#!/usr/bin/env python3
"""
Comprehensive 23andMe Genome Analysis Script
Analyzes Bill Syrros's raw genotype data for health-relevant SNP associations
"""

import json
from collections import defaultdict
from pathlib import Path

# SNP database with health associations
SNP_DATABASE = {
    # Cardiovascular / Heart Health
    'rs1801133': {
        'gene': 'MTHFR',
        'variant': 'C677T',
        'category': 'Cardiovascular',
        'affects': 'Folate metabolism, homocysteine levels',
        'risk_allele': 'T',
        'genotype_interpretation': {
            'CC': 'Normal folate metabolism, homocysteine levels optimal',
            'CT': 'Heterozygous, mildly reduced enzyme activity (~35% reduction)',
            'TT': 'Homozygous, significantly reduced enzyme activity (~65% reduction), elevated homocysteine risk'
        },
        'clinical_significance': 'moderate',
        'notes': 'Elevated homocysteine increases cardiovascular and thrombotic risk'
    },
    'rs1801131': {
        'gene': 'MTHFR',
        'variant': 'A1298C',
        'category': 'Cardiovascular',
        'affects': 'Folate metabolism (secondary site)',
        'risk_allele': 'C',
        'genotype_interpretation': {
            'AA': 'Normal enzyme activity',
            'AC': 'Mildly reduced enzyme activity',
            'CC': 'Moderately reduced enzyme activity'
        },
        'clinical_significance': 'low',
        'notes': 'Less severe than C677T variant'
    },
    'rs1799963': {
        'gene': 'F2 (Prothrombin)',
        'variant': 'G20210A',
        'category': 'Cardiovascular',
        'affects': 'Blood clotting risk',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Normal, no increased clotting risk',
            'GA': 'Heterozygous, 2-3x increased thrombosis risk',
            'AA': 'Rare homozygous, significantly increased thrombosis risk'
        },
        'clinical_significance': 'moderate',
        'notes': 'One of the most common hereditary thrombophilias'
    },
    'rs6025': {
        'gene': 'F5 (Factor V)',
        'variant': 'Leiden (R506Q)',
        'category': 'Cardiovascular',
        'affects': 'Blood clotting resistance to anticoagulant protein C',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Wild-type, normal anticoagulation response',
            'GA': '3-8x increased thrombosis risk',
            'AA': 'Rare, severe increased thrombosis risk'
        },
        'clinical_significance': 'high',
        'notes': 'Factor V Leiden - most common inherited thrombophilia in Europeans'
    },
    'rs429358': {
        'gene': 'APOE',
        'variant': 'ε2/ε3/ε4 (Cys112Arg)',
        'category': 'Cardiovascular',
        'affects': 'Cholesterol metabolism, Alzheimer\'s disease risk',
        'risk_allele': 'C (ε4)',
        'genotype_interpretation': {
            'TT': 'ε2/ε2 genotype (APOE2) - lowest Alzheimer\'s risk, lower LDL',
            'TC': 'ε2/ε3 or ε3/ε4 genotype - depends on rs7412',
            'CC': 'ε3/ε3 or ε4/ε4 genotype - depends on rs7412'
        },
        'clinical_significance': 'high',
        'notes': 'Requires rs7412 to determine full APOE status'
    },
    'rs7412': {
        'gene': 'APOE',
        'variant': 'ε2/ε3/ε4 (Arg158Cys)',
        'category': 'Cardiovascular',
        'affects': 'Cholesterol metabolism, Alzheimer\'s disease risk',
        'risk_allele': 'T (ε2)',
        'genotype_interpretation': {
            'CC': 'ε3 or ε4 allele',
            'CT': 'ε2 or ε3 allele',
            'TT': 'ε2/ε2 or ε2/ε3 genotype'
        },
        'clinical_significance': 'high',
        'notes': 'Combined with rs429358 to determine APOE2/3/4 status'
    },
    'rs1801282': {
        'gene': 'PPARG',
        'variant': 'Pro12Ala',
        'category': 'Metabolic',
        'affects': 'Insulin sensitivity, metabolic syndrome risk',
        'risk_allele': 'G',
        'genotype_interpretation': {
            'CC': 'Normal insulin sensitivity',
            'CG': 'Ala allele, protective against type 2 diabetes, improved insulin sensitivity',
            'GG': 'Pro/Pro, increased metabolic syndrome and obesity risk'
        },
        'clinical_significance': 'moderate',
        'notes': 'Ala allele is protective, reduces diabetes risk by ~25%'
    },
    'rs662': {
        'gene': 'PON1',
        'variant': 'Q192R',
        'category': 'Cardiovascular',
        'affects': 'Oxidative stress protection, HDL function',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'R allele, better protection against oxidative stress',
            'GA': 'Heterozygous, intermediate protection',
            'AA': 'Q allele, reduced antioxidant capacity'
        },
        'clinical_significance': 'low',
        'notes': 'Affects HDL\'s ability to protect against LDL oxidation'
    },
    'rs4340': {
        'gene': 'ACE',
        'variant': 'I/D proxy (Alu repeat)',
        'category': 'Cardiovascular',
        'affects': 'Blood pressure, ACE enzyme levels',
        'risk_allele': 'None (functional variant)',
        'genotype_interpretation': {
            'AA': 'I/I genotype - lower ACE levels, better endurance capacity',
            'GA': 'I/D genotype - intermediate ACE levels',
            'GG': 'D/D genotype - higher ACE levels, increased hypertension risk'
        },
        'clinical_significance': 'moderate',
        'notes': 'Note: rs4340 is a proxy SNP for the functional I/D polymorphism'
    },
    'rs5186': {
        'gene': 'AGTR1',
        'variant': 'A1166C',
        'category': 'Cardiovascular',
        'affects': 'Blood pressure regulation',
        'risk_allele': 'C',
        'genotype_interpretation': {
            'AA': 'Lower risk for hypertension',
            'AC': 'Intermediate risk',
            'CC': 'Increased hypertension and cardiovascular disease risk'
        },
        'clinical_significance': 'moderate',
        'notes': 'Affects angiotensin II signaling in blood vessels'
    },
    'rs1799983': {
        'gene': 'NOS3',
        'variant': 'G894T',
        'category': 'Cardiovascular',
        'affects': 'Nitric oxide production, vascular function',
        'risk_allele': 'T',
        'genotype_interpretation': {
            'GG': 'Normal nitric oxide production, better endothelial function',
            'GT': 'Intermediate nitric oxide levels',
            'TT': 'Reduced nitric oxide production, impaired vascular relaxation, hypertension risk'
        },
        'clinical_significance': 'moderate',
        'notes': 'Affects endothelial function and vascular health'
    },
    'rs1800796': {
        'gene': 'IL6',
        'variant': '-174G/C',
        'category': 'Cardiovascular',
        'affects': 'Inflammation, IL-6 levels',
        'risk_allele': 'C',
        'genotype_interpretation': {
            'GG': 'Higher IL-6 production, pro-inflammatory',
            'GC': 'Intermediate IL-6 levels',
            'CC': 'Lower IL-6 production, anti-inflammatory benefit'
        },
        'clinical_significance': 'low',
        'notes': 'IL-6 is a key inflammatory cytokine'
    },
    'rs1800795': {
        'gene': 'IL6',
        'variant': '-174G>C',
        'category': 'Cardiovascular',
        'affects': 'IL-6 levels and inflammation',
        'risk_allele': 'G',
        'genotype_interpretation': {
            'GG': 'Increased IL-6 levels',
            'GC': 'Intermediate levels',
            'CC': 'Lower IL-6 levels'
        },
        'clinical_significance': 'low',
        'notes': 'May be same as rs1800796'
    },
    'rs1800629': {
        'gene': 'TNF',
        'variant': '-308G/A',
        'category': 'Cardiovascular',
        'affects': 'TNF-alpha inflammation marker',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Lower TNF-alpha production',
            'GA': 'Intermediate TNF-alpha levels',
            'AA': 'Higher TNF-alpha production, pro-inflammatory'
        },
        'clinical_significance': 'low',
        'notes': 'TNF-alpha is a major inflammatory cytokine'
    },
    'rs3798220': {
        'gene': 'LPA',
        'variant': 'IVS25+323C/G',
        'category': 'Cardiovascular',
        'affects': 'Lipoprotein(a) levels, cardiovascular risk',
        'risk_allele': 'G',
        'genotype_interpretation': {
            'CC': 'Lower Lp(a) levels',
            'CG': 'Intermediate Lp(a) levels',
            'GG': 'Higher Lp(a) levels, increased cardiovascular risk'
        },
        'clinical_significance': 'moderate',
        'notes': 'Elevated Lp(a) is independent cardiovascular risk factor'
    },
    'rs10455872': {
        'gene': 'LPA',
        'variant': 'Pentanucleotide repeat',
        'category': 'Cardiovascular',
        'affects': 'Lipoprotein(a) levels, cardiovascular risk',
        'risk_allele': 'T',
        'genotype_interpretation': {
            'CC': 'Lower Lp(a) levels',
            'CT': 'Intermediate Lp(a) levels',
            'TT': 'Higher Lp(a) levels, increased cardiovascular risk'
        },
        'clinical_significance': 'moderate',
        'notes': 'Works with rs3798220 to determine Lp(a) phenotype'
    },

    # Metabolic / Diabetes / Weight
    'rs7903146': {
        'gene': 'TCF7L2',
        'variant': 'rs7903146',
        'category': 'Metabolic',
        'affects': 'Type 2 diabetes risk (strongest known locus)',
        'risk_allele': 'T',
        'genotype_interpretation': {
            'CC': 'Normal diabetes risk (~7%)',
            'CT': '~1.4x increased diabetes risk',
            'TT': '~2-4x increased type 2 diabetes risk'
        },
        'clinical_significance': 'high',
        'notes': 'Most significant T2D susceptibility gene; each T allele ~20% increased risk'
    },
    'rs5219': {
        'gene': 'KCNJ11',
        'variant': 'E23K',
        'category': 'Metabolic',
        'affects': 'Insulin secretion and glucose control',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'CC': 'K allele, normal insulin secretion',
            'CA': 'Heterozygous, intermediate risk',
            'AA': 'E allele, increased type 2 diabetes risk'
        },
        'clinical_significance': 'moderate',
        'notes': 'Affects ATP-sensitive potassium channel in pancreatic beta cells'
    },
    'rs13266634': {
        'gene': 'SLC30A8',
        'variant': 'R325W',
        'category': 'Metabolic',
        'affects': 'Zinc transporter, insulin secretion',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'CC': 'Normal, protective',
            'CA': 'Intermediate',
            'AA': 'Increased diabetes risk'
        },
        'clinical_significance': 'moderate',
        'notes': 'Zinc transport critical for insulin secretion'
    },
    'rs4402960': {
        'gene': 'IGF2BP2',
        'variant': 'rs4402960',
        'category': 'Metabolic',
        'affects': 'Type 2 diabetes risk',
        'risk_allele': 'T',
        'genotype_interpretation': {
            'CC': 'Lower diabetes risk',
            'CT': 'Intermediate risk',
            'TT': 'Increased diabetes risk'
        },
        'clinical_significance': 'low',
        'notes': 'Involved in glucose metabolism'
    },
    'rs10811661': {
        'gene': 'CDKN2A/B',
        'variant': 'rs10811661',
        'category': 'Metabolic',
        'affects': 'Type 2 diabetes and pancreatic cancer risk',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'CC': 'Lower risk',
            'CA': 'Intermediate risk',
            'AA': 'Increased diabetes risk'
        },
        'clinical_significance': 'moderate',
        'notes': 'Cell cycle regulator'
    },
    'rs9939609': {
        'gene': 'FTO',
        'variant': 'A/T',
        'category': 'Metabolic',
        'affects': 'Obesity risk, appetite regulation',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'TT': 'Lower obesity risk',
            'TA': 'Intermediate obesity risk (~1.3x)',
            'AA': 'Significantly increased obesity risk (~1.6x higher BMI)'
        },
        'clinical_significance': 'moderate',
        'notes': 'Most significant obesity-associated SNP; affects appetite circuits'
    },
    'rs17782313': {
        'gene': 'MC4R',
        'variant': 'rs17782313',
        'category': 'Metabolic',
        'affects': 'Obesity risk, appetite/satiety',
        'risk_allele': 'C',
        'genotype_interpretation': {
            'AA': 'Normal satiety, lower obesity risk',
            'AC': 'Intermediate obesity risk',
            'CC': 'Increased appetite, higher obesity risk'
        },
        'clinical_significance': 'moderate',
        'notes': 'Melanocortin-4 receptor crucial for appetite control'
    },
    'rs1042713': {
        'gene': 'ADRB2',
        'variant': 'Arg16Gly',
        'category': 'Metabolic',
        'affects': 'Beta-2 adrenergic receptor, metabolic rate',
        'risk_allele': 'C',
        'genotype_interpretation': {
            'AA': 'Arg/Arg, higher metabolic rate and norepinephrine sensitivity',
            'AC': 'Intermediate',
            'CC': 'Gly/Gly, lower metabolic rate'
        },
        'clinical_significance': 'low',
        'notes': 'Affects sympathetic nervous system response'
    },
    'rs1042714': {
        'gene': 'ADRB2',
        'variant': 'Gln27Glu',
        'category': 'Metabolic',
        'affects': 'Beta-2 adrenergic receptor, weight regulation',
        'risk_allele': 'G',
        'genotype_interpretation': {
            'AA': 'Gln/Gln, lower weight gain risk',
            'AG': 'Intermediate',
            'GG': 'Glu/Glu, greater weight gain risk'
        },
        'clinical_significance': 'low',
        'notes': 'Works with rs1042713 for metabolic effects'
    },
    'rs4994': {
        'gene': 'ADRB3',
        'variant': 'Trp64Arg',
        'category': 'Metabolic',
        'affects': 'Beta-3 adrenergic receptor, metabolic efficiency',
        'risk_allele': 'C',
        'genotype_interpretation': {
            'TT': 'Trp/Trp, normal metabolism',
            'TC': 'Trp/Arg, intermediate',
            'CC': 'Arg/Arg, reduced metabolic efficiency, higher obesity risk'
        },
        'clinical_significance': 'low',
        'notes': 'Involved in thermogenesis and fat mobilization'
    },

    # Nutrient Metabolism / Diet
    'rs4988235': {
        'gene': 'MCM6/LCT',
        'variant': 'C/T (-13910)',
        'category': 'Nutrient Metabolism',
        'affects': 'Lactose tolerance',
        'risk_allele': 'A (old nomenclature)',
        'genotype_interpretation': {
            'CC': 'Lactose intolerant (likely)',
            'CT': 'Lactose tolerant (heterozygous)',
            'TT': 'Lactose tolerant (homozygous)'
        },
        'clinical_significance': 'moderate',
        'notes': 'Determines adult lactase persistence; common in Northern European ancestry'
    },
    'rs1801394': {
        'gene': 'MTRR',
        'variant': 'A66G',
        'category': 'Nutrient Metabolism',
        'affects': 'B12 metabolism, methionine cycle',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Normal B12 metabolism',
            'AG': 'Intermediate enzyme activity',
            'AA': 'Reduced methionine synthase reductase activity, potential B12/folate issues'
        },
        'clinical_significance': 'low',
        'notes': 'Related to homocysteine metabolism'
    },
    'rs1805087': {
        'gene': 'MTR',
        'variant': 'A2756G',
        'category': 'Nutrient Metabolism',
        'affects': 'B12 and methionine metabolism',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Normal methionine synthase activity',
            'AG': 'Intermediate',
            'AA': 'Potential B12/methionine metabolism issues'
        },
        'clinical_significance': 'low',
        'notes': 'Cofactor required for methylation reactions'
    },
    'rs602662': {
        'gene': 'FUT2',
        'variant': 'A/G',
        'category': 'Nutrient Metabolism',
        'affects': 'B12 absorption, gut microbiota',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Secretor status, better B12 absorption',
            'GA': 'Intermediate',
            'AA': 'Non-secretor, reduced B12 absorption from food'
        },
        'clinical_significance': 'low',
        'notes': 'FUT2 secretor status affects B12 and micronutrient availability'
    },
    'rs7041': {
        'gene': 'GC/VDBP',
        'variant': 'D432E',
        'category': 'Nutrient Metabolism',
        'affects': 'Vitamin D binding protein, vitamin D status',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Type 1f, higher Vitamin D bioavailability',
            'GA': 'Type 1s/1f intermediate',
            'AA': 'Type 1s, lower Vitamin D bioavailability'
        },
        'clinical_significance': 'moderate',
        'notes': 'Affects vitamin D distribution and bioavailability'
    },
    'rs4588': {
        'gene': 'GC/VDBP',
        'variant': 'T436K',
        'category': 'Nutrient Metabolism',
        'affects': 'Vitamin D binding protein',
        'risk_allele': 'T',
        'genotype_interpretation': {
            'GG': 'Isoform 1f or 2',
            'GA': 'Intermediate',
            'AA': 'Isoform 2'
        },
        'clinical_significance': 'moderate',
        'notes': 'Works with rs7041 to determine VDBP isoform'
    },
    'rs10741657': {
        'gene': 'CYP2R1',
        'variant': 'rs10741657',
        'category': 'Nutrient Metabolism',
        'affects': 'Vitamin D metabolism, 25-OHD levels',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Higher vitamin D levels',
            'GA': 'Intermediate',
            'AA': 'Lower vitamin D levels, impaired metabolism'
        },
        'clinical_significance': 'moderate',
        'notes': 'Primary vitamin D 25-hydroxylase in liver'
    },
    'rs1800562': {
        'gene': 'HFE',
        'variant': 'C282Y',
        'category': 'Nutrient Metabolism',
        'affects': 'Iron overload risk (hemochromatosis)',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'CC': 'Normal iron metabolism',
            'CA': 'Heterozygous carrier, typically no hemochromatosis',
            'AA': 'Homozygous, significant hemochromatosis risk (85% penetrance)'
        },
        'clinical_significance': 'high',
        'notes': 'Most common genetic cause of hemochromatosis'
    },
    'rs1799945': {
        'gene': 'HFE',
        'variant': 'H63D',
        'category': 'Nutrient Metabolism',
        'affects': 'Iron overload risk (secondary)',
        'risk_allele': 'G',
        'genotype_interpretation': {
            'CC': 'Normal',
            'CG': 'Heterozygous carrier',
            'GG': 'Homozygous, mild risk if combined with C282Y'
        },
        'clinical_significance': 'low',
        'notes': 'Secondary hemochromatosis mutation; rarely causes disease alone'
    },
    'rs855791': {
        'gene': 'TMPRSS6',
        'variant': 'A736V',
        'category': 'Nutrient Metabolism',
        'affects': 'Iron levels and hepcidin regulation',
        'risk_allele': 'T',
        'genotype_interpretation': {
            'CC': 'A allele, normal iron regulation',
            'CT': 'Intermediate',
            'TT': 'V allele, lower iron levels'
        },
        'clinical_significance': 'low',
        'notes': 'Regulates iron absorption through hepcidin'
    },
    'rs12325817': {
        'gene': 'BCMO1',
        'variant': 'rs12325817',
        'category': 'Nutrient Metabolism',
        'affects': 'Beta-carotene to vitamin A conversion',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Normal beta-carotene conversion to vitamin A',
            'GA': 'Intermediate conversion',
            'AA': 'Poor beta-carotene converter, need more dietary vitamin A'
        },
        'clinical_significance': 'moderate',
        'notes': 'Affects plant-based vitamin A sufficiency'
    },
    'rs234706': {
        'gene': 'CBS',
        'variant': 'C699T',
        'category': 'Nutrient Metabolism',
        'affects': 'Homocysteine and sulfur metabolism',
        'risk_allele': 'T',
        'genotype_interpretation': {
            'CC': 'Normal cystathionine beta-synthase activity',
            'CT': 'Intermediate',
            'TT': 'Reduced CBS activity, elevated homocysteine and ammonia'
        },
        'clinical_significance': 'low',
        'notes': 'CBS is gateway to sulfur metabolism'
    },
    'rs1695': {
        'gene': 'GSTP1',
        'variant': 'A313G (Ile105Val)',
        'category': 'Nutrient Metabolism',
        'affects': 'Detoxification, glutathione S-transferase activity',
        'risk_allele': 'G',
        'genotype_interpretation': {
            'AA': 'Normal detox capacity',
            'AG': 'Intermediate',
            'GG': 'Reduced GSTP1 activity, lower detoxification capacity'
        },
        'clinical_significance': 'low',
        'notes': 'Phase II detox enzyme for environmental toxins'
    },
    'rs1048943': {
        'gene': 'CYP1A1',
        'variant': 'A2455G',
        'category': 'Nutrient Metabolism',
        'affects': 'Phase I detoxification (CYP1A1 induction)',
        'risk_allele': 'G',
        'genotype_interpretation': {
            'AA': 'Normal CYP1A1 induction',
            'AG': 'Intermediate',
            'GG': 'Increased inducibility, higher detox capacity but more metabolites'
        },
        'clinical_significance': 'low',
        'notes': 'Affects polycyclic aromatic hydrocarbon metabolism'
    },
    'rs4680': {
        'gene': 'COMT',
        'variant': 'Val158Met',
        'category': 'Mental/Cognitive',
        'affects': 'Catechol metabolism, dopamine/norepinephrine breakdown',
        'risk_allele': 'A (Met allele)',
        'genotype_interpretation': {
            'GG': 'Val/Val, faster catechol breakdown, more anxiety-resistant but potentially less focused',
            'GA': 'Val/Met, intermediate metabolism',
            'AA': 'Met/Met, slower catechol breakdown, higher dopamine/norepinephrine, better focus/motivation, higher stress sensitivity'
        },
        'clinical_significance': 'moderate',
        'notes': 'Warrior vs Worrier phenotype; also affects pain sensitivity'
    },

    # Fitness / Performance / Recovery
    'rs1815739': {
        'gene': 'ACTN3',
        'variant': 'R577X',
        'category': 'Fitness',
        'affects': 'Fast-twitch muscle fiber composition',
        'risk_allele': 'X',
        'genotype_interpretation': {
            'CC': 'R/R, alpha-actinin-3 present, good sprint/power performance',
            'CT': 'R/X, intermediate phenotype',
            'TT': 'X/X, no alpha-actinin-3, endurance advantage, may have poor sprint capacity'
        },
        'clinical_significance': 'moderate',
        'notes': 'ACTN3 R allele associated with elite sprinters; X/X with elite endurance athletes'
    },
    'rs4253778': {
        'gene': 'PPARA',
        'variant': 'rs4253778',
        'category': 'Fitness',
        'affects': 'Fat vs carbohydrate fuel preference',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Better carbohydrate utilization',
            'GA': 'Intermediate fuel usage',
            'AA': 'Better fat utilization, favorable for endurance'
        },
        'clinical_significance': 'low',
        'notes': 'Affects metabolic substrate preference'
    },
    'rs8192678': {
        'gene': 'PPARGC1A',
        'variant': 'Gly482Ser',
        'category': 'Fitness',
        'affects': 'Aerobic capacity, mitochondrial biogenesis',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'CC': 'Gly/Gly, superior aerobic capacity and mitochondrial function',
            'CA': 'Intermediate aerobic capacity',
            'AA': 'Ser/Ser, lower aerobic capacity baseline'
        },
        'clinical_significance': 'moderate',
        'notes': 'PGC-1alpha is master regulator of mitochondrial biogenesis'
    },
    'rs1800012': {
        'gene': 'COL1A1',
        'variant': 'Sp1 promoter (Xbal)',
        'category': 'Fitness',
        'affects': 'Collagen synthesis, connective tissue strength',
        'risk_allele': 'S',
        'genotype_interpretation': {
            'SS': 'Higher collagen production, stronger connective tissue',
            'Ss': 'Intermediate collagen levels',
            'ss': 'Lower collagen production, higher tendon/ligament injury risk'
        },
        'clinical_significance': 'low',
        'notes': 'Affects injury susceptibility in athletes'
    },
    'rs1799752': {
        'gene': 'ACE',
        'variant': 'I/D (Alu repeat)',
        'category': 'Fitness',
        'affects': 'ACE enzyme levels, endurance vs power performance',
        'risk_allele': 'D',
        'genotype_interpretation': {
            'II': 'I/I, lower circulating ACE, endurance advantage',
            'ID': 'I/D, intermediate phenotype',
            'DD': 'D/D, higher ACE, power/strength advantage'
        },
        'clinical_significance': 'moderate',
        'notes': 'Affects oxygen utilization and altitude adaptation; classic performance gene'
    },
    'rs7136446': {
        'gene': 'COL5A1',
        'variant': 'rs7136446',
        'category': 'Fitness',
        'affects': 'Collagen V synthesis, ligament/tendon properties',
        'risk_allele': 'C',
        'genotype_interpretation': {
            'AA': 'Normal collagen flexibility',
            'AC': 'Intermediate',
            'CC': 'Increased tendon/ligament stiffness, injury risk'
        },
        'clinical_significance': 'low',
        'notes': 'Type V collagen affects soft tissue properties'
    },

    # Sleep / Circadian
    'rs1801260': {
        'gene': 'CLOCK',
        'variant': 'A3111T',
        'category': 'Sleep/Circadian',
        'affects': 'Circadian rhythm, sleep timing',
        'risk_allele': 'T',
        'genotype_interpretation': {
            'AA': 'Morning preference, earlier sleep/wake timing',
            'AT': 'Intermediate chronotype',
            'TT': 'Evening preference, later sleep/wake timing'
        },
        'clinical_significance': 'low',
        'notes': 'CLOCK is master circadian regulator; affects chronotype'
    },
    'rs57875989': {
        'gene': 'DEC2/BHLHE41',
        'variant': 'rs57875989',
        'category': 'Sleep/Circadian',
        'affects': 'Sleep duration, short sleep phenotype',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Normal sleep need, requires 7-9 hours',
            'GA': 'Intermediate',
            'AA': 'Short sleep gene, may function well on 4-6 hours (rare)'
        },
        'clinical_significance': 'moderate',
        'notes': 'Only 1-3% carry short sleep variants; very rare'
    },
    'rs73598374': {
        'gene': 'ADA',
        'variant': 'rs73598374',
        'category': 'Sleep/Circadian',
        'affects': 'Deep sleep efficiency',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Normal deep sleep',
            'GA': 'Intermediate',
            'AA': 'Reduced deep sleep efficiency'
        },
        'clinical_significance': 'low',
        'notes': 'Adenosine deaminase affects adenosine-mediated sleep pressure'
    },

    # Longevity / Aging
    'rs2802292': {
        'gene': 'FOXO3',
        'variant': 'rs2802292',
        'category': 'Longevity',
        'affects': 'Longevity and healthy aging',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Common allele',
            'GA': 'Intermediate',
            'AA': 'Associated with increased longevity and healthy aging (meta-analyses)'
        },
        'clinical_significance': 'moderate',
        'notes': 'FOXO3 is major longevity gene; G allele associated with longer lifespan'
    },
    'rs1042522': {
        'gene': 'TP53',
        'variant': 'Arg72Pro',
        'category': 'Longevity',
        'affects': 'Tumor suppression, cancer risk',
        'risk_allele': 'G',
        'genotype_interpretation': {
            'CC': 'Pro/Pro, potentially more efficient tumor suppression',
            'CG': 'Arg/Pro, intermediate',
            'GG': 'Arg/Arg, may have slightly different apoptosis response'
        },
        'clinical_significance': 'moderate',
        'notes': 'p53 is guardian of the genome; affects cancer risk'
    },
    'rs1800566': {
        'gene': 'NQO1',
        'variant': 'C609T (Pro187Ser)',
        'category': 'Longevity',
        'affects': 'Antioxidant defense, NAD(P)H quinone oxidoreductase',
        'risk_allele': 'T',
        'genotype_interpretation': {
            'CC': 'Normal NQO1 activity',
            'CT': 'Heterozygous, ~50% activity',
            'TT': 'Homozygous null, no functional NQO1, increased oxidative stress'
        },
        'clinical_significance': 'moderate',
        'notes': 'Deficiency increases cancer risk from benzene and other quinones'
    },
    'rs4880': {
        'gene': 'SOD2',
        'variant': 'Ala16Val (Val9Ala)',
        'category': 'Longevity',
        'affects': 'Mitochondrial antioxidant defense',
        'risk_allele': 'A (Val)',
        'genotype_interpretation': {
            'TT': 'Ala/Ala, better mitochondrial antioxidant capacity',
            'TA': 'Intermediate',
            'AA': 'Val/Val, reduced mitochondrial superoxide dismutase activity'
        },
        'clinical_significance': 'moderate',
        'notes': 'MnSOD is key mitochondrial antioxidant defense'
    },

    # Mental / Cognitive
    'rs6265': {
        'gene': 'BDNF',
        'variant': 'Val66Met',
        'category': 'Mental/Cognitive',
        'affects': 'Brain-derived neurotrophic factor, neuroplasticity, learning',
        'risk_allele': 'A (Met)',
        'genotype_interpretation': {
            'GG': 'Val/Val, higher BDNF, better episodic memory and learning capacity',
            'GA': 'Val/Met, intermediate neuroplasticity',
            'AA': 'Met/Met, lower activity-dependent BDNF, potential learning/memory challenges'
        },
        'clinical_significance': 'moderate',
        'notes': 'Met allele associated with anxiety-like traits and depression risk'
    },
    'rs53576': {
        'gene': 'OXTR',
        'variant': 'rs53576',
        'category': 'Mental/Cognitive',
        'affects': 'Oxytocin receptor, social bonding, empathy',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Lower oxytocin receptor expression, less empathy/social sensitivity',
            'GA': 'Intermediate social sensitivity',
            'AA': 'Higher oxytocin sensitivity, greater empathy and social bonding'
        },
        'clinical_significance': 'low',
        'notes': 'Affects social bonding, parenting, and stress response'
    },
    'rs4570625': {
        'gene': 'TPH2',
        'variant': 'G/A (-703)',
        'category': 'Mental/Cognitive',
        'affects': 'Tryptophan hydroxylase, serotonin synthesis',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Higher serotonin production',
            'GA': 'Intermediate serotonin levels',
            'AA': 'Lower serotonin synthesis, mood-lowering risk'
        },
        'clinical_significance': 'low',
        'notes': 'Polymorphism affects serotonin availability'
    },
    'rs6311': {
        'gene': 'HTR2A',
        'variant': '-1438G/A',
        'category': 'Mental/Cognitive',
        'affects': 'Serotonin 2A receptor, mood, aggression',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Normal serotonin 2A receptor expression',
            'GA': 'Intermediate',
            'AA': 'Higher HTR2A expression, increased serotonergic response'
        },
        'clinical_significance': 'low',
        'notes': 'Associated with depression, suicidality, and aggression'
    },
    'rs1800497': {
        'gene': 'DRD2/ANKK1',
        'variant': 'Taq1A',
        'category': 'Mental/Cognitive',
        'affects': 'Dopamine receptor density, motivation, reward',
        'risk_allele': 'T (A1 allele)',
        'genotype_interpretation': {
            'CC': 'A2/A2, higher dopamine D2 receptor density, better motivation',
            'CT': 'A1/A2, intermediate dopamine sensitivity',
            'TT': 'A1/A1, lower dopamine receptor density, lower motivation, addiction risk'
        },
        'clinical_significance': 'moderate',
        'notes': 'A1 allele associated with lower dopamine sensitivity and addiction'
    },

    # Caffeine / Alcohol / Drug Metabolism
    'rs762551': {
        'gene': 'CYP1A2',
        'variant': 'C163A (-163)',
        'category': 'Drug Metabolism',
        'affects': 'Caffeine metabolism speed',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'CC': 'Fast caffeine metabolizer, can handle more caffeine',
            'CA': 'Intermediate caffeine sensitivity',
            'AA': 'Slow caffeine metabolizer, jitteriness/anxiety at normal doses'
        },
        'clinical_significance': 'low',
        'notes': 'Explains individual caffeine sensitivity; affects 1-2 day half-life'
    },
    'rs1229984': {
        'gene': 'ADH1B',
        'variant': 'Arg47His',
        'category': 'Drug Metabolism',
        'affects': 'Alcohol metabolism speed',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Slow alcohol metabolism',
            'GA': 'Intermediate metabolism',
            'AA': 'Fast alcohol metabolism'
        },
        'clinical_significance': 'moderate',
        'notes': 'Asian populations enriched for fast metabolism; protective against alcoholism'
    },
    'rs671': {
        'gene': 'ALDH2',
        'variant': 'Glu504Lys',
        'category': 'Drug Metabolism',
        'affects': 'Aldehyde dehydrogenase, alcohol flush reaction',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'Normal acetaldehyde metabolism',
            'GA': 'Heterozygous, alcohol flush reaction, protective against alcoholism',
            'AA': 'Deficient ALDH2, severe flush reaction, strong alcohol avoidance'
        },
        'clinical_significance': 'moderate',
        'notes': 'East Asian variant; 50% of populations in China, Japan, Korea have deficiency'
    },
    'rs4149056': {
        'gene': 'SLCO1B1',
        'variant': 'T521C',
        'category': 'Drug Metabolism',
        'affects': 'Statin metabolism, drug interactions',
        'risk_allele': 'C',
        'genotype_interpretation': {
            'TT': 'Normal statin metabolism',
            'TC': 'Increased statin levels',
            'CC': 'High statin levels, increased myopathy risk'
        },
        'clinical_significance': 'moderate',
        'notes': 'Important for statin dosing and side effect risk'
    },

    # Immune / Autoimmune
    'rs2187668': {
        'gene': 'HLA-DQ2.5',
        'variant': 'rs2187668',
        'category': 'Immune',
        'affects': 'Celiac disease susceptibility',
        'risk_allele': 'A',
        'genotype_interpretation': {
            'GG': 'No HLA-DQ2.5, very low celiac risk',
            'GA': 'Carriers, increased risk',
            'AA': 'Homozygous DQ2.5, ~30% develop celiac disease'
        },
        'clinical_significance': 'high',
        'notes': '95% of celiac disease patients carry HLA-DQ2 or HLA-DQ8'
    },
    'rs7454108': {
        'gene': 'HLA-DQ8',
        'variant': 'rs7454108',
        'category': 'Immune',
        'affects': 'Celiac disease susceptibility',
        'risk_allele': 'T',
        'genotype_interpretation': {
            'CC': 'No HLA-DQ8',
            'CT': 'Carriers',
            'TT': 'DQ8 homozygous, increased celiac risk'
        },
        'clinical_significance': 'high',
        'notes': 'Accounts for ~5% of celiac patients; complementary to DQ2'
    }
}

def parse_genome_file(filepath):
    """Parse 23andMe raw data file into dictionary"""
    genotypes = {}
    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            parts = line.split('\t')
            if len(parts) >= 4:
                rsid, chromosome, position, genotype = parts[0], parts[1], parts[2], parts[3]
                genotypes[rsid] = {
                    'chromosome': chromosome,
                    'position': position,
                    'genotype': genotype
                }
    return genotypes

def determine_apoe_status(rs429358_gt, rs7412_gt):
    """Determine APOE e2/e3/e4 status from two SNPs

    rs429358 (Cys112Arg): C=Cys, T=Arg
    rs7412 (Arg158Cys): C=Arg, T=Cys

    APOE2: Cys112/Cys158 = rs429358 T + rs7412 T = TT/TT
    APOE3: Cys112/Arg158 = rs429358 T + rs7412 C = TT/CC or heterozygous
    APOE4: Arg112/Arg158 = rs429358 C + rs7412 C = CC/CC
    """
    if rs429358_gt == '--' or rs7412_gt == '--':
        return None, "No-call genotypes"

    # Convert alleles to APOE alleles
    # Each SNP position contributes one allele to the APOE determination
    apoe_alleles = []

    # For each chromosome copy (diploid)
    for i in range(2):
        allele_429358 = rs429358_gt[i] if i < len(rs429358_gt) else None
        allele_7412 = rs7412_gt[i] if i < len(rs7412_gt) else None

        if allele_429358 and allele_7412:
            # Determine APOE allele from combination
            if allele_429358 == 'T' and allele_7412 == 'T':
                apoe_alleles.append('ε2')
            elif (allele_429358 == 'T' and allele_7412 == 'C') or (allele_429358 == 'C' and allele_7412 == 'T'):
                apoe_alleles.append('ε3')
            elif allele_429358 == 'C' and allele_7412 == 'C':
                apoe_alleles.append('ε4')

    if len(apoe_alleles) == 2:
        # Sort to display in standard order (highest first)
        apoe_alleles_sorted = sorted(apoe_alleles, key=lambda x: {'ε4': 0, 'ε3': 1, 'ε2': 2}.get(x, 3))
        apoe_status = ''.join(apoe_alleles_sorted)
        return apoe_status, apoe_alleles

    return None, "Unable to determine"

def get_interpretation_from_alleles(user_gt, snp_info, rsid):
    """Get interpretation by trying various allele naming conventions"""
    interpretations = snp_info['genotype_interpretation']

    # Try direct match
    if user_gt in interpretations:
        return interpretations[user_gt]

    # Some SNPs might use different reference/alternate alleles in the database
    # Try reverse complement logic or alternative naming
    for key in interpretations:
        if len(key) == len(user_gt) and len(user_gt) == 2:
            # Check if it's a reversed version
            if key == user_gt[::-1]:
                return interpretations[key]

    # Return generic interpretation based on allele frequency if available
    allele_counts = {}
    for base in user_gt:
        allele_counts[base] = allele_counts.get(base, 0) + 1

    if len(allele_counts) == 1:
        # Homozygous
        return f"Homozygous {user_gt} genotype"
    else:
        # Heterozygous
        return f"Heterozygous genotype"

def analyze_genome(genotypes_file):
    """Main analysis function"""
    print("=" * 80)
    print("COMPREHENSIVE 23andMe GENOME ANALYSIS")
    print("Individual: Bill Syrros (Vasilios Syrros)")
    print("=" * 80)

    # Parse genome file
    print("\nParsing genome data...")
    genotypes = parse_genome_file(genotypes_file)
    print(f"Total SNPs loaded: {len(genotypes):,}")

    # Calculate basic stats
    chrom_dist = defaultdict(int)
    no_calls = 0
    for rsid, data in genotypes.items():
        chrom = data['chromosome']
        chrom_dist[chrom] += 1
        if data['genotype'] == '--':
            no_calls += 1

    no_call_rate = (no_calls / len(genotypes)) * 100 if genotypes else 0

    print(f"\nBasic Statistics:")
    print(f"  Total SNPs: {len(genotypes):,}")
    print(f"  No-call rate: {no_call_rate:.2f}% ({no_calls:,} SNPs)")
    print(f"  Chromosomes with data: {len(chrom_dist)}")

    # Analyze health-relevant SNPs
    print(f"\nSearching for {len(SNP_DATABASE)} health-relevant SNPs...")
    results = {
        'metadata': {
            'total_snps': len(genotypes),
            'no_call_rate': no_call_rate,
            'chromosomal_distribution': dict(chrom_dist)
        },
        'snp_findings': [],
        'apoe_status': {},
        'summary_by_category': defaultdict(list)
    }

    found_count = 0
    missing_snps = []

    for rsid, snp_info in SNP_DATABASE.items():
        if rsid in genotypes:
            found_count += 1
            user_gt = genotypes[rsid]['genotype']

            # Skip no-call genotypes
            if user_gt == '--':
                continue

            snp_result = {
                'rsid': rsid,
                'gene': snp_info['gene'],
                'variant': snp_info['variant'],
                'category': snp_info['category'],
                'affects': snp_info['affects'],
                'user_genotype': user_gt,
                'risk_allele': snp_info['risk_allele'],
                'chromosome': genotypes[rsid]['chromosome'],
                'position': genotypes[rsid]['position']
            }

            # Get interpretation with fallback logic
            interpretation = get_interpretation_from_alleles(user_gt, snp_info, rsid)
            snp_result['interpretation'] = interpretation

            snp_result['clinical_significance'] = snp_info['clinical_significance']
            snp_result['notes'] = snp_info['notes']

            results['snp_findings'].append(snp_result)
            results['summary_by_category'][snp_info['category']].append(snp_result)
        else:
            missing_snps.append(rsid)

    # Determine APOE status if available
    if 'rs429358' in genotypes and 'rs7412' in genotypes:
        gt429358 = genotypes['rs429358']['genotype']
        gt7412 = genotypes['rs7412']['genotype']
        if gt429358 != '--' and gt7412 != '--':
            apoe_status, alleles = determine_apoe_status(gt429358, gt7412)
            results['apoe_status'] = {
                'rs429358_genotype': gt429358,
                'rs7412_genotype': gt7412,
                'apoe_status': apoe_status,
                'alleles_detected': alleles,
                'interpretation': get_apoe_interpretation(apoe_status)
            }

    print(f"\nFound {found_count}/{len(SNP_DATABASE)} health-relevant SNPs")
    print(f"Missing/no-call: {len(missing_snps)}")

    return results

def get_apoe_interpretation(apoe_status):
    """Get human-readable APOE interpretation"""
    if not apoe_status:
        return "Unable to determine"

    interpretations = {
        'ε2ε2': 'Lowest Alzheimer\'s disease risk, longer lifespan, lower LDL cholesterol',
        'ε2ε3': 'Very low AD risk, favorable cholesterol profile',
        'ε2ε4': 'Rare combination, mixed risk profile',
        'ε3ε3': 'Common genotype, average AD risk, average cardiovascular risk',
        'ε3ε4': 'Moderately increased AD risk, higher LDL cholesterol',
        'ε4ε4': 'Highest AD risk (~55% by age 85), highest LDL, cardiovascular risk'
    }
    return interpretations.get(apoe_status, "Unknown")

def generate_summary_report(results, output_file):
    """Generate human-readable summary report"""
    with open(output_file, 'w') as f:
        f.write("=" * 100 + "\n")
        f.write("23andMe GENOME ANALYSIS REPORT\n")
        f.write("Subject: Bill Syrros (Vasilios Syrros)\n")
        f.write("Date: 2026-04-02\n")
        f.write("=" * 100 + "\n\n")

        # Summary statistics
        f.write("SUMMARY STATISTICS\n")
        f.write("-" * 100 + "\n")
        f.write(f"Total SNPs analyzed: {results['metadata']['total_snps']:,}\n")
        f.write(f"No-call rate: {results['metadata']['no_call_rate']:.2f}%\n")
        f.write(f"Health-relevant SNPs found: {len(results['snp_findings'])}\n\n")

        # APOE Status
        if results['apoe_status']:
            f.write("APOE STATUS (Critical for Alzheimer's Disease Risk)\n")
            f.write("-" * 100 + "\n")
            apoe = results['apoe_status']
            f.write(f"APOE Genotype: {apoe['apoe_status']}\n")
            f.write(f"  rs429358: {apoe['rs429358_genotype']}\n")
            f.write(f"  rs7412: {apoe['rs7412_genotype']}\n")
            f.write(f"Interpretation: {apoe['interpretation']}\n\n")

        # By category
        for category in sorted(results['summary_by_category'].keys()):
            snps = results['summary_by_category'][category]
            f.write("\n" + "=" * 100 + "\n")
            f.write(f"{category.upper()}\n")
            f.write("=" * 100 + "\n\n")

            for snp in snps:
                f.write(f"SNP: {snp['rsid']}\n")
                f.write(f"Gene: {snp['gene']} ({snp['variant']})\n")
                f.write(f"Location: Chromosome {snp['chromosome']}:{snp['position']}\n")
                f.write(f"Affects: {snp['affects']}\n")
                f.write(f"Your Genotype: {snp['user_genotype']}\n")
                f.write(f"Risk Allele: {snp['risk_allele']}\n")
                f.write(f"Clinical Significance: {snp['clinical_significance'].upper()}\n")
                f.write(f"\nMeaning: {snp['interpretation']}\n")
                f.write(f"Notes: {snp['notes']}\n")
                f.write("-" * 100 + "\n\n")

if __name__ == '__main__':
    genomes_file = '/sessions/lucid-sleepy-lamport/23andme/genome_Vasilios_Syrros_v4_Full_20260402081234.txt'

    # Run analysis
    results = analyze_genome(genomes_file)

    # Save JSON results
    json_output = '/sessions/lucid-sleepy-lamport/23andme/genome_analysis.json'
    with open(json_output, 'w') as f:
        # Convert defaultdict to regular dict for JSON serialization
        results['summary_by_category'] = dict(results['summary_by_category'])
        json.dump(results, f, indent=2)
    print(f"\nJSON results saved to: {json_output}")

    # Generate and save text summary
    txt_output = '/sessions/lucid-sleepy-lamport/23andme/genome_summary.txt'
    generate_summary_report(results, txt_output)
    print(f"Summary report saved to: {txt_output}")

    # Print to console
    print("\n" + "=" * 100)
    print("DETAILED FINDINGS BY CATEGORY")
    print("=" * 100)

    for category in sorted(results['summary_by_category'].keys()):
        snps = results['summary_by_category'][category]
        print(f"\n{category.upper()} ({len(snps)} SNPs)")
        print("-" * 100)
        for snp in snps:
            print(f"\n  {snp['rsid']} - {snp['gene']} ({snp['variant']})")
            print(f"  Genotype: {snp['user_genotype']} | Significance: {snp['clinical_significance'].upper()}")
            print(f"  {snp['interpretation']}")
