"""
Filter variant_summary.txt (ClinVar) to high-confidence pathogenic variants.

Keeps GRCh38 rows where:
  - ClinicalSignificance contains Pathogenic or Likely_pathogenic
  - GeneSymbol is not empty
  - PhenotypeList is not 'not provided'
  - ReviewStatus is 'criteria provided, multiple submitters' OR 'reviewed by expert panel'
    (high-confidence only — reduces ~450K → ~60K rows, ~10MB)

Output: data/clinvar_pathogenic.tsv
Run once locally before committing the repo.
"""

from __future__ import annotations

import csv
import sys
from pathlib import Path

REPO_ROOT = Path(__file__).parent.parent
INPUT_PATH = REPO_ROOT / "data" / "variant_summary.txt"
OUTPUT_PATH = REPO_ROOT / "data" / "clinvar_pathogenic.tsv"

KEEP_COLUMNS = [
    "#AlleleID",
    "GeneSymbol",
    "ClinicalSignificance",
    "PhenotypeList",
    "PhenotypeIDS",
    "Type",
    "Name",
    "Chromosome",
    "Start",
]

PATHOGENIC_TERMS = {"pathogenic", "likely pathogenic", "pathogenic/likely pathogenic"}

HIGH_CONFIDENCE_REVIEW = {
    "criteria provided, multiple submitters, no conflicts",
    "reviewed by expert panel",
    "practice guideline",
}


def is_pathogenic(clnsig: str) -> bool:
    low = clnsig.lower()
    return any(t in low for t in PATHOGENIC_TERMS)


def is_high_confidence(review_status: str) -> bool:
    return review_status.strip().lower() in HIGH_CONFIDENCE_REVIEW


def main() -> None:
    if not INPUT_PATH.exists():
        print(f"ERROR: {INPUT_PATH} not found", file=sys.stderr)
        sys.exit(1)

    print(f"Reading {INPUT_PATH} ...")
    written = 0
    seen_allele_ids: set[str] = set()

    with (
        open(INPUT_PATH, "r", encoding="utf-8", errors="replace") as fin,
        open(OUTPUT_PATH, "w", encoding="utf-8", newline="") as fout,
    ):
        reader = csv.DictReader(fin, delimiter="\t")
        writer = csv.DictWriter(fout, fieldnames=KEEP_COLUMNS, delimiter="\t")
        writer.writeheader()

        for i, row in enumerate(reader):
            if i % 500_000 == 0 and i > 0:
                print(f"  {i:,} rows scanned, {written:,} kept ...")

            if row.get("Assembly", "") != "GRCh38":
                continue

            if not is_pathogenic(row.get("ClinicalSignificance", "")):
                continue

            gene = row.get("GeneSymbol", "").strip()
            if not gene or gene == "-":
                continue

            phenotype = row.get("PhenotypeList", "").strip()
            if not phenotype or phenotype.lower() in ("not provided", "-", ""):
                continue

            if not is_high_confidence(row.get("ReviewStatus", "")):
                continue

            # Deduplicate by AlleleID
            allele_id = row.get("#AlleleID", "")
            if allele_id in seen_allele_ids:
                continue
            seen_allele_ids.add(allele_id)

            writer.writerow({col: row.get(col, "") for col in KEEP_COLUMNS})
            written += 1

    size_mb = OUTPUT_PATH.stat().st_size / 1_048_576
    print(f"\nDone.")
    print(f"  Written: {written:,} unique variants")
    print(f"  Size:    {size_mb:.1f} MB")
    print(f"  Saved:   {OUTPUT_PATH}")


if __name__ == "__main__":
    main()