narada-env / scripts /filter_clinvar.py
Krishna
Initial ClinDetect OpenEnv environment
077e61e
Raw
History Blame Contribute Delete
3.2 kB
"""
Filter variant_summary.txt (ClinVar) to high-confidence pathogenic variants.
Keeps GRCh38 rows where:
- ClinicalSignificance contains Pathogenic or Likely_pathogenic
- GeneSymbol is not empty
- PhenotypeList is not 'not provided'
- ReviewStatus is 'criteria provided, multiple submitters' OR 'reviewed by expert panel'
(high-confidence only — reduces ~450K → ~60K rows, ~10MB)
Output: data/clinvar_pathogenic.tsv
Run once locally before committing the repo.
"""
from __future__ import annotations
import csv
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).parent.parent
INPUT_PATH = REPO_ROOT / "data" / "variant_summary.txt"
OUTPUT_PATH = REPO_ROOT / "data" / "clinvar_pathogenic.tsv"
KEEP_COLUMNS = [
"#AlleleID",
"GeneSymbol",
"ClinicalSignificance",
"PhenotypeList",
"PhenotypeIDS",
"Type",
"Name",
"Chromosome",
"Start",
]
PATHOGENIC_TERMS = {"pathogenic", "likely pathogenic", "pathogenic/likely pathogenic"}
HIGH_CONFIDENCE_REVIEW = {
"criteria provided, multiple submitters, no conflicts",
"reviewed by expert panel",
"practice guideline",
}
def is_pathogenic(clnsig: str) -> bool:
low = clnsig.lower()
return any(t in low for t in PATHOGENIC_TERMS)
def is_high_confidence(review_status: str) -> bool:
return review_status.strip().lower() in HIGH_CONFIDENCE_REVIEW
def main() -> None:
if not INPUT_PATH.exists():
print(f"ERROR: {INPUT_PATH} not found", file=sys.stderr)
sys.exit(1)
print(f"Reading {INPUT_PATH} ...")
written = 0
seen_allele_ids: set[str] = set()
with (
open(INPUT_PATH, "r", encoding="utf-8", errors="replace") as fin,
open(OUTPUT_PATH, "w", encoding="utf-8", newline="") as fout,
):
reader = csv.DictReader(fin, delimiter="\t")
writer = csv.DictWriter(fout, fieldnames=KEEP_COLUMNS, delimiter="\t")
writer.writeheader()
for i, row in enumerate(reader):
if i % 500_000 == 0 and i > 0:
print(f" {i:,} rows scanned, {written:,} kept ...")
if row.get("Assembly", "") != "GRCh38":
continue
if not is_pathogenic(row.get("ClinicalSignificance", "")):
continue
gene = row.get("GeneSymbol", "").strip()
if not gene or gene == "-":
continue
phenotype = row.get("PhenotypeList", "").strip()
if not phenotype or phenotype.lower() in ("not provided", "-", ""):
continue
if not is_high_confidence(row.get("ReviewStatus", "")):
continue
# Deduplicate by AlleleID
allele_id = row.get("#AlleleID", "")
if allele_id in seen_allele_ids:
continue
seen_allele_ids.add(allele_id)
writer.writerow({col: row.get(col, "") for col in KEEP_COLUMNS})
written += 1
size_mb = OUTPUT_PATH.stat().st_size / 1_048_576
print(f"\nDone.")
print(f" Written: {written:,} unique variants")
print(f" Size: {size_mb:.1f} MB")
print(f" Saved: {OUTPUT_PATH}")
if __name__ == "__main__":
main()