Spaces:

KrishVenky
/

narada-env

Sleeping

Krishna

Initial ClinDetect OpenEnv environment

077e61e 2 months ago

3.2 kB

	"""
	Filter variant_summary.txt (ClinVar) to high-confidence pathogenic variants.

	Keeps GRCh38 rows where:
	- ClinicalSignificance contains Pathogenic or Likely_pathogenic
	- GeneSymbol is not empty
	- PhenotypeList is not 'not provided'
	- ReviewStatus is 'criteria provided, multiple submitters' OR 'reviewed by expert panel'
	(high-confidence only — reduces ~450K → ~60K rows, ~10MB)

	Output: data/clinvar_pathogenic.tsv
	Run once locally before committing the repo.
	"""

	from __future__ import annotations

	import csv
	import sys
	from pathlib import Path

	REPO_ROOT = Path(__file__).parent.parent
	INPUT_PATH = REPO_ROOT / "data" / "variant_summary.txt"
	OUTPUT_PATH = REPO_ROOT / "data" / "clinvar_pathogenic.tsv"

	KEEP_COLUMNS = [
	"#AlleleID",
	"GeneSymbol",
	"ClinicalSignificance",
	"PhenotypeList",
	"PhenotypeIDS",
	"Type",
	"Name",
	"Chromosome",
	"Start",
	]

	PATHOGENIC_TERMS = {"pathogenic", "likely pathogenic", "pathogenic/likely pathogenic"}

	HIGH_CONFIDENCE_REVIEW = {
	"criteria provided, multiple submitters, no conflicts",
	"reviewed by expert panel",
	"practice guideline",
	}


	def is_pathogenic(clnsig: str) -> bool:
	low = clnsig.lower()
	return any(t in low for t in PATHOGENIC_TERMS)


	def is_high_confidence(review_status: str) -> bool:
	return review_status.strip().lower() in HIGH_CONFIDENCE_REVIEW


	def main() -> None:
	if not INPUT_PATH.exists():
	print(f"ERROR: {INPUT_PATH} not found", file=sys.stderr)
	sys.exit(1)

	print(f"Reading {INPUT_PATH} ...")
	written = 0
	seen_allele_ids: set[str] = set()

	with (
	open(INPUT_PATH, "r", encoding="utf-8", errors="replace") as fin,
	open(OUTPUT_PATH, "w", encoding="utf-8", newline="") as fout,
	):
	reader = csv.DictReader(fin, delimiter="\t")
	writer = csv.DictWriter(fout, fieldnames=KEEP_COLUMNS, delimiter="\t")
	writer.writeheader()

	for i, row in enumerate(reader):
	if i % 500_000 == 0 and i > 0:
	print(f" {i:,} rows scanned, {written:,} kept ...")

	if row.get("Assembly", "") != "GRCh38":
	continue

	if not is_pathogenic(row.get("ClinicalSignificance", "")):
	continue

	gene = row.get("GeneSymbol", "").strip()
	if not gene or gene == "-":
	continue

	phenotype = row.get("PhenotypeList", "").strip()
	if not phenotype or phenotype.lower() in ("not provided", "-", ""):
	continue

	if not is_high_confidence(row.get("ReviewStatus", "")):
	continue

	# Deduplicate by AlleleID
	allele_id = row.get("#AlleleID", "")
	if allele_id in seen_allele_ids:
	continue
	seen_allele_ids.add(allele_id)

	writer.writerow({col: row.get(col, "") for col in KEEP_COLUMNS})
	written += 1

	size_mb = OUTPUT_PATH.stat().st_size / 1_048_576
	print(f"\nDone.")
	print(f" Written: {written:,} unique variants")
	print(f" Size: {size_mb:.1f} MB")
	print(f" Saved: {OUTPUT_PATH}")


	if __name__ == "__main__":
	main()