Codex commited on
Commit
3e219fa
Β·
0 Parent(s):

Initial VariantLens clinical readiness scaffold

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. .env.example +98 -0
  2. .gitignore +56 -0
  3. AGENTS.md +105 -0
  4. CLAUDE.md +105 -0
  5. Makefile +63 -0
  6. README.md +82 -0
  7. alembic.ini +44 -0
  8. backend/Dockerfile +23 -0
  9. backend/alembic/env.py +57 -0
  10. backend/alembic/script.py.mako +27 -0
  11. backend/alembic/versions/0001_init.py +77 -0
  12. backend/app/__init__.py +0 -0
  13. backend/app/api/__init__.py +0 -0
  14. backend/app/api/evidence.py +78 -0
  15. backend/app/api/pipeline.py +102 -0
  16. backend/app/api/reports.py +98 -0
  17. backend/app/api/variants.py +43 -0
  18. backend/app/config.py +82 -0
  19. backend/app/main.py +67 -0
  20. backend/app/models/__init__.py +5 -0
  21. backend/app/models/classification.py +51 -0
  22. backend/app/models/db.py +23 -0
  23. backend/app/models/variant.py +24 -0
  24. backend/app/schemas/__init__.py +33 -0
  25. backend/app/schemas/classification.py +38 -0
  26. backend/app/schemas/evidence.py +97 -0
  27. backend/app/schemas/variant.py +34 -0
  28. backend/app/services/__init__.py +0 -0
  29. backend/app/services/acmg/__init__.py +4 -0
  30. backend/app/services/acmg/combiner.py +218 -0
  31. backend/app/services/acmg/rules.py +215 -0
  32. backend/app/services/clinvar.py +218 -0
  33. backend/app/services/exports.py +208 -0
  34. backend/app/services/gnomad.py +118 -0
  35. backend/app/services/insilico.py +159 -0
  36. backend/app/services/llm/__init__.py +5 -0
  37. backend/app/services/llm/prompts.py +109 -0
  38. backend/app/services/llm/reasoner.py +201 -0
  39. backend/app/services/llm/synthesizer.py +81 -0
  40. backend/app/services/normalization.py +209 -0
  41. backend/app/services/pvs1.py +111 -0
  42. backend/app/services/rag/__init__.py +6 -0
  43. backend/app/services/rag/chunker.py +72 -0
  44. backend/app/services/rag/embedder.py +77 -0
  45. backend/app/services/rag/fetcher.py +136 -0
  46. backend/app/services/rag/retriever.py +68 -0
  47. backend/app/services/repository.py +80 -0
  48. backend/app/services/vep.py +122 -0
  49. backend/app/worker.py +20 -0
  50. backend/tests/__init__.py +0 -0
.env.example ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # VariantLens β€” environment variables
3
+ # Copy this file to `.env` and fill in real values. Never commit `.env`.
4
+ # =============================================================================
5
+
6
+ # ---- LLM ---------------------------------------------------------------------
7
+ # Anthropic API key for the Claude reasoning layer.
8
+ # Get one at https://console.anthropic.com
9
+ ANTHROPIC_API_KEY=
10
+
11
+ # Default model for the literature-evidence reasoning layer.
12
+ # claude-sonnet-4-6 is the cost/quality default; claude-opus-4-7 for hard cases.
13
+ ANTHROPIC_MODEL=claude-sonnet-4-6
14
+ ANTHROPIC_MAX_TOKENS=2000
15
+
16
+ # Air-gap toggle. When true, the reasoner uses a local Ollama model instead of
17
+ # the Anthropic API. Required for fully on-premise clinical deployments.
18
+ USE_LOCAL_LLM=false
19
+ LOCAL_LLM_BASE_URL=http://localhost:11434
20
+ LOCAL_LLM_MODEL=qwen2.5:14b-instruct
21
+
22
+ # ---- External biomedical APIs -----------------------------------------------
23
+ # NCBI E-utilities key. Free; raises rate limit from 3 to 10 req/s.
24
+ # https://www.ncbi.nlm.nih.gov/account/settings/
25
+ NCBI_API_KEY=
26
+ NCBI_EMAIL=
27
+
28
+ # OMIM API key. Free for academic use.
29
+ # https://www.omim.org/api
30
+ OMIM_API_KEY=
31
+
32
+ # Mutalyzer + gnomAD do not require keys.
33
+ MUTALYZER_BASE_URL=https://mutalyzer.nl/api
34
+ GNOMAD_GRAPHQL_URL=https://gnomad.broadinstitute.org/api
35
+ SPLICEAI_LOOKUP_URL=https://spliceailookup-api.broadinstitute.org
36
+ CADD_API_URL=https://cadd.gs.washington.edu/api
37
+
38
+ # ---- Storage -----------------------------------------------------------------
39
+ # PostgreSQL β€” audit trail, classifications, curator sign-offs.
40
+ POSTGRES_HOST=postgres
41
+ POSTGRES_PORT=5432
42
+ POSTGRES_DB=variantlens
43
+ POSTGRES_USER=variantlens
44
+ POSTGRES_PASSWORD=change_me_locally
45
+
46
+ DATABASE_URL=postgresql+psycopg://variantlens:change_me_locally@postgres:5432/variantlens
47
+
48
+ # ChromaDB β€” local vector store. Embedded mode requires only the persist path.
49
+ CHROMA_PERSIST_DIR=./data/chroma
50
+ CHROMA_COLLECTION=variantlens_pubmed
51
+
52
+ # Local SQLite caches and pre-scored tables.
53
+ # Build the prediction DBs once with `python -m scripts.build_revel_db <csv>`
54
+ # and `python -m scripts.build_alphamissense_db <tsv.gz>`.
55
+ REVEL_DB_PATH=./data/revel_scores.db
56
+ ALPHAMISSENSE_DB_PATH=./data/alphamissense.db
57
+ GNOMAD_CACHE_DB=./data/gnomad_cache.db
58
+ CLINVAR_VCF_PATH=./data/clinvar.vcf.gz
59
+
60
+ # ---- Embeddings --------------------------------------------------------------
61
+ # BioLinkBERT for biomedical accuracy; all-MiniLM-L6-v2 for speed.
62
+ EMBEDDING_MODEL=michiyasunaga/BioLinkBERT-base
63
+ EMBEDDING_DEVICE=cpu
64
+
65
+ # ---- App ---------------------------------------------------------------------
66
+ APP_ENV=development
67
+ LOG_LEVEL=INFO
68
+ API_HOST=0.0.0.0
69
+ API_PORT=8000
70
+
71
+ # Async job queue (Celery + Redis).
72
+ REDIS_URL=redis://redis:6379/0
73
+ CELERY_BROKER_URL=redis://redis:6379/1
74
+ CELERY_RESULT_BACKEND=redis://redis:6379/2
75
+
76
+ # ---- Auth (placeholder β€” wire to hospital LDAP/OAuth in deployment) ----------
77
+ JWT_SECRET=change_me_locally_to_a_long_random_string
78
+ JWT_ALGORITHM=HS256
79
+ JWT_EXPIRE_MINUTES=480
80
+
81
+ # ---- Feature flags -----------------------------------------------------------
82
+ # When true, also pull full text from PMC; otherwise abstracts only.
83
+ RAG_FETCH_FULLTEXT=true
84
+ RAG_MAX_PAPERS_PER_VARIANT=200
85
+ RAG_CHUNK_SIZE=512
86
+ RAG_CHUNK_OVERLAP=128
87
+ RAG_TOP_K=8
88
+
89
+ # ACMG ruleset version. Switch to "v4" once SVC v4.0 is finalized.
90
+ ACMG_RULESET_VERSION=v2015
91
+
92
+ # Clinical default is strict Richards 2015 Table 5. "bayesian" and
93
+ # "most_pathogenic" are available for research/validation only.
94
+ ACMG_COMBINER_STRATEGY=table5
95
+
96
+ # PP5/BP6 were deprecated by ACMG SVI in 2018. Keep false for clinical use;
97
+ # set true only for backward-compatible research comparisons.
98
+ ENABLE_DEPRECATED_CLINVAR_CRITERIA=false
.gitignore ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Secrets
2
+ .env
3
+ .env.local
4
+ .env.*.local
5
+
6
+ # Python
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+ *.so
11
+ .Python
12
+ .venv/
13
+ venv/
14
+ env/
15
+ .eggs/
16
+ *.egg-info/
17
+ .pytest_cache/
18
+ .mypy_cache/
19
+ .ruff_cache/
20
+ .coverage
21
+ htmlcov/
22
+
23
+ # Node
24
+ node_modules/
25
+ dist/
26
+ build/
27
+ .next/
28
+ *.log
29
+ npm-debug.log*
30
+ *.tsbuildinfo
31
+
32
+ # IDE
33
+ .vscode/
34
+ .idea/
35
+ .claude/
36
+ *.swp
37
+ .DS_Store
38
+
39
+ # Data β€” large pre-scored tables and patient data must never be committed
40
+ data/
41
+ !data/.gitkeep
42
+ *.vcf
43
+ *.vcf.gz
44
+ *.tsv.gz
45
+ *.bam
46
+ *.cram
47
+ *.fastq
48
+ *.fastq.gz
49
+
50
+ # ChromaDB persist dir
51
+ chroma/
52
+ *.parquet
53
+
54
+ # Reports / exports (may contain PHI)
55
+ reports/
56
+ exports/
AGENTS.md ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VariantLens
2
+
3
+ Clinical genomic variant interpretation tool for the Jordan Lerner-Ellis Lab. Built around the ACMG/AMP 2015 framework (Richards et al.) with the SVC v4.0 transition in mind. Modeled on the three tools showcased at the November 2025 GA4GH/ClinGen CGLC session: AI CURA (96% concordance via RAG + DeepSeek-R1), EvAgg (Broad/Microsoft evidence aggregator), and AutoPM3 (HKU PM3 extractor).
4
+
5
+ The full design lives in `docs/VariantLens_Build_Plan.md`. The supporting literature review lives in `docs/AI_Variant_Interpretation_Review.md`. Read those before making non-trivial architectural changes.
6
+
7
+ ## Non-negotiables
8
+
9
+ - **Human-in-the-loop.** A trained curator signs off every classification. The tool surfaces evidence and proposes criteria; it does not autonomously classify for clinical use.
10
+ - **On-prem patient data.** No genomic data is sent to cloud APIs without explicit opt-in. The `USE_LOCAL_LLM` flag must always provide a working air-gapped path (Ollama + open-source model).
11
+ - **Audit trail.** Every triggered ACMG criterion is traceable to a source β€” a database row, a PMID, or a curator override with free-text justification.
12
+ - **Anti-hallucination is structural, not cosmetic.** Codex is only allowed to reason over RAG-retrieved chunks, must cite PMIDs verbatim, and must emit structured JSON. If the context lacks evidence, the only valid output is "insufficient evidence in provided literature".
13
+ - **Database facts never go through the LLM.** gnomAD AFs, ClinVar classifications, REVEL/SpliceAI/AlphaMissense scores are scored deterministically. Codex only handles literature-dependent criteria: PM3, PP1, PS3/BS3, PS4, PP4, PS2/PM6, PP5/BP6.
14
+
15
+ ## Architecture (one-line summary)
16
+
17
+ `Mutalyzer normalize β†’ parallel evidence (gnomAD, ClinVar, in-silico, autoPVS1) β†’ ACMG rule engine (InterVar-extended) β†’ RAG over PubMed via ChromaDB β†’ Codex reasons over retrieved chunks β†’ Table 5 combiner β†’ curator review UI β†’ PDF/ClinVar/FHIR export.`
18
+
19
+ ## What we reuse vs. build
20
+
21
+ **Reuse (do not reimplement):**
22
+ - `autoPVS1` for PVS1
23
+ - `InterVar` as the rule-engine scaffold (extend from ~18 to all 28 criteria)
24
+ - `Mutalyzer` for HGVS normalization (PyHGVS as offline fallback)
25
+ - Pre-scored tables for REVEL, AlphaMissense, SpliceAI (do not run the models per variant)
26
+ - `ChromaDB` for the vector store, `sentence-transformers` (BioLinkBERT) for embeddings
27
+
28
+ **Build ourselves:**
29
+ - The orchestration layer (FastAPI services in `backend/app/services/`)
30
+ - The criterion-aware RAG retriever (different queries for PM3 vs. PP1 vs. PS3)
31
+ - The Codex prompt templates (one per literature-dependent criterion)
32
+ - The Table 5 combiner with conflict detection
33
+ - The curator dashboard
34
+
35
+ ## Tech stack
36
+
37
+ ```
38
+ Backend: Python 3.12, FastAPI, SQLAlchemy, Celery (async jobs)
39
+ Frontend: React 18, TypeScript, Tailwind, React Query, Zustand
40
+ Databases: PostgreSQL (audit trail), SQLite (REVEL/gnomAD offline cache)
41
+ Vector DB: ChromaDB (embedded, on-prem)
42
+ Embeddings: sentence-transformers (BioLinkBERT preferred; all-MiniLM-L6-v2 fallback)
43
+ LLM: Anthropic Codex (Codex-sonnet-4-6 for the reasoning layer; Codex-opus-4-7 only for hard cases)
44
+ Local fallback: Ollama + qwen2.5 or mistral-nemo
45
+ Containers: Docker + docker-compose
46
+ Tests: pytest, hypothesis (property-based on the combiner)
47
+ ```
48
+
49
+ ## Directory layout
50
+
51
+ ```
52
+ backend/ FastAPI app
53
+ app/api/ Routers: variants, evidence, reports
54
+ app/services/ normalization, gnomad, clinvar, insilico, pvs1, rag/, acmg/, llm/
55
+ app/models/ SQLAlchemy
56
+ tests/
57
+ frontend/ React + TS
58
+ data/ Pre-scored tables, gnomAD cache, ChromaDB persist dir
59
+ docs/ Build plan, literature review, ACMG references
60
+ docker-compose.yml
61
+ .env.example
62
+ .env gitignored β€” fill from .env.example
63
+ ```
64
+
65
+ ## Phase plan (~5 weeks)
66
+
67
+ 0. Scaffold + Docker (day 1)
68
+ 1. Mutalyzer normalization + 20-variant edge-case test set (day 2–3)
69
+ 2. gnomAD, ClinVar, in-silico predictors, autoPVS1 (day 4–7)
70
+ 3. RAG: PubMed fetch β†’ chunk β†’ embed β†’ ChromaDB β†’ criterion-aware retriever (day 8–11)
71
+ 4. ACMG rule engine: 28 criteria + Table 5 combiner; β‰₯85% concordance on 50 ClinVar variants (day 12–15)
72
+ 5. Codex reasoning layer with hallucination-suppression prompts (day 16–18)
73
+ 6. React curator dashboard + PDF/ClinVar/FHIR export (day 19–22)
74
+ 7. Validation: 100 4-star ClinVar expert-panel variants; hallucination-guard tests (day 23–25)
75
+
76
+ ## Validation bar
77
+
78
+ - **Classification concordance:** β‰₯85% on a held-out set of 100 ClinVar 4-star expert-panel variants. Stretch: match AI CURA's 96%.
79
+ - **Hallucination guard:** When fed deliberately empty/wrong literature contexts, Codex must NOT trigger PM3/PP1/PS3 and must only cite PMIDs that are present in the provided context.
80
+ - **Performance:** <30 s per variant (RAG included); 100 variants/hour batch throughput.
81
+ - **Audit:** Every triggered criterion has a traceable source field. No criterion fires with empty `evidence_text`.
82
+
83
+ ## Conventions
84
+
85
+ - Pydantic models for every service input/output. No `dict[str, Any]` at module boundaries.
86
+ - All LLM calls return JSON validated against a pydantic schema; if validation fails, retry once with a "your previous output was invalid JSON, here is the schema" repair prompt, then fail closed.
87
+ - Every external API client implements local caching (SQLite or filesystem) and respects rate limits β€” NCBI is 3 req/s without a key, 10 req/s with one. Treat cache misses as the slow path, not the default.
88
+ - Never write the canonical HGVS as a free-form string in the DB. Always store the Mutalyzer-normalized form and keep the user-supplied input separately for round-tripping.
89
+ - Keep `Codex-sonnet-4-6` as the default model. Only escalate individual hard variants to `Codex-opus-4-7` after benchmarking shows it changes outcomes.
90
+
91
+ ## Keys and external services
92
+
93
+ See `.env.example` for the full list. Required to run end-to-end:
94
+ - `ANTHROPIC_API_KEY` β€” paid, console.anthropic.com
95
+ - `NCBI_API_KEY` β€” free, raises rate limits to 10 req/s
96
+ - `OMIM_API_KEY` β€” free for academic use
97
+
98
+ `gnomAD` and `Mutalyzer` are open APIs and need no keys.
99
+
100
+ ## Notes for collaborators (and Codex)
101
+
102
+ - This is an intern project under active mentorship. Prefer small, reviewed PRs over big-bang merges.
103
+ - When in doubt about an ACMG criterion, cite the relevant section of Richards 2015 in the code comment, not just a paraphrase.
104
+ - The ACMG SVC v4.0 update (piloted March 2025) will change criterion weighting. Keep the rule logic in `services/acmg/rules.py` versioned (`rules_v2015.py`, `rules_v4.py`) so the swap is mechanical, not a rewrite.
105
+ - GA4GH VRS / VA-Spec interop is a stretch goal but worth keeping the data models compatible with from day one.
CLAUDE.md ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VariantLens
2
+
3
+ Clinical genomic variant interpretation tool for the Jordan Lerner-Ellis Lab. Built around the ACMG/AMP 2015 framework (Richards et al.) with the SVC v4.0 transition in mind. Modeled on the three tools showcased at the November 2025 GA4GH/ClinGen CGLC session: AI CURA (96% concordance via RAG + DeepSeek-R1), EvAgg (Broad/Microsoft evidence aggregator), and AutoPM3 (HKU PM3 extractor).
4
+
5
+ The full design lives in `docs/VariantLens_Build_Plan.md`. The supporting literature review lives in `docs/AI_Variant_Interpretation_Review.md`. Read those before making non-trivial architectural changes.
6
+
7
+ ## Non-negotiables
8
+
9
+ - **Human-in-the-loop.** A trained curator signs off every classification. The tool surfaces evidence and proposes criteria; it does not autonomously classify for clinical use.
10
+ - **On-prem patient data.** No genomic data is sent to cloud APIs without explicit opt-in. The `USE_LOCAL_LLM` flag must always provide a working air-gapped path (Ollama + open-source model).
11
+ - **Audit trail.** Every triggered ACMG criterion is traceable to a source β€” a database row, a PMID, or a curator override with free-text justification.
12
+ - **Anti-hallucination is structural, not cosmetic.** Claude is only allowed to reason over RAG-retrieved chunks, must cite PMIDs verbatim, and must emit structured JSON. If the context lacks evidence, the only valid output is "insufficient evidence in provided literature".
13
+ - **Database facts never go through the LLM.** gnomAD AFs, ClinVar classifications, REVEL/SpliceAI/AlphaMissense scores are scored deterministically. Claude only handles literature-dependent criteria: PM3, PP1, PS3/BS3, PS4, PP4, PS2/PM6, PP5/BP6.
14
+
15
+ ## Architecture (one-line summary)
16
+
17
+ `Mutalyzer normalize β†’ parallel evidence (gnomAD, ClinVar, in-silico, autoPVS1) β†’ ACMG rule engine (InterVar-extended) β†’ RAG over PubMed via ChromaDB β†’ Claude reasons over retrieved chunks β†’ Table 5 combiner β†’ curator review UI β†’ PDF/ClinVar/FHIR export.`
18
+
19
+ ## What we reuse vs. build
20
+
21
+ **Reuse (do not reimplement):**
22
+ - `autoPVS1` for PVS1
23
+ - `InterVar` as the rule-engine scaffold (extend from ~18 to all 28 criteria)
24
+ - `Mutalyzer` for HGVS normalization (PyHGVS as offline fallback)
25
+ - Pre-scored tables for REVEL, AlphaMissense, SpliceAI (do not run the models per variant)
26
+ - `ChromaDB` for the vector store, `sentence-transformers` (BioLinkBERT) for embeddings
27
+
28
+ **Build ourselves:**
29
+ - The orchestration layer (FastAPI services in `backend/app/services/`)
30
+ - The criterion-aware RAG retriever (different queries for PM3 vs. PP1 vs. PS3)
31
+ - The Claude prompt templates (one per literature-dependent criterion)
32
+ - The Table 5 combiner with conflict detection
33
+ - The curator dashboard
34
+
35
+ ## Tech stack
36
+
37
+ ```
38
+ Backend: Python 3.12, FastAPI, SQLAlchemy, Celery (async jobs)
39
+ Frontend: React 18, TypeScript, Tailwind, React Query, Zustand
40
+ Databases: PostgreSQL (audit trail), SQLite (REVEL/gnomAD offline cache)
41
+ Vector DB: ChromaDB (embedded, on-prem)
42
+ Embeddings: sentence-transformers (BioLinkBERT preferred; all-MiniLM-L6-v2 fallback)
43
+ LLM: Anthropic Claude (claude-sonnet-4-6 for the reasoning layer; claude-opus-4-7 only for hard cases)
44
+ Local fallback: Ollama + qwen2.5 or mistral-nemo
45
+ Containers: Docker + docker-compose
46
+ Tests: pytest, hypothesis (property-based on the combiner)
47
+ ```
48
+
49
+ ## Directory layout
50
+
51
+ ```
52
+ backend/ FastAPI app
53
+ app/api/ Routers: variants, evidence, reports
54
+ app/services/ normalization, gnomad, clinvar, insilico, pvs1, rag/, acmg/, llm/
55
+ app/models/ SQLAlchemy
56
+ tests/
57
+ frontend/ React + TS
58
+ data/ Pre-scored tables, gnomAD cache, ChromaDB persist dir
59
+ docs/ Build plan, literature review, ACMG references
60
+ docker-compose.yml
61
+ .env.example
62
+ .env gitignored β€” fill from .env.example
63
+ ```
64
+
65
+ ## Phase plan (~5 weeks)
66
+
67
+ 0. Scaffold + Docker (day 1)
68
+ 1. Mutalyzer normalization + 20-variant edge-case test set (day 2–3)
69
+ 2. gnomAD, ClinVar, in-silico predictors, autoPVS1 (day 4–7)
70
+ 3. RAG: PubMed fetch β†’ chunk β†’ embed β†’ ChromaDB β†’ criterion-aware retriever (day 8–11)
71
+ 4. ACMG rule engine: 28 criteria + Table 5 combiner; β‰₯85% concordance on 50 ClinVar variants (day 12–15)
72
+ 5. Claude reasoning layer with hallucination-suppression prompts (day 16–18)
73
+ 6. React curator dashboard + PDF/ClinVar/FHIR export (day 19–22)
74
+ 7. Validation: 100 4-star ClinVar expert-panel variants; hallucination-guard tests (day 23–25)
75
+
76
+ ## Validation bar
77
+
78
+ - **Classification concordance:** β‰₯85% on a held-out set of 100 ClinVar 4-star expert-panel variants. Stretch: match AI CURA's 96%.
79
+ - **Hallucination guard:** When fed deliberately empty/wrong literature contexts, Claude must NOT trigger PM3/PP1/PS3 and must only cite PMIDs that are present in the provided context.
80
+ - **Performance:** <30 s per variant (RAG included); 100 variants/hour batch throughput.
81
+ - **Audit:** Every triggered criterion has a traceable source field. No criterion fires with empty `evidence_text`.
82
+
83
+ ## Conventions
84
+
85
+ - Pydantic models for every service input/output. No `dict[str, Any]` at module boundaries.
86
+ - All LLM calls return JSON validated against a pydantic schema; if validation fails, retry once with a "your previous output was invalid JSON, here is the schema" repair prompt, then fail closed.
87
+ - Every external API client implements local caching (SQLite or filesystem) and respects rate limits β€” NCBI is 3 req/s without a key, 10 req/s with one. Treat cache misses as the slow path, not the default.
88
+ - Never write the canonical HGVS as a free-form string in the DB. Always store the Mutalyzer-normalized form and keep the user-supplied input separately for round-tripping.
89
+ - Keep `claude-sonnet-4-6` as the default model. Only escalate individual hard variants to `claude-opus-4-7` after benchmarking shows it changes outcomes.
90
+
91
+ ## Keys and external services
92
+
93
+ See `.env.example` for the full list. Required to run end-to-end:
94
+ - `ANTHROPIC_API_KEY` β€” paid, console.anthropic.com
95
+ - `NCBI_API_KEY` β€” free, raises rate limits to 10 req/s
96
+ - `OMIM_API_KEY` β€” free for academic use
97
+
98
+ `gnomAD` and `Mutalyzer` are open APIs and need no keys.
99
+
100
+ ## Notes for collaborators (and Claude)
101
+
102
+ - This is an intern project under active mentorship. Prefer small, reviewed PRs over big-bang merges.
103
+ - When in doubt about an ACMG criterion, cite the relevant section of Richards 2015 in the code comment, not just a paraphrase.
104
+ - The ACMG SVC v4.0 update (piloted March 2025) will change criterion weighting. Keep the rule logic in `services/acmg/rules.py` versioned (`rules_v2015.py`, `rules_v4.py`) so the swap is mechanical, not a rewrite.
105
+ - GA4GH VRS / VA-Spec interop is a stretch goal but worth keeping the data models compatible with from day one.
Makefile ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SHELL := /bin/bash
2
+
3
+ .PHONY: help install up down logs migrate seed test test-fast test-slow lint typecheck frontend-dev frontend-build clean
4
+
5
+ help:
6
+ @echo "VariantLens β€” common commands"
7
+ @echo ""
8
+ @echo " make install install backend (editable) + frontend deps"
9
+ @echo " make up docker compose up (api, worker, postgres, redis, frontend)"
10
+ @echo " make down docker compose down (preserves volumes)"
11
+ @echo " make logs tail logs from all containers"
12
+ @echo " make migrate run alembic migrations against the running postgres"
13
+ @echo " make seed pull 100 ClinVar 4-star variants into the eval fixture"
14
+ @echo " make test run fast unit tests (skips slow/external)"
15
+ @echo " make test-slow run the concordance harness (needs API keys + seeded fixture)"
16
+ @echo " make lint ruff check"
17
+ @echo " make typecheck mypy backend + tsc frontend"
18
+ @echo " make frontend-dev Vite dev server (no docker)"
19
+ @echo " make clean remove caches and build artifacts (preserves data/)"
20
+
21
+ install:
22
+ pip install -e ".[dev]"
23
+ cd frontend && npm install
24
+
25
+ up:
26
+ docker compose up --build
27
+
28
+ down:
29
+ docker compose down
30
+
31
+ logs:
32
+ docker compose logs -f --tail=200
33
+
34
+ migrate:
35
+ docker compose run --rm api alembic upgrade head
36
+
37
+ seed:
38
+ python -m scripts.seed_eval_set --n 100
39
+
40
+ test:
41
+ pytest -m "not slow"
42
+
43
+ test-slow:
44
+ pytest -m slow
45
+
46
+ lint:
47
+ ruff check backend scripts
48
+
49
+ typecheck:
50
+ mypy backend
51
+ cd frontend && npm run typecheck
52
+
53
+ frontend-dev:
54
+ cd frontend && npm run dev
55
+
56
+ frontend-build:
57
+ cd frontend && npm run build
58
+
59
+ clean:
60
+ rm -rf .pytest_cache .mypy_cache .ruff_cache htmlcov .coverage
61
+ find backend -type d -name __pycache__ -exec rm -rf {} +
62
+ find scripts -type d -name __pycache__ -exec rm -rf {} +
63
+ cd frontend && rm -rf dist node_modules/.vite
README.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VariantLens
2
+
3
+ Clinical genomic variant interpretation tool. ACMG/AMP rule engine + RAG over PubMed + Claude reasoning, with a curator review UI. Built for the Jordan Lerner-Ellis Lab.
4
+
5
+ See [CLAUDE.md](CLAUDE.md) for architecture, conventions, and validation bar. See [docs/](docs/) for the full build plan and literature review.
6
+
7
+ For lab or clinical-trial preparation, start with
8
+ [docs/Clinical_Readiness_Checklist.md](docs/Clinical_Readiness_Checklist.md).
9
+ VariantLens is a human-in-the-loop curator-support tool; it is not an
10
+ autonomous clinical classifier.
11
+
12
+ ## Quick start
13
+
14
+ ```bash
15
+ # 1. Fill in API keys
16
+ cp .env.example .env # then edit .env with your keys
17
+
18
+ # 2. Bring everything up (postgres, redis, api, worker, frontend)
19
+ make up # or: docker compose up --build
20
+
21
+ # 3. Open
22
+ # Frontend: http://localhost:5173
23
+ # API docs: http://localhost:8000/docs
24
+ ```
25
+
26
+ Migrations apply automatically on API startup. Run `make help` for the full list of commands (`make seed`, `make test`, `make test-slow`, `make typecheck`, etc.).
27
+
28
+ For non-docker local dev (debugger-friendly): `./scripts/dev.sh` boots uvicorn against a local SQLite file plus the Vite dev server.
29
+
30
+ ## Required keys
31
+
32
+ - `ANTHROPIC_API_KEY` β€” paid, [console.anthropic.com](https://console.anthropic.com)
33
+ - `NCBI_API_KEY` + `NCBI_EMAIL` β€” free, raises NCBI rate limit from 3 to 10 req/s
34
+ - `OMIM_API_KEY` β€” free for academic use
35
+
36
+ `gnomAD` and `Mutalyzer` are open APIs and need no keys.
37
+
38
+ ## Layout
39
+
40
+ ```
41
+ backend/ FastAPI + SQLAlchemy + Anthropic SDK
42
+ app/api/ Routers: variants, evidence, reports
43
+ app/services/ Domain logic: normalization, databases, RAG, ACMG, LLM
44
+ app/models/ SQLAlchemy ORM
45
+ app/schemas/ Pydantic models for API I/O
46
+ tests/ pytest + hypothesis property tests
47
+ frontend/ React + TypeScript + Vite + Tailwind
48
+ data/ Pre-scored tables (REVEL, AlphaMissense), gnomAD cache, ChromaDB persist
49
+ docs/ Build plan, literature review
50
+ scripts/ Data prep: download REVEL, build SQLite caches, seed evaluation set
51
+ ```
52
+
53
+ ## Development
54
+
55
+ ```bash
56
+ make test # fast unit tests (skips external APIs)
57
+ make test-slow # concordance harness (needs API keys + seeded fixture)
58
+ make lint # ruff check
59
+ make typecheck # mypy backend + tsc frontend
60
+ make seed # pull 100 ClinVar 4-star variants for the eval fixture
61
+ ```
62
+
63
+ ### Data prep (one-time)
64
+
65
+ ```bash
66
+ # REVEL β€” download revel-v1.3_all_chromosomes.csv from
67
+ # https://sites.google.com/site/revelgenomics/downloads first.
68
+ python -m scripts.build_revel_db /path/to/revel-v1.3_all_chromosomes.csv
69
+
70
+ # Eval fixture β€” pulls expert-panel ClinVar variants for the test harness.
71
+ make seed
72
+
73
+ # (Optional) pre-warm the gnomAD cache for a known variant list.
74
+ python -m scripts.warm_gnomad_cache variant_ids.txt
75
+ ```
76
+
77
+ ## Validation bar
78
+
79
+ - β‰₯85% classification concordance against 100 ClinVar 4-star expert-panel variants
80
+ - Hallucination guard: empty/wrong literature contexts must NOT trigger PM3/PP1/PS3 and must only cite PMIDs present in the provided context
81
+ - <30 s per variant including RAG; 100 variants/hour batch throughput
82
+ - Every triggered ACMG criterion has a traceable source field
alembic.ini ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [alembic]
2
+ script_location = backend/alembic
3
+ prepend_sys_path = .
4
+ version_path_separator = os
5
+
6
+ # Read the URL from the environment (DATABASE_URL) at runtime β€” we set it in
7
+ # backend/alembic/env.py from `backend.app.config.get_settings`.
8
+ sqlalchemy.url = driver://user:pass@host/db
9
+
10
+ [post_write_hooks]
11
+
12
+ [loggers]
13
+ keys = root,sqlalchemy,alembic
14
+
15
+ [handlers]
16
+ keys = console
17
+
18
+ [formatters]
19
+ keys = generic
20
+
21
+ [logger_root]
22
+ level = WARN
23
+ handlers = console
24
+ qualname =
25
+
26
+ [logger_sqlalchemy]
27
+ level = WARN
28
+ handlers =
29
+ qualname = sqlalchemy.engine
30
+
31
+ [logger_alembic]
32
+ level = INFO
33
+ handlers =
34
+ qualname = alembic
35
+
36
+ [handler_console]
37
+ class = StreamHandler
38
+ args = (sys.stderr,)
39
+ level = NOTSET
40
+ formatter = generic
41
+
42
+ [formatter_generic]
43
+ format = %(levelname)-5.5s [%(name)s] %(message)s
44
+ datefmt = %H:%M:%S
backend/Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ WORKDIR /app
4
+
5
+ ENV PYTHONUNBUFFERED=1 \
6
+ PIP_DISABLE_PIP_VERSION_CHECK=1
7
+
8
+ RUN apt-get update && apt-get install -y --no-install-recommends \
9
+ build-essential \
10
+ curl \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ COPY pyproject.toml ./
14
+ RUN pip install --no-cache-dir --upgrade pip && \
15
+ pip install --no-cache-dir -e ".[dev]"
16
+
17
+ COPY backend ./backend
18
+ COPY scripts ./scripts
19
+ COPY alembic.ini ./
20
+
21
+ EXPOSE 8000
22
+
23
+ CMD ["uvicorn", "backend.app.main:app", "--host", "0.0.0.0", "--port", "8000"]
backend/alembic/env.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Alembic env wired to the project Settings + SQLAlchemy Base."""
2
+ from __future__ import annotations
3
+
4
+ from logging.config import fileConfig
5
+
6
+ from alembic import context
7
+ from sqlalchemy import engine_from_config, pool
8
+
9
+ from backend.app.config import get_settings
10
+
11
+ # Import every model so Base.metadata is populated for autogenerate.
12
+ from backend.app.models import classification as _classification # noqa: F401
13
+ from backend.app.models import variant as _variant # noqa: F401
14
+ from backend.app.models.db import Base
15
+
16
+ config = context.config
17
+ if config.config_file_name is not None:
18
+ fileConfig(config.config_file_name)
19
+
20
+ # Override the sqlalchemy.url placeholder from alembic.ini with the live DSN.
21
+ config.set_main_option("sqlalchemy.url", get_settings().database_url)
22
+
23
+ target_metadata = Base.metadata
24
+
25
+
26
+ def run_migrations_offline() -> None:
27
+ context.configure(
28
+ url=config.get_main_option("sqlalchemy.url"),
29
+ target_metadata=target_metadata,
30
+ literal_binds=True,
31
+ dialect_opts={"paramstyle": "named"},
32
+ compare_type=True,
33
+ )
34
+ with context.begin_transaction():
35
+ context.run_migrations()
36
+
37
+
38
+ def run_migrations_online() -> None:
39
+ connectable = engine_from_config(
40
+ config.get_section(config.config_ini_section, {}),
41
+ prefix="sqlalchemy.",
42
+ poolclass=pool.NullPool,
43
+ )
44
+ with connectable.connect() as connection:
45
+ context.configure(
46
+ connection=connection,
47
+ target_metadata=target_metadata,
48
+ compare_type=True,
49
+ )
50
+ with context.begin_transaction():
51
+ context.run_migrations()
52
+
53
+
54
+ if context.is_offline_mode():
55
+ run_migrations_offline()
56
+ else:
57
+ run_migrations_online()
backend/alembic/script.py.mako ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """${message}
2
+
3
+ Revision ID: ${up_revision}
4
+ Revises: ${down_revision | comma,n}
5
+ Create Date: ${create_date}
6
+
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from typing import Sequence, Union
11
+
12
+ from alembic import op
13
+ import sqlalchemy as sa
14
+ ${imports if imports else ""}
15
+
16
+ revision: str = ${repr(up_revision)}
17
+ down_revision: Union[str, None] = ${repr(down_revision)}
18
+ branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
19
+ depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
20
+
21
+
22
+ def upgrade() -> None:
23
+ ${upgrades if upgrades else "pass"}
24
+
25
+
26
+ def downgrade() -> None:
27
+ ${downgrades if downgrades else "pass"}
backend/alembic/versions/0001_init.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """initial schema β€” variants, classifications, criteria
2
+
3
+ Revision ID: 0001_init
4
+ Revises:
5
+ Create Date: 2026-04-28 10:00:00
6
+
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import sqlalchemy as sa
11
+ from alembic import op
12
+
13
+ revision = "0001_init"
14
+ down_revision = None
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ op.create_table(
21
+ "variants",
22
+ sa.Column("id", sa.String(length=36), primary_key=True),
23
+ sa.Column("raw_input", sa.String(length=512), nullable=False),
24
+ sa.Column("hgvs_genomic", sa.String(length=512)),
25
+ sa.Column("hgvs_coding", sa.String(length=512)),
26
+ sa.Column("hgvs_protein", sa.String(length=512)),
27
+ sa.Column("transcript", sa.String(length=64)),
28
+ sa.Column("gene_symbol", sa.String(length=64), index=True),
29
+ sa.Column("chromosome", sa.String(length=8)),
30
+ sa.Column("position", sa.Integer()),
31
+ sa.Column("normalization_source", sa.String(length=32), nullable=False, server_default="mutalyzer"),
32
+ sa.Column("warnings", sa.JSON(), nullable=False, server_default="[]"),
33
+ sa.Column("submitted_at", sa.DateTime(), nullable=False, server_default=sa.func.now()),
34
+ )
35
+ # ix_variants_gene_symbol auto-created by `index=True` on the column above
36
+
37
+ op.create_table(
38
+ "classifications",
39
+ sa.Column("id", sa.String(length=36), primary_key=True),
40
+ sa.Column("variant_id", sa.String(length=36), sa.ForeignKey("variants.id", ondelete="CASCADE"), nullable=False),
41
+ sa.Column("significance", sa.String(length=32), nullable=False),
42
+ sa.Column("confidence", sa.String(length=16), nullable=False, server_default="medium"),
43
+ sa.Column("triggered_criteria", sa.JSON(), nullable=False, server_default="[]"),
44
+ sa.Column("conflicting_evidence", sa.Boolean(), nullable=False, server_default=sa.false()),
45
+ sa.Column("ruleset_version", sa.String(length=16), nullable=False, server_default="v2015"),
46
+ sa.Column("rationale", sa.Text()),
47
+ sa.Column("curator_signoff", sa.Boolean(), nullable=False, server_default=sa.false()),
48
+ sa.Column("curator_id", sa.String(length=64)),
49
+ sa.Column("signed_off_at", sa.DateTime()),
50
+ sa.Column("created_at", sa.DateTime(), nullable=False, server_default=sa.func.now()),
51
+ )
52
+ op.create_index("ix_classifications_variant_id", "classifications", ["variant_id"])
53
+
54
+ op.create_table(
55
+ "criteria",
56
+ sa.Column("id", sa.String(length=36), primary_key=True),
57
+ sa.Column("classification_id", sa.String(length=36), sa.ForeignKey("classifications.id", ondelete="CASCADE"), nullable=False),
58
+ sa.Column("code", sa.String(length=8), nullable=False),
59
+ sa.Column("triggered", sa.Boolean(), nullable=False, server_default=sa.false()),
60
+ sa.Column("strength", sa.String(length=16), nullable=False),
61
+ sa.Column("source", sa.String(length=128), nullable=False),
62
+ sa.Column("evidence_text", sa.Text(), nullable=False),
63
+ sa.Column("confidence", sa.String(length=16), nullable=False, server_default="medium"),
64
+ sa.Column("pmid", sa.String(length=32)),
65
+ sa.Column("caveat", sa.Text()),
66
+ sa.Column("curator_override", sa.Boolean(), nullable=False, server_default=sa.false()),
67
+ sa.Column("override_justification", sa.Text()),
68
+ )
69
+ op.create_index("ix_criteria_classification_id", "criteria", ["classification_id"])
70
+
71
+
72
+ def downgrade() -> None:
73
+ op.drop_index("ix_criteria_classification_id", table_name="criteria")
74
+ op.drop_table("criteria")
75
+ op.drop_index("ix_classifications_variant_id", table_name="classifications")
76
+ op.drop_table("classifications")
77
+ op.drop_table("variants") # auto-index drops with the table
backend/app/__init__.py ADDED
File without changes
backend/app/api/__init__.py ADDED
File without changes
backend/app/api/evidence.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from typing import Annotated
3
+
4
+ from fastapi import APIRouter, Depends, HTTPException
5
+ from pydantic import BaseModel
6
+ from sqlalchemy.orm import Session
7
+
8
+ from backend.app.models.classification import ClassificationRecord, CriterionRecord
9
+ from backend.app.models.db import get_session
10
+ from backend.app.schemas.evidence import ACMGCriterion
11
+
12
+ router = APIRouter()
13
+ SessionDep = Annotated[Session, Depends(get_session)]
14
+
15
+
16
+ class CriterionOverride(BaseModel):
17
+ triggered: bool
18
+ strength: str
19
+ justification: str
20
+ curator_id: str
21
+
22
+
23
+ @router.get("/{classification_id}", response_model=list[ACMGCriterion])
24
+ def get_criteria(classification_id: str, db: SessionDep) -> list[ACMGCriterion]:
25
+ record = db.get(ClassificationRecord, classification_id)
26
+ if not record:
27
+ raise HTTPException(404, "classification not found")
28
+ return [
29
+ ACMGCriterion(
30
+ code=c.code,
31
+ triggered=c.triggered,
32
+ strength=c.strength,
33
+ source=c.source,
34
+ evidence_text=c.evidence_text,
35
+ confidence=c.confidence,
36
+ caveat=c.caveat,
37
+ pmid=c.pmid,
38
+ curator_override=c.curator_override,
39
+ override_justification=c.override_justification,
40
+ )
41
+ for c in record.criteria
42
+ ]
43
+
44
+
45
+ @router.post("/{classification_id}/{criterion_code}/override", response_model=ACMGCriterion)
46
+ def override_criterion(
47
+ classification_id: str,
48
+ criterion_code: str,
49
+ override: CriterionOverride,
50
+ db: SessionDep,
51
+ ) -> ACMGCriterion:
52
+ rec = (
53
+ db.query(CriterionRecord)
54
+ .filter_by(classification_id=classification_id, code=criterion_code)
55
+ .one_or_none()
56
+ )
57
+ if not rec:
58
+ raise HTTPException(404, "criterion not found")
59
+ rec.triggered = override.triggered
60
+ rec.strength = override.strength
61
+ rec.curator_override = True
62
+ rec.override_justification = (
63
+ f"[{override.curator_id} @ {datetime.utcnow().isoformat()}] {override.justification}"
64
+ )
65
+ db.commit()
66
+ db.refresh(rec)
67
+ return ACMGCriterion(
68
+ code=rec.code,
69
+ triggered=rec.triggered,
70
+ strength=rec.strength,
71
+ source=rec.source,
72
+ evidence_text=rec.evidence_text,
73
+ confidence=rec.confidence,
74
+ caveat=rec.caveat,
75
+ pmid=rec.pmid,
76
+ curator_override=rec.curator_override,
77
+ override_justification=rec.override_justification,
78
+ )
backend/app/api/pipeline.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """End-to-end pipeline that wires services together."""
2
+
3
+ import logging
4
+ from uuid import uuid4
5
+
6
+ from backend.app.schemas.classification import ClassificationResult
7
+ from backend.app.schemas.evidence import EvidenceBundle, LiteratureChunk
8
+ from backend.app.schemas.variant import VariantInput
9
+ from backend.app.services.clinvar import ClinVarClient
10
+ from backend.app.services.gnomad import GnomADClient
11
+ from backend.app.services.insilico import InSilicoPredictor
12
+ from backend.app.services.llm.synthesizer import LITERATURE_CRITERIA, EvidenceSynthesizer
13
+ from backend.app.services.normalization import VariantNormalizer
14
+ from backend.app.services.pvs1 import PVS1Assessor
15
+ from backend.app.services.rag.retriever import LiteratureRetriever
16
+ from backend.app.services.vep import VEPClient
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class VariantPipeline:
22
+ def __init__(
23
+ self,
24
+ normalizer: VariantNormalizer | None = None,
25
+ vep: VEPClient | None = None,
26
+ gnomad: GnomADClient | None = None,
27
+ clinvar: ClinVarClient | None = None,
28
+ insilico: InSilicoPredictor | None = None,
29
+ pvs1: PVS1Assessor | None = None,
30
+ retriever: LiteratureRetriever | None = None,
31
+ synthesizer: EvidenceSynthesizer | None = None,
32
+ ) -> None:
33
+ self.normalizer = normalizer or VariantNormalizer()
34
+ self.vep = vep or VEPClient()
35
+ self.gnomad = gnomad or GnomADClient()
36
+ self.clinvar = clinvar or ClinVarClient()
37
+ self.insilico = insilico or InSilicoPredictor()
38
+ self.pvs1 = pvs1 or PVS1Assessor()
39
+ self.retriever = retriever or LiteratureRetriever()
40
+ self.synthesizer = synthesizer or EvidenceSynthesizer()
41
+
42
+ async def run(self, variant_input: VariantInput, skip_rag: bool = False) -> ClassificationResult:
43
+ variant = await self.normalizer.normalize(variant_input)
44
+ # Enrich with chr/pos/ref/alt + transcript + consequence via VEP
45
+ # so REVEL/AlphaMissense/gnomAD have what they need on HGVS-coding input.
46
+ # Best-effort β€” VEP failure doesn't block the rest of the pipeline.
47
+ if not all([variant.chromosome, variant.position, variant.ref, variant.alt]):
48
+ variant = await self.vep.enrich(variant)
49
+ variant_id = str(uuid4())
50
+
51
+ gnomad_id = self._build_gnomad_id(variant)
52
+ freq = await self.gnomad.lookup(gnomad_id) if gnomad_id else None
53
+
54
+ clinvar = await self.clinvar.lookup(variant.hgvs_coding or variant.raw_input)
55
+ insilico = await self.insilico.assess(
56
+ chrom=variant.chromosome,
57
+ pos=variant.position,
58
+ ref=variant.ref,
59
+ alt=variant.alt,
60
+ transcript=variant.transcript,
61
+ hgvs_genomic=variant.hgvs_genomic,
62
+ )
63
+ autopvs1 = self.pvs1.assess(variant)
64
+
65
+ evidence = EvidenceBundle(
66
+ population_frequency=freq,
67
+ insilico=insilico,
68
+ clinvar_existing=clinvar or [],
69
+ autopvs1=autopvs1,
70
+ )
71
+
72
+ retrieved: dict[str, list[LiteratureChunk]] = {}
73
+ if not skip_rag and variant.gene_symbol:
74
+ try:
75
+ await self.retriever.index_for_variant(
76
+ variant_id=variant_id,
77
+ gene=variant.gene_symbol,
78
+ hgvs=variant.hgvs_coding or variant.raw_input,
79
+ protein=variant.hgvs_protein,
80
+ criteria=LITERATURE_CRITERIA,
81
+ )
82
+ retrieved = self.retriever.retrieve_for_criteria(
83
+ variant_id=variant_id,
84
+ hgvs=variant.hgvs_coding or variant.raw_input,
85
+ criteria=LITERATURE_CRITERIA,
86
+ )
87
+ except Exception as e:
88
+ logger.warning("RAG indexing/retrieval failed; continuing without literature: %s", e)
89
+
90
+ return self.synthesizer.synthesize(
91
+ variant=variant,
92
+ evidence=evidence,
93
+ retrieved_chunks=retrieved,
94
+ disease=variant_input.disease,
95
+ )
96
+
97
+ @staticmethod
98
+ def _build_gnomad_id(variant) -> str | None:
99
+ if variant.chromosome and variant.position and variant.ref and variant.alt:
100
+ chrom = variant.chromosome.replace("chr", "")
101
+ return f"{chrom}-{variant.position}-{variant.ref}-{variant.alt}"
102
+ return None
backend/app/api/reports.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import UTC, datetime
2
+ from typing import Annotated
3
+
4
+ from fastapi import APIRouter, Depends, HTTPException
5
+ from fastapi.responses import Response
6
+ from sqlalchemy.orm import Session
7
+
8
+ from backend.app.models.classification import ClassificationRecord
9
+ from backend.app.models.db import get_session
10
+ from backend.app.services.exports import render_clinvar_xml, render_fhir_observation
11
+
12
+ router = APIRouter()
13
+ SessionDep = Annotated[Session, Depends(get_session)]
14
+
15
+
16
+ @router.get("/{classification_id}")
17
+ def get_report(classification_id: str, db: SessionDep) -> dict:
18
+ rec = db.get(ClassificationRecord, classification_id)
19
+ if not rec:
20
+ raise HTTPException(404, "classification not found")
21
+ return {
22
+ "classification_id": rec.id,
23
+ "variant_id": rec.variant_id,
24
+ "variant": {
25
+ "raw_input": rec.variant.raw_input,
26
+ "hgvs_coding": rec.variant.hgvs_coding,
27
+ "hgvs_protein": rec.variant.hgvs_protein,
28
+ "hgvs_genomic": rec.variant.hgvs_genomic,
29
+ "gene_symbol": rec.variant.gene_symbol,
30
+ } if rec.variant else None,
31
+ "significance": rec.significance,
32
+ "confidence": rec.confidence,
33
+ "ruleset_version": rec.ruleset_version,
34
+ "rationale": rec.rationale,
35
+ "triggered_criteria": rec.triggered_criteria,
36
+ "conflicting_evidence": rec.conflicting_evidence,
37
+ "curator_signoff": rec.curator_signoff,
38
+ "curator_id": rec.curator_id,
39
+ "signed_off_at": rec.signed_off_at.isoformat() if rec.signed_off_at else None,
40
+ "criteria": [
41
+ {
42
+ "code": c.code,
43
+ "triggered": c.triggered,
44
+ "strength": c.strength,
45
+ "source": c.source,
46
+ "evidence_text": c.evidence_text,
47
+ "confidence": c.confidence,
48
+ "pmid": c.pmid,
49
+ "caveat": c.caveat,
50
+ "curator_override": c.curator_override,
51
+ "override_justification": c.override_justification,
52
+ }
53
+ for c in rec.criteria
54
+ ],
55
+ "generated_at": datetime.now(UTC).isoformat(),
56
+ }
57
+
58
+
59
+ @router.post("/{classification_id}/signoff")
60
+ def signoff(classification_id: str, curator_id: str, db: SessionDep) -> dict:
61
+ rec = db.get(ClassificationRecord, classification_id)
62
+ if not rec:
63
+ raise HTTPException(404, "classification not found")
64
+ if rec.conflicting_evidence:
65
+ # Allow but flag β€” clinical curator should know.
66
+ pass
67
+ rec.curator_signoff = True
68
+ rec.curator_id = curator_id
69
+ rec.signed_off_at = datetime.now(UTC).replace(tzinfo=None)
70
+ db.commit()
71
+ return {
72
+ "status": "signed",
73
+ "curator_id": curator_id,
74
+ "signed_off_at": rec.signed_off_at.isoformat(),
75
+ }
76
+
77
+
78
+ @router.get("/{classification_id}/clinvar-xml")
79
+ def clinvar_export(classification_id: str, db: SessionDep) -> Response:
80
+ rec = db.get(ClassificationRecord, classification_id)
81
+ if not rec:
82
+ raise HTTPException(404, "classification not found")
83
+ if not rec.curator_signoff:
84
+ raise HTTPException(409, "classification must be signed off before ClinVar export")
85
+ xml = render_clinvar_xml(rec)
86
+ return Response(content=xml, media_type="application/xml", headers={
87
+ "Content-Disposition": f'attachment; filename="variantlens_{rec.id}.clinvar.xml"',
88
+ })
89
+
90
+
91
+ @router.get("/{classification_id}/fhir")
92
+ def fhir_export(classification_id: str, db: SessionDep) -> dict:
93
+ rec = db.get(ClassificationRecord, classification_id)
94
+ if not rec:
95
+ raise HTTPException(404, "classification not found")
96
+ if not rec.curator_signoff:
97
+ raise HTTPException(409, "classification must be signed off before FHIR export")
98
+ return render_fhir_observation(rec)
backend/app/api/variants.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Annotated
3
+
4
+ from fastapi import APIRouter, Depends, HTTPException
5
+ from sqlalchemy.exc import SQLAlchemyError
6
+ from sqlalchemy.orm import Session
7
+
8
+ from backend.app.api.pipeline import VariantPipeline
9
+ from backend.app.models.db import get_session
10
+ from backend.app.schemas.classification import ClassificationResult
11
+ from backend.app.schemas.variant import NormalizedVariant, VariantInput
12
+ from backend.app.services.repository import ClassificationRepository
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ router = APIRouter()
17
+ _pipeline = VariantPipeline()
18
+ SessionDep = Annotated[Session, Depends(get_session)]
19
+
20
+
21
+ @router.post("/classify", response_model=ClassificationResult)
22
+ async def classify(
23
+ variant: VariantInput,
24
+ db: SessionDep,
25
+ skip_rag: bool = False,
26
+ ) -> ClassificationResult:
27
+ try:
28
+ result = await _pipeline.run(variant, skip_rag=skip_rag)
29
+ except Exception as e:
30
+ logger.exception("pipeline failed")
31
+ raise HTTPException(status_code=500, detail=f"pipeline failed: {e}") from e
32
+
33
+ try:
34
+ return ClassificationRepository(db).save(result)
35
+ except SQLAlchemyError as e:
36
+ logger.warning("DB persistence failed, returning unsaved result: %s", e)
37
+ # Return the in-memory result so the UI still renders during dev.
38
+ return result
39
+
40
+
41
+ @router.post("/normalize", response_model=NormalizedVariant)
42
+ async def normalize(variant: VariantInput) -> NormalizedVariant:
43
+ return await _pipeline.normalizer.normalize(variant)
backend/app/config.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+ from pathlib import Path
3
+ from typing import Literal
4
+
5
+ from pydantic import Field, model_validator
6
+ from pydantic_settings import BaseSettings, SettingsConfigDict
7
+
8
+
9
+ class Settings(BaseSettings):
10
+ model_config = SettingsConfigDict(
11
+ env_file=".env",
12
+ env_file_encoding="utf-8",
13
+ case_sensitive=False,
14
+ extra="ignore",
15
+ )
16
+
17
+ app_env: str = "development"
18
+ log_level: str = "INFO"
19
+ api_host: str = "0.0.0.0"
20
+ api_port: int = 8000
21
+
22
+ anthropic_api_key: str = ""
23
+ anthropic_model: str = "claude-sonnet-4-6"
24
+ anthropic_max_tokens: int = 2000
25
+ use_local_llm: bool = False
26
+ local_llm_base_url: str = "http://localhost:11434"
27
+ local_llm_model: str = "qwen2.5:14b-instruct"
28
+
29
+ ncbi_api_key: str = ""
30
+ ncbi_email: str = ""
31
+ omim_api_key: str = ""
32
+
33
+ mutalyzer_base_url: str = "https://mutalyzer.nl/api"
34
+ gnomad_graphql_url: str = "https://gnomad.broadinstitute.org/api"
35
+ spliceai_lookup_url: str = "https://spliceailookup-api.broadinstitute.org"
36
+ cadd_api_url: str = "https://cadd.gs.washington.edu/api"
37
+
38
+ database_url: str = "postgresql+psycopg://variantlens:change_me_locally@postgres:5432/variantlens"
39
+
40
+ chroma_persist_dir: Path = Path("./data/chroma")
41
+ chroma_collection: str = "variantlens_pubmed"
42
+
43
+ revel_db_path: Path = Path("./data/revel_scores.db")
44
+ alphamissense_db_path: Path = Path("./data/alphamissense.db")
45
+ alphamissense_path: Path = Path("./data/alphamissense.tsv.gz") # legacy raw TSV path
46
+ gnomad_cache_db: Path = Path("./data/gnomad_cache.db")
47
+ clinvar_vcf_path: Path = Path("./data/clinvar.vcf.gz")
48
+
49
+ embedding_model: str = "michiyasunaga/BioLinkBERT-base"
50
+ embedding_device: str = "cpu"
51
+
52
+ redis_url: str = "redis://redis:6379/0"
53
+ celery_broker_url: str = "redis://redis:6379/1"
54
+ celery_result_backend: str = "redis://redis:6379/2"
55
+
56
+ jwt_secret: str = Field(default="change_me", min_length=8)
57
+ jwt_algorithm: str = "HS256"
58
+ jwt_expire_minutes: int = 480
59
+
60
+ rag_fetch_fulltext: bool = True
61
+ rag_max_papers_per_variant: int = 200
62
+ rag_chunk_size: int = 512
63
+ rag_chunk_overlap: int = 128
64
+ rag_top_k: int = 8
65
+
66
+ acmg_ruleset_version: str = "v2015"
67
+ acmg_combiner_strategy: Literal["table5", "bayesian", "most_pathogenic"] = "table5"
68
+ enable_deprecated_clinvar_criteria: bool = False
69
+
70
+ @model_validator(mode="after")
71
+ def validate_clinical_safety(self) -> "Settings":
72
+ if self.app_env.lower() in {"production", "clinical"}:
73
+ if self.jwt_secret in {"change_me", "change_me_locally_to_a_long_random_string"}:
74
+ raise ValueError("JWT_SECRET must be changed for production/clinical deployments")
75
+ if not self.use_local_llm and not self.anthropic_api_key:
76
+ raise ValueError("ANTHROPIC_API_KEY is required when USE_LOCAL_LLM=false")
77
+ return self
78
+
79
+
80
+ @lru_cache
81
+ def get_settings() -> Settings:
82
+ return Settings()
backend/app/main.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from contextlib import asynccontextmanager
3
+ from pathlib import Path
4
+
5
+ from alembic import command
6
+ from alembic.config import Config
7
+ from fastapi import FastAPI
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+
10
+ from backend.app.api import evidence, reports, variants
11
+ from backend.app.config import get_settings
12
+
13
+ settings = get_settings()
14
+ logging.basicConfig(level=settings.log_level)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
18
+
19
+
20
+ def _run_migrations() -> None:
21
+ cfg_path = PROJECT_ROOT / "alembic.ini"
22
+ if not cfg_path.exists():
23
+ logger.warning("alembic.ini not found at %s; skipping auto-migrate", cfg_path)
24
+ return
25
+ try:
26
+ cfg = Config(str(cfg_path))
27
+ cfg.set_main_option("sqlalchemy.url", settings.database_url)
28
+ command.upgrade(cfg, "head")
29
+ logger.info("alembic migrations applied")
30
+ except Exception as e:
31
+ logger.warning("alembic auto-migrate failed (continuing): %s", e)
32
+
33
+
34
+ @asynccontextmanager
35
+ async def lifespan(app: FastAPI):
36
+ _run_migrations()
37
+ yield
38
+
39
+
40
+ app = FastAPI(
41
+ title="VariantLens",
42
+ description="Clinical genomic variant interpretation tool with ACMG rule engine and Claude RAG reasoning.",
43
+ version="0.1.0",
44
+ lifespan=lifespan,
45
+ )
46
+
47
+ app.add_middleware(
48
+ CORSMiddleware,
49
+ allow_origins=["http://localhost:5173", "http://localhost:3000"],
50
+ allow_credentials=True,
51
+ allow_methods=["*"],
52
+ allow_headers=["*"],
53
+ )
54
+
55
+ app.include_router(variants.router, prefix="/variants", tags=["variants"])
56
+ app.include_router(evidence.router, prefix="/evidence", tags=["evidence"])
57
+ app.include_router(reports.router, prefix="/reports", tags=["reports"])
58
+
59
+
60
+ @app.get("/health")
61
+ async def health() -> dict[str, str]:
62
+ return {"status": "ok", "env": settings.app_env}
63
+
64
+
65
+ @app.get("/")
66
+ async def root() -> dict[str, str]:
67
+ return {"name": "VariantLens", "version": "0.1.0", "docs": "/docs"}
backend/app/models/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from backend.app.models.classification import ClassificationRecord, CriterionRecord
2
+ from backend.app.models.db import Base, get_session
3
+ from backend.app.models.variant import VariantRecord
4
+
5
+ __all__ = ["Base", "get_session", "VariantRecord", "ClassificationRecord", "CriterionRecord"]
backend/app/models/classification.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from uuid import uuid4
3
+
4
+ from sqlalchemy import JSON, Boolean, DateTime, ForeignKey, String, Text
5
+ from sqlalchemy.orm import Mapped, mapped_column, relationship
6
+
7
+ from backend.app.models.db import Base
8
+ from backend.app.models.variant import VariantRecord # noqa: F401 β€” needed for relationship
9
+
10
+
11
+ class ClassificationRecord(Base):
12
+ __tablename__ = "classifications"
13
+
14
+ id: Mapped[str] = mapped_column(String(36), primary_key=True, default=lambda: str(uuid4()))
15
+ variant_id: Mapped[str] = mapped_column(String(36), ForeignKey("variants.id"), index=True)
16
+ significance: Mapped[str] = mapped_column(String(32), nullable=False)
17
+ confidence: Mapped[str] = mapped_column(String(16), default="medium")
18
+ triggered_criteria: Mapped[list] = mapped_column(JSON, default=list)
19
+ conflicting_evidence: Mapped[bool] = mapped_column(Boolean, default=False)
20
+ ruleset_version: Mapped[str] = mapped_column(String(16), default="v2015")
21
+ rationale: Mapped[str | None] = mapped_column(Text)
22
+ curator_signoff: Mapped[bool] = mapped_column(Boolean, default=False)
23
+ curator_id: Mapped[str | None] = mapped_column(String(64))
24
+ signed_off_at: Mapped[datetime | None] = mapped_column(DateTime)
25
+ created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
26
+
27
+ criteria: Mapped[list["CriterionRecord"]] = relationship(
28
+ back_populates="classification", cascade="all, delete-orphan"
29
+ )
30
+ variant: Mapped["VariantRecord"] = relationship("VariantRecord", lazy="joined")
31
+
32
+
33
+ class CriterionRecord(Base):
34
+ __tablename__ = "criteria"
35
+
36
+ id: Mapped[str] = mapped_column(String(36), primary_key=True, default=lambda: str(uuid4()))
37
+ classification_id: Mapped[str] = mapped_column(
38
+ String(36), ForeignKey("classifications.id"), index=True
39
+ )
40
+ code: Mapped[str] = mapped_column(String(8), nullable=False)
41
+ triggered: Mapped[bool] = mapped_column(Boolean, default=False)
42
+ strength: Mapped[str] = mapped_column(String(16))
43
+ source: Mapped[str] = mapped_column(String(128))
44
+ evidence_text: Mapped[str] = mapped_column(Text)
45
+ confidence: Mapped[str] = mapped_column(String(16), default="medium")
46
+ pmid: Mapped[str | None] = mapped_column(String(32))
47
+ caveat: Mapped[str | None] = mapped_column(Text)
48
+ curator_override: Mapped[bool] = mapped_column(Boolean, default=False)
49
+ override_justification: Mapped[str | None] = mapped_column(Text)
50
+
51
+ classification: Mapped["ClassificationRecord"] = relationship(back_populates="criteria")
backend/app/models/db.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections.abc import Generator
2
+
3
+ from sqlalchemy import create_engine
4
+ from sqlalchemy.orm import DeclarativeBase, Session, sessionmaker
5
+
6
+ from backend.app.config import get_settings
7
+
8
+ settings = get_settings()
9
+
10
+ engine = create_engine(settings.database_url, pool_pre_ping=True)
11
+ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
12
+
13
+
14
+ class Base(DeclarativeBase):
15
+ pass
16
+
17
+
18
+ def get_session() -> Generator[Session, None, None]:
19
+ db = SessionLocal()
20
+ try:
21
+ yield db
22
+ finally:
23
+ db.close()
backend/app/models/variant.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from uuid import uuid4
3
+
4
+ from sqlalchemy import JSON, DateTime, String
5
+ from sqlalchemy.orm import Mapped, mapped_column
6
+
7
+ from backend.app.models.db import Base
8
+
9
+
10
+ class VariantRecord(Base):
11
+ __tablename__ = "variants"
12
+
13
+ id: Mapped[str] = mapped_column(String(36), primary_key=True, default=lambda: str(uuid4()))
14
+ raw_input: Mapped[str] = mapped_column(String(512), nullable=False)
15
+ hgvs_genomic: Mapped[str | None] = mapped_column(String(512))
16
+ hgvs_coding: Mapped[str | None] = mapped_column(String(512))
17
+ hgvs_protein: Mapped[str | None] = mapped_column(String(512))
18
+ transcript: Mapped[str | None] = mapped_column(String(64))
19
+ gene_symbol: Mapped[str | None] = mapped_column(String(64), index=True)
20
+ chromosome: Mapped[str | None] = mapped_column(String(8))
21
+ position: Mapped[int | None] = mapped_column()
22
+ normalization_source: Mapped[str] = mapped_column(String(32), default="mutalyzer")
23
+ warnings: Mapped[list] = mapped_column(JSON, default=list)
24
+ submitted_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
backend/app/schemas/__init__.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from backend.app.schemas.classification import (
2
+ Classification,
3
+ ClassificationResult,
4
+ ClinicalSignificance,
5
+ )
6
+ from backend.app.schemas.evidence import (
7
+ ACMGCriterion,
8
+ CriterionStrength,
9
+ EvidenceBundle,
10
+ InSilicoResult,
11
+ LiteratureChunk,
12
+ PopulationFrequency,
13
+ )
14
+ from backend.app.schemas.variant import (
15
+ NormalizedVariant,
16
+ VariantInput,
17
+ VariantOutput,
18
+ )
19
+
20
+ __all__ = [
21
+ "VariantInput",
22
+ "VariantOutput",
23
+ "NormalizedVariant",
24
+ "ACMGCriterion",
25
+ "CriterionStrength",
26
+ "EvidenceBundle",
27
+ "InSilicoResult",
28
+ "LiteratureChunk",
29
+ "PopulationFrequency",
30
+ "Classification",
31
+ "ClassificationResult",
32
+ "ClinicalSignificance",
33
+ ]
backend/app/schemas/classification.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from backend.app.schemas.evidence import ACMGCriterion, EvidenceBundle
6
+ from backend.app.schemas.variant import NormalizedVariant
7
+
8
+ ClinicalSignificance = Literal[
9
+ "Pathogenic",
10
+ "Likely Pathogenic",
11
+ "Uncertain Significance",
12
+ "Likely Benign",
13
+ "Benign",
14
+ ]
15
+
16
+
17
+ class Classification(BaseModel):
18
+ significance: ClinicalSignificance
19
+ confidence: Literal["high", "medium", "low"] = "medium"
20
+ triggered_criteria: list[str] = Field(default_factory=list)
21
+ conflicting_evidence: bool = False
22
+ rationale: str | None = None
23
+
24
+
25
+ class ClassificationResult(BaseModel):
26
+ id: str | None = None
27
+ variant: NormalizedVariant
28
+ evidence: EvidenceBundle
29
+ classification: Classification
30
+ ruleset_version: str = "v2015"
31
+ curator_signoff: bool = False
32
+ curator_id: str | None = None
33
+ signed_off_at: str | None = None
34
+ analysed_at: str | None = None
35
+
36
+ @property
37
+ def auditable_criteria(self) -> list[ACMGCriterion]:
38
+ return [c for c in self.evidence.criteria if c.triggered]
backend/app/schemas/evidence.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+
3
+ from pydantic import BaseModel, ConfigDict, Field
4
+
5
+ CriterionStrength = Literal["very_strong", "strong", "moderate", "supporting", "standalone"]
6
+ CriterionConfidence = Literal["high", "medium", "low"]
7
+
8
+ ACMG_CRITERIA = [
9
+ "PVS1",
10
+ "PS1", "PS2", "PS3", "PS4",
11
+ "PM1", "PM2", "PM3", "PM4", "PM5", "PM6",
12
+ "PP1", "PP2", "PP3", "PP4", "PP5",
13
+ "BA1",
14
+ "BS1", "BS2", "BS3", "BS4",
15
+ "BP1", "BP2", "BP3", "BP4", "BP5", "BP6", "BP7",
16
+ ]
17
+
18
+
19
+ class ACMGCriterion(BaseModel):
20
+ code: str = Field(..., description="ACMG criterion code (e.g., PVS1, PM2)")
21
+ triggered: bool
22
+ strength: CriterionStrength
23
+ source: str = Field(..., description="Database name, PMID, or 'curator'")
24
+ evidence_text: str = Field(..., description="Quote, numeric value, or rule trace")
25
+ confidence: CriterionConfidence = "medium"
26
+ caveat: str | None = None
27
+ pmid: str | None = None
28
+ curator_override: bool = False
29
+ override_justification: str | None = None
30
+
31
+
32
+ class PopulationFrequency(BaseModel):
33
+ overall_af: float | None = None
34
+ by_population: dict[str, float] = Field(default_factory=dict)
35
+ homozygote_count: int | None = None
36
+ coverage_warning: str | None = None
37
+ source: str = "gnomAD v4.1"
38
+
39
+
40
+ class InSilicoResult(BaseModel):
41
+ revel: float | None = None
42
+ alphamissense: float | None = None
43
+ spliceai_max: float | None = None
44
+ cadd_phred: float | None = None
45
+ concordant_pathogenic: bool | None = None
46
+ concordant_benign: bool | None = None
47
+ pp3_triggered: bool = False
48
+ bp4_triggered: bool = False
49
+
50
+
51
+ class ClinVarSubmission(BaseModel):
52
+ accession: str
53
+ submitter: str = "unknown"
54
+ classification: str
55
+ stars: int = 0
56
+ date: str = ""
57
+ condition: str = ""
58
+
59
+
60
+ class AutoPVS1Step(BaseModel):
61
+ model_config = ConfigDict(populate_by_name=True)
62
+
63
+ step: int
64
+ label: str
65
+ value: str
66
+ pass_: bool = Field(..., alias="pass")
67
+
68
+
69
+ class AutoPVS1Result(BaseModel):
70
+ triggered: bool
71
+ strength: CriterionStrength = "very_strong"
72
+ rule: str = "PVS1"
73
+ reasoning: list[AutoPVS1Step] = Field(default_factory=list)
74
+ conclusion: str = ""
75
+ source: str = "autoPVS1"
76
+ caveats: list[str] = Field(default_factory=list)
77
+
78
+
79
+ class LiteratureChunk(BaseModel):
80
+ pmid: str
81
+ year: int | None = None
82
+ title: str | None = None
83
+ journal: str | None = None
84
+ chunk_text: str
85
+ criteria_relevance: list[str] = Field(default_factory=list)
86
+ score: float | None = None
87
+ ai_interpretation: str | None = None
88
+ ai_confidence: str | None = None
89
+
90
+
91
+ class EvidenceBundle(BaseModel):
92
+ population_frequency: PopulationFrequency | None = None
93
+ insilico: InSilicoResult | None = None
94
+ clinvar_existing: list[ClinVarSubmission] = Field(default_factory=list)
95
+ autopvs1: AutoPVS1Result | None = None
96
+ literature_chunks: list[LiteratureChunk] = Field(default_factory=list)
97
+ criteria: list[ACMGCriterion] = Field(default_factory=list)
backend/app/schemas/variant.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class VariantInput(BaseModel):
7
+ raw: str = Field(..., description="User-supplied variant string (HGVS, VCF, or protein notation)")
8
+ notation: Literal["hgvs", "vcf", "protein", "auto"] = "auto"
9
+ gene_symbol: str | None = None
10
+ disease: str | None = None
11
+ hpo_terms: list[str] = Field(default_factory=list)
12
+ inheritance: Literal["AD", "AR", "XL", "MT", "unknown"] | None = None
13
+
14
+
15
+ class NormalizedVariant(BaseModel):
16
+ raw_input: str
17
+ hgvs_genomic: str | None = None
18
+ hgvs_coding: str | None = None
19
+ hgvs_protein: str | None = None
20
+ transcript: str | None = None
21
+ gene_symbol: str | None = None
22
+ chromosome: str | None = None
23
+ position: int | None = None
24
+ ref: str | None = None
25
+ alt: str | None = None
26
+ consequence: str | None = None
27
+ normalization_source: Literal["mutalyzer", "pyhgvs", "passthrough"] = "mutalyzer"
28
+ warnings: list[str] = Field(default_factory=list)
29
+
30
+
31
+ class VariantOutput(BaseModel):
32
+ id: str
33
+ normalized: NormalizedVariant
34
+ submitted_at: str
backend/app/services/__init__.py ADDED
File without changes
backend/app/services/acmg/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from backend.app.services.acmg.combiner import combine_criteria
2
+ from backend.app.services.acmg.rules import RuleEngine
3
+
4
+ __all__ = ["RuleEngine", "combine_criteria"]
backend/app/services/acmg/combiner.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ACMG/AMP variant classification combiner.
2
+
3
+ This module implements two classifiers:
4
+
5
+ 1. **Strict Table 5** (Richards 2015) β€” the original combinatorial rules.
6
+ This is the clinical default because it is auditable and conservative.
7
+
8
+ 2. **Bayesian point system** (Tavtigian 2018; ClinGen SVI 2020) β€” assigns
9
+ numeric points to each triggered criterion based on its strength, then
10
+ classifies by total. This can be enabled explicitly for validation and
11
+ research cohorts.
12
+
13
+ Point thresholds (Tavtigian 2018, Genet Med 20:1054):
14
+ β‰₯10 β†’ Pathogenic
15
+ 6-9 β†’ Likely Pathogenic
16
+ 0-5 β†’ VUS
17
+ -6 to -1 β†’ Likely Benign
18
+ ≀-7 β†’ Benign
19
+
20
+ Point values:
21
+ very_strong=8, strong=4, moderate=2, supporting=1
22
+ standalone=-8, benign equivalents flip sign
23
+
24
+ The previous implementation selected the more pathogenic result by default.
25
+ That is useful for exploration, but too permissive for lab-facing defaults.
26
+ """
27
+
28
+ from backend.app.config import get_settings
29
+ from backend.app.schemas.classification import Classification, ClinicalSignificance
30
+ from backend.app.schemas.evidence import ACMGCriterion
31
+
32
+ PATHOGENIC_PREFIX = ("PVS", "PS", "PM", "PP")
33
+ BENIGN_PREFIX = ("BA", "BS", "BP")
34
+
35
+ POINTS_PATH = {"very_strong": 8, "strong": 4, "moderate": 2, "supporting": 1}
36
+ POINTS_BEN = {"standalone": 8, "strong": 4, "moderate": 2, "supporting": 1}
37
+
38
+
39
+ def _bayesian_score(criteria: list[ACMGCriterion]) -> int:
40
+ """Tavtigian 2018 point system. Pathogenic criteria add, benign subtract."""
41
+ score = 0
42
+ for c in criteria:
43
+ if not c.triggered:
44
+ continue
45
+ if c.code.startswith(PATHOGENIC_PREFIX):
46
+ score += POINTS_PATH.get(c.strength, 0)
47
+ elif c.code.startswith(BENIGN_PREFIX):
48
+ score -= POINTS_BEN.get(c.strength, 0)
49
+ return score
50
+
51
+
52
+ def _bayesian_significance(score: int) -> ClinicalSignificance:
53
+ if score >= 10:
54
+ return "Pathogenic"
55
+ if score >= 6:
56
+ return "Likely Pathogenic"
57
+ if score >= 0:
58
+ return "Uncertain Significance"
59
+ if score >= -6:
60
+ return "Likely Benign"
61
+ return "Benign"
62
+
63
+
64
+ SIGNIFICANCE_RANK = {
65
+ "Benign": 0,
66
+ "Likely Benign": 1,
67
+ "Uncertain Significance": 2,
68
+ "Likely Pathogenic": 3,
69
+ "Pathogenic": 4,
70
+ }
71
+
72
+
73
+ def _bucket(criteria: list[ACMGCriterion]) -> dict[str, int]:
74
+ triggered = [c for c in criteria if c.triggered]
75
+ return {
76
+ "very_strong": sum(1 for c in triggered if c.strength == "very_strong"),
77
+ "strong_path": sum(1 for c in triggered if c.strength == "strong" and c.code.startswith(PATHOGENIC_PREFIX)),
78
+ "moderate_path": sum(1 for c in triggered if c.strength == "moderate" and c.code.startswith(PATHOGENIC_PREFIX)),
79
+ "supporting_path": sum(1 for c in triggered if c.strength == "supporting" and c.code.startswith(PATHOGENIC_PREFIX)),
80
+ "standalone": sum(1 for c in triggered if c.strength == "standalone"),
81
+ "strong_benign": sum(1 for c in triggered if c.strength == "strong" and c.code.startswith(BENIGN_PREFIX)),
82
+ "moderate_benign": sum(1 for c in triggered if c.strength == "moderate" and c.code.startswith(BENIGN_PREFIX)),
83
+ "supporting_benign": sum(1 for c in triggered if c.strength == "supporting" and c.code.startswith(BENIGN_PREFIX)),
84
+ }
85
+
86
+
87
+ def _is_pathogenic(b: dict[str, int]) -> bool:
88
+ if b["very_strong"] >= 1:
89
+ if b["strong_path"] >= 1:
90
+ return True
91
+ if b["moderate_path"] >= 2:
92
+ return True
93
+ if b["moderate_path"] >= 1 and b["supporting_path"] >= 1:
94
+ return True
95
+ if b["supporting_path"] >= 2:
96
+ return True
97
+ if b["strong_path"] >= 2:
98
+ return True
99
+ if b["strong_path"] >= 1:
100
+ if b["moderate_path"] >= 3:
101
+ return True
102
+ if b["moderate_path"] >= 2 and b["supporting_path"] >= 2:
103
+ return True
104
+ return b["moderate_path"] >= 1 and b["supporting_path"] >= 4
105
+ return False
106
+
107
+
108
+ def _is_likely_pathogenic(b: dict[str, int]) -> bool:
109
+ if b["very_strong"] >= 1 and b["moderate_path"] >= 1:
110
+ return True
111
+ if b["strong_path"] >= 1 and 1 <= b["moderate_path"] <= 2:
112
+ return True
113
+ if b["strong_path"] >= 1 and b["supporting_path"] >= 2:
114
+ return True
115
+ if b["moderate_path"] >= 3:
116
+ return True
117
+ if b["moderate_path"] >= 2 and b["supporting_path"] >= 2:
118
+ return True
119
+ return b["moderate_path"] >= 1 and b["supporting_path"] >= 4
120
+
121
+
122
+ def _is_benign(b: dict[str, int]) -> bool:
123
+ if b["standalone"] >= 1:
124
+ return True
125
+ return b["strong_benign"] >= 2
126
+
127
+
128
+ def _is_likely_benign(b: dict[str, int]) -> bool:
129
+ if b["strong_benign"] >= 1 and b["supporting_benign"] >= 1:
130
+ return True
131
+ return b["supporting_benign"] >= 2
132
+
133
+
134
+ def combine_criteria(criteria: list[ACMGCriterion]) -> Classification:
135
+ """Combine ACMG criteria using the configured combiner strategy.
136
+
137
+ Conflict detection still uses the strict bucketing β€” if pathogenic
138
+ AND benign criteria both fire, we surface VUS regardless of points.
139
+ """
140
+ strategy = get_settings().acmg_combiner_strategy
141
+ triggered = [c for c in criteria if c.triggered]
142
+ b = _bucket(criteria)
143
+
144
+ table5_pathogenic = _is_pathogenic(b)
145
+ table5_likely_pathogenic = _is_likely_pathogenic(b)
146
+ table5_benign = _is_benign(b)
147
+ table5_likely_benign = _is_likely_benign(b)
148
+
149
+ table5_sig: ClinicalSignificance = (
150
+ "Pathogenic" if table5_pathogenic else
151
+ "Likely Pathogenic" if table5_likely_pathogenic else
152
+ "Benign" if table5_benign else
153
+ "Likely Benign" if table5_likely_benign else
154
+ "Uncertain Significance"
155
+ )
156
+
157
+ points = _bayesian_score(criteria)
158
+ bayes_sig = _bayesian_significance(points)
159
+
160
+ if strategy == "bayesian":
161
+ significance: ClinicalSignificance = bayes_sig
162
+ used_classifier = f"Bayesian {points:+d} pts"
163
+ elif strategy == "most_pathogenic" and SIGNIFICANCE_RANK[bayes_sig] >= SIGNIFICANCE_RANK[table5_sig]:
164
+ significance = bayes_sig
165
+ used_classifier = f"Bayesian {points:+d} pts"
166
+ else:
167
+ significance = table5_sig
168
+ used_classifier = "Richards 2015 Table 5"
169
+
170
+ has_path_evidence = b["very_strong"] + b["strong_path"] + b["moderate_path"] + b["supporting_path"] > 0
171
+ has_benign_evidence = b["standalone"] + b["strong_benign"] + b["moderate_benign"] + b["supporting_benign"] > 0
172
+ conflicting = has_path_evidence and has_benign_evidence
173
+
174
+ if conflicting:
175
+ significance = "Uncertain Significance"
176
+
177
+ avg_low = sum(1 for c in triggered if c.confidence == "low")
178
+ if not triggered or avg_low >= 2:
179
+ confidence = "low"
180
+ elif all(c.confidence == "high" for c in triggered):
181
+ confidence = "high"
182
+ else:
183
+ confidence = "medium"
184
+
185
+ return Classification(
186
+ significance=significance,
187
+ confidence=confidence,
188
+ triggered_criteria=[c.code for c in triggered],
189
+ conflicting_evidence=conflicting,
190
+ rationale=_build_rationale(b, significance, points, used_classifier),
191
+ )
192
+
193
+
194
+ def _build_rationale(
195
+ b: dict[str, int],
196
+ significance: ClinicalSignificance,
197
+ points: int,
198
+ classifier: str,
199
+ ) -> str:
200
+ parts = []
201
+ if b["very_strong"]:
202
+ parts.append(f"{b['very_strong']}Γ— Very Strong")
203
+ if b["strong_path"]:
204
+ parts.append(f"{b['strong_path']}Γ— Strong (P)")
205
+ if b["moderate_path"]:
206
+ parts.append(f"{b['moderate_path']}Γ— Moderate (P)")
207
+ if b["supporting_path"]:
208
+ parts.append(f"{b['supporting_path']}Γ— Supporting (P)")
209
+ if b["standalone"]:
210
+ parts.append(f"{b['standalone']}Γ— Stand-alone (B)")
211
+ if b["strong_benign"]:
212
+ parts.append(f"{b['strong_benign']}Γ— Strong (B)")
213
+ if b["moderate_benign"]:
214
+ parts.append(f"{b['moderate_benign']}Γ— Moderate (B)")
215
+ if b["supporting_benign"]:
216
+ parts.append(f"{b['supporting_benign']}Γ— Supporting (B)")
217
+ counts = " + ".join(parts) if parts else "no triggered criteria"
218
+ return f"{significance} ({classifier}, {points:+d} pts) β€” {counts}"
backend/app/services/acmg/rules.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ from backend.app.config import get_settings
4
+ from backend.app.schemas.evidence import (
5
+ ACMGCriterion,
6
+ AutoPVS1Result,
7
+ ClinVarSubmission,
8
+ EvidenceBundle,
9
+ InSilicoResult,
10
+ PopulationFrequency,
11
+ )
12
+
13
+ logger = logging.getLogger(__name__)
14
+ settings = get_settings()
15
+
16
+ PM2_THRESHOLD = 0.0001
17
+ BS1_THRESHOLD = 0.005
18
+ BA1_THRESHOLD = 0.05
19
+ BS2_HOM_THRESHOLD = 2
20
+
21
+ # PM2 strength β€” Richards 2015 originally specified MODERATE.
22
+ # ClinGen SVI 2020 recommended downgrading to SUPPORTING for general use,
23
+ # but most clinical labs and ClinGen VCEPs still apply MODERATE in practice.
24
+ # Switch via env if you want the SVI 2020 behavior.
25
+ PM2_STRENGTH = "moderate"
26
+
27
+
28
+ class RuleEngine:
29
+ """Auto-scorers for database-derived ACMG criteria. Literature criteria
30
+ (PM3, PP1, PS3, PS4, PP4, PS2/PM6, PP5/BP6) are populated by the LLM layer."""
31
+
32
+ def score_pvs1(self, autopvs1_result: AutoPVS1Result | None) -> ACMGCriterion | None:
33
+ if not autopvs1_result or not autopvs1_result.triggered:
34
+ return None
35
+ return ACMGCriterion(
36
+ code="PVS1",
37
+ triggered=True,
38
+ strength=autopvs1_result.strength,
39
+ source=autopvs1_result.source,
40
+ evidence_text=autopvs1_result.conclusion,
41
+ confidence="high",
42
+ caveat="; ".join(autopvs1_result.caveats) or None,
43
+ )
44
+
45
+ def score_population(self, freq: PopulationFrequency | None) -> list[ACMGCriterion]:
46
+ if not freq or freq.overall_af is None:
47
+ logger.warning("Population frequency missing; PM2 not triggered until coverage is verified")
48
+ return []
49
+
50
+ out: list[ACMGCriterion] = []
51
+ af = freq.overall_af or 0.0
52
+
53
+ if af >= BA1_THRESHOLD:
54
+ out.append(ACMGCriterion(
55
+ code="BA1",
56
+ triggered=True,
57
+ strength="standalone",
58
+ source="gnomAD v4.1",
59
+ evidence_text=f"overall AF = {af:.4f} β‰₯ 5%",
60
+ confidence="high",
61
+ ))
62
+ elif af >= BS1_THRESHOLD:
63
+ out.append(ACMGCriterion(
64
+ code="BS1",
65
+ triggered=True,
66
+ strength="strong",
67
+ source="gnomAD v4.1",
68
+ evidence_text=f"overall AF = {af:.4f} > expected",
69
+ confidence="medium",
70
+ caveat="compare against disease-specific BS1 threshold",
71
+ ))
72
+ elif af < PM2_THRESHOLD:
73
+ out.append(ACMGCriterion(
74
+ code="PM2",
75
+ triggered=True,
76
+ strength="supporting",
77
+ source="gnomAD v4.1",
78
+ evidence_text=f"overall AF = {af:.6f} < 0.0001",
79
+ confidence="high",
80
+ ))
81
+
82
+ if (freq.homozygote_count or 0) >= BS2_HOM_THRESHOLD:
83
+ out.append(ACMGCriterion(
84
+ code="BS2",
85
+ triggered=True,
86
+ strength="strong",
87
+ source="gnomAD v4.1",
88
+ evidence_text=f"{freq.homozygote_count} healthy homozygotes",
89
+ confidence="high",
90
+ ))
91
+ return out
92
+
93
+ def score_insilico(self, ins: InSilicoResult | None) -> list[ACMGCriterion]:
94
+ """Modulate PP3/BP4 strength using ClinGen SVI 2022 recommendations
95
+ (Pejaver et al. 2022, AJHG) β€” REVEL β‰₯ 0.932 + concordant signals
96
+ upgrade to PP3_strong; β‰₯ 0.773 to PP3_moderate; otherwise supporting.
97
+ Mirror thresholds for BP4.
98
+ """
99
+ if not ins:
100
+ return []
101
+ out = []
102
+ if ins.pp3_triggered:
103
+ strength = self._pp3_strength(ins)
104
+ out.append(ACMGCriterion(
105
+ code="PP3",
106
+ triggered=True,
107
+ strength=strength,
108
+ source="REVEL+AlphaMissense+SpliceAI concordant",
109
+ evidence_text=f"REVEL={ins.revel}, AM={ins.alphamissense}, SpliceAI={ins.spliceai_max} β†’ {strength}",
110
+ confidence="high" if strength in ("strong", "moderate") else "medium",
111
+ ))
112
+ if ins.bp4_triggered:
113
+ strength = self._bp4_strength(ins)
114
+ out.append(ACMGCriterion(
115
+ code="BP4",
116
+ triggered=True,
117
+ strength=strength,
118
+ source="REVEL+AlphaMissense+SpliceAI concordant",
119
+ evidence_text=f"REVEL={ins.revel}, AM={ins.alphamissense}, SpliceAI={ins.spliceai_max} β†’ {strength}",
120
+ confidence="high" if strength in ("strong", "moderate") else "medium",
121
+ ))
122
+ return out
123
+
124
+ @staticmethod
125
+ def _pp3_strength(ins: "InSilicoResult") -> str:
126
+ # Pejaver et al. 2022 calibration β€” REVEL stratification for PP3
127
+ revel = ins.revel or 0.0
128
+ am = ins.alphamissense or 0.0
129
+ if revel >= 0.932 and am >= 0.95:
130
+ return "strong"
131
+ if revel >= 0.773 or am >= 0.834:
132
+ return "moderate"
133
+ return "supporting"
134
+
135
+ @staticmethod
136
+ def _bp4_strength(ins: "InSilicoResult") -> str:
137
+ revel = ins.revel if ins.revel is not None else 1.0
138
+ am = ins.alphamissense if ins.alphamissense is not None else 1.0
139
+ if revel <= 0.183 and am <= 0.099:
140
+ return "strong"
141
+ if revel <= 0.290 or am <= 0.099:
142
+ return "moderate"
143
+ return "supporting"
144
+
145
+ def score_clinvar(self, submissions: list[ClinVarSubmission] | None) -> list[ACMGCriterion]:
146
+ """Map ClinVar consensus to optional PP5/BP6 evidence.
147
+
148
+ The first submission is the AGGREGATE consensus from ClinVar (the
149
+ green-star verdict). ACMG SVI deprecated PP5/BP6 as standalone
150
+ criteria in 2018, so VariantLens does not auto-trigger them unless
151
+ explicitly enabled for research/backward-compatibility validation.
152
+ """
153
+ if not submissions:
154
+ return []
155
+ if not settings.enable_deprecated_clinvar_criteria:
156
+ logger.info("ClinVar PP5/BP6 auto-scoring disabled; retaining ClinVar as evidence only")
157
+ return []
158
+
159
+ # First submission is the aggregate consensus (see clinvar.py); rest are lab-level
160
+ consensus = submissions[0]
161
+ cls = consensus.classification.lower()
162
+ stars = consensus.stars
163
+ is_path = "pathogenic" in cls and "conflicting" not in cls
164
+ is_benign = "benign" in cls and "conflicting" not in cls
165
+
166
+ if not (is_path or is_benign):
167
+ return []
168
+
169
+ # Strength scales with ClinGen review-status stars:
170
+ # 4β˜… practice guideline β†’ strong
171
+ # 3β˜… expert panel β†’ strong
172
+ # 2β˜… multi-submitter ok β†’ moderate
173
+ # 1β˜… single submitter β†’ supporting
174
+ # 0β˜… no criteria β†’ supporting (downgraded)
175
+ strength = (
176
+ "strong" if stars >= 3 else
177
+ "moderate" if stars == 2 else
178
+ "supporting"
179
+ )
180
+ confidence: str = "high" if stars >= 3 else ("medium" if stars >= 1 else "low")
181
+
182
+ out: list[ACMGCriterion] = []
183
+ if is_path:
184
+ out.append(ACMGCriterion(
185
+ code="PP5",
186
+ triggered=True,
187
+ strength=strength,
188
+ source=f"ClinVar consensus {consensus.accession} ({stars}β˜…)",
189
+ evidence_text=f"Aggregate ClinVar classification: {consensus.classification} β€” {stars}β˜… review",
190
+ confidence=confidence,
191
+ caveat=("ACMG SVI 2018 deprecated PP5 as standalone β€” verify before final sign-off"
192
+ if stars < 3 else None),
193
+ ))
194
+ elif is_benign:
195
+ out.append(ACMGCriterion(
196
+ code="BP6",
197
+ triggered=True,
198
+ strength=strength,
199
+ source=f"ClinVar consensus {consensus.accession} ({stars}β˜…)",
200
+ evidence_text=f"Aggregate ClinVar classification: {consensus.classification} β€” {stars}β˜… review",
201
+ confidence=confidence,
202
+ caveat=("ACMG SVI 2018 deprecated BP6 as standalone β€” verify before final sign-off"
203
+ if stars < 3 else None),
204
+ ))
205
+ return out
206
+
207
+ def score_all(self, evidence: EvidenceBundle) -> list[ACMGCriterion]:
208
+ criteria: list[ACMGCriterion] = []
209
+ pvs1 = self.score_pvs1(evidence.autopvs1)
210
+ if pvs1:
211
+ criteria.append(pvs1)
212
+ criteria.extend(self.score_population(evidence.population_frequency))
213
+ criteria.extend(self.score_insilico(evidence.insilico))
214
+ criteria.extend(self.score_clinvar(evidence.clinvar_existing))
215
+ return criteria
backend/app/services/clinvar.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ClinVar lookup β€” aggregate consensus + per-submitter assertions.
2
+
3
+ The previous implementation only fetched `ids[0]` from esearch, which often
4
+ isn't the canonical VariationArchive (esearch ranks by recency, not by
5
+ match quality). It also ignored the aggregate `GermlineClassification`
6
+ field, so a variant with 50 Pathogenic assertions and a 3-star expert-panel
7
+ review status would render as the first lab-level submission found β€” often
8
+ a discordant single-lab call.
9
+
10
+ This module now:
11
+ 1. Fetches all matching variation IDs from esearch (up to MAX_IDS).
12
+ 2. Extracts the aggregate `Classifications/GermlineClassification` from
13
+ each β€” that's the curated consensus that ClinGen uses for the green
14
+ star ratings.
15
+ 3. Picks the entry whose review status carries the highest weight.
16
+ 4. Returns it as the primary `ClinVarSubmission`, plus up to N
17
+ supporting per-submitter assertions for the UI's evidence list.
18
+ """
19
+ from __future__ import annotations
20
+
21
+ import logging
22
+ import xml.etree.ElementTree as ET
23
+ from typing import Any
24
+
25
+ import httpx
26
+ from tenacity import retry, stop_after_attempt, wait_exponential
27
+
28
+ from backend.app.config import get_settings
29
+ from backend.app.schemas.evidence import ClinVarSubmission
30
+
31
+ logger = logging.getLogger(__name__)
32
+ settings = get_settings()
33
+
34
+ EUTILS = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
35
+ MAX_IDS = 10
36
+ MAX_ASSERTIONS = 10
37
+
38
+ REVIEW_STATUS_STARS: dict[str, int] = {
39
+ "practice guideline": 4,
40
+ "reviewed by expert panel": 3,
41
+ "criteria provided, multiple submitters, no conflicts": 2,
42
+ "criteria provided, multiple submitters": 2,
43
+ "criteria provided, single submitter": 1,
44
+ "criteria provided, conflicting classifications": 1,
45
+ "criteria provided, conflicting interpretations": 1,
46
+ "no assertion criteria provided": 0,
47
+ "no classification provided": 0,
48
+ "no assertion provided": 0,
49
+ "no classifications from unflagged records": 0,
50
+ }
51
+
52
+
53
+ def _stars_for(review_status: str | None) -> int:
54
+ if not review_status:
55
+ return 0
56
+ return REVIEW_STATUS_STARS.get(review_status.strip().lower(), 0)
57
+
58
+
59
+ class ClinVarClient:
60
+ def __init__(self, api_key: str | None = None, email: str | None = None) -> None:
61
+ self.api_key = api_key or settings.ncbi_api_key
62
+ self.email = email or settings.ncbi_email
63
+
64
+ def _params(self, **extra: Any) -> dict[str, Any]:
65
+ params = {"db": "clinvar", "tool": "VariantLens", "email": self.email}
66
+ if self.api_key:
67
+ params["api_key"] = self.api_key
68
+ return {**params, **extra}
69
+
70
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=8), reraise=True)
71
+ async def search(self, hgvs: str) -> list[str]:
72
+ async with httpx.AsyncClient(timeout=15.0) as client:
73
+ r = await client.get(
74
+ f"{EUTILS}/esearch.fcgi",
75
+ params=self._params(term=hgvs, retmode="json", retmax=MAX_IDS),
76
+ )
77
+ r.raise_for_status()
78
+ return r.json().get("esearchresult", {}).get("idlist", [])
79
+
80
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=8), reraise=True)
81
+ async def _efetch(self, variation_ids: list[str]) -> str:
82
+ """Bulk-fetch up to N variation IDs in one call. ClinVar's efetch
83
+ supports comma-separated IDs and returns a single ClinVarResult
84
+ document containing one VariationArchive per ID."""
85
+ async with httpx.AsyncClient(timeout=30.0) as client:
86
+ r = await client.get(
87
+ f"{EUTILS}/efetch.fcgi",
88
+ params=self._params(
89
+ id=",".join(variation_ids),
90
+ rettype="vcv",
91
+ is_variationid="true",
92
+ ),
93
+ )
94
+ r.raise_for_status()
95
+ return r.text
96
+
97
+ def _parse_aggregate(self, vcv: ET.Element) -> ClinVarSubmission | None:
98
+ """Extract the canonical aggregate consensus from a VariationArchive.
99
+
100
+ This corresponds to the green-star review at the top of the ClinVar
101
+ web page β€” the single line of consensus that the ACMG SVI and most
102
+ clinical labs treat as the authoritative ClinVar verdict.
103
+ """
104
+ accession = vcv.get("Accession") or vcv.get("VariationID") or "unknown"
105
+
106
+ # GRCh38 RefSeq accessions are direct children, not nested deeper
107
+ cls_node = vcv.find(".//Classifications/GermlineClassification/Description")
108
+ review_node = vcv.find(".//Classifications/GermlineClassification/ReviewStatus")
109
+ date_node = vcv.find(".//Classifications/GermlineClassification")
110
+ cond_nodes = vcv.findall(".//Classifications/GermlineClassification/ConditionList/TraitSet/Trait/Name/ElementValue")
111
+
112
+ if cls_node is None or not cls_node.text:
113
+ return None
114
+
115
+ review = review_node.text if review_node is not None else None
116
+ date = ""
117
+ if date_node is not None:
118
+ date = date_node.get("DateLastEvaluated") or date_node.get("DateCreated") or ""
119
+
120
+ condition = "not specified"
121
+ for n in cond_nodes:
122
+ if n.get("Type") == "Preferred" and n.text:
123
+ condition = n.text
124
+ break
125
+ if condition == "not specified" and cond_nodes and cond_nodes[0].text:
126
+ condition = cond_nodes[0].text
127
+
128
+ return ClinVarSubmission(
129
+ accession=accession,
130
+ submitter="ClinVar aggregate",
131
+ classification=cls_node.text,
132
+ stars=_stars_for(review),
133
+ date=date,
134
+ condition=condition,
135
+ )
136
+
137
+ def _parse_assertions(self, vcv: ET.Element, limit: int) -> list[ClinVarSubmission]:
138
+ """Pull individual lab-level assertions for the UI's evidence list.
139
+
140
+ Aggregated separately from the consensus so the rule engine doesn't
141
+ double-count a single ClinVar entry into 50 distinct PP5 hits.
142
+ """
143
+ out: list[ClinVarSubmission] = []
144
+ for scv in vcv.iter("ClinicalAssertion"):
145
+ if len(out) >= limit:
146
+ break
147
+ acc_node = scv.find(".//ClinVarAccession")
148
+ acc = acc_node.get("Accession") if acc_node is not None else "unknown"
149
+ submitter = acc_node.get("SubmitterName") if acc_node is not None else "unknown"
150
+
151
+ cls_node = scv.find("Classification/GermlineClassification")
152
+ if cls_node is None or not cls_node.text:
153
+ continue
154
+ classification = cls_node.text
155
+
156
+ review_node = scv.find("Classification/ReviewStatus")
157
+ review = review_node.text if review_node is not None else None
158
+
159
+ date_node = scv.find("Classification")
160
+ date = date_node.get("DateLastEvaluated") if date_node is not None else ""
161
+
162
+ cond_node = scv.find(".//TraitSet/Trait/Name/ElementValue")
163
+ condition = cond_node.text if cond_node is not None and cond_node.text else "not specified"
164
+
165
+ out.append(ClinVarSubmission(
166
+ accession=acc or "unknown",
167
+ submitter=submitter or "unknown",
168
+ classification=classification,
169
+ stars=_stars_for(review),
170
+ date=date or "",
171
+ condition=condition,
172
+ ))
173
+ return out
174
+
175
+ def _parse(self, xml_text: str) -> list[ClinVarSubmission]:
176
+ """Parse all VariationArchives in the ClinVar response.
177
+
178
+ Returns the strongest aggregate consensus first, then up to
179
+ MAX_ASSERTIONS per-submitter assertions from the same archive.
180
+ """
181
+ try:
182
+ root = ET.fromstring(xml_text)
183
+ except ET.ParseError as e:
184
+ logger.warning("clinvar xml parse failure: %s", e)
185
+ return []
186
+
187
+ # Pick the VariationArchive with the highest-star aggregate consensus β€”
188
+ # esearch sometimes returns several IDs (alternative alleles, related
189
+ # variants) and we want the canonical one for THIS variant.
190
+ archives_with_consensus: list[tuple[ET.Element, ClinVarSubmission]] = []
191
+ for vcv in root.iter("VariationArchive"):
192
+ agg = self._parse_aggregate(vcv)
193
+ if agg is not None:
194
+ archives_with_consensus.append((vcv, agg))
195
+
196
+ if not archives_with_consensus:
197
+ return []
198
+
199
+ archives_with_consensus.sort(key=lambda t: -t[1].stars)
200
+ canonical_vcv, consensus = archives_with_consensus[0]
201
+ return [consensus] + self._parse_assertions(canonical_vcv, MAX_ASSERTIONS)
202
+
203
+ async def lookup(self, hgvs: str) -> list[ClinVarSubmission]:
204
+ try:
205
+ ids = await self.search(hgvs)
206
+ except (httpx.HTTPError, httpx.TimeoutException) as e:
207
+ logger.warning("ClinVar search failed for %s: %s", hgvs, e)
208
+ return []
209
+ if not ids:
210
+ return []
211
+
212
+ try:
213
+ xml = await self._efetch(ids[:MAX_IDS])
214
+ except (httpx.HTTPError, httpx.TimeoutException) as e:
215
+ logger.warning("ClinVar efetch failed for %s: %s", hgvs, e)
216
+ return []
217
+
218
+ return self._parse(xml)
backend/app/services/exports.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Export classifications to standard interchange formats.
2
+
3
+ ClinVar XML β€” Submission Schema v1.16 (https://www.ncbi.nlm.nih.gov/clinvar/docs/submit/).
4
+ FHIR R4 β€” Observation resource with LOINC 53037-8 (genetic clinical significance).
5
+
6
+ Both renderers read from a persisted `ClassificationRecord` so the audit
7
+ trail is intact (sign-off and curator overrides are reflected in the export).
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import xml.etree.ElementTree as ET
12
+ from datetime import UTC, datetime
13
+ from typing import Any
14
+ from xml.dom import minidom
15
+
16
+ from backend.app.models.classification import ClassificationRecord
17
+
18
+ CLINVAR_SIG_MAP = {
19
+ "Pathogenic": "Pathogenic",
20
+ "Likely Pathogenic": "Likely pathogenic",
21
+ "Uncertain Significance": "Uncertain significance",
22
+ "Likely Benign": "Likely benign",
23
+ "Benign": "Benign",
24
+ }
25
+
26
+ LOINC_CLINSIG = {
27
+ "Pathogenic": {"system": "http://loinc.org", "code": "LA6668-3", "display": "Pathogenic"},
28
+ "Likely Pathogenic": {"system": "http://loinc.org", "code": "LA26332-9", "display": "Likely pathogenic"},
29
+ "Uncertain Significance": {"system": "http://loinc.org", "code": "LA26333-7", "display": "Uncertain significance"},
30
+ "Likely Benign": {"system": "http://loinc.org", "code": "LA26334-5", "display": "Likely benign"},
31
+ "Benign": {"system": "http://loinc.org", "code": "LA6675-8", "display": "Benign"},
32
+ }
33
+
34
+
35
+ def _today() -> str:
36
+ return datetime.now(UTC).strftime("%Y-%m-%d")
37
+
38
+
39
+ def render_clinvar_xml(rec: ClassificationRecord, *, submitter_org_id: str = "VARIANTLENS_LAB") -> str:
40
+ """Render a minimal ClinVar SCV submission for a single variant.
41
+
42
+ The output validates against the ClinVar Submission Schema's
43
+ `ClinvarSubmissionSet > ClinVarSubmission > ClinVarAssertion` path.
44
+ """
45
+ root = ET.Element("ClinvarSubmissionSet", attrib={"Date": _today()})
46
+ submission = ET.SubElement(root, "ClinvarSubmission", attrib={
47
+ "ID": rec.id,
48
+ "SubmissionDate": _today(),
49
+ })
50
+
51
+ assertion = ET.SubElement(submission, "ClinVarAssertion")
52
+
53
+ # ClinVarAccession β€” submitter assigned IDs
54
+ ET.SubElement(assertion, "ClinVarAccession", attrib={
55
+ "Acc": f"SCV-LOCAL-{rec.id}",
56
+ "Type": "SCV",
57
+ "OrgID": submitter_org_id,
58
+ })
59
+
60
+ # RecordStatus
61
+ rs = ET.SubElement(assertion, "RecordStatus")
62
+ rs.text = "current"
63
+
64
+ # ClinicalSignificance β€” the actual call
65
+ cs = ET.SubElement(assertion, "ClinicalSignificance", attrib={
66
+ "DateLastEvaluated": (rec.signed_off_at.strftime("%Y-%m-%d")
67
+ if rec.signed_off_at
68
+ else _today()),
69
+ })
70
+ review = ET.SubElement(cs, "ReviewStatus")
71
+ review.text = ("criteria provided, single submitter"
72
+ if rec.curator_signoff
73
+ else "no assertion criteria provided")
74
+ desc = ET.SubElement(cs, "Description")
75
+ desc.text = CLINVAR_SIG_MAP.get(rec.significance, rec.significance)
76
+ if rec.rationale:
77
+ comment = ET.SubElement(cs, "Comment", attrib={"Type": "ConvertedByNCBI"})
78
+ comment.text = rec.rationale
79
+
80
+ # AssertionMethod β€” the ruleset
81
+ method = ET.SubElement(assertion, "AssertionMethod")
82
+ method_name = ET.SubElement(method, "MethodName")
83
+ method_name.text = f"ACMG/AMP guidelines (Richards 2015) β€” VariantLens {rec.ruleset_version}"
84
+
85
+ # ObservedIn β€” placeholder for the proband
86
+ obs_in = ET.SubElement(assertion, "ObservedIn")
87
+ sample = ET.SubElement(obs_in, "Sample")
88
+ origin = ET.SubElement(sample, "Origin")
89
+ origin.text = "germline"
90
+ species = ET.SubElement(sample, "Species", attrib={"TaxonomyId": "9606"})
91
+ species.text = "human"
92
+ affected = ET.SubElement(sample, "AffectedStatus")
93
+ affected.text = "yes"
94
+ method_obs = ET.SubElement(obs_in, "Method")
95
+ method_type = ET.SubElement(method_obs, "MethodType")
96
+ method_type.text = "clinical testing"
97
+ obs_data = ET.SubElement(obs_in, "ObservedData")
98
+ obs_attr = ET.SubElement(obs_data, "Attribute", attrib={"Type": "Description"})
99
+ obs_attr.text = (f"Variant interpreted by VariantLens with "
100
+ f"{len(rec.triggered_criteria or [])} ACMG criteria triggered.")
101
+
102
+ # MeasureSet β€” the variant itself
103
+ measure_set = ET.SubElement(assertion, "MeasureSet", attrib={"Type": "Variant"})
104
+ measure = ET.SubElement(measure_set, "Measure", attrib={"Type": "Variation"})
105
+ if rec.variant_id and hasattr(rec, "variant") and rec.variant is not None:
106
+ # Use the raw HGVS coding string from the related variant if available
107
+ for attr_name, attr_type in [
108
+ ("hgvs_coding", "HGVS, coding"),
109
+ ("hgvs_protein", "HGVS, protein"),
110
+ ("hgvs_genomic", "HGVS, genomic"),
111
+ ]:
112
+ val = getattr(rec.variant, attr_name, None)
113
+ if val:
114
+ name = ET.SubElement(measure, "AttributeSet")
115
+ attr = ET.SubElement(name, "Attribute", attrib={"Type": attr_type})
116
+ attr.text = val
117
+
118
+ # Per-criterion comments β€” the audit trail in flat form
119
+ for c in rec.criteria or []:
120
+ if not c.triggered:
121
+ continue
122
+ crit_comment = ET.SubElement(assertion, "Comment", attrib={"Type": "public"})
123
+ bits = [f"{c.code} ({c.strength})", f"source={c.source}"]
124
+ if c.pmid:
125
+ bits.append(f"PMID:{c.pmid}")
126
+ if c.curator_override and c.override_justification:
127
+ bits.append(f"curator override: {c.override_justification}")
128
+ crit_comment.text = " β€” ".join(bits + [c.evidence_text])
129
+
130
+ rough = ET.tostring(root, encoding="utf-8")
131
+ return minidom.parseString(rough).toprettyxml(indent=" ")
132
+
133
+
134
+ def render_fhir_observation(rec: ClassificationRecord) -> dict[str, Any]:
135
+ """Render a FHIR R4 Observation resource for the variant interpretation.
136
+
137
+ Conforms to the HL7 Genomics Reporting IG profile
138
+ `genomic-implication` / `variant` family. The encoded structure is the
139
+ minimum needed for an EHR import β€” extend with `specimen`, `subject`, and
140
+ `performer` references at the deployment boundary.
141
+ """
142
+ sig = LOINC_CLINSIG.get(rec.significance, {
143
+ "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor",
144
+ "code": "OTH",
145
+ "display": rec.significance,
146
+ })
147
+
148
+ components: list[dict[str, Any]] = []
149
+ if hasattr(rec, "variant") and rec.variant is not None:
150
+ for code, display, attr in [
151
+ ("48004-6", "DNA change (c.HGVS)", "hgvs_coding"),
152
+ ("48005-3", "Amino acid change (p.HGVS)", "hgvs_protein"),
153
+ ("81290-9", "Genomic DNA change (g.HGVS)", "hgvs_genomic"),
154
+ ("48018-6", "Gene studied [ID]", "gene_symbol"),
155
+ ]:
156
+ val = getattr(rec.variant, attr, None)
157
+ if val:
158
+ components.append({
159
+ "code": {"coding": [{"system": "http://loinc.org", "code": code, "display": display}]},
160
+ "valueString": val,
161
+ })
162
+
163
+ derived: list[dict[str, Any]] = []
164
+ for c in rec.criteria or []:
165
+ if not c.triggered:
166
+ continue
167
+ derived.append({
168
+ "extension": [
169
+ {"url": "https://variantlens.local/fhir/criterion-code", "valueString": c.code},
170
+ {"url": "https://variantlens.local/fhir/criterion-strength", "valueString": c.strength},
171
+ {"url": "https://variantlens.local/fhir/criterion-source", "valueString": c.source},
172
+ ],
173
+ "valueString": c.evidence_text,
174
+ })
175
+
176
+ return {
177
+ "resourceType": "Observation",
178
+ "id": rec.id,
179
+ "meta": {
180
+ "profile": [
181
+ "http://hl7.org/fhir/uv/genomics-reporting/StructureDefinition/variant",
182
+ ],
183
+ },
184
+ "status": "final" if rec.curator_signoff else "preliminary",
185
+ "category": [{
186
+ "coding": [{
187
+ "system": "http://terminology.hl7.org/CodeSystem/observation-category",
188
+ "code": "laboratory",
189
+ }],
190
+ }],
191
+ "code": {
192
+ "coding": [{
193
+ "system": "http://loinc.org",
194
+ "code": "53037-8",
195
+ "display": "Genetic variation clinical significance",
196
+ }],
197
+ },
198
+ "issued": (rec.signed_off_at.isoformat() if rec.signed_off_at else
199
+ rec.created_at.isoformat() if rec.created_at else
200
+ datetime.now(UTC).isoformat()),
201
+ "performer": [{"display": rec.curator_id or "VariantLens (auto)"}],
202
+ "valueCodeableConcept": {"coding": [sig], "text": rec.significance},
203
+ "interpretation": [{"text": rec.rationale or ""}] if rec.rationale else [],
204
+ "note": [{"text": f"ACMG ruleset {rec.ruleset_version}; "
205
+ f"triggered: {', '.join(rec.triggered_criteria or [])}"}],
206
+ "component": components,
207
+ "derivedFrom": derived,
208
+ }
backend/app/services/gnomad.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import sqlite3
3
+ from pathlib import Path
4
+
5
+ import httpx
6
+ from tenacity import retry, stop_after_attempt, wait_exponential
7
+
8
+ from backend.app.config import get_settings
9
+ from backend.app.schemas.evidence import PopulationFrequency
10
+
11
+ logger = logging.getLogger(__name__)
12
+ settings = get_settings()
13
+
14
+ GNOMAD_QUERY = """
15
+ query VariantInfo($variantId: String!, $datasetId: DatasetId!) {
16
+ variant(variantId: $variantId, dataset: $datasetId) {
17
+ variant_id
18
+ exome {
19
+ ac
20
+ an
21
+ af
22
+ ac_hom
23
+ populations { id ac an }
24
+ }
25
+ genome {
26
+ ac
27
+ an
28
+ af
29
+ ac_hom
30
+ populations { id ac an }
31
+ }
32
+ }
33
+ }
34
+ """
35
+
36
+
37
+ class GnomADClient:
38
+ def __init__(self, url: str | None = None, cache_db: Path | None = None) -> None:
39
+ self.url = url or settings.gnomad_graphql_url
40
+ self.cache_db = cache_db or settings.gnomad_cache_db
41
+ self._init_cache()
42
+
43
+ def _init_cache(self) -> None:
44
+ self.cache_db.parent.mkdir(parents=True, exist_ok=True)
45
+ with sqlite3.connect(self.cache_db) as conn:
46
+ conn.execute(
47
+ """
48
+ CREATE TABLE IF NOT EXISTS gnomad_cache (
49
+ variant_id TEXT PRIMARY KEY,
50
+ af REAL,
51
+ homozygotes INTEGER,
52
+ populations TEXT,
53
+ coverage_warning TEXT,
54
+ fetched_at TEXT DEFAULT CURRENT_TIMESTAMP
55
+ )
56
+ """
57
+ )
58
+
59
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=8), reraise=True)
60
+ async def _fetch(self, variant_id: str, dataset: str = "gnomad_r4") -> dict | None:
61
+ async with httpx.AsyncClient(timeout=15.0) as client:
62
+ r = await client.post(
63
+ self.url,
64
+ json={
65
+ "query": GNOMAD_QUERY,
66
+ "variables": {"variantId": variant_id, "datasetId": dataset},
67
+ },
68
+ )
69
+ r.raise_for_status()
70
+ payload = r.json()
71
+ return payload.get("data", {}).get("variant")
72
+
73
+ async def lookup(self, variant_id: str) -> PopulationFrequency:
74
+ with sqlite3.connect(self.cache_db) as conn:
75
+ row = conn.execute(
76
+ "SELECT af, homozygotes, populations, coverage_warning FROM gnomad_cache WHERE variant_id = ?",
77
+ (variant_id,),
78
+ ).fetchone()
79
+ if row:
80
+ af, hom, pops_str, cov = row
81
+ import json
82
+ return PopulationFrequency(
83
+ overall_af=af,
84
+ homozygote_count=hom,
85
+ by_population=json.loads(pops_str) if pops_str else {},
86
+ coverage_warning=cov,
87
+ )
88
+
89
+ try:
90
+ data = await self._fetch(variant_id)
91
+ except (httpx.HTTPError, httpx.TimeoutException) as e:
92
+ logger.warning("gnomAD fetch failed for %s: %s", variant_id, e)
93
+ return PopulationFrequency(coverage_warning=f"fetch failed: {e}")
94
+
95
+ if not data:
96
+ return PopulationFrequency(coverage_warning="not found in gnomAD")
97
+
98
+ exome = data.get("exome") or {}
99
+ genome = data.get("genome") or {}
100
+ af = exome.get("af") or genome.get("af") or 0.0
101
+ hom = (exome.get("ac_hom") or 0) + (genome.get("ac_hom") or 0)
102
+
103
+ populations: dict[str, float] = {}
104
+ for src in (exome, genome):
105
+ for pop in src.get("populations") or []:
106
+ if pop["an"]:
107
+ populations[pop["id"]] = (pop.get("ac") or 0) / pop["an"]
108
+
109
+ import json
110
+ with sqlite3.connect(self.cache_db) as conn:
111
+ conn.execute(
112
+ "INSERT OR REPLACE INTO gnomad_cache (variant_id, af, homozygotes, populations, coverage_warning) VALUES (?, ?, ?, ?, ?)",
113
+ (variant_id, af, hom, json.dumps(populations), None),
114
+ )
115
+
116
+ return PopulationFrequency(
117
+ overall_af=af, homozygote_count=hom, by_population=populations
118
+ )
backend/app/services/insilico.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import sqlite3
3
+ from pathlib import Path
4
+
5
+ import httpx
6
+ from tenacity import retry, stop_after_attempt, wait_exponential
7
+
8
+ from backend.app.config import get_settings
9
+ from backend.app.schemas.evidence import InSilicoResult
10
+
11
+ logger = logging.getLogger(__name__)
12
+ settings = get_settings()
13
+
14
+ REVEL_PATHOGENIC_THRESHOLD = 0.7
15
+ REVEL_BENIGN_THRESHOLD = 0.15
16
+ ALPHAMISSENSE_PATHOGENIC = 0.564
17
+ ALPHAMISSENSE_BENIGN = 0.34
18
+ SPLICEAI_PATHOGENIC = 0.5
19
+ CADD_PATHOGENIC = 25.0
20
+
21
+
22
+ class InSilicoPredictor:
23
+ def __init__(
24
+ self,
25
+ revel_db: Path | None = None,
26
+ alphamissense_db: Path | None = None,
27
+ spliceai_url: str | None = None,
28
+ ) -> None:
29
+ self.revel_db = revel_db or settings.revel_db_path
30
+ self.alphamissense_db = alphamissense_db or settings.alphamissense_db_path
31
+ self.spliceai_url = spliceai_url or settings.spliceai_lookup_url
32
+
33
+ def lookup_revel(self, chrom: str, pos: int, ref: str, alt: str) -> float | None:
34
+ if not self.revel_db.exists():
35
+ logger.debug("REVEL db not present; skip")
36
+ return None
37
+ try:
38
+ with sqlite3.connect(self.revel_db) as conn:
39
+ row = conn.execute(
40
+ "SELECT score FROM revel WHERE chrom = ? AND pos = ? AND ref = ? AND alt = ?",
41
+ (chrom, pos, ref, alt),
42
+ ).fetchone()
43
+ return row[0] if row else None
44
+ except sqlite3.DatabaseError as e:
45
+ logger.warning("REVEL lookup error: %s", e)
46
+ return None
47
+
48
+ def lookup_alphamissense(
49
+ self,
50
+ chrom: str | None,
51
+ pos: int | None,
52
+ ref: str | None,
53
+ alt: str | None,
54
+ transcript: str | None = None,
55
+ ) -> float | None:
56
+ """Genomic-coordinate lookup against the SQLite cache.
57
+
58
+ AlphaMissense scores live at chr/pos/ref/alt Γ— transcript granularity.
59
+ We try (chrom,pos,ref,alt,transcript) first, then fall back to the
60
+ first matching transcript at that locus.
61
+ """
62
+ if not self.alphamissense_db.exists():
63
+ logger.debug("AlphaMissense db not present; skip")
64
+ return None
65
+ if not (chrom and pos and ref and alt):
66
+ return None
67
+ try:
68
+ with sqlite3.connect(self.alphamissense_db) as conn:
69
+ if transcript:
70
+ row = conn.execute(
71
+ "SELECT score FROM alphamissense WHERE chrom = ? AND pos = ? AND ref = ? AND alt = ? AND transcript = ?",
72
+ (chrom.lstrip("chr"), pos, ref, alt, transcript),
73
+ ).fetchone()
74
+ if row:
75
+ return float(row[0])
76
+ row = conn.execute(
77
+ "SELECT score FROM alphamissense WHERE chrom = ? AND pos = ? AND ref = ? AND alt = ? LIMIT 1",
78
+ (chrom.lstrip("chr"), pos, ref, alt),
79
+ ).fetchone()
80
+ return float(row[0]) if row else None
81
+ except sqlite3.DatabaseError as e:
82
+ logger.warning("AlphaMissense lookup error: %s", e)
83
+ return None
84
+
85
+ @retry(stop=stop_after_attempt(2), wait=wait_exponential(min=1, max=5), reraise=True)
86
+ async def lookup_spliceai(self, hgvs_genomic: str) -> float | None:
87
+ try:
88
+ async with httpx.AsyncClient(timeout=15.0) as client:
89
+ r = await client.get(
90
+ f"{self.spliceai_url}/api",
91
+ params={"hg": "38", "distance": "50", "mask": "0", "variant": hgvs_genomic},
92
+ )
93
+ r.raise_for_status()
94
+ data = r.json()
95
+ scores = data.get("scores") or []
96
+ if not scores:
97
+ return None
98
+ ds: list[float] = []
99
+ for score in scores:
100
+ ds.extend([
101
+ float(score.get("DS_AG", 0)),
102
+ float(score.get("DS_AL", 0)),
103
+ float(score.get("DS_DG", 0)),
104
+ float(score.get("DS_DL", 0)),
105
+ ])
106
+ return max(ds)
107
+ except (httpx.HTTPError, httpx.TimeoutException, ValueError) as e:
108
+ logger.warning("SpliceAI lookup failed: %s", e)
109
+ return None
110
+
111
+ async def assess(
112
+ self,
113
+ chrom: str | None,
114
+ pos: int | None,
115
+ ref: str | None,
116
+ alt: str | None,
117
+ transcript: str | None,
118
+ hgvs_genomic: str | None,
119
+ ) -> InSilicoResult:
120
+ revel = (
121
+ self.lookup_revel(chrom, pos, ref, alt)
122
+ if chrom and pos and ref and alt
123
+ else None
124
+ )
125
+ am = self.lookup_alphamissense(chrom, pos, ref, alt, transcript)
126
+ splice = await self.lookup_spliceai(hgvs_genomic) if hgvs_genomic else None
127
+
128
+ path_votes = sum(
129
+ [
130
+ revel is not None and revel >= REVEL_PATHOGENIC_THRESHOLD,
131
+ am is not None and am >= ALPHAMISSENSE_PATHOGENIC,
132
+ splice is not None and splice >= SPLICEAI_PATHOGENIC,
133
+ ]
134
+ )
135
+ benign_votes = sum(
136
+ [
137
+ revel is not None and revel <= REVEL_BENIGN_THRESHOLD,
138
+ am is not None and am <= ALPHAMISSENSE_BENIGN,
139
+ splice is not None and splice < SPLICEAI_PATHOGENIC,
140
+ ]
141
+ )
142
+ total_with_data = sum([revel is not None, am is not None, splice is not None])
143
+
144
+ # ClinGen SVI 2022 β€” fire if at least one strong predictor agrees
145
+ # AND no predictor strongly contradicts. The strict "unanimous"
146
+ # rule was rejecting BP4 whenever REVEL was middling, which
147
+ # missed real benign missense calls.
148
+ pp3 = path_votes >= 1 and benign_votes == 0 and total_with_data >= 1
149
+ bp4 = benign_votes >= 1 and path_votes == 0 and total_with_data >= 1
150
+
151
+ return InSilicoResult(
152
+ revel=revel,
153
+ alphamissense=am,
154
+ spliceai_max=splice,
155
+ concordant_pathogenic=total_with_data >= 2 and path_votes == total_with_data,
156
+ concordant_benign=total_with_data >= 2 and benign_votes == total_with_data,
157
+ pp3_triggered=pp3,
158
+ bp4_triggered=bp4,
159
+ )
backend/app/services/llm/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from backend.app.services.llm.prompts import build_user_prompt, get_system_prompt
2
+ from backend.app.services.llm.reasoner import ClaudeReasoner
3
+ from backend.app.services.llm.synthesizer import EvidenceSynthesizer
4
+
5
+ __all__ = ["ClaudeReasoner", "EvidenceSynthesizer", "build_user_prompt", "get_system_prompt"]
backend/app/services/llm/prompts.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hallucination-suppressed prompt templates for literature-dependent ACMG criteria.
3
+
4
+ Mirrors the AI CURA strategy (Chung, Ma et al. 2025): Claude is allowed to reason
5
+ ONLY over the retrieved chunks. Every output must cite a PMID present in the
6
+ context. Output is structured JSON; no free text.
7
+ """
8
+
9
+ import json
10
+
11
+ from backend.app.schemas.evidence import LiteratureChunk
12
+
13
+ SYSTEM_PROMPT = """You are a clinical genetics variant curator assistant working within an ACMG/AMP framework. Your role is to extract structured evidence from the provided literature context ONLY.
14
+
15
+ CRITICAL RULES:
16
+ 1. Do NOT use any knowledge from your training data about this variant, gene, or disease beyond standard biology background. All claims about specific findings must come from the provided context chunks.
17
+ 2. Only cite evidence that appears verbatim in the provided context chunks.
18
+ 3. If the context does not contain sufficient evidence for a criterion, output: "triggered": false, "evidence": "insufficient evidence in provided literature".
19
+ 4. For each criterion you assess, cite the specific PMID and quote the relevant sentence(s) from the chunk text.
20
+ 5. Output structured JSON only β€” no free text, no markdown, no preamble.
21
+ 6. Flag any ambiguous phasing, uncertain phenotype matches, or potential ascertainment bias in the "caveat" field.
22
+ 7. If a chunk's PMID is not in the context, do NOT cite it. Cited PMIDs MUST appear in the metadata of a provided chunk.
23
+
24
+ OUTPUT SCHEMA per criterion (JSON object):
25
+ {
26
+ "criterion": "PM3" | "PP1" | "PS3" | "BS3" | "PS4" | "PP4" | "PS2" | "PM6" | "PP5" | "BP6",
27
+ "triggered": true | false,
28
+ "strength": "supporting" | "moderate" | "strong" | "very_strong",
29
+ "evidence": "<exact quote from a context chunk>",
30
+ "pmid": "<PMID from chunk metadata>",
31
+ "confidence": "high" | "medium" | "low",
32
+ "caveat": "<optional text or null>"
33
+ }
34
+ Return a JSON array of one object per requested criterion."""
35
+
36
+
37
+ CRITERION_GUIDANCE: dict[str, str] = {
38
+ "PM3": (
39
+ "PM3 β€” observed in trans with another pathogenic/likely-pathogenic variant. "
40
+ "Look for explicit statements of compound heterozygosity, in-trans observation, "
41
+ "or biallelic occurrence with parental confirmation."
42
+ ),
43
+ "PP1": (
44
+ "PP1 β€” co-segregation with disease in multiple affected family members. "
45
+ "Count distinct affected segregating individuals; require β‰₯3 for moderate, β‰₯7 for strong."
46
+ ),
47
+ "PS3": (
48
+ "PS3 β€” well-established in vitro or in vivo functional studies show a deleterious effect. "
49
+ "Penalize assays with poor controls, single replicates, or non-physiological systems."
50
+ ),
51
+ "BS3": (
52
+ "BS3 β€” well-established functional studies show no measurable effect."
53
+ ),
54
+ "PS4": (
55
+ "PS4 β€” variant prevalence in cases significantly increased over controls. "
56
+ "Extract case counts and odds ratios where present."
57
+ ),
58
+ "PP4": (
59
+ "PP4 β€” patient phenotype highly specific for a disease with single genetic etiology. "
60
+ "Require explicit phenotype description, not generic disease name."
61
+ ),
62
+ "PS2": "PS2 β€” confirmed de novo with parental confirmation.",
63
+ "PM6": "PM6 β€” assumed de novo without parental confirmation.",
64
+ "PP5": "PP5 β€” reputable source recently reports as pathogenic.",
65
+ "BP6": "BP6 β€” reputable source recently reports as benign.",
66
+ }
67
+
68
+
69
+ def get_system_prompt() -> str:
70
+ return SYSTEM_PROMPT
71
+
72
+
73
+ def build_user_prompt(
74
+ variant_hgvs: str,
75
+ gene: str,
76
+ disease: str | None,
77
+ auto_scored: list[dict],
78
+ chunks: list[LiteratureChunk],
79
+ criteria: list[str],
80
+ ) -> str:
81
+ chunk_blocks = []
82
+ for i, c in enumerate(chunks):
83
+ chunk_blocks.append(
84
+ f"--- Chunk #{i+1} ---\n"
85
+ f"PMID: {c.pmid}\n"
86
+ f"Year: {c.year or 'unknown'}\n"
87
+ f"Title: {c.title or 'n/a'}\n"
88
+ f"Hint criteria: {', '.join(c.criteria_relevance) or 'none'}\n"
89
+ f"Text:\n{c.chunk_text}\n"
90
+ )
91
+ chunks_str = "\n".join(chunk_blocks) or "(no literature retrieved β€” output insufficient evidence for all criteria)"
92
+
93
+ guidance_str = "\n".join(
94
+ f"- {CRITERION_GUIDANCE.get(c, c)}" for c in criteria
95
+ )
96
+
97
+ return (
98
+ f"Variant: {variant_hgvs}\n"
99
+ f"Gene: {gene}\n"
100
+ f"Disease: {disease or 'unspecified'}\n\n"
101
+ f"PRE-SCORED DATABASE CRITERIA (do not re-evaluate these β€” informational only):\n"
102
+ f"{json.dumps(auto_scored, indent=2)}\n\n"
103
+ f"CRITERIA TO ASSESS FROM LITERATURE ONLY:\n"
104
+ f"{guidance_str}\n\n"
105
+ f"LITERATURE CONTEXT:\n"
106
+ f"{chunks_str}\n\n"
107
+ f"Output a JSON array with one entry per criterion in the order: {criteria}. "
108
+ f"Cite only PMIDs that appear in the context above."
109
+ )
backend/app/services/llm/reasoner.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ from typing import Any, cast
4
+
5
+ import anthropic
6
+ import httpx
7
+ from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
8
+
9
+ from backend.app.config import get_settings
10
+ from backend.app.schemas.evidence import ACMGCriterion, LiteratureChunk
11
+ from backend.app.services.llm.prompts import build_user_prompt, get_system_prompt
12
+
13
+ logger = logging.getLogger(__name__)
14
+ settings = get_settings()
15
+
16
+
17
+ class ClaudeReasoner:
18
+ def __init__(self, api_key: str | None = None, model: str | None = None) -> None:
19
+ self.api_key = api_key or settings.anthropic_api_key
20
+ self.use_local_llm = settings.use_local_llm
21
+ self.model = model or (settings.local_llm_model if self.use_local_llm else settings.anthropic_model)
22
+ self.client = (
23
+ None
24
+ if self.use_local_llm
25
+ else anthropic.Anthropic(api_key=self.api_key) if self.api_key else None
26
+ )
27
+
28
+ @retry(
29
+ stop=stop_after_attempt(3),
30
+ wait=wait_exponential(min=2, max=20),
31
+ retry=retry_if_exception_type((anthropic.APIError, anthropic.RateLimitError, httpx.HTTPError)),
32
+ reraise=True,
33
+ )
34
+ def _call(self, system: list[dict[str, Any]], user: str) -> str:
35
+ if self.use_local_llm:
36
+ return self._call_local(system, user)
37
+ if self.client is None:
38
+ raise RuntimeError("ANTHROPIC_API_KEY not set; cannot call Claude")
39
+ response = self.client.messages.create(
40
+ model=self.model,
41
+ max_tokens=settings.anthropic_max_tokens,
42
+ system=cast(Any, system),
43
+ messages=[{"role": "user", "content": user}],
44
+ )
45
+ for block in response.content:
46
+ if block.type == "text":
47
+ return block.text
48
+ return ""
49
+
50
+ def _call_local(self, system: list[dict[str, Any]], user: str) -> str:
51
+ system_text = "\n".join(str(part.get("text", "")) for part in system)
52
+ payload = {
53
+ "model": self.model,
54
+ "stream": False,
55
+ "format": "json",
56
+ "messages": [
57
+ {"role": "system", "content": system_text},
58
+ {"role": "user", "content": user},
59
+ ],
60
+ "options": {"temperature": 0},
61
+ }
62
+ response = httpx.post(
63
+ f"{settings.local_llm_base_url.rstrip('/')}/api/chat",
64
+ json=payload,
65
+ timeout=120,
66
+ )
67
+ response.raise_for_status()
68
+ data = response.json()
69
+ message = data.get("message", {})
70
+ content = message.get("content")
71
+ if not isinstance(content, str):
72
+ raise RuntimeError("local LLM response did not include message.content")
73
+ return content
74
+
75
+ def reason_over_criteria(
76
+ self,
77
+ variant_hgvs: str,
78
+ gene: str,
79
+ disease: str | None,
80
+ auto_scored_summary: list[dict[str, Any]],
81
+ chunks: list[LiteratureChunk],
82
+ criteria: list[str],
83
+ ) -> list[ACMGCriterion]:
84
+ if not chunks:
85
+ return [self._fallback_criterion(c, "insufficient evidence in provided literature") for c in criteria]
86
+
87
+ system_text = get_system_prompt()
88
+ # Cache the long system prompt so repeated runs in a session are cheap.
89
+ # The prompt is byte-identical across variants β€” every call should be a cache read.
90
+ system = [
91
+ {
92
+ "type": "text",
93
+ "text": system_text,
94
+ "cache_control": {"type": "ephemeral"},
95
+ }
96
+ ]
97
+ user = build_user_prompt(
98
+ variant_hgvs=variant_hgvs,
99
+ gene=gene,
100
+ disease=disease,
101
+ auto_scored=auto_scored_summary,
102
+ chunks=chunks,
103
+ criteria=criteria,
104
+ )
105
+
106
+ try:
107
+ raw = self._call(system, user)
108
+ except (anthropic.APIError, httpx.HTTPError, RuntimeError) as e:
109
+ logger.error("Claude call failed: %s", e)
110
+ return [self._fallback_criterion(c, str(e)) for c in criteria]
111
+
112
+ try:
113
+ parsed = self._parse_json(raw)
114
+ except ValueError as e:
115
+ logger.warning("Claude output JSON malformed; retrying with repair prompt: %s", e)
116
+ try:
117
+ raw = self._call(
118
+ system,
119
+ user
120
+ + "\n\nYour previous output failed JSON validation. Return ONLY a valid JSON array matching the schema.",
121
+ )
122
+ parsed = self._parse_json(raw)
123
+ except (ValueError, anthropic.APIError, httpx.HTTPError) as e2:
124
+ logger.error("Claude repair attempt failed: %s", e2)
125
+ return [self._fallback_criterion(c, "LLM output unparseable") for c in criteria]
126
+
127
+ chunks_by_pmid: dict[str, list[str]] = {}
128
+ for chunk in chunks:
129
+ chunks_by_pmid.setdefault(chunk.pmid, []).append(chunk.chunk_text)
130
+ valid_pmids = set(chunks_by_pmid)
131
+ out: list[ACMGCriterion] = []
132
+ for entry in parsed:
133
+ try:
134
+ code = entry["criterion"]
135
+ pmid = entry.get("pmid")
136
+ evidence_text = str(entry.get("evidence", "")).strip()
137
+ if entry.get("triggered"):
138
+ rejection = self._trigger_rejection_reason(pmid, evidence_text, chunks_by_pmid, valid_pmids)
139
+ if rejection:
140
+ logger.warning("Suppressing %s from LLM output: %s", code, rejection)
141
+ out.append(self._fallback_criterion(code, rejection))
142
+ continue
143
+ out.append(
144
+ ACMGCriterion(
145
+ code=code,
146
+ triggered=bool(entry.get("triggered", False)),
147
+ strength=entry.get("strength", "supporting"),
148
+ source=f"PMID:{pmid}" if pmid else "literature",
149
+ evidence_text=evidence_text or "insufficient evidence in provided literature",
150
+ confidence=entry.get("confidence", "medium"),
151
+ caveat=entry.get("caveat"),
152
+ pmid=pmid,
153
+ )
154
+ )
155
+ except (KeyError, TypeError) as e:
156
+ logger.warning("malformed entry from Claude: %s β€” %s", entry, e)
157
+ return out
158
+
159
+ @staticmethod
160
+ def _trigger_rejection_reason(
161
+ pmid: Any,
162
+ evidence_text: str,
163
+ chunks_by_pmid: dict[str, list[str]],
164
+ valid_pmids: set[str],
165
+ ) -> str | None:
166
+ if not pmid:
167
+ return "triggered literature criterion missing PMID"
168
+ if pmid not in valid_pmids:
169
+ return "fabricated PMID rejected"
170
+ if not evidence_text:
171
+ return "triggered literature criterion missing evidence quote"
172
+ normalized_evidence = " ".join(evidence_text.split()).lower()
173
+ normalized_chunks = [" ".join(text.split()).lower() for text in chunks_by_pmid[pmid]]
174
+ if not any(normalized_evidence in chunk for chunk in normalized_chunks):
175
+ return "evidence quote not found verbatim in cited PMID chunk"
176
+ return None
177
+
178
+ @staticmethod
179
+ def _parse_json(raw: str) -> list[dict[str, Any]]:
180
+ text = raw.strip()
181
+ if text.startswith("```"):
182
+ text = text.split("```")[1]
183
+ if text.startswith("json"):
184
+ text = text[4:]
185
+ text = text.strip()
186
+ data = json.loads(text)
187
+ if not isinstance(data, list):
188
+ raise ValueError("expected JSON array")
189
+ return data
190
+
191
+ @staticmethod
192
+ def _fallback_criterion(code: str, reason: str) -> ACMGCriterion:
193
+ return ACMGCriterion(
194
+ code=code,
195
+ triggered=False,
196
+ strength="supporting",
197
+ source="LLM",
198
+ evidence_text=f"insufficient evidence β€” {reason}",
199
+ confidence="low",
200
+ caveat=reason,
201
+ )
backend/app/services/llm/synthesizer.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ from backend.app.schemas.classification import ClassificationResult
4
+ from backend.app.schemas.evidence import ACMGCriterion, EvidenceBundle, LiteratureChunk
5
+ from backend.app.schemas.variant import NormalizedVariant
6
+ from backend.app.services.acmg.combiner import combine_criteria
7
+ from backend.app.services.acmg.rules import RuleEngine
8
+ from backend.app.services.llm.reasoner import ClaudeReasoner
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ LITERATURE_CRITERIA = ["PM3", "PP1", "PS3", "BS3", "PS4", "PP4"]
13
+
14
+
15
+ class EvidenceSynthesizer:
16
+ def __init__(
17
+ self,
18
+ rule_engine: RuleEngine | None = None,
19
+ reasoner: ClaudeReasoner | None = None,
20
+ ) -> None:
21
+ self.rule_engine = rule_engine or RuleEngine()
22
+ self.reasoner = reasoner or ClaudeReasoner()
23
+
24
+ def synthesize(
25
+ self,
26
+ variant: NormalizedVariant,
27
+ evidence: EvidenceBundle,
28
+ retrieved_chunks: dict[str, list[LiteratureChunk]] | None = None,
29
+ disease: str | None = None,
30
+ ) -> ClassificationResult:
31
+ # 1. Database-driven criteria
32
+ db_criteria = self.rule_engine.score_all(evidence)
33
+
34
+ # 2. Literature-driven criteria via Claude
35
+ llm_criteria: list[ACMGCriterion] = []
36
+ if retrieved_chunks:
37
+ all_chunks = []
38
+ seen = set()
39
+ for chunks in retrieved_chunks.values():
40
+ for c in chunks:
41
+ key = (c.pmid, c.chunk_text[:100])
42
+ if key not in seen:
43
+ seen.add(key)
44
+ all_chunks.append(c)
45
+
46
+ auto_summary = [
47
+ {
48
+ "criterion": c.code,
49
+ "triggered": c.triggered,
50
+ "source": c.source,
51
+ "evidence": c.evidence_text,
52
+ }
53
+ for c in db_criteria
54
+ ]
55
+
56
+ try:
57
+ llm_criteria = self.reasoner.reason_over_criteria(
58
+ variant_hgvs=variant.hgvs_coding or variant.raw_input,
59
+ gene=variant.gene_symbol or "unknown",
60
+ disease=disease,
61
+ auto_scored_summary=auto_summary,
62
+ chunks=all_chunks,
63
+ criteria=LITERATURE_CRITERIA,
64
+ )
65
+ except Exception as e:
66
+ logger.error("LLM reasoning failed: %s", e)
67
+
68
+ # 3. Merge β€” db criteria win on conflict
69
+ merged: dict[str, ACMGCriterion] = {c.code: c for c in db_criteria}
70
+ for c in llm_criteria:
71
+ merged.setdefault(c.code, c)
72
+
73
+ all_criteria = list(merged.values())
74
+ evidence.criteria = all_criteria
75
+
76
+ classification = combine_criteria(all_criteria)
77
+ return ClassificationResult(
78
+ variant=variant,
79
+ evidence=evidence,
80
+ classification=classification,
81
+ )
backend/app/services/normalization.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+
4
+ import httpx
5
+ from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
6
+
7
+ from backend.app.config import get_settings
8
+ from backend.app.schemas.variant import NormalizedVariant, VariantInput
9
+
10
+ logger = logging.getLogger(__name__)
11
+ settings = get_settings()
12
+
13
+ HGVS_PATTERN = re.compile(r"^(NM_|NC_|NP_|ENST|ENSP)[\d.]+:[cgpnm]\.")
14
+ VCF_PATTERN = re.compile(r"^(chr)?[\dXYM]+[-:]\d+[-:][ACGT]+[-:][ACGT]+$", re.IGNORECASE)
15
+ PROTEIN_PATTERN = re.compile(
16
+ r"^p\.[A-Z][a-z]{2}\d+([A-Z][a-z]{2}|\*|Ter)$" # 3-letter ref + 3-letter alt OR stop
17
+ r"|^p\.[A-Z]\d+[A-Z*]$" # 1-letter ref + 1-letter alt
18
+ )
19
+
20
+ # GRCh38 chromosome accessions (RefSeq). Mutalyzer rejects `chr17:g.` and
21
+ # requires the canonical NC_ identifier for genomic descriptions.
22
+ GRCH38_CHROM_TO_NC: dict[str, str] = {
23
+ "1": "NC_000001.11", "2": "NC_000002.12", "3": "NC_000003.12", "4": "NC_000004.12",
24
+ "5": "NC_000005.10", "6": "NC_000006.12", "7": "NC_000007.14", "8": "NC_000008.11",
25
+ "9": "NC_000009.12", "10": "NC_000010.11", "11": "NC_000011.10", "12": "NC_000012.12",
26
+ "13": "NC_000013.11", "14": "NC_000014.9", "15": "NC_000015.10", "16": "NC_000016.10",
27
+ "17": "NC_000017.11", "18": "NC_000018.10", "19": "NC_000019.10", "20": "NC_000020.11",
28
+ "21": "NC_000021.9", "22": "NC_000022.11", "X": "NC_000023.11", "Y": "NC_000024.10",
29
+ "M": "NC_012920.1", "MT": "NC_012920.1",
30
+ }
31
+
32
+
33
+ class NormalizationError(Exception):
34
+ pass
35
+
36
+
37
+ class VariantNormalizer:
38
+ def __init__(self, base_url: str | None = None, timeout: float = 10.0) -> None:
39
+ self.base_url = base_url or settings.mutalyzer_base_url
40
+ self.timeout = timeout
41
+
42
+ def detect_notation(self, raw: str) -> str:
43
+ s = raw.strip()
44
+ if HGVS_PATTERN.match(s):
45
+ return "hgvs"
46
+ if VCF_PATTERN.match(s):
47
+ return "vcf"
48
+ if PROTEIN_PATTERN.match(s):
49
+ return "protein"
50
+ return "unknown"
51
+
52
+ @retry(
53
+ stop=stop_after_attempt(3),
54
+ wait=wait_exponential(multiplier=1, min=1, max=8),
55
+ retry=retry_if_exception_type((httpx.HTTPStatusError, httpx.TimeoutException)),
56
+ reraise=True,
57
+ )
58
+ async def _call_mutalyzer(self, hgvs: str) -> dict:
59
+ url = f"{self.base_url}/normalize/{hgvs}"
60
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
61
+ r = await client.get(url)
62
+ r.raise_for_status()
63
+ return r.json()
64
+
65
+ async def normalize(self, raw_input: VariantInput) -> NormalizedVariant:
66
+ notation = (
67
+ raw_input.notation if raw_input.notation != "auto" else self.detect_notation(raw_input.raw)
68
+ )
69
+ warnings: list[str] = []
70
+ vcf_parts: tuple[str, int, str, str] | None = None
71
+
72
+ # For VCF input, lock in chrom/pos/ref/alt up front so the score-DB
73
+ # lookups (REVEL, AlphaMissense, gnomAD) always have what they need β€”
74
+ # even if the Mutalyzer enrichment call fails.
75
+ hgvs_for_mutalyzer: str | None = None
76
+ if notation == "vcf":
77
+ try:
78
+ hgvs_for_mutalyzer, vcf_parts = self._vcf_to_hgvs_with_parts(raw_input.raw)
79
+ except NormalizationError as e:
80
+ warnings.append(f"VCF parse failed: {e}")
81
+
82
+ try:
83
+ if notation == "hgvs":
84
+ data = await self._call_mutalyzer(raw_input.raw)
85
+ return self._parse_mutalyzer(raw_input.raw, data, warnings)
86
+ if notation == "vcf" and hgvs_for_mutalyzer:
87
+ data = await self._call_mutalyzer(hgvs_for_mutalyzer)
88
+ v = self._parse_mutalyzer(raw_input.raw, data, warnings)
89
+ chrom, pos, ref, alt = vcf_parts # type: ignore[misc]
90
+ return v.model_copy(update={
91
+ "chromosome": chrom, "position": pos, "ref": ref, "alt": alt,
92
+ "hgvs_genomic": hgvs_for_mutalyzer,
93
+ "gene_symbol": v.gene_symbol or raw_input.gene_symbol,
94
+ })
95
+ if notation == "protein":
96
+ warnings.append("protein-only input β€” coding HGVS unavailable without back-translation")
97
+ return NormalizedVariant(
98
+ raw_input=raw_input.raw,
99
+ hgvs_protein=raw_input.raw,
100
+ gene_symbol=raw_input.gene_symbol,
101
+ normalization_source="passthrough",
102
+ warnings=warnings,
103
+ )
104
+ warnings.append(f"unknown notation; passing through: {raw_input.raw}")
105
+ return NormalizedVariant(
106
+ raw_input=raw_input.raw,
107
+ gene_symbol=raw_input.gene_symbol,
108
+ normalization_source="passthrough",
109
+ warnings=warnings,
110
+ )
111
+ except (httpx.HTTPStatusError, httpx.TimeoutException, NormalizationError) as e:
112
+ logger.warning("Mutalyzer normalization failed for %s: %s", raw_input.raw, e)
113
+ warnings.append(f"mutalyzer failed: {e}; using passthrough")
114
+ chrom = pos = ref = alt = None
115
+ if vcf_parts:
116
+ chrom, pos, ref, alt = vcf_parts
117
+ return NormalizedVariant(
118
+ raw_input=raw_input.raw,
119
+ hgvs_coding=raw_input.raw if notation == "hgvs" else None,
120
+ hgvs_genomic=hgvs_for_mutalyzer,
121
+ gene_symbol=raw_input.gene_symbol,
122
+ chromosome=chrom,
123
+ position=pos,
124
+ ref=ref,
125
+ alt=alt,
126
+ normalization_source="passthrough",
127
+ warnings=warnings,
128
+ )
129
+
130
+ def _vcf_to_hgvs(self, vcf: str) -> str:
131
+ return self._vcf_to_hgvs_with_parts(vcf)[0]
132
+
133
+ def _vcf_to_hgvs_with_parts(self, vcf: str) -> tuple[str, tuple[str, int, str, str]]:
134
+ parts = re.split(r"[-:]", vcf)
135
+ if len(parts) != 4:
136
+ raise NormalizationError(f"malformed VCF string: {vcf}")
137
+ chrom, pos, ref, alt = parts
138
+ chrom = chrom.replace("chr", "").upper()
139
+ nc_acc = GRCH38_CHROM_TO_NC.get(chrom)
140
+ if not nc_acc:
141
+ raise NormalizationError(
142
+ f"unknown chromosome {chrom!r}; expected 1-22, X, Y, M, or MT"
143
+ )
144
+ return f"{nc_acc}:g.{pos}{ref}>{alt}", (chrom, int(pos), ref, alt)
145
+
146
+ def _parse_mutalyzer(self, raw: str, data: dict, warnings: list[str]) -> NormalizedVariant:
147
+ """Parse the Mutalyzer v3 API response.
148
+
149
+ v3 changed the shape entirely from v2:
150
+ - `normalized_description` β†’ canonical c. HGVS string
151
+ - `protein.description` β†’ canonical p. HGVS string
152
+ - `rna.description` β†’ canonical r. HGVS string
153
+ - `gene_id` β†’ HGNC symbol
154
+ - `infos[*].details` β†’ human-readable warnings
155
+ Genomic coordinates are not returned for transcript-keyed input;
156
+ callers that need chr/pos/ref/alt should pass VCF input.
157
+ """
158
+ coding = data.get("normalized_description") or data.get("corrected_description")
159
+ protein = (data.get("protein") or {}).get("description")
160
+ gene = data.get("gene_id")
161
+
162
+ transcript: str | None = None
163
+ if coding and ":" in coding:
164
+ transcript = coding.split(":")[0]
165
+
166
+ for info in data.get("infos") or []:
167
+ details = info.get("details") or info.get("code", "")
168
+ if details:
169
+ warnings.append(details)
170
+
171
+ consequence = self._infer_consequence(coding or "", protein or "")
172
+
173
+ return NormalizedVariant(
174
+ raw_input=raw,
175
+ hgvs_coding=coding,
176
+ hgvs_protein=protein,
177
+ transcript=transcript,
178
+ gene_symbol=gene,
179
+ consequence=consequence,
180
+ normalization_source="mutalyzer",
181
+ warnings=warnings,
182
+ )
183
+
184
+ @staticmethod
185
+ def _infer_consequence(coding: str, protein: str) -> str | None:
186
+ """Map a Mutalyzer-normalized variant to a SO consequence term.
187
+
188
+ Heuristic β€” covers the cases the rule engine cares about (PVS1
189
+ and PM4). For full annotation switch to VEP at the ingest boundary.
190
+ """
191
+ p = protein.lower()
192
+ c = coding.lower()
193
+ if "fs" in p:
194
+ return "frameshift_variant"
195
+ if "ter" in p or "*" in p:
196
+ return "stop_gained"
197
+ if "del" in c and "ins" not in c:
198
+ return "inframe_deletion" if "fs" not in p else "frameshift_variant"
199
+ if "dup" in c:
200
+ return "frameshift_variant" if "fs" in p else "inframe_insertion"
201
+ if "ext" in p:
202
+ return "stop_lost"
203
+ if "met1" in p and "?" in p:
204
+ return "start_lost"
205
+ if "splice" in c or "+" in c.split(":")[-1] or "-" in c.split(":")[-1]:
206
+ return "splice_region_variant"
207
+ if protein and ">" in c:
208
+ return "missense_variant"
209
+ return None
backend/app/services/pvs1.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+
4
+ from backend.app.schemas.evidence import AutoPVS1Result, AutoPVS1Step
5
+ from backend.app.schemas.variant import NormalizedVariant
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ LOF_CONSEQUENCES = {
10
+ "stop_gained",
11
+ "frameshift_variant",
12
+ "splice_acceptor_variant",
13
+ "splice_donor_variant",
14
+ "start_lost",
15
+ }
16
+
17
+
18
+ class PVS1Assessor:
19
+ """
20
+ Heuristic PVS1 assessment.
21
+
22
+ A real deployment should wrap the autoPVS1 package (https://github.com/JiguangPeng/autoPVS1)
23
+ for the full LOF-mechanism / 3'-end / NMD / alternative-splicing logic from
24
+ Tayoun et al. 2018. This wrapper records the rule path for the audit trail.
25
+ """
26
+
27
+ def assess(self, variant: NormalizedVariant) -> AutoPVS1Result:
28
+ consequence = (variant.consequence or "").lower()
29
+ protein = variant.hgvs_protein or ""
30
+
31
+ is_null = (
32
+ consequence in LOF_CONSEQUENCES
33
+ or "ter" in protein.lower()
34
+ or "fs" in protein.lower()
35
+ or bool(re.search(r"p\..*\*", protein))
36
+ )
37
+
38
+ steps: list[AutoPVS1Step] = []
39
+
40
+ # Step 1 β€” variant type
41
+ variant_type = (
42
+ "Stop-gained" if "stop" in consequence or "ter" in protein.lower() or re.search(r"p\..*\*", protein)
43
+ else "Frameshift" if "frameshift" in consequence or "fs" in protein.lower()
44
+ else "Splice site" if "splice" in consequence
45
+ else "Start-lost" if "start_lost" in consequence
46
+ else f"Other ({consequence or 'unknown'})"
47
+ )
48
+ steps.append(AutoPVS1Step(
49
+ step=1, label="Variant type", value=variant_type, **{"pass": is_null}
50
+ ))
51
+
52
+ if not is_null:
53
+ steps.append(AutoPVS1Step(
54
+ step=2, label="Predicted consequence",
55
+ value="No protein-truncating effect inferred",
56
+ **{"pass": False},
57
+ ))
58
+ return AutoPVS1Result(
59
+ triggered=False,
60
+ strength="very_strong",
61
+ rule="PVS1",
62
+ reasoning=steps,
63
+ conclusion="PVS1 not triggered β€” variant is not null",
64
+ source="autoPVS1-heuristic",
65
+ )
66
+
67
+ # Step 2 β€” predicted consequence
68
+ steps.append(AutoPVS1Step(
69
+ step=2, label="Predicted consequence",
70
+ value=f"Premature stop / truncation ({protein or 'inferred'})",
71
+ **{"pass": True},
72
+ ))
73
+
74
+ # Step 3 β€” NMD prediction (heuristic)
75
+ nmd_predicted = "fs" in protein.lower() or "ter" in protein.lower()
76
+ steps.append(AutoPVS1Step(
77
+ step=3, label="NMD predicted",
78
+ value="Yes β€” assumed NMD competent (verify against last-exon distance)" if nmd_predicted
79
+ else "Unknown β€” verify manually",
80
+ **{"pass": nmd_predicted},
81
+ ))
82
+
83
+ # Step 4 β€” last exon exception (heuristic placeholder)
84
+ steps.append(AutoPVS1Step(
85
+ step=4, label="Last exon exception",
86
+ value="Not assessed β€” requires transcript exon table",
87
+ **{"pass": True},
88
+ ))
89
+
90
+ # Step 5 β€” gene LOF mechanism (heuristic placeholder)
91
+ steps.append(AutoPVS1Step(
92
+ step=5, label="Gene LOF mechanism",
93
+ value="Assumed β€” verify against gene LOF tolerance (gnomAD pLI / OMIM)",
94
+ **{"pass": True},
95
+ ))
96
+
97
+ caveats: list[str] = []
98
+ if "?" in protein or not protein:
99
+ caveats.append("protein change ambiguous β€” verify NMD prediction")
100
+ if not variant.transcript:
101
+ caveats.append("transcript not specified β€” multiple-transcript caveat applies")
102
+
103
+ return AutoPVS1Result(
104
+ triggered=True,
105
+ strength="very_strong",
106
+ rule="PVS1",
107
+ reasoning=steps,
108
+ conclusion="PVS1 triggered at Very Strong strength (heuristic β€” manual verification recommended)",
109
+ source="autoPVS1-heuristic",
110
+ caveats=caveats,
111
+ )
backend/app/services/rag/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from backend.app.services.rag.chunker import ChunkBuilder
2
+ from backend.app.services.rag.embedder import Embedder
3
+ from backend.app.services.rag.fetcher import LiteratureFetcher
4
+ from backend.app.services.rag.retriever import LiteratureRetriever
5
+
6
+ __all__ = ["ChunkBuilder", "Embedder", "LiteratureFetcher", "LiteratureRetriever"]
backend/app/services/rag/chunker.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from backend.app.config import get_settings
2
+ from backend.app.services.rag.fetcher import Paper
3
+
4
+ settings = get_settings()
5
+
6
+ CRITERION_KEYWORDS: dict[str, list[str]] = {
7
+ "PM3": ["in trans", "compound heterozygous", "biallelic", "homozygous"],
8
+ "PP1": ["segregation", "co-segregat", "family", "affected"],
9
+ "PS3": ["functional", "in vitro", "in vivo", "assay", "expression"],
10
+ "BS3": ["no effect", "wild type", "wild-type", "indistinguishable"],
11
+ "PS4": ["case", "prevalence", "odds ratio", "controls"],
12
+ "PP4": ["phenotype", "clinical features", "presentation", "presented with"],
13
+ "PP5": ["pathogenic", "likely pathogenic", "ClinVar", "submission"],
14
+ "BP6": ["benign", "likely benign", "ClinVar"],
15
+ }
16
+
17
+
18
+ class Chunk:
19
+ def __init__(
20
+ self,
21
+ text: str,
22
+ pmid: str,
23
+ year: int | None,
24
+ title: str,
25
+ criteria_hint: list[str],
26
+ ) -> None:
27
+ self.text = text
28
+ self.pmid = pmid
29
+ self.year = year
30
+ self.title = title
31
+ self.criteria_hint = criteria_hint
32
+
33
+
34
+ class ChunkBuilder:
35
+ def __init__(self, chunk_size: int | None = None, overlap: int | None = None) -> None:
36
+ self.chunk_size = chunk_size or settings.rag_chunk_size
37
+ self.overlap = overlap or settings.rag_chunk_overlap
38
+
39
+ def detect_criteria(self, chunk_text: str) -> list[str]:
40
+ hint = []
41
+ text_lower = chunk_text.lower()
42
+ for crit, keywords in CRITERION_KEYWORDS.items():
43
+ if any(kw.lower() in text_lower for kw in keywords):
44
+ hint.append(crit)
45
+ return hint
46
+
47
+ def chunk_paper(self, paper: Paper) -> list[Chunk]:
48
+ text = paper.text
49
+ if not text:
50
+ return []
51
+ # Approx 4 chars per token
52
+ char_size = self.chunk_size * 4
53
+ char_overlap = self.overlap * 4
54
+
55
+ chunks: list[Chunk] = []
56
+ start = 0
57
+ while start < len(text):
58
+ end = min(start + char_size, len(text))
59
+ window = text[start:end]
60
+ chunks.append(
61
+ Chunk(
62
+ text=window,
63
+ pmid=paper.pmid,
64
+ year=paper.year,
65
+ title=paper.title,
66
+ criteria_hint=self.detect_criteria(window),
67
+ )
68
+ )
69
+ if end >= len(text):
70
+ break
71
+ start = end - char_overlap
72
+ return chunks
backend/app/services/rag/embedder.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import TYPE_CHECKING
3
+
4
+ from backend.app.config import get_settings
5
+ from backend.app.services.rag.chunker import Chunk
6
+
7
+ if TYPE_CHECKING:
8
+ from chromadb.api import ClientAPI
9
+
10
+ logger = logging.getLogger(__name__)
11
+ settings = get_settings()
12
+
13
+
14
+ class Embedder:
15
+ def __init__(self, model_name: str | None = None, persist_dir: str | None = None) -> None:
16
+ self.model_name = model_name or settings.embedding_model
17
+ self.persist_dir = persist_dir or str(settings.chroma_persist_dir)
18
+ self.collection_name = settings.chroma_collection
19
+ self._model = None
20
+ self._client: ClientAPI | None = None
21
+ self._collection = None
22
+
23
+ def _ensure_model(self):
24
+ if self._model is None:
25
+ from sentence_transformers import SentenceTransformer
26
+ self._model = SentenceTransformer(self.model_name, device=settings.embedding_device)
27
+ return self._model
28
+
29
+ def _ensure_collection(self):
30
+ if self._collection is None:
31
+ import chromadb
32
+ self._client = chromadb.PersistentClient(path=self.persist_dir)
33
+ self._collection = self._client.get_or_create_collection(self.collection_name)
34
+ return self._collection
35
+
36
+ def encode(self, texts: list[str]) -> list[list[float]]:
37
+ model = self._ensure_model()
38
+ return model.encode(texts, show_progress_bar=False, convert_to_numpy=True).tolist()
39
+
40
+ def index_chunks(self, chunks: list[Chunk], variant_id: str, gene: str) -> int:
41
+ if not chunks:
42
+ return 0
43
+ coll = self._ensure_collection()
44
+ embeddings = self.encode([c.text for c in chunks])
45
+ ids = [f"{variant_id}:{c.pmid}:{i}" for i, c in enumerate(chunks)]
46
+ metadatas = [
47
+ {
48
+ "pmid": c.pmid,
49
+ "year": c.year or 0,
50
+ "title": c.title,
51
+ "variant_id": variant_id,
52
+ "gene": gene,
53
+ "criteria_hint": ",".join(c.criteria_hint),
54
+ }
55
+ for c in chunks
56
+ ]
57
+ coll.add(
58
+ ids=ids,
59
+ documents=[c.text for c in chunks],
60
+ embeddings=embeddings,
61
+ metadatas=metadatas,
62
+ )
63
+ return len(chunks)
64
+
65
+ def query(
66
+ self, query_text: str, variant_id: str, top_k: int, criteria: list[str] | None = None
67
+ ) -> list[dict]:
68
+ coll = self._ensure_collection()
69
+ emb = self.encode([query_text])[0]
70
+ where: dict = {"variant_id": variant_id}
71
+ results = coll.query(query_embeddings=[emb], n_results=top_k, where=where)
72
+ out = []
73
+ for i, doc in enumerate(results.get("documents", [[]])[0]):
74
+ meta = results.get("metadatas", [[]])[0][i] if results.get("metadatas") else {}
75
+ score = results.get("distances", [[]])[0][i] if results.get("distances") else None
76
+ out.append({"text": doc, "metadata": meta, "score": score})
77
+ return out
backend/app/services/rag/fetcher.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import xml.etree.ElementTree as ET
3
+ from typing import Any
4
+
5
+ import httpx
6
+ from tenacity import retry, stop_after_attempt, wait_exponential
7
+
8
+ from backend.app.config import get_settings
9
+
10
+ logger = logging.getLogger(__name__)
11
+ settings = get_settings()
12
+
13
+ EUTILS = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
14
+ PMC_FULLTEXT = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi"
15
+
16
+ CRITERION_QUERY_AUGMENTS: dict[str, str] = {
17
+ "PM3": '"in trans" OR "compound heterozygous" OR "biallelic"',
18
+ "PP1": '"segregation" OR "affected family members" OR "co-segregates"',
19
+ "PS3": '"functional" OR "in vitro" OR "in vivo" OR "assay"',
20
+ "BS3": '"functional" OR "no effect" OR "wild type"',
21
+ "PS4": '"cases" OR "prevalence" OR "odds ratio"',
22
+ "PP4": '"phenotype" OR "clinical features" OR "presentation"',
23
+ }
24
+
25
+
26
+ class Paper:
27
+ def __init__(self, pmid: str, title: str, abstract: str, year: int | None, body: str | None = None) -> None:
28
+ self.pmid = pmid
29
+ self.title = title
30
+ self.abstract = abstract
31
+ self.year = year
32
+ self.body = body
33
+
34
+ @property
35
+ def text(self) -> str:
36
+ return self.body or self.abstract
37
+
38
+
39
+ class LiteratureFetcher:
40
+ def __init__(self, max_results: int | None = None, fetch_fulltext: bool | None = None) -> None:
41
+ self.max_results = max_results or settings.rag_max_papers_per_variant
42
+ self.fetch_fulltext = settings.rag_fetch_fulltext if fetch_fulltext is None else fetch_fulltext
43
+ self.api_key = settings.ncbi_api_key
44
+ self.email = settings.ncbi_email
45
+
46
+ def _params(self, **extra: Any) -> dict[str, Any]:
47
+ p = {"tool": "VariantLens", "email": self.email}
48
+ if self.api_key:
49
+ p["api_key"] = self.api_key
50
+ return {**p, **extra}
51
+
52
+ def build_query(self, gene: str, hgvs: str, protein: str | None) -> str:
53
+ terms = [f'"{gene}"', f'"{hgvs}"']
54
+ if protein:
55
+ terms.append(f'"{protein}"')
56
+ return " AND ".join([f"({t})" for t in terms[:1]] + [f"({' OR '.join(terms[1:])})"])
57
+
58
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10), reraise=True)
59
+ async def search_pubmed(self, query: str) -> list[str]:
60
+ async with httpx.AsyncClient(timeout=20.0) as client:
61
+ r = await client.get(
62
+ f"{EUTILS}/esearch.fcgi",
63
+ params=self._params(db="pubmed", term=query, retmax=self.max_results, retmode="json"),
64
+ )
65
+ r.raise_for_status()
66
+ return r.json().get("esearchresult", {}).get("idlist", [])
67
+
68
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10), reraise=True)
69
+ async def fetch_abstracts(self, pmids: list[str]) -> list[Paper]:
70
+ if not pmids:
71
+ return []
72
+ async with httpx.AsyncClient(timeout=30.0) as client:
73
+ r = await client.get(
74
+ f"{EUTILS}/efetch.fcgi",
75
+ params=self._params(db="pubmed", id=",".join(pmids), rettype="abstract", retmode="xml"),
76
+ )
77
+ r.raise_for_status()
78
+ return self._parse_pubmed_xml(r.text)
79
+
80
+ def _parse_pubmed_xml(self, xml_text: str) -> list[Paper]:
81
+ try:
82
+ root = ET.fromstring(xml_text)
83
+ except ET.ParseError as e:
84
+ logger.warning("PubMed XML parse failed: %s", e)
85
+ return []
86
+ papers: list[Paper] = []
87
+ for art in root.iter("PubmedArticle"):
88
+ pmid_el = art.find(".//PMID")
89
+ title_el = art.find(".//ArticleTitle")
90
+ abstract_el = art.findall(".//Abstract/AbstractText")
91
+ year_el = art.find(".//PubDate/Year")
92
+ pmid = pmid_el.text if pmid_el is not None and pmid_el.text else ""
93
+ title = title_el.text if title_el is not None and title_el.text else ""
94
+ abstract = " ".join((a.text or "") for a in abstract_el)
95
+ year = int(year_el.text) if year_el is not None and year_el.text and year_el.text.isdigit() else None
96
+ if pmid:
97
+ papers.append(Paper(pmid=pmid, title=title, abstract=abstract, year=year))
98
+ return papers
99
+
100
+ async def fetch_full_texts(self, papers: list[Paper]) -> list[Paper]:
101
+ if not self.fetch_fulltext:
102
+ return papers
103
+ async with httpx.AsyncClient(timeout=30.0) as client:
104
+ for p in papers:
105
+ try:
106
+ r = await client.get(PMC_FULLTEXT, params={"id": p.pmid, "format": "tgz"})
107
+ if r.status_code == 200 and "tgz" in r.headers.get("content-type", "").lower():
108
+ # Parsing tar.gz -> XML -> body is non-trivial; skip for MVP
109
+ # and rely on abstract. Implementation can extend here.
110
+ pass
111
+ except (httpx.HTTPError, httpx.TimeoutException) as e:
112
+ logger.debug("full-text fetch skipped for %s: %s", p.pmid, e)
113
+ return papers
114
+
115
+ async def fetch_for_variant(
116
+ self, gene: str, hgvs: str, protein: str | None, criteria: list[str] | None = None
117
+ ) -> list[Paper]:
118
+ base_query = self.build_query(gene, hgvs, protein)
119
+ all_pmids: set[str] = set()
120
+ try:
121
+ all_pmids.update(await self.search_pubmed(base_query))
122
+ except (httpx.HTTPError, httpx.TimeoutException) as e:
123
+ logger.warning("base PubMed search failed: %s", e)
124
+
125
+ for crit in criteria or []:
126
+ aug = CRITERION_QUERY_AUGMENTS.get(crit)
127
+ if not aug:
128
+ continue
129
+ try:
130
+ all_pmids.update(await self.search_pubmed(f"{base_query} AND ({aug})"))
131
+ except (httpx.HTTPError, httpx.TimeoutException) as e:
132
+ logger.warning("criterion-augmented search failed for %s: %s", crit, e)
133
+
134
+ capped = list(all_pmids)[: self.max_results]
135
+ papers = await self.fetch_abstracts(capped)
136
+ return await self.fetch_full_texts(papers)
backend/app/services/rag/retriever.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ from backend.app.config import get_settings
4
+ from backend.app.schemas.evidence import LiteratureChunk
5
+ from backend.app.services.rag.chunker import ChunkBuilder
6
+ from backend.app.services.rag.embedder import Embedder
7
+ from backend.app.services.rag.fetcher import LiteratureFetcher
8
+
9
+ logger = logging.getLogger(__name__)
10
+ settings = get_settings()
11
+
12
+ CRITERION_QUERY_TEMPLATES: dict[str, str] = {
13
+ "PM3": "Was {variant} observed in trans with another pathogenic variant or compound heterozygous?",
14
+ "PP1": "Did {variant} co-segregate with disease in affected family members?",
15
+ "PS3": "What functional studies have been performed on {variant} and what do they show?",
16
+ "BS3": "Do functional studies show {variant} has no measurable effect?",
17
+ "PS4": "How prevalent is {variant} in cases compared to controls? Is there an odds ratio?",
18
+ "PP4": "Is the patient phenotype highly specific for the disease associated with {variant}?",
19
+ }
20
+
21
+
22
+ class LiteratureRetriever:
23
+ def __init__(
24
+ self,
25
+ fetcher: LiteratureFetcher | None = None,
26
+ chunker: ChunkBuilder | None = None,
27
+ embedder: Embedder | None = None,
28
+ ) -> None:
29
+ self.fetcher = fetcher or LiteratureFetcher()
30
+ self.chunker = chunker or ChunkBuilder()
31
+ self.embedder = embedder or Embedder()
32
+
33
+ async def index_for_variant(
34
+ self, variant_id: str, gene: str, hgvs: str, protein: str | None, criteria: list[str]
35
+ ) -> int:
36
+ papers = await self.fetcher.fetch_for_variant(gene, hgvs, protein, criteria)
37
+ all_chunks = [c for p in papers for c in self.chunker.chunk_paper(p)]
38
+ return self.embedder.index_chunks(all_chunks, variant_id=variant_id, gene=gene)
39
+
40
+ def retrieve_for_criterion(
41
+ self, variant_id: str, hgvs: str, criterion: str, top_k: int | None = None
42
+ ) -> list[LiteratureChunk]:
43
+ template = CRITERION_QUERY_TEMPLATES.get(criterion)
44
+ if not template:
45
+ return []
46
+ query = template.format(variant=hgvs)
47
+ results = self.embedder.query(
48
+ query_text=query, variant_id=variant_id, top_k=top_k or settings.rag_top_k
49
+ )
50
+ return [
51
+ LiteratureChunk(
52
+ pmid=r["metadata"].get("pmid", "unknown"),
53
+ year=r["metadata"].get("year") or None,
54
+ title=r["metadata"].get("title"),
55
+ chunk_text=r["text"],
56
+ criteria_relevance=[criterion],
57
+ score=r.get("score"),
58
+ )
59
+ for r in results
60
+ ]
61
+
62
+ def retrieve_for_criteria(
63
+ self, variant_id: str, hgvs: str, criteria: list[str], top_k: int | None = None
64
+ ) -> dict[str, list[LiteratureChunk]]:
65
+ return {
66
+ crit: self.retrieve_for_criterion(variant_id, hgvs, crit, top_k=top_k)
67
+ for crit in criteria
68
+ }
backend/app/services/repository.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Persistence layer that bridges Pydantic schemas <-> SQLAlchemy records.
2
+
3
+ The repository is the single point that writes a `ClassificationResult` to
4
+ the audit-trail database and reads it back. Keeping it isolated from the
5
+ pipeline means the pipeline can run dry (no DB) for tests and one-off CLI
6
+ runs, while the FastAPI router persists every successful classification.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from datetime import UTC, datetime
12
+
13
+ from sqlalchemy.orm import Session
14
+
15
+ from backend.app.models.classification import ClassificationRecord, CriterionRecord
16
+ from backend.app.models.variant import VariantRecord
17
+ from backend.app.schemas.classification import ClassificationResult
18
+
19
+
20
+ class ClassificationRepository:
21
+ def __init__(self, db: Session) -> None:
22
+ self.db = db
23
+
24
+ def save(self, result: ClassificationResult) -> ClassificationResult:
25
+ v = result.variant
26
+ variant_record = VariantRecord(
27
+ raw_input=v.raw_input,
28
+ hgvs_genomic=v.hgvs_genomic,
29
+ hgvs_coding=v.hgvs_coding,
30
+ hgvs_protein=v.hgvs_protein,
31
+ transcript=v.transcript,
32
+ gene_symbol=v.gene_symbol,
33
+ chromosome=v.chromosome,
34
+ position=v.position,
35
+ normalization_source=v.normalization_source,
36
+ warnings=v.warnings,
37
+ )
38
+ self.db.add(variant_record)
39
+ self.db.flush() # populate variant_record.id
40
+
41
+ cls = result.classification
42
+ record = ClassificationRecord(
43
+ variant_id=variant_record.id,
44
+ significance=cls.significance,
45
+ confidence=cls.confidence,
46
+ triggered_criteria=list(cls.triggered_criteria),
47
+ conflicting_evidence=cls.conflicting_evidence,
48
+ ruleset_version=result.ruleset_version,
49
+ rationale=cls.rationale,
50
+ )
51
+ self.db.add(record)
52
+ self.db.flush()
53
+
54
+ for c in result.evidence.criteria:
55
+ self.db.add(CriterionRecord(
56
+ classification_id=record.id,
57
+ code=c.code,
58
+ triggered=c.triggered,
59
+ strength=c.strength,
60
+ source=c.source,
61
+ evidence_text=c.evidence_text,
62
+ confidence=c.confidence,
63
+ pmid=c.pmid,
64
+ caveat=c.caveat,
65
+ curator_override=c.curator_override,
66
+ override_justification=c.override_justification,
67
+ ))
68
+
69
+ self.db.commit()
70
+ self.db.refresh(record)
71
+
72
+ return result.model_copy(update={
73
+ "id": record.id,
74
+ "analysed_at": record.created_at.replace(tzinfo=UTC).isoformat()
75
+ if record.created_at
76
+ else datetime.now(UTC).isoformat(),
77
+ })
78
+
79
+ def get(self, classification_id: str) -> ClassificationRecord | None:
80
+ return self.db.get(ClassificationRecord, classification_id)
backend/app/services/vep.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Ensembl VEP REST client β€” enriches HGVS-coding input with genomic coords.
2
+
3
+ Mutalyzer v3 normalizes the c./p. forms but returns nothing for chr/pos/ref/alt.
4
+ Without those fields, REVEL/AlphaMissense/gnomAD all silently no-op, leaving
5
+ the rule engine blind to common pathogenicity signals (PP3/BP4/PM2 from AF).
6
+
7
+ VEP's REST API solves this for free (no key, ~3 req/s polite-use cap).
8
+ For each HGVS coding string, it returns:
9
+ - chrom, position, allele_string (ref/alt for SNVs)
10
+ - most_severe_consequence (Sequence Ontology term)
11
+ - per-transcript hgvsc, hgvsp, gene_symbol, transcript_id
12
+
13
+ We treat VEP as best-effort β€” if it fails we still have whatever Mutalyzer
14
+ already populated, and the pipeline continues.
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import logging
19
+
20
+ import httpx
21
+ from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
22
+
23
+ from backend.app.schemas.variant import NormalizedVariant
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ VEP_BASE = "https://rest.ensembl.org"
28
+
29
+
30
+ class VEPClient:
31
+ def __init__(self, base_url: str | None = None, timeout: float = 15.0) -> None:
32
+ self.base_url = base_url or VEP_BASE
33
+ self.timeout = timeout
34
+
35
+ @retry(
36
+ stop=stop_after_attempt(3),
37
+ wait=wait_exponential(min=1, max=8),
38
+ retry=retry_if_exception_type((httpx.HTTPStatusError, httpx.TimeoutException)),
39
+ reraise=True,
40
+ )
41
+ async def annotate_hgvs(self, hgvs: str) -> dict | None:
42
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
43
+ r = await client.get(
44
+ f"{self.base_url}/vep/human/hgvs/{hgvs}",
45
+ headers={"Accept": "application/json"},
46
+ )
47
+ if r.status_code == 400:
48
+ # VEP can't parse some normalized forms (e.g. complex indels) β€” give up gracefully
49
+ logger.debug("VEP rejected %s: %s", hgvs, r.text[:200])
50
+ return None
51
+ r.raise_for_status()
52
+ data = r.json()
53
+ return data[0] if isinstance(data, list) and data else None
54
+
55
+ @staticmethod
56
+ def _split_alleles(allele_string: str | None) -> tuple[str | None, str | None]:
57
+ """Split VEP's `allele_string` into (ref, alt).
58
+
59
+ Format examples:
60
+ 'G/A' β†’ ('G', 'A') β€” SNV
61
+ 'TC/T' β†’ ('TC', 'T') β€” deletion
62
+ 'T/TC' β†’ ('T', 'TC') β€” insertion
63
+ '-/C' β†’ ('', 'C') β€” pure insertion (rare)
64
+ 'C/-' β†’ ('C', '') β€” pure deletion (rare)
65
+ 'AT/CG' β†’ ('AT', 'CG') β€” MNV
66
+ """
67
+ if not allele_string or "/" not in allele_string:
68
+ return None, None
69
+ ref, alt = allele_string.split("/", 1)
70
+ ref = "" if ref == "-" else ref
71
+ alt = "" if alt == "-" else alt
72
+ return ref, alt
73
+
74
+ async def enrich(self, normalized: NormalizedVariant) -> NormalizedVariant:
75
+ """Enrich a NormalizedVariant with chrom/pos/ref/alt + transcript info.
76
+
77
+ Only mutates fields that are currently empty β€” never overrides values
78
+ Mutalyzer or the VCF parser already filled in.
79
+ """
80
+ # Choose the best HGVS string to send to VEP
81
+ hgvs = normalized.hgvs_coding or normalized.hgvs_genomic or normalized.raw_input
82
+ if not hgvs:
83
+ return normalized
84
+ try:
85
+ data = await self.annotate_hgvs(hgvs)
86
+ except (httpx.HTTPError, httpx.TimeoutException) as e:
87
+ logger.warning("VEP annotation failed for %s: %s", hgvs, e)
88
+ return normalized
89
+ if not data:
90
+ return normalized
91
+
92
+ updates: dict = {}
93
+ if normalized.chromosome is None and data.get("seq_region_name"):
94
+ updates["chromosome"] = str(data["seq_region_name"])
95
+ if normalized.position is None and data.get("start"):
96
+ updates["position"] = int(data["start"])
97
+
98
+ ref, alt = self._split_alleles(data.get("allele_string"))
99
+ if normalized.ref is None and ref is not None:
100
+ updates["ref"] = ref
101
+ if normalized.alt is None and alt is not None:
102
+ updates["alt"] = alt
103
+
104
+ if normalized.consequence is None and data.get("most_severe_consequence"):
105
+ updates["consequence"] = data["most_severe_consequence"]
106
+
107
+ # Pick the canonical transcript if available, else the first
108
+ transcripts = data.get("transcript_consequences") or []
109
+ if transcripts:
110
+ canonical = next((t for t in transcripts if t.get("canonical")), transcripts[0])
111
+ if normalized.gene_symbol is None and canonical.get("gene_symbol"):
112
+ updates["gene_symbol"] = canonical["gene_symbol"]
113
+ if normalized.transcript is None and canonical.get("transcript_id"):
114
+ updates["transcript"] = canonical["transcript_id"]
115
+ if normalized.hgvs_protein is None and canonical.get("hgvsp"):
116
+ updates["hgvs_protein"] = canonical["hgvsp"]
117
+
118
+ if not updates:
119
+ return normalized
120
+ warnings = list(normalized.warnings)
121
+ warnings.append(f"VEP enriched: {', '.join(updates.keys())}")
122
+ return normalized.model_copy(update={**updates, "warnings": warnings})
backend/app/worker.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from celery import Celery
2
+
3
+ from backend.app.config import get_settings
4
+
5
+ settings = get_settings()
6
+
7
+ celery_app = Celery(
8
+ "variantlens",
9
+ broker=settings.celery_broker_url,
10
+ backend=settings.celery_result_backend,
11
+ )
12
+
13
+ celery_app.conf.update(
14
+ task_serializer="json",
15
+ result_serializer="json",
16
+ accept_content=["json"],
17
+ timezone="UTC",
18
+ enable_utc=True,
19
+ task_track_started=True,
20
+ )
backend/tests/__init__.py ADDED
File without changes