Spaces:
Sleeping
Sleeping
Codex commited on
Commit Β·
3e219fa
0
Parent(s):
Initial VariantLens clinical readiness scaffold
Browse filesThis view is limited to 50 files because it contains too many changes. Β See raw diff
- .env.example +98 -0
- .gitignore +56 -0
- AGENTS.md +105 -0
- CLAUDE.md +105 -0
- Makefile +63 -0
- README.md +82 -0
- alembic.ini +44 -0
- backend/Dockerfile +23 -0
- backend/alembic/env.py +57 -0
- backend/alembic/script.py.mako +27 -0
- backend/alembic/versions/0001_init.py +77 -0
- backend/app/__init__.py +0 -0
- backend/app/api/__init__.py +0 -0
- backend/app/api/evidence.py +78 -0
- backend/app/api/pipeline.py +102 -0
- backend/app/api/reports.py +98 -0
- backend/app/api/variants.py +43 -0
- backend/app/config.py +82 -0
- backend/app/main.py +67 -0
- backend/app/models/__init__.py +5 -0
- backend/app/models/classification.py +51 -0
- backend/app/models/db.py +23 -0
- backend/app/models/variant.py +24 -0
- backend/app/schemas/__init__.py +33 -0
- backend/app/schemas/classification.py +38 -0
- backend/app/schemas/evidence.py +97 -0
- backend/app/schemas/variant.py +34 -0
- backend/app/services/__init__.py +0 -0
- backend/app/services/acmg/__init__.py +4 -0
- backend/app/services/acmg/combiner.py +218 -0
- backend/app/services/acmg/rules.py +215 -0
- backend/app/services/clinvar.py +218 -0
- backend/app/services/exports.py +208 -0
- backend/app/services/gnomad.py +118 -0
- backend/app/services/insilico.py +159 -0
- backend/app/services/llm/__init__.py +5 -0
- backend/app/services/llm/prompts.py +109 -0
- backend/app/services/llm/reasoner.py +201 -0
- backend/app/services/llm/synthesizer.py +81 -0
- backend/app/services/normalization.py +209 -0
- backend/app/services/pvs1.py +111 -0
- backend/app/services/rag/__init__.py +6 -0
- backend/app/services/rag/chunker.py +72 -0
- backend/app/services/rag/embedder.py +77 -0
- backend/app/services/rag/fetcher.py +136 -0
- backend/app/services/rag/retriever.py +68 -0
- backend/app/services/repository.py +80 -0
- backend/app/services/vep.py +122 -0
- backend/app/worker.py +20 -0
- backend/tests/__init__.py +0 -0
.env.example
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# VariantLens β environment variables
|
| 3 |
+
# Copy this file to `.env` and fill in real values. Never commit `.env`.
|
| 4 |
+
# =============================================================================
|
| 5 |
+
|
| 6 |
+
# ---- LLM ---------------------------------------------------------------------
|
| 7 |
+
# Anthropic API key for the Claude reasoning layer.
|
| 8 |
+
# Get one at https://console.anthropic.com
|
| 9 |
+
ANTHROPIC_API_KEY=
|
| 10 |
+
|
| 11 |
+
# Default model for the literature-evidence reasoning layer.
|
| 12 |
+
# claude-sonnet-4-6 is the cost/quality default; claude-opus-4-7 for hard cases.
|
| 13 |
+
ANTHROPIC_MODEL=claude-sonnet-4-6
|
| 14 |
+
ANTHROPIC_MAX_TOKENS=2000
|
| 15 |
+
|
| 16 |
+
# Air-gap toggle. When true, the reasoner uses a local Ollama model instead of
|
| 17 |
+
# the Anthropic API. Required for fully on-premise clinical deployments.
|
| 18 |
+
USE_LOCAL_LLM=false
|
| 19 |
+
LOCAL_LLM_BASE_URL=http://localhost:11434
|
| 20 |
+
LOCAL_LLM_MODEL=qwen2.5:14b-instruct
|
| 21 |
+
|
| 22 |
+
# ---- External biomedical APIs -----------------------------------------------
|
| 23 |
+
# NCBI E-utilities key. Free; raises rate limit from 3 to 10 req/s.
|
| 24 |
+
# https://www.ncbi.nlm.nih.gov/account/settings/
|
| 25 |
+
NCBI_API_KEY=
|
| 26 |
+
NCBI_EMAIL=
|
| 27 |
+
|
| 28 |
+
# OMIM API key. Free for academic use.
|
| 29 |
+
# https://www.omim.org/api
|
| 30 |
+
OMIM_API_KEY=
|
| 31 |
+
|
| 32 |
+
# Mutalyzer + gnomAD do not require keys.
|
| 33 |
+
MUTALYZER_BASE_URL=https://mutalyzer.nl/api
|
| 34 |
+
GNOMAD_GRAPHQL_URL=https://gnomad.broadinstitute.org/api
|
| 35 |
+
SPLICEAI_LOOKUP_URL=https://spliceailookup-api.broadinstitute.org
|
| 36 |
+
CADD_API_URL=https://cadd.gs.washington.edu/api
|
| 37 |
+
|
| 38 |
+
# ---- Storage -----------------------------------------------------------------
|
| 39 |
+
# PostgreSQL β audit trail, classifications, curator sign-offs.
|
| 40 |
+
POSTGRES_HOST=postgres
|
| 41 |
+
POSTGRES_PORT=5432
|
| 42 |
+
POSTGRES_DB=variantlens
|
| 43 |
+
POSTGRES_USER=variantlens
|
| 44 |
+
POSTGRES_PASSWORD=change_me_locally
|
| 45 |
+
|
| 46 |
+
DATABASE_URL=postgresql+psycopg://variantlens:change_me_locally@postgres:5432/variantlens
|
| 47 |
+
|
| 48 |
+
# ChromaDB β local vector store. Embedded mode requires only the persist path.
|
| 49 |
+
CHROMA_PERSIST_DIR=./data/chroma
|
| 50 |
+
CHROMA_COLLECTION=variantlens_pubmed
|
| 51 |
+
|
| 52 |
+
# Local SQLite caches and pre-scored tables.
|
| 53 |
+
# Build the prediction DBs once with `python -m scripts.build_revel_db <csv>`
|
| 54 |
+
# and `python -m scripts.build_alphamissense_db <tsv.gz>`.
|
| 55 |
+
REVEL_DB_PATH=./data/revel_scores.db
|
| 56 |
+
ALPHAMISSENSE_DB_PATH=./data/alphamissense.db
|
| 57 |
+
GNOMAD_CACHE_DB=./data/gnomad_cache.db
|
| 58 |
+
CLINVAR_VCF_PATH=./data/clinvar.vcf.gz
|
| 59 |
+
|
| 60 |
+
# ---- Embeddings --------------------------------------------------------------
|
| 61 |
+
# BioLinkBERT for biomedical accuracy; all-MiniLM-L6-v2 for speed.
|
| 62 |
+
EMBEDDING_MODEL=michiyasunaga/BioLinkBERT-base
|
| 63 |
+
EMBEDDING_DEVICE=cpu
|
| 64 |
+
|
| 65 |
+
# ---- App ---------------------------------------------------------------------
|
| 66 |
+
APP_ENV=development
|
| 67 |
+
LOG_LEVEL=INFO
|
| 68 |
+
API_HOST=0.0.0.0
|
| 69 |
+
API_PORT=8000
|
| 70 |
+
|
| 71 |
+
# Async job queue (Celery + Redis).
|
| 72 |
+
REDIS_URL=redis://redis:6379/0
|
| 73 |
+
CELERY_BROKER_URL=redis://redis:6379/1
|
| 74 |
+
CELERY_RESULT_BACKEND=redis://redis:6379/2
|
| 75 |
+
|
| 76 |
+
# ---- Auth (placeholder β wire to hospital LDAP/OAuth in deployment) ----------
|
| 77 |
+
JWT_SECRET=change_me_locally_to_a_long_random_string
|
| 78 |
+
JWT_ALGORITHM=HS256
|
| 79 |
+
JWT_EXPIRE_MINUTES=480
|
| 80 |
+
|
| 81 |
+
# ---- Feature flags -----------------------------------------------------------
|
| 82 |
+
# When true, also pull full text from PMC; otherwise abstracts only.
|
| 83 |
+
RAG_FETCH_FULLTEXT=true
|
| 84 |
+
RAG_MAX_PAPERS_PER_VARIANT=200
|
| 85 |
+
RAG_CHUNK_SIZE=512
|
| 86 |
+
RAG_CHUNK_OVERLAP=128
|
| 87 |
+
RAG_TOP_K=8
|
| 88 |
+
|
| 89 |
+
# ACMG ruleset version. Switch to "v4" once SVC v4.0 is finalized.
|
| 90 |
+
ACMG_RULESET_VERSION=v2015
|
| 91 |
+
|
| 92 |
+
# Clinical default is strict Richards 2015 Table 5. "bayesian" and
|
| 93 |
+
# "most_pathogenic" are available for research/validation only.
|
| 94 |
+
ACMG_COMBINER_STRATEGY=table5
|
| 95 |
+
|
| 96 |
+
# PP5/BP6 were deprecated by ACMG SVI in 2018. Keep false for clinical use;
|
| 97 |
+
# set true only for backward-compatible research comparisons.
|
| 98 |
+
ENABLE_DEPRECATED_CLINVAR_CRITERIA=false
|
.gitignore
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Secrets
|
| 2 |
+
.env
|
| 3 |
+
.env.local
|
| 4 |
+
.env.*.local
|
| 5 |
+
|
| 6 |
+
# Python
|
| 7 |
+
__pycache__/
|
| 8 |
+
*.py[cod]
|
| 9 |
+
*$py.class
|
| 10 |
+
*.so
|
| 11 |
+
.Python
|
| 12 |
+
.venv/
|
| 13 |
+
venv/
|
| 14 |
+
env/
|
| 15 |
+
.eggs/
|
| 16 |
+
*.egg-info/
|
| 17 |
+
.pytest_cache/
|
| 18 |
+
.mypy_cache/
|
| 19 |
+
.ruff_cache/
|
| 20 |
+
.coverage
|
| 21 |
+
htmlcov/
|
| 22 |
+
|
| 23 |
+
# Node
|
| 24 |
+
node_modules/
|
| 25 |
+
dist/
|
| 26 |
+
build/
|
| 27 |
+
.next/
|
| 28 |
+
*.log
|
| 29 |
+
npm-debug.log*
|
| 30 |
+
*.tsbuildinfo
|
| 31 |
+
|
| 32 |
+
# IDE
|
| 33 |
+
.vscode/
|
| 34 |
+
.idea/
|
| 35 |
+
.claude/
|
| 36 |
+
*.swp
|
| 37 |
+
.DS_Store
|
| 38 |
+
|
| 39 |
+
# Data β large pre-scored tables and patient data must never be committed
|
| 40 |
+
data/
|
| 41 |
+
!data/.gitkeep
|
| 42 |
+
*.vcf
|
| 43 |
+
*.vcf.gz
|
| 44 |
+
*.tsv.gz
|
| 45 |
+
*.bam
|
| 46 |
+
*.cram
|
| 47 |
+
*.fastq
|
| 48 |
+
*.fastq.gz
|
| 49 |
+
|
| 50 |
+
# ChromaDB persist dir
|
| 51 |
+
chroma/
|
| 52 |
+
*.parquet
|
| 53 |
+
|
| 54 |
+
# Reports / exports (may contain PHI)
|
| 55 |
+
reports/
|
| 56 |
+
exports/
|
AGENTS.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# VariantLens
|
| 2 |
+
|
| 3 |
+
Clinical genomic variant interpretation tool for the Jordan Lerner-Ellis Lab. Built around the ACMG/AMP 2015 framework (Richards et al.) with the SVC v4.0 transition in mind. Modeled on the three tools showcased at the November 2025 GA4GH/ClinGen CGLC session: AI CURA (96% concordance via RAG + DeepSeek-R1), EvAgg (Broad/Microsoft evidence aggregator), and AutoPM3 (HKU PM3 extractor).
|
| 4 |
+
|
| 5 |
+
The full design lives in `docs/VariantLens_Build_Plan.md`. The supporting literature review lives in `docs/AI_Variant_Interpretation_Review.md`. Read those before making non-trivial architectural changes.
|
| 6 |
+
|
| 7 |
+
## Non-negotiables
|
| 8 |
+
|
| 9 |
+
- **Human-in-the-loop.** A trained curator signs off every classification. The tool surfaces evidence and proposes criteria; it does not autonomously classify for clinical use.
|
| 10 |
+
- **On-prem patient data.** No genomic data is sent to cloud APIs without explicit opt-in. The `USE_LOCAL_LLM` flag must always provide a working air-gapped path (Ollama + open-source model).
|
| 11 |
+
- **Audit trail.** Every triggered ACMG criterion is traceable to a source β a database row, a PMID, or a curator override with free-text justification.
|
| 12 |
+
- **Anti-hallucination is structural, not cosmetic.** Codex is only allowed to reason over RAG-retrieved chunks, must cite PMIDs verbatim, and must emit structured JSON. If the context lacks evidence, the only valid output is "insufficient evidence in provided literature".
|
| 13 |
+
- **Database facts never go through the LLM.** gnomAD AFs, ClinVar classifications, REVEL/SpliceAI/AlphaMissense scores are scored deterministically. Codex only handles literature-dependent criteria: PM3, PP1, PS3/BS3, PS4, PP4, PS2/PM6, PP5/BP6.
|
| 14 |
+
|
| 15 |
+
## Architecture (one-line summary)
|
| 16 |
+
|
| 17 |
+
`Mutalyzer normalize β parallel evidence (gnomAD, ClinVar, in-silico, autoPVS1) β ACMG rule engine (InterVar-extended) β RAG over PubMed via ChromaDB β Codex reasons over retrieved chunks β Table 5 combiner β curator review UI β PDF/ClinVar/FHIR export.`
|
| 18 |
+
|
| 19 |
+
## What we reuse vs. build
|
| 20 |
+
|
| 21 |
+
**Reuse (do not reimplement):**
|
| 22 |
+
- `autoPVS1` for PVS1
|
| 23 |
+
- `InterVar` as the rule-engine scaffold (extend from ~18 to all 28 criteria)
|
| 24 |
+
- `Mutalyzer` for HGVS normalization (PyHGVS as offline fallback)
|
| 25 |
+
- Pre-scored tables for REVEL, AlphaMissense, SpliceAI (do not run the models per variant)
|
| 26 |
+
- `ChromaDB` for the vector store, `sentence-transformers` (BioLinkBERT) for embeddings
|
| 27 |
+
|
| 28 |
+
**Build ourselves:**
|
| 29 |
+
- The orchestration layer (FastAPI services in `backend/app/services/`)
|
| 30 |
+
- The criterion-aware RAG retriever (different queries for PM3 vs. PP1 vs. PS3)
|
| 31 |
+
- The Codex prompt templates (one per literature-dependent criterion)
|
| 32 |
+
- The Table 5 combiner with conflict detection
|
| 33 |
+
- The curator dashboard
|
| 34 |
+
|
| 35 |
+
## Tech stack
|
| 36 |
+
|
| 37 |
+
```
|
| 38 |
+
Backend: Python 3.12, FastAPI, SQLAlchemy, Celery (async jobs)
|
| 39 |
+
Frontend: React 18, TypeScript, Tailwind, React Query, Zustand
|
| 40 |
+
Databases: PostgreSQL (audit trail), SQLite (REVEL/gnomAD offline cache)
|
| 41 |
+
Vector DB: ChromaDB (embedded, on-prem)
|
| 42 |
+
Embeddings: sentence-transformers (BioLinkBERT preferred; all-MiniLM-L6-v2 fallback)
|
| 43 |
+
LLM: Anthropic Codex (Codex-sonnet-4-6 for the reasoning layer; Codex-opus-4-7 only for hard cases)
|
| 44 |
+
Local fallback: Ollama + qwen2.5 or mistral-nemo
|
| 45 |
+
Containers: Docker + docker-compose
|
| 46 |
+
Tests: pytest, hypothesis (property-based on the combiner)
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
## Directory layout
|
| 50 |
+
|
| 51 |
+
```
|
| 52 |
+
backend/ FastAPI app
|
| 53 |
+
app/api/ Routers: variants, evidence, reports
|
| 54 |
+
app/services/ normalization, gnomad, clinvar, insilico, pvs1, rag/, acmg/, llm/
|
| 55 |
+
app/models/ SQLAlchemy
|
| 56 |
+
tests/
|
| 57 |
+
frontend/ React + TS
|
| 58 |
+
data/ Pre-scored tables, gnomAD cache, ChromaDB persist dir
|
| 59 |
+
docs/ Build plan, literature review, ACMG references
|
| 60 |
+
docker-compose.yml
|
| 61 |
+
.env.example
|
| 62 |
+
.env gitignored β fill from .env.example
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
## Phase plan (~5 weeks)
|
| 66 |
+
|
| 67 |
+
0. Scaffold + Docker (day 1)
|
| 68 |
+
1. Mutalyzer normalization + 20-variant edge-case test set (day 2β3)
|
| 69 |
+
2. gnomAD, ClinVar, in-silico predictors, autoPVS1 (day 4β7)
|
| 70 |
+
3. RAG: PubMed fetch β chunk β embed β ChromaDB β criterion-aware retriever (day 8β11)
|
| 71 |
+
4. ACMG rule engine: 28 criteria + Table 5 combiner; β₯85% concordance on 50 ClinVar variants (day 12β15)
|
| 72 |
+
5. Codex reasoning layer with hallucination-suppression prompts (day 16β18)
|
| 73 |
+
6. React curator dashboard + PDF/ClinVar/FHIR export (day 19β22)
|
| 74 |
+
7. Validation: 100 4-star ClinVar expert-panel variants; hallucination-guard tests (day 23β25)
|
| 75 |
+
|
| 76 |
+
## Validation bar
|
| 77 |
+
|
| 78 |
+
- **Classification concordance:** β₯85% on a held-out set of 100 ClinVar 4-star expert-panel variants. Stretch: match AI CURA's 96%.
|
| 79 |
+
- **Hallucination guard:** When fed deliberately empty/wrong literature contexts, Codex must NOT trigger PM3/PP1/PS3 and must only cite PMIDs that are present in the provided context.
|
| 80 |
+
- **Performance:** <30 s per variant (RAG included); 100 variants/hour batch throughput.
|
| 81 |
+
- **Audit:** Every triggered criterion has a traceable source field. No criterion fires with empty `evidence_text`.
|
| 82 |
+
|
| 83 |
+
## Conventions
|
| 84 |
+
|
| 85 |
+
- Pydantic models for every service input/output. No `dict[str, Any]` at module boundaries.
|
| 86 |
+
- All LLM calls return JSON validated against a pydantic schema; if validation fails, retry once with a "your previous output was invalid JSON, here is the schema" repair prompt, then fail closed.
|
| 87 |
+
- Every external API client implements local caching (SQLite or filesystem) and respects rate limits β NCBI is 3 req/s without a key, 10 req/s with one. Treat cache misses as the slow path, not the default.
|
| 88 |
+
- Never write the canonical HGVS as a free-form string in the DB. Always store the Mutalyzer-normalized form and keep the user-supplied input separately for round-tripping.
|
| 89 |
+
- Keep `Codex-sonnet-4-6` as the default model. Only escalate individual hard variants to `Codex-opus-4-7` after benchmarking shows it changes outcomes.
|
| 90 |
+
|
| 91 |
+
## Keys and external services
|
| 92 |
+
|
| 93 |
+
See `.env.example` for the full list. Required to run end-to-end:
|
| 94 |
+
- `ANTHROPIC_API_KEY` β paid, console.anthropic.com
|
| 95 |
+
- `NCBI_API_KEY` β free, raises rate limits to 10 req/s
|
| 96 |
+
- `OMIM_API_KEY` β free for academic use
|
| 97 |
+
|
| 98 |
+
`gnomAD` and `Mutalyzer` are open APIs and need no keys.
|
| 99 |
+
|
| 100 |
+
## Notes for collaborators (and Codex)
|
| 101 |
+
|
| 102 |
+
- This is an intern project under active mentorship. Prefer small, reviewed PRs over big-bang merges.
|
| 103 |
+
- When in doubt about an ACMG criterion, cite the relevant section of Richards 2015 in the code comment, not just a paraphrase.
|
| 104 |
+
- The ACMG SVC v4.0 update (piloted March 2025) will change criterion weighting. Keep the rule logic in `services/acmg/rules.py` versioned (`rules_v2015.py`, `rules_v4.py`) so the swap is mechanical, not a rewrite.
|
| 105 |
+
- GA4GH VRS / VA-Spec interop is a stretch goal but worth keeping the data models compatible with from day one.
|
CLAUDE.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# VariantLens
|
| 2 |
+
|
| 3 |
+
Clinical genomic variant interpretation tool for the Jordan Lerner-Ellis Lab. Built around the ACMG/AMP 2015 framework (Richards et al.) with the SVC v4.0 transition in mind. Modeled on the three tools showcased at the November 2025 GA4GH/ClinGen CGLC session: AI CURA (96% concordance via RAG + DeepSeek-R1), EvAgg (Broad/Microsoft evidence aggregator), and AutoPM3 (HKU PM3 extractor).
|
| 4 |
+
|
| 5 |
+
The full design lives in `docs/VariantLens_Build_Plan.md`. The supporting literature review lives in `docs/AI_Variant_Interpretation_Review.md`. Read those before making non-trivial architectural changes.
|
| 6 |
+
|
| 7 |
+
## Non-negotiables
|
| 8 |
+
|
| 9 |
+
- **Human-in-the-loop.** A trained curator signs off every classification. The tool surfaces evidence and proposes criteria; it does not autonomously classify for clinical use.
|
| 10 |
+
- **On-prem patient data.** No genomic data is sent to cloud APIs without explicit opt-in. The `USE_LOCAL_LLM` flag must always provide a working air-gapped path (Ollama + open-source model).
|
| 11 |
+
- **Audit trail.** Every triggered ACMG criterion is traceable to a source β a database row, a PMID, or a curator override with free-text justification.
|
| 12 |
+
- **Anti-hallucination is structural, not cosmetic.** Claude is only allowed to reason over RAG-retrieved chunks, must cite PMIDs verbatim, and must emit structured JSON. If the context lacks evidence, the only valid output is "insufficient evidence in provided literature".
|
| 13 |
+
- **Database facts never go through the LLM.** gnomAD AFs, ClinVar classifications, REVEL/SpliceAI/AlphaMissense scores are scored deterministically. Claude only handles literature-dependent criteria: PM3, PP1, PS3/BS3, PS4, PP4, PS2/PM6, PP5/BP6.
|
| 14 |
+
|
| 15 |
+
## Architecture (one-line summary)
|
| 16 |
+
|
| 17 |
+
`Mutalyzer normalize β parallel evidence (gnomAD, ClinVar, in-silico, autoPVS1) β ACMG rule engine (InterVar-extended) β RAG over PubMed via ChromaDB β Claude reasons over retrieved chunks β Table 5 combiner β curator review UI β PDF/ClinVar/FHIR export.`
|
| 18 |
+
|
| 19 |
+
## What we reuse vs. build
|
| 20 |
+
|
| 21 |
+
**Reuse (do not reimplement):**
|
| 22 |
+
- `autoPVS1` for PVS1
|
| 23 |
+
- `InterVar` as the rule-engine scaffold (extend from ~18 to all 28 criteria)
|
| 24 |
+
- `Mutalyzer` for HGVS normalization (PyHGVS as offline fallback)
|
| 25 |
+
- Pre-scored tables for REVEL, AlphaMissense, SpliceAI (do not run the models per variant)
|
| 26 |
+
- `ChromaDB` for the vector store, `sentence-transformers` (BioLinkBERT) for embeddings
|
| 27 |
+
|
| 28 |
+
**Build ourselves:**
|
| 29 |
+
- The orchestration layer (FastAPI services in `backend/app/services/`)
|
| 30 |
+
- The criterion-aware RAG retriever (different queries for PM3 vs. PP1 vs. PS3)
|
| 31 |
+
- The Claude prompt templates (one per literature-dependent criterion)
|
| 32 |
+
- The Table 5 combiner with conflict detection
|
| 33 |
+
- The curator dashboard
|
| 34 |
+
|
| 35 |
+
## Tech stack
|
| 36 |
+
|
| 37 |
+
```
|
| 38 |
+
Backend: Python 3.12, FastAPI, SQLAlchemy, Celery (async jobs)
|
| 39 |
+
Frontend: React 18, TypeScript, Tailwind, React Query, Zustand
|
| 40 |
+
Databases: PostgreSQL (audit trail), SQLite (REVEL/gnomAD offline cache)
|
| 41 |
+
Vector DB: ChromaDB (embedded, on-prem)
|
| 42 |
+
Embeddings: sentence-transformers (BioLinkBERT preferred; all-MiniLM-L6-v2 fallback)
|
| 43 |
+
LLM: Anthropic Claude (claude-sonnet-4-6 for the reasoning layer; claude-opus-4-7 only for hard cases)
|
| 44 |
+
Local fallback: Ollama + qwen2.5 or mistral-nemo
|
| 45 |
+
Containers: Docker + docker-compose
|
| 46 |
+
Tests: pytest, hypothesis (property-based on the combiner)
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
## Directory layout
|
| 50 |
+
|
| 51 |
+
```
|
| 52 |
+
backend/ FastAPI app
|
| 53 |
+
app/api/ Routers: variants, evidence, reports
|
| 54 |
+
app/services/ normalization, gnomad, clinvar, insilico, pvs1, rag/, acmg/, llm/
|
| 55 |
+
app/models/ SQLAlchemy
|
| 56 |
+
tests/
|
| 57 |
+
frontend/ React + TS
|
| 58 |
+
data/ Pre-scored tables, gnomAD cache, ChromaDB persist dir
|
| 59 |
+
docs/ Build plan, literature review, ACMG references
|
| 60 |
+
docker-compose.yml
|
| 61 |
+
.env.example
|
| 62 |
+
.env gitignored β fill from .env.example
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
## Phase plan (~5 weeks)
|
| 66 |
+
|
| 67 |
+
0. Scaffold + Docker (day 1)
|
| 68 |
+
1. Mutalyzer normalization + 20-variant edge-case test set (day 2β3)
|
| 69 |
+
2. gnomAD, ClinVar, in-silico predictors, autoPVS1 (day 4β7)
|
| 70 |
+
3. RAG: PubMed fetch β chunk β embed β ChromaDB β criterion-aware retriever (day 8β11)
|
| 71 |
+
4. ACMG rule engine: 28 criteria + Table 5 combiner; β₯85% concordance on 50 ClinVar variants (day 12β15)
|
| 72 |
+
5. Claude reasoning layer with hallucination-suppression prompts (day 16β18)
|
| 73 |
+
6. React curator dashboard + PDF/ClinVar/FHIR export (day 19β22)
|
| 74 |
+
7. Validation: 100 4-star ClinVar expert-panel variants; hallucination-guard tests (day 23β25)
|
| 75 |
+
|
| 76 |
+
## Validation bar
|
| 77 |
+
|
| 78 |
+
- **Classification concordance:** β₯85% on a held-out set of 100 ClinVar 4-star expert-panel variants. Stretch: match AI CURA's 96%.
|
| 79 |
+
- **Hallucination guard:** When fed deliberately empty/wrong literature contexts, Claude must NOT trigger PM3/PP1/PS3 and must only cite PMIDs that are present in the provided context.
|
| 80 |
+
- **Performance:** <30 s per variant (RAG included); 100 variants/hour batch throughput.
|
| 81 |
+
- **Audit:** Every triggered criterion has a traceable source field. No criterion fires with empty `evidence_text`.
|
| 82 |
+
|
| 83 |
+
## Conventions
|
| 84 |
+
|
| 85 |
+
- Pydantic models for every service input/output. No `dict[str, Any]` at module boundaries.
|
| 86 |
+
- All LLM calls return JSON validated against a pydantic schema; if validation fails, retry once with a "your previous output was invalid JSON, here is the schema" repair prompt, then fail closed.
|
| 87 |
+
- Every external API client implements local caching (SQLite or filesystem) and respects rate limits β NCBI is 3 req/s without a key, 10 req/s with one. Treat cache misses as the slow path, not the default.
|
| 88 |
+
- Never write the canonical HGVS as a free-form string in the DB. Always store the Mutalyzer-normalized form and keep the user-supplied input separately for round-tripping.
|
| 89 |
+
- Keep `claude-sonnet-4-6` as the default model. Only escalate individual hard variants to `claude-opus-4-7` after benchmarking shows it changes outcomes.
|
| 90 |
+
|
| 91 |
+
## Keys and external services
|
| 92 |
+
|
| 93 |
+
See `.env.example` for the full list. Required to run end-to-end:
|
| 94 |
+
- `ANTHROPIC_API_KEY` β paid, console.anthropic.com
|
| 95 |
+
- `NCBI_API_KEY` β free, raises rate limits to 10 req/s
|
| 96 |
+
- `OMIM_API_KEY` β free for academic use
|
| 97 |
+
|
| 98 |
+
`gnomAD` and `Mutalyzer` are open APIs and need no keys.
|
| 99 |
+
|
| 100 |
+
## Notes for collaborators (and Claude)
|
| 101 |
+
|
| 102 |
+
- This is an intern project under active mentorship. Prefer small, reviewed PRs over big-bang merges.
|
| 103 |
+
- When in doubt about an ACMG criterion, cite the relevant section of Richards 2015 in the code comment, not just a paraphrase.
|
| 104 |
+
- The ACMG SVC v4.0 update (piloted March 2025) will change criterion weighting. Keep the rule logic in `services/acmg/rules.py` versioned (`rules_v2015.py`, `rules_v4.py`) so the swap is mechanical, not a rewrite.
|
| 105 |
+
- GA4GH VRS / VA-Spec interop is a stretch goal but worth keeping the data models compatible with from day one.
|
Makefile
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
SHELL := /bin/bash
|
| 2 |
+
|
| 3 |
+
.PHONY: help install up down logs migrate seed test test-fast test-slow lint typecheck frontend-dev frontend-build clean
|
| 4 |
+
|
| 5 |
+
help:
|
| 6 |
+
@echo "VariantLens β common commands"
|
| 7 |
+
@echo ""
|
| 8 |
+
@echo " make install install backend (editable) + frontend deps"
|
| 9 |
+
@echo " make up docker compose up (api, worker, postgres, redis, frontend)"
|
| 10 |
+
@echo " make down docker compose down (preserves volumes)"
|
| 11 |
+
@echo " make logs tail logs from all containers"
|
| 12 |
+
@echo " make migrate run alembic migrations against the running postgres"
|
| 13 |
+
@echo " make seed pull 100 ClinVar 4-star variants into the eval fixture"
|
| 14 |
+
@echo " make test run fast unit tests (skips slow/external)"
|
| 15 |
+
@echo " make test-slow run the concordance harness (needs API keys + seeded fixture)"
|
| 16 |
+
@echo " make lint ruff check"
|
| 17 |
+
@echo " make typecheck mypy backend + tsc frontend"
|
| 18 |
+
@echo " make frontend-dev Vite dev server (no docker)"
|
| 19 |
+
@echo " make clean remove caches and build artifacts (preserves data/)"
|
| 20 |
+
|
| 21 |
+
install:
|
| 22 |
+
pip install -e ".[dev]"
|
| 23 |
+
cd frontend && npm install
|
| 24 |
+
|
| 25 |
+
up:
|
| 26 |
+
docker compose up --build
|
| 27 |
+
|
| 28 |
+
down:
|
| 29 |
+
docker compose down
|
| 30 |
+
|
| 31 |
+
logs:
|
| 32 |
+
docker compose logs -f --tail=200
|
| 33 |
+
|
| 34 |
+
migrate:
|
| 35 |
+
docker compose run --rm api alembic upgrade head
|
| 36 |
+
|
| 37 |
+
seed:
|
| 38 |
+
python -m scripts.seed_eval_set --n 100
|
| 39 |
+
|
| 40 |
+
test:
|
| 41 |
+
pytest -m "not slow"
|
| 42 |
+
|
| 43 |
+
test-slow:
|
| 44 |
+
pytest -m slow
|
| 45 |
+
|
| 46 |
+
lint:
|
| 47 |
+
ruff check backend scripts
|
| 48 |
+
|
| 49 |
+
typecheck:
|
| 50 |
+
mypy backend
|
| 51 |
+
cd frontend && npm run typecheck
|
| 52 |
+
|
| 53 |
+
frontend-dev:
|
| 54 |
+
cd frontend && npm run dev
|
| 55 |
+
|
| 56 |
+
frontend-build:
|
| 57 |
+
cd frontend && npm run build
|
| 58 |
+
|
| 59 |
+
clean:
|
| 60 |
+
rm -rf .pytest_cache .mypy_cache .ruff_cache htmlcov .coverage
|
| 61 |
+
find backend -type d -name __pycache__ -exec rm -rf {} +
|
| 62 |
+
find scripts -type d -name __pycache__ -exec rm -rf {} +
|
| 63 |
+
cd frontend && rm -rf dist node_modules/.vite
|
README.md
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# VariantLens
|
| 2 |
+
|
| 3 |
+
Clinical genomic variant interpretation tool. ACMG/AMP rule engine + RAG over PubMed + Claude reasoning, with a curator review UI. Built for the Jordan Lerner-Ellis Lab.
|
| 4 |
+
|
| 5 |
+
See [CLAUDE.md](CLAUDE.md) for architecture, conventions, and validation bar. See [docs/](docs/) for the full build plan and literature review.
|
| 6 |
+
|
| 7 |
+
For lab or clinical-trial preparation, start with
|
| 8 |
+
[docs/Clinical_Readiness_Checklist.md](docs/Clinical_Readiness_Checklist.md).
|
| 9 |
+
VariantLens is a human-in-the-loop curator-support tool; it is not an
|
| 10 |
+
autonomous clinical classifier.
|
| 11 |
+
|
| 12 |
+
## Quick start
|
| 13 |
+
|
| 14 |
+
```bash
|
| 15 |
+
# 1. Fill in API keys
|
| 16 |
+
cp .env.example .env # then edit .env with your keys
|
| 17 |
+
|
| 18 |
+
# 2. Bring everything up (postgres, redis, api, worker, frontend)
|
| 19 |
+
make up # or: docker compose up --build
|
| 20 |
+
|
| 21 |
+
# 3. Open
|
| 22 |
+
# Frontend: http://localhost:5173
|
| 23 |
+
# API docs: http://localhost:8000/docs
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
Migrations apply automatically on API startup. Run `make help` for the full list of commands (`make seed`, `make test`, `make test-slow`, `make typecheck`, etc.).
|
| 27 |
+
|
| 28 |
+
For non-docker local dev (debugger-friendly): `./scripts/dev.sh` boots uvicorn against a local SQLite file plus the Vite dev server.
|
| 29 |
+
|
| 30 |
+
## Required keys
|
| 31 |
+
|
| 32 |
+
- `ANTHROPIC_API_KEY` β paid, [console.anthropic.com](https://console.anthropic.com)
|
| 33 |
+
- `NCBI_API_KEY` + `NCBI_EMAIL` β free, raises NCBI rate limit from 3 to 10 req/s
|
| 34 |
+
- `OMIM_API_KEY` β free for academic use
|
| 35 |
+
|
| 36 |
+
`gnomAD` and `Mutalyzer` are open APIs and need no keys.
|
| 37 |
+
|
| 38 |
+
## Layout
|
| 39 |
+
|
| 40 |
+
```
|
| 41 |
+
backend/ FastAPI + SQLAlchemy + Anthropic SDK
|
| 42 |
+
app/api/ Routers: variants, evidence, reports
|
| 43 |
+
app/services/ Domain logic: normalization, databases, RAG, ACMG, LLM
|
| 44 |
+
app/models/ SQLAlchemy ORM
|
| 45 |
+
app/schemas/ Pydantic models for API I/O
|
| 46 |
+
tests/ pytest + hypothesis property tests
|
| 47 |
+
frontend/ React + TypeScript + Vite + Tailwind
|
| 48 |
+
data/ Pre-scored tables (REVEL, AlphaMissense), gnomAD cache, ChromaDB persist
|
| 49 |
+
docs/ Build plan, literature review
|
| 50 |
+
scripts/ Data prep: download REVEL, build SQLite caches, seed evaluation set
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
## Development
|
| 54 |
+
|
| 55 |
+
```bash
|
| 56 |
+
make test # fast unit tests (skips external APIs)
|
| 57 |
+
make test-slow # concordance harness (needs API keys + seeded fixture)
|
| 58 |
+
make lint # ruff check
|
| 59 |
+
make typecheck # mypy backend + tsc frontend
|
| 60 |
+
make seed # pull 100 ClinVar 4-star variants for the eval fixture
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
### Data prep (one-time)
|
| 64 |
+
|
| 65 |
+
```bash
|
| 66 |
+
# REVEL β download revel-v1.3_all_chromosomes.csv from
|
| 67 |
+
# https://sites.google.com/site/revelgenomics/downloads first.
|
| 68 |
+
python -m scripts.build_revel_db /path/to/revel-v1.3_all_chromosomes.csv
|
| 69 |
+
|
| 70 |
+
# Eval fixture β pulls expert-panel ClinVar variants for the test harness.
|
| 71 |
+
make seed
|
| 72 |
+
|
| 73 |
+
# (Optional) pre-warm the gnomAD cache for a known variant list.
|
| 74 |
+
python -m scripts.warm_gnomad_cache variant_ids.txt
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
## Validation bar
|
| 78 |
+
|
| 79 |
+
- β₯85% classification concordance against 100 ClinVar 4-star expert-panel variants
|
| 80 |
+
- Hallucination guard: empty/wrong literature contexts must NOT trigger PM3/PP1/PS3 and must only cite PMIDs present in the provided context
|
| 81 |
+
- <30 s per variant including RAG; 100 variants/hour batch throughput
|
| 82 |
+
- Every triggered ACMG criterion has a traceable source field
|
alembic.ini
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[alembic]
|
| 2 |
+
script_location = backend/alembic
|
| 3 |
+
prepend_sys_path = .
|
| 4 |
+
version_path_separator = os
|
| 5 |
+
|
| 6 |
+
# Read the URL from the environment (DATABASE_URL) at runtime β we set it in
|
| 7 |
+
# backend/alembic/env.py from `backend.app.config.get_settings`.
|
| 8 |
+
sqlalchemy.url = driver://user:pass@host/db
|
| 9 |
+
|
| 10 |
+
[post_write_hooks]
|
| 11 |
+
|
| 12 |
+
[loggers]
|
| 13 |
+
keys = root,sqlalchemy,alembic
|
| 14 |
+
|
| 15 |
+
[handlers]
|
| 16 |
+
keys = console
|
| 17 |
+
|
| 18 |
+
[formatters]
|
| 19 |
+
keys = generic
|
| 20 |
+
|
| 21 |
+
[logger_root]
|
| 22 |
+
level = WARN
|
| 23 |
+
handlers = console
|
| 24 |
+
qualname =
|
| 25 |
+
|
| 26 |
+
[logger_sqlalchemy]
|
| 27 |
+
level = WARN
|
| 28 |
+
handlers =
|
| 29 |
+
qualname = sqlalchemy.engine
|
| 30 |
+
|
| 31 |
+
[logger_alembic]
|
| 32 |
+
level = INFO
|
| 33 |
+
handlers =
|
| 34 |
+
qualname = alembic
|
| 35 |
+
|
| 36 |
+
[handler_console]
|
| 37 |
+
class = StreamHandler
|
| 38 |
+
args = (sys.stderr,)
|
| 39 |
+
level = NOTSET
|
| 40 |
+
formatter = generic
|
| 41 |
+
|
| 42 |
+
[formatter_generic]
|
| 43 |
+
format = %(levelname)-5.5s [%(name)s] %(message)s
|
| 44 |
+
datefmt = %H:%M:%S
|
backend/Dockerfile
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 6 |
+
PIP_DISABLE_PIP_VERSION_CHECK=1
|
| 7 |
+
|
| 8 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 9 |
+
build-essential \
|
| 10 |
+
curl \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
COPY pyproject.toml ./
|
| 14 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 15 |
+
pip install --no-cache-dir -e ".[dev]"
|
| 16 |
+
|
| 17 |
+
COPY backend ./backend
|
| 18 |
+
COPY scripts ./scripts
|
| 19 |
+
COPY alembic.ini ./
|
| 20 |
+
|
| 21 |
+
EXPOSE 8000
|
| 22 |
+
|
| 23 |
+
CMD ["uvicorn", "backend.app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
backend/alembic/env.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Alembic env wired to the project Settings + SQLAlchemy Base."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
from logging.config import fileConfig
|
| 5 |
+
|
| 6 |
+
from alembic import context
|
| 7 |
+
from sqlalchemy import engine_from_config, pool
|
| 8 |
+
|
| 9 |
+
from backend.app.config import get_settings
|
| 10 |
+
|
| 11 |
+
# Import every model so Base.metadata is populated for autogenerate.
|
| 12 |
+
from backend.app.models import classification as _classification # noqa: F401
|
| 13 |
+
from backend.app.models import variant as _variant # noqa: F401
|
| 14 |
+
from backend.app.models.db import Base
|
| 15 |
+
|
| 16 |
+
config = context.config
|
| 17 |
+
if config.config_file_name is not None:
|
| 18 |
+
fileConfig(config.config_file_name)
|
| 19 |
+
|
| 20 |
+
# Override the sqlalchemy.url placeholder from alembic.ini with the live DSN.
|
| 21 |
+
config.set_main_option("sqlalchemy.url", get_settings().database_url)
|
| 22 |
+
|
| 23 |
+
target_metadata = Base.metadata
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def run_migrations_offline() -> None:
|
| 27 |
+
context.configure(
|
| 28 |
+
url=config.get_main_option("sqlalchemy.url"),
|
| 29 |
+
target_metadata=target_metadata,
|
| 30 |
+
literal_binds=True,
|
| 31 |
+
dialect_opts={"paramstyle": "named"},
|
| 32 |
+
compare_type=True,
|
| 33 |
+
)
|
| 34 |
+
with context.begin_transaction():
|
| 35 |
+
context.run_migrations()
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def run_migrations_online() -> None:
|
| 39 |
+
connectable = engine_from_config(
|
| 40 |
+
config.get_section(config.config_ini_section, {}),
|
| 41 |
+
prefix="sqlalchemy.",
|
| 42 |
+
poolclass=pool.NullPool,
|
| 43 |
+
)
|
| 44 |
+
with connectable.connect() as connection:
|
| 45 |
+
context.configure(
|
| 46 |
+
connection=connection,
|
| 47 |
+
target_metadata=target_metadata,
|
| 48 |
+
compare_type=True,
|
| 49 |
+
)
|
| 50 |
+
with context.begin_transaction():
|
| 51 |
+
context.run_migrations()
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
if context.is_offline_mode():
|
| 55 |
+
run_migrations_offline()
|
| 56 |
+
else:
|
| 57 |
+
run_migrations_online()
|
backend/alembic/script.py.mako
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""${message}
|
| 2 |
+
|
| 3 |
+
Revision ID: ${up_revision}
|
| 4 |
+
Revises: ${down_revision | comma,n}
|
| 5 |
+
Create Date: ${create_date}
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from typing import Sequence, Union
|
| 11 |
+
|
| 12 |
+
from alembic import op
|
| 13 |
+
import sqlalchemy as sa
|
| 14 |
+
${imports if imports else ""}
|
| 15 |
+
|
| 16 |
+
revision: str = ${repr(up_revision)}
|
| 17 |
+
down_revision: Union[str, None] = ${repr(down_revision)}
|
| 18 |
+
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
|
| 19 |
+
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def upgrade() -> None:
|
| 23 |
+
${upgrades if upgrades else "pass"}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def downgrade() -> None:
|
| 27 |
+
${downgrades if downgrades else "pass"}
|
backend/alembic/versions/0001_init.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""initial schema β variants, classifications, criteria
|
| 2 |
+
|
| 3 |
+
Revision ID: 0001_init
|
| 4 |
+
Revises:
|
| 5 |
+
Create Date: 2026-04-28 10:00:00
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import sqlalchemy as sa
|
| 11 |
+
from alembic import op
|
| 12 |
+
|
| 13 |
+
revision = "0001_init"
|
| 14 |
+
down_revision = None
|
| 15 |
+
branch_labels = None
|
| 16 |
+
depends_on = None
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def upgrade() -> None:
|
| 20 |
+
op.create_table(
|
| 21 |
+
"variants",
|
| 22 |
+
sa.Column("id", sa.String(length=36), primary_key=True),
|
| 23 |
+
sa.Column("raw_input", sa.String(length=512), nullable=False),
|
| 24 |
+
sa.Column("hgvs_genomic", sa.String(length=512)),
|
| 25 |
+
sa.Column("hgvs_coding", sa.String(length=512)),
|
| 26 |
+
sa.Column("hgvs_protein", sa.String(length=512)),
|
| 27 |
+
sa.Column("transcript", sa.String(length=64)),
|
| 28 |
+
sa.Column("gene_symbol", sa.String(length=64), index=True),
|
| 29 |
+
sa.Column("chromosome", sa.String(length=8)),
|
| 30 |
+
sa.Column("position", sa.Integer()),
|
| 31 |
+
sa.Column("normalization_source", sa.String(length=32), nullable=False, server_default="mutalyzer"),
|
| 32 |
+
sa.Column("warnings", sa.JSON(), nullable=False, server_default="[]"),
|
| 33 |
+
sa.Column("submitted_at", sa.DateTime(), nullable=False, server_default=sa.func.now()),
|
| 34 |
+
)
|
| 35 |
+
# ix_variants_gene_symbol auto-created by `index=True` on the column above
|
| 36 |
+
|
| 37 |
+
op.create_table(
|
| 38 |
+
"classifications",
|
| 39 |
+
sa.Column("id", sa.String(length=36), primary_key=True),
|
| 40 |
+
sa.Column("variant_id", sa.String(length=36), sa.ForeignKey("variants.id", ondelete="CASCADE"), nullable=False),
|
| 41 |
+
sa.Column("significance", sa.String(length=32), nullable=False),
|
| 42 |
+
sa.Column("confidence", sa.String(length=16), nullable=False, server_default="medium"),
|
| 43 |
+
sa.Column("triggered_criteria", sa.JSON(), nullable=False, server_default="[]"),
|
| 44 |
+
sa.Column("conflicting_evidence", sa.Boolean(), nullable=False, server_default=sa.false()),
|
| 45 |
+
sa.Column("ruleset_version", sa.String(length=16), nullable=False, server_default="v2015"),
|
| 46 |
+
sa.Column("rationale", sa.Text()),
|
| 47 |
+
sa.Column("curator_signoff", sa.Boolean(), nullable=False, server_default=sa.false()),
|
| 48 |
+
sa.Column("curator_id", sa.String(length=64)),
|
| 49 |
+
sa.Column("signed_off_at", sa.DateTime()),
|
| 50 |
+
sa.Column("created_at", sa.DateTime(), nullable=False, server_default=sa.func.now()),
|
| 51 |
+
)
|
| 52 |
+
op.create_index("ix_classifications_variant_id", "classifications", ["variant_id"])
|
| 53 |
+
|
| 54 |
+
op.create_table(
|
| 55 |
+
"criteria",
|
| 56 |
+
sa.Column("id", sa.String(length=36), primary_key=True),
|
| 57 |
+
sa.Column("classification_id", sa.String(length=36), sa.ForeignKey("classifications.id", ondelete="CASCADE"), nullable=False),
|
| 58 |
+
sa.Column("code", sa.String(length=8), nullable=False),
|
| 59 |
+
sa.Column("triggered", sa.Boolean(), nullable=False, server_default=sa.false()),
|
| 60 |
+
sa.Column("strength", sa.String(length=16), nullable=False),
|
| 61 |
+
sa.Column("source", sa.String(length=128), nullable=False),
|
| 62 |
+
sa.Column("evidence_text", sa.Text(), nullable=False),
|
| 63 |
+
sa.Column("confidence", sa.String(length=16), nullable=False, server_default="medium"),
|
| 64 |
+
sa.Column("pmid", sa.String(length=32)),
|
| 65 |
+
sa.Column("caveat", sa.Text()),
|
| 66 |
+
sa.Column("curator_override", sa.Boolean(), nullable=False, server_default=sa.false()),
|
| 67 |
+
sa.Column("override_justification", sa.Text()),
|
| 68 |
+
)
|
| 69 |
+
op.create_index("ix_criteria_classification_id", "criteria", ["classification_id"])
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def downgrade() -> None:
|
| 73 |
+
op.drop_index("ix_criteria_classification_id", table_name="criteria")
|
| 74 |
+
op.drop_table("criteria")
|
| 75 |
+
op.drop_index("ix_classifications_variant_id", table_name="classifications")
|
| 76 |
+
op.drop_table("classifications")
|
| 77 |
+
op.drop_table("variants") # auto-index drops with the table
|
backend/app/__init__.py
ADDED
|
File without changes
|
backend/app/api/__init__.py
ADDED
|
File without changes
|
backend/app/api/evidence.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
from typing import Annotated
|
| 3 |
+
|
| 4 |
+
from fastapi import APIRouter, Depends, HTTPException
|
| 5 |
+
from pydantic import BaseModel
|
| 6 |
+
from sqlalchemy.orm import Session
|
| 7 |
+
|
| 8 |
+
from backend.app.models.classification import ClassificationRecord, CriterionRecord
|
| 9 |
+
from backend.app.models.db import get_session
|
| 10 |
+
from backend.app.schemas.evidence import ACMGCriterion
|
| 11 |
+
|
| 12 |
+
router = APIRouter()
|
| 13 |
+
SessionDep = Annotated[Session, Depends(get_session)]
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class CriterionOverride(BaseModel):
|
| 17 |
+
triggered: bool
|
| 18 |
+
strength: str
|
| 19 |
+
justification: str
|
| 20 |
+
curator_id: str
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@router.get("/{classification_id}", response_model=list[ACMGCriterion])
|
| 24 |
+
def get_criteria(classification_id: str, db: SessionDep) -> list[ACMGCriterion]:
|
| 25 |
+
record = db.get(ClassificationRecord, classification_id)
|
| 26 |
+
if not record:
|
| 27 |
+
raise HTTPException(404, "classification not found")
|
| 28 |
+
return [
|
| 29 |
+
ACMGCriterion(
|
| 30 |
+
code=c.code,
|
| 31 |
+
triggered=c.triggered,
|
| 32 |
+
strength=c.strength,
|
| 33 |
+
source=c.source,
|
| 34 |
+
evidence_text=c.evidence_text,
|
| 35 |
+
confidence=c.confidence,
|
| 36 |
+
caveat=c.caveat,
|
| 37 |
+
pmid=c.pmid,
|
| 38 |
+
curator_override=c.curator_override,
|
| 39 |
+
override_justification=c.override_justification,
|
| 40 |
+
)
|
| 41 |
+
for c in record.criteria
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@router.post("/{classification_id}/{criterion_code}/override", response_model=ACMGCriterion)
|
| 46 |
+
def override_criterion(
|
| 47 |
+
classification_id: str,
|
| 48 |
+
criterion_code: str,
|
| 49 |
+
override: CriterionOverride,
|
| 50 |
+
db: SessionDep,
|
| 51 |
+
) -> ACMGCriterion:
|
| 52 |
+
rec = (
|
| 53 |
+
db.query(CriterionRecord)
|
| 54 |
+
.filter_by(classification_id=classification_id, code=criterion_code)
|
| 55 |
+
.one_or_none()
|
| 56 |
+
)
|
| 57 |
+
if not rec:
|
| 58 |
+
raise HTTPException(404, "criterion not found")
|
| 59 |
+
rec.triggered = override.triggered
|
| 60 |
+
rec.strength = override.strength
|
| 61 |
+
rec.curator_override = True
|
| 62 |
+
rec.override_justification = (
|
| 63 |
+
f"[{override.curator_id} @ {datetime.utcnow().isoformat()}] {override.justification}"
|
| 64 |
+
)
|
| 65 |
+
db.commit()
|
| 66 |
+
db.refresh(rec)
|
| 67 |
+
return ACMGCriterion(
|
| 68 |
+
code=rec.code,
|
| 69 |
+
triggered=rec.triggered,
|
| 70 |
+
strength=rec.strength,
|
| 71 |
+
source=rec.source,
|
| 72 |
+
evidence_text=rec.evidence_text,
|
| 73 |
+
confidence=rec.confidence,
|
| 74 |
+
caveat=rec.caveat,
|
| 75 |
+
pmid=rec.pmid,
|
| 76 |
+
curator_override=rec.curator_override,
|
| 77 |
+
override_justification=rec.override_justification,
|
| 78 |
+
)
|
backend/app/api/pipeline.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""End-to-end pipeline that wires services together."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from uuid import uuid4
|
| 5 |
+
|
| 6 |
+
from backend.app.schemas.classification import ClassificationResult
|
| 7 |
+
from backend.app.schemas.evidence import EvidenceBundle, LiteratureChunk
|
| 8 |
+
from backend.app.schemas.variant import VariantInput
|
| 9 |
+
from backend.app.services.clinvar import ClinVarClient
|
| 10 |
+
from backend.app.services.gnomad import GnomADClient
|
| 11 |
+
from backend.app.services.insilico import InSilicoPredictor
|
| 12 |
+
from backend.app.services.llm.synthesizer import LITERATURE_CRITERIA, EvidenceSynthesizer
|
| 13 |
+
from backend.app.services.normalization import VariantNormalizer
|
| 14 |
+
from backend.app.services.pvs1 import PVS1Assessor
|
| 15 |
+
from backend.app.services.rag.retriever import LiteratureRetriever
|
| 16 |
+
from backend.app.services.vep import VEPClient
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class VariantPipeline:
|
| 22 |
+
def __init__(
|
| 23 |
+
self,
|
| 24 |
+
normalizer: VariantNormalizer | None = None,
|
| 25 |
+
vep: VEPClient | None = None,
|
| 26 |
+
gnomad: GnomADClient | None = None,
|
| 27 |
+
clinvar: ClinVarClient | None = None,
|
| 28 |
+
insilico: InSilicoPredictor | None = None,
|
| 29 |
+
pvs1: PVS1Assessor | None = None,
|
| 30 |
+
retriever: LiteratureRetriever | None = None,
|
| 31 |
+
synthesizer: EvidenceSynthesizer | None = None,
|
| 32 |
+
) -> None:
|
| 33 |
+
self.normalizer = normalizer or VariantNormalizer()
|
| 34 |
+
self.vep = vep or VEPClient()
|
| 35 |
+
self.gnomad = gnomad or GnomADClient()
|
| 36 |
+
self.clinvar = clinvar or ClinVarClient()
|
| 37 |
+
self.insilico = insilico or InSilicoPredictor()
|
| 38 |
+
self.pvs1 = pvs1 or PVS1Assessor()
|
| 39 |
+
self.retriever = retriever or LiteratureRetriever()
|
| 40 |
+
self.synthesizer = synthesizer or EvidenceSynthesizer()
|
| 41 |
+
|
| 42 |
+
async def run(self, variant_input: VariantInput, skip_rag: bool = False) -> ClassificationResult:
|
| 43 |
+
variant = await self.normalizer.normalize(variant_input)
|
| 44 |
+
# Enrich with chr/pos/ref/alt + transcript + consequence via VEP
|
| 45 |
+
# so REVEL/AlphaMissense/gnomAD have what they need on HGVS-coding input.
|
| 46 |
+
# Best-effort β VEP failure doesn't block the rest of the pipeline.
|
| 47 |
+
if not all([variant.chromosome, variant.position, variant.ref, variant.alt]):
|
| 48 |
+
variant = await self.vep.enrich(variant)
|
| 49 |
+
variant_id = str(uuid4())
|
| 50 |
+
|
| 51 |
+
gnomad_id = self._build_gnomad_id(variant)
|
| 52 |
+
freq = await self.gnomad.lookup(gnomad_id) if gnomad_id else None
|
| 53 |
+
|
| 54 |
+
clinvar = await self.clinvar.lookup(variant.hgvs_coding or variant.raw_input)
|
| 55 |
+
insilico = await self.insilico.assess(
|
| 56 |
+
chrom=variant.chromosome,
|
| 57 |
+
pos=variant.position,
|
| 58 |
+
ref=variant.ref,
|
| 59 |
+
alt=variant.alt,
|
| 60 |
+
transcript=variant.transcript,
|
| 61 |
+
hgvs_genomic=variant.hgvs_genomic,
|
| 62 |
+
)
|
| 63 |
+
autopvs1 = self.pvs1.assess(variant)
|
| 64 |
+
|
| 65 |
+
evidence = EvidenceBundle(
|
| 66 |
+
population_frequency=freq,
|
| 67 |
+
insilico=insilico,
|
| 68 |
+
clinvar_existing=clinvar or [],
|
| 69 |
+
autopvs1=autopvs1,
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
retrieved: dict[str, list[LiteratureChunk]] = {}
|
| 73 |
+
if not skip_rag and variant.gene_symbol:
|
| 74 |
+
try:
|
| 75 |
+
await self.retriever.index_for_variant(
|
| 76 |
+
variant_id=variant_id,
|
| 77 |
+
gene=variant.gene_symbol,
|
| 78 |
+
hgvs=variant.hgvs_coding or variant.raw_input,
|
| 79 |
+
protein=variant.hgvs_protein,
|
| 80 |
+
criteria=LITERATURE_CRITERIA,
|
| 81 |
+
)
|
| 82 |
+
retrieved = self.retriever.retrieve_for_criteria(
|
| 83 |
+
variant_id=variant_id,
|
| 84 |
+
hgvs=variant.hgvs_coding or variant.raw_input,
|
| 85 |
+
criteria=LITERATURE_CRITERIA,
|
| 86 |
+
)
|
| 87 |
+
except Exception as e:
|
| 88 |
+
logger.warning("RAG indexing/retrieval failed; continuing without literature: %s", e)
|
| 89 |
+
|
| 90 |
+
return self.synthesizer.synthesize(
|
| 91 |
+
variant=variant,
|
| 92 |
+
evidence=evidence,
|
| 93 |
+
retrieved_chunks=retrieved,
|
| 94 |
+
disease=variant_input.disease,
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
@staticmethod
|
| 98 |
+
def _build_gnomad_id(variant) -> str | None:
|
| 99 |
+
if variant.chromosome and variant.position and variant.ref and variant.alt:
|
| 100 |
+
chrom = variant.chromosome.replace("chr", "")
|
| 101 |
+
return f"{chrom}-{variant.position}-{variant.ref}-{variant.alt}"
|
| 102 |
+
return None
|
backend/app/api/reports.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import UTC, datetime
|
| 2 |
+
from typing import Annotated
|
| 3 |
+
|
| 4 |
+
from fastapi import APIRouter, Depends, HTTPException
|
| 5 |
+
from fastapi.responses import Response
|
| 6 |
+
from sqlalchemy.orm import Session
|
| 7 |
+
|
| 8 |
+
from backend.app.models.classification import ClassificationRecord
|
| 9 |
+
from backend.app.models.db import get_session
|
| 10 |
+
from backend.app.services.exports import render_clinvar_xml, render_fhir_observation
|
| 11 |
+
|
| 12 |
+
router = APIRouter()
|
| 13 |
+
SessionDep = Annotated[Session, Depends(get_session)]
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@router.get("/{classification_id}")
|
| 17 |
+
def get_report(classification_id: str, db: SessionDep) -> dict:
|
| 18 |
+
rec = db.get(ClassificationRecord, classification_id)
|
| 19 |
+
if not rec:
|
| 20 |
+
raise HTTPException(404, "classification not found")
|
| 21 |
+
return {
|
| 22 |
+
"classification_id": rec.id,
|
| 23 |
+
"variant_id": rec.variant_id,
|
| 24 |
+
"variant": {
|
| 25 |
+
"raw_input": rec.variant.raw_input,
|
| 26 |
+
"hgvs_coding": rec.variant.hgvs_coding,
|
| 27 |
+
"hgvs_protein": rec.variant.hgvs_protein,
|
| 28 |
+
"hgvs_genomic": rec.variant.hgvs_genomic,
|
| 29 |
+
"gene_symbol": rec.variant.gene_symbol,
|
| 30 |
+
} if rec.variant else None,
|
| 31 |
+
"significance": rec.significance,
|
| 32 |
+
"confidence": rec.confidence,
|
| 33 |
+
"ruleset_version": rec.ruleset_version,
|
| 34 |
+
"rationale": rec.rationale,
|
| 35 |
+
"triggered_criteria": rec.triggered_criteria,
|
| 36 |
+
"conflicting_evidence": rec.conflicting_evidence,
|
| 37 |
+
"curator_signoff": rec.curator_signoff,
|
| 38 |
+
"curator_id": rec.curator_id,
|
| 39 |
+
"signed_off_at": rec.signed_off_at.isoformat() if rec.signed_off_at else None,
|
| 40 |
+
"criteria": [
|
| 41 |
+
{
|
| 42 |
+
"code": c.code,
|
| 43 |
+
"triggered": c.triggered,
|
| 44 |
+
"strength": c.strength,
|
| 45 |
+
"source": c.source,
|
| 46 |
+
"evidence_text": c.evidence_text,
|
| 47 |
+
"confidence": c.confidence,
|
| 48 |
+
"pmid": c.pmid,
|
| 49 |
+
"caveat": c.caveat,
|
| 50 |
+
"curator_override": c.curator_override,
|
| 51 |
+
"override_justification": c.override_justification,
|
| 52 |
+
}
|
| 53 |
+
for c in rec.criteria
|
| 54 |
+
],
|
| 55 |
+
"generated_at": datetime.now(UTC).isoformat(),
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@router.post("/{classification_id}/signoff")
|
| 60 |
+
def signoff(classification_id: str, curator_id: str, db: SessionDep) -> dict:
|
| 61 |
+
rec = db.get(ClassificationRecord, classification_id)
|
| 62 |
+
if not rec:
|
| 63 |
+
raise HTTPException(404, "classification not found")
|
| 64 |
+
if rec.conflicting_evidence:
|
| 65 |
+
# Allow but flag β clinical curator should know.
|
| 66 |
+
pass
|
| 67 |
+
rec.curator_signoff = True
|
| 68 |
+
rec.curator_id = curator_id
|
| 69 |
+
rec.signed_off_at = datetime.now(UTC).replace(tzinfo=None)
|
| 70 |
+
db.commit()
|
| 71 |
+
return {
|
| 72 |
+
"status": "signed",
|
| 73 |
+
"curator_id": curator_id,
|
| 74 |
+
"signed_off_at": rec.signed_off_at.isoformat(),
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
@router.get("/{classification_id}/clinvar-xml")
|
| 79 |
+
def clinvar_export(classification_id: str, db: SessionDep) -> Response:
|
| 80 |
+
rec = db.get(ClassificationRecord, classification_id)
|
| 81 |
+
if not rec:
|
| 82 |
+
raise HTTPException(404, "classification not found")
|
| 83 |
+
if not rec.curator_signoff:
|
| 84 |
+
raise HTTPException(409, "classification must be signed off before ClinVar export")
|
| 85 |
+
xml = render_clinvar_xml(rec)
|
| 86 |
+
return Response(content=xml, media_type="application/xml", headers={
|
| 87 |
+
"Content-Disposition": f'attachment; filename="variantlens_{rec.id}.clinvar.xml"',
|
| 88 |
+
})
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
@router.get("/{classification_id}/fhir")
|
| 92 |
+
def fhir_export(classification_id: str, db: SessionDep) -> dict:
|
| 93 |
+
rec = db.get(ClassificationRecord, classification_id)
|
| 94 |
+
if not rec:
|
| 95 |
+
raise HTTPException(404, "classification not found")
|
| 96 |
+
if not rec.curator_signoff:
|
| 97 |
+
raise HTTPException(409, "classification must be signed off before FHIR export")
|
| 98 |
+
return render_fhir_observation(rec)
|
backend/app/api/variants.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import Annotated
|
| 3 |
+
|
| 4 |
+
from fastapi import APIRouter, Depends, HTTPException
|
| 5 |
+
from sqlalchemy.exc import SQLAlchemyError
|
| 6 |
+
from sqlalchemy.orm import Session
|
| 7 |
+
|
| 8 |
+
from backend.app.api.pipeline import VariantPipeline
|
| 9 |
+
from backend.app.models.db import get_session
|
| 10 |
+
from backend.app.schemas.classification import ClassificationResult
|
| 11 |
+
from backend.app.schemas.variant import NormalizedVariant, VariantInput
|
| 12 |
+
from backend.app.services.repository import ClassificationRepository
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
router = APIRouter()
|
| 17 |
+
_pipeline = VariantPipeline()
|
| 18 |
+
SessionDep = Annotated[Session, Depends(get_session)]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@router.post("/classify", response_model=ClassificationResult)
|
| 22 |
+
async def classify(
|
| 23 |
+
variant: VariantInput,
|
| 24 |
+
db: SessionDep,
|
| 25 |
+
skip_rag: bool = False,
|
| 26 |
+
) -> ClassificationResult:
|
| 27 |
+
try:
|
| 28 |
+
result = await _pipeline.run(variant, skip_rag=skip_rag)
|
| 29 |
+
except Exception as e:
|
| 30 |
+
logger.exception("pipeline failed")
|
| 31 |
+
raise HTTPException(status_code=500, detail=f"pipeline failed: {e}") from e
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
return ClassificationRepository(db).save(result)
|
| 35 |
+
except SQLAlchemyError as e:
|
| 36 |
+
logger.warning("DB persistence failed, returning unsaved result: %s", e)
|
| 37 |
+
# Return the in-memory result so the UI still renders during dev.
|
| 38 |
+
return result
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
@router.post("/normalize", response_model=NormalizedVariant)
|
| 42 |
+
async def normalize(variant: VariantInput) -> NormalizedVariant:
|
| 43 |
+
return await _pipeline.normalizer.normalize(variant)
|
backend/app/config.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from functools import lru_cache
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Literal
|
| 4 |
+
|
| 5 |
+
from pydantic import Field, model_validator
|
| 6 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class Settings(BaseSettings):
|
| 10 |
+
model_config = SettingsConfigDict(
|
| 11 |
+
env_file=".env",
|
| 12 |
+
env_file_encoding="utf-8",
|
| 13 |
+
case_sensitive=False,
|
| 14 |
+
extra="ignore",
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
app_env: str = "development"
|
| 18 |
+
log_level: str = "INFO"
|
| 19 |
+
api_host: str = "0.0.0.0"
|
| 20 |
+
api_port: int = 8000
|
| 21 |
+
|
| 22 |
+
anthropic_api_key: str = ""
|
| 23 |
+
anthropic_model: str = "claude-sonnet-4-6"
|
| 24 |
+
anthropic_max_tokens: int = 2000
|
| 25 |
+
use_local_llm: bool = False
|
| 26 |
+
local_llm_base_url: str = "http://localhost:11434"
|
| 27 |
+
local_llm_model: str = "qwen2.5:14b-instruct"
|
| 28 |
+
|
| 29 |
+
ncbi_api_key: str = ""
|
| 30 |
+
ncbi_email: str = ""
|
| 31 |
+
omim_api_key: str = ""
|
| 32 |
+
|
| 33 |
+
mutalyzer_base_url: str = "https://mutalyzer.nl/api"
|
| 34 |
+
gnomad_graphql_url: str = "https://gnomad.broadinstitute.org/api"
|
| 35 |
+
spliceai_lookup_url: str = "https://spliceailookup-api.broadinstitute.org"
|
| 36 |
+
cadd_api_url: str = "https://cadd.gs.washington.edu/api"
|
| 37 |
+
|
| 38 |
+
database_url: str = "postgresql+psycopg://variantlens:change_me_locally@postgres:5432/variantlens"
|
| 39 |
+
|
| 40 |
+
chroma_persist_dir: Path = Path("./data/chroma")
|
| 41 |
+
chroma_collection: str = "variantlens_pubmed"
|
| 42 |
+
|
| 43 |
+
revel_db_path: Path = Path("./data/revel_scores.db")
|
| 44 |
+
alphamissense_db_path: Path = Path("./data/alphamissense.db")
|
| 45 |
+
alphamissense_path: Path = Path("./data/alphamissense.tsv.gz") # legacy raw TSV path
|
| 46 |
+
gnomad_cache_db: Path = Path("./data/gnomad_cache.db")
|
| 47 |
+
clinvar_vcf_path: Path = Path("./data/clinvar.vcf.gz")
|
| 48 |
+
|
| 49 |
+
embedding_model: str = "michiyasunaga/BioLinkBERT-base"
|
| 50 |
+
embedding_device: str = "cpu"
|
| 51 |
+
|
| 52 |
+
redis_url: str = "redis://redis:6379/0"
|
| 53 |
+
celery_broker_url: str = "redis://redis:6379/1"
|
| 54 |
+
celery_result_backend: str = "redis://redis:6379/2"
|
| 55 |
+
|
| 56 |
+
jwt_secret: str = Field(default="change_me", min_length=8)
|
| 57 |
+
jwt_algorithm: str = "HS256"
|
| 58 |
+
jwt_expire_minutes: int = 480
|
| 59 |
+
|
| 60 |
+
rag_fetch_fulltext: bool = True
|
| 61 |
+
rag_max_papers_per_variant: int = 200
|
| 62 |
+
rag_chunk_size: int = 512
|
| 63 |
+
rag_chunk_overlap: int = 128
|
| 64 |
+
rag_top_k: int = 8
|
| 65 |
+
|
| 66 |
+
acmg_ruleset_version: str = "v2015"
|
| 67 |
+
acmg_combiner_strategy: Literal["table5", "bayesian", "most_pathogenic"] = "table5"
|
| 68 |
+
enable_deprecated_clinvar_criteria: bool = False
|
| 69 |
+
|
| 70 |
+
@model_validator(mode="after")
|
| 71 |
+
def validate_clinical_safety(self) -> "Settings":
|
| 72 |
+
if self.app_env.lower() in {"production", "clinical"}:
|
| 73 |
+
if self.jwt_secret in {"change_me", "change_me_locally_to_a_long_random_string"}:
|
| 74 |
+
raise ValueError("JWT_SECRET must be changed for production/clinical deployments")
|
| 75 |
+
if not self.use_local_llm and not self.anthropic_api_key:
|
| 76 |
+
raise ValueError("ANTHROPIC_API_KEY is required when USE_LOCAL_LLM=false")
|
| 77 |
+
return self
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
@lru_cache
|
| 81 |
+
def get_settings() -> Settings:
|
| 82 |
+
return Settings()
|
backend/app/main.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from contextlib import asynccontextmanager
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from alembic import command
|
| 6 |
+
from alembic.config import Config
|
| 7 |
+
from fastapi import FastAPI
|
| 8 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 9 |
+
|
| 10 |
+
from backend.app.api import evidence, reports, variants
|
| 11 |
+
from backend.app.config import get_settings
|
| 12 |
+
|
| 13 |
+
settings = get_settings()
|
| 14 |
+
logging.basicConfig(level=settings.log_level)
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _run_migrations() -> None:
|
| 21 |
+
cfg_path = PROJECT_ROOT / "alembic.ini"
|
| 22 |
+
if not cfg_path.exists():
|
| 23 |
+
logger.warning("alembic.ini not found at %s; skipping auto-migrate", cfg_path)
|
| 24 |
+
return
|
| 25 |
+
try:
|
| 26 |
+
cfg = Config(str(cfg_path))
|
| 27 |
+
cfg.set_main_option("sqlalchemy.url", settings.database_url)
|
| 28 |
+
command.upgrade(cfg, "head")
|
| 29 |
+
logger.info("alembic migrations applied")
|
| 30 |
+
except Exception as e:
|
| 31 |
+
logger.warning("alembic auto-migrate failed (continuing): %s", e)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@asynccontextmanager
|
| 35 |
+
async def lifespan(app: FastAPI):
|
| 36 |
+
_run_migrations()
|
| 37 |
+
yield
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
app = FastAPI(
|
| 41 |
+
title="VariantLens",
|
| 42 |
+
description="Clinical genomic variant interpretation tool with ACMG rule engine and Claude RAG reasoning.",
|
| 43 |
+
version="0.1.0",
|
| 44 |
+
lifespan=lifespan,
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
app.add_middleware(
|
| 48 |
+
CORSMiddleware,
|
| 49 |
+
allow_origins=["http://localhost:5173", "http://localhost:3000"],
|
| 50 |
+
allow_credentials=True,
|
| 51 |
+
allow_methods=["*"],
|
| 52 |
+
allow_headers=["*"],
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
app.include_router(variants.router, prefix="/variants", tags=["variants"])
|
| 56 |
+
app.include_router(evidence.router, prefix="/evidence", tags=["evidence"])
|
| 57 |
+
app.include_router(reports.router, prefix="/reports", tags=["reports"])
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@app.get("/health")
|
| 61 |
+
async def health() -> dict[str, str]:
|
| 62 |
+
return {"status": "ok", "env": settings.app_env}
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
@app.get("/")
|
| 66 |
+
async def root() -> dict[str, str]:
|
| 67 |
+
return {"name": "VariantLens", "version": "0.1.0", "docs": "/docs"}
|
backend/app/models/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from backend.app.models.classification import ClassificationRecord, CriterionRecord
|
| 2 |
+
from backend.app.models.db import Base, get_session
|
| 3 |
+
from backend.app.models.variant import VariantRecord
|
| 4 |
+
|
| 5 |
+
__all__ = ["Base", "get_session", "VariantRecord", "ClassificationRecord", "CriterionRecord"]
|
backend/app/models/classification.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
from uuid import uuid4
|
| 3 |
+
|
| 4 |
+
from sqlalchemy import JSON, Boolean, DateTime, ForeignKey, String, Text
|
| 5 |
+
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
| 6 |
+
|
| 7 |
+
from backend.app.models.db import Base
|
| 8 |
+
from backend.app.models.variant import VariantRecord # noqa: F401 β needed for relationship
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class ClassificationRecord(Base):
|
| 12 |
+
__tablename__ = "classifications"
|
| 13 |
+
|
| 14 |
+
id: Mapped[str] = mapped_column(String(36), primary_key=True, default=lambda: str(uuid4()))
|
| 15 |
+
variant_id: Mapped[str] = mapped_column(String(36), ForeignKey("variants.id"), index=True)
|
| 16 |
+
significance: Mapped[str] = mapped_column(String(32), nullable=False)
|
| 17 |
+
confidence: Mapped[str] = mapped_column(String(16), default="medium")
|
| 18 |
+
triggered_criteria: Mapped[list] = mapped_column(JSON, default=list)
|
| 19 |
+
conflicting_evidence: Mapped[bool] = mapped_column(Boolean, default=False)
|
| 20 |
+
ruleset_version: Mapped[str] = mapped_column(String(16), default="v2015")
|
| 21 |
+
rationale: Mapped[str | None] = mapped_column(Text)
|
| 22 |
+
curator_signoff: Mapped[bool] = mapped_column(Boolean, default=False)
|
| 23 |
+
curator_id: Mapped[str | None] = mapped_column(String(64))
|
| 24 |
+
signed_off_at: Mapped[datetime | None] = mapped_column(DateTime)
|
| 25 |
+
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
| 26 |
+
|
| 27 |
+
criteria: Mapped[list["CriterionRecord"]] = relationship(
|
| 28 |
+
back_populates="classification", cascade="all, delete-orphan"
|
| 29 |
+
)
|
| 30 |
+
variant: Mapped["VariantRecord"] = relationship("VariantRecord", lazy="joined")
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class CriterionRecord(Base):
|
| 34 |
+
__tablename__ = "criteria"
|
| 35 |
+
|
| 36 |
+
id: Mapped[str] = mapped_column(String(36), primary_key=True, default=lambda: str(uuid4()))
|
| 37 |
+
classification_id: Mapped[str] = mapped_column(
|
| 38 |
+
String(36), ForeignKey("classifications.id"), index=True
|
| 39 |
+
)
|
| 40 |
+
code: Mapped[str] = mapped_column(String(8), nullable=False)
|
| 41 |
+
triggered: Mapped[bool] = mapped_column(Boolean, default=False)
|
| 42 |
+
strength: Mapped[str] = mapped_column(String(16))
|
| 43 |
+
source: Mapped[str] = mapped_column(String(128))
|
| 44 |
+
evidence_text: Mapped[str] = mapped_column(Text)
|
| 45 |
+
confidence: Mapped[str] = mapped_column(String(16), default="medium")
|
| 46 |
+
pmid: Mapped[str | None] = mapped_column(String(32))
|
| 47 |
+
caveat: Mapped[str | None] = mapped_column(Text)
|
| 48 |
+
curator_override: Mapped[bool] = mapped_column(Boolean, default=False)
|
| 49 |
+
override_justification: Mapped[str | None] = mapped_column(Text)
|
| 50 |
+
|
| 51 |
+
classification: Mapped["ClassificationRecord"] = relationship(back_populates="criteria")
|
backend/app/models/db.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections.abc import Generator
|
| 2 |
+
|
| 3 |
+
from sqlalchemy import create_engine
|
| 4 |
+
from sqlalchemy.orm import DeclarativeBase, Session, sessionmaker
|
| 5 |
+
|
| 6 |
+
from backend.app.config import get_settings
|
| 7 |
+
|
| 8 |
+
settings = get_settings()
|
| 9 |
+
|
| 10 |
+
engine = create_engine(settings.database_url, pool_pre_ping=True)
|
| 11 |
+
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class Base(DeclarativeBase):
|
| 15 |
+
pass
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def get_session() -> Generator[Session, None, None]:
|
| 19 |
+
db = SessionLocal()
|
| 20 |
+
try:
|
| 21 |
+
yield db
|
| 22 |
+
finally:
|
| 23 |
+
db.close()
|
backend/app/models/variant.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
from uuid import uuid4
|
| 3 |
+
|
| 4 |
+
from sqlalchemy import JSON, DateTime, String
|
| 5 |
+
from sqlalchemy.orm import Mapped, mapped_column
|
| 6 |
+
|
| 7 |
+
from backend.app.models.db import Base
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class VariantRecord(Base):
|
| 11 |
+
__tablename__ = "variants"
|
| 12 |
+
|
| 13 |
+
id: Mapped[str] = mapped_column(String(36), primary_key=True, default=lambda: str(uuid4()))
|
| 14 |
+
raw_input: Mapped[str] = mapped_column(String(512), nullable=False)
|
| 15 |
+
hgvs_genomic: Mapped[str | None] = mapped_column(String(512))
|
| 16 |
+
hgvs_coding: Mapped[str | None] = mapped_column(String(512))
|
| 17 |
+
hgvs_protein: Mapped[str | None] = mapped_column(String(512))
|
| 18 |
+
transcript: Mapped[str | None] = mapped_column(String(64))
|
| 19 |
+
gene_symbol: Mapped[str | None] = mapped_column(String(64), index=True)
|
| 20 |
+
chromosome: Mapped[str | None] = mapped_column(String(8))
|
| 21 |
+
position: Mapped[int | None] = mapped_column()
|
| 22 |
+
normalization_source: Mapped[str] = mapped_column(String(32), default="mutalyzer")
|
| 23 |
+
warnings: Mapped[list] = mapped_column(JSON, default=list)
|
| 24 |
+
submitted_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
backend/app/schemas/__init__.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from backend.app.schemas.classification import (
|
| 2 |
+
Classification,
|
| 3 |
+
ClassificationResult,
|
| 4 |
+
ClinicalSignificance,
|
| 5 |
+
)
|
| 6 |
+
from backend.app.schemas.evidence import (
|
| 7 |
+
ACMGCriterion,
|
| 8 |
+
CriterionStrength,
|
| 9 |
+
EvidenceBundle,
|
| 10 |
+
InSilicoResult,
|
| 11 |
+
LiteratureChunk,
|
| 12 |
+
PopulationFrequency,
|
| 13 |
+
)
|
| 14 |
+
from backend.app.schemas.variant import (
|
| 15 |
+
NormalizedVariant,
|
| 16 |
+
VariantInput,
|
| 17 |
+
VariantOutput,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
__all__ = [
|
| 21 |
+
"VariantInput",
|
| 22 |
+
"VariantOutput",
|
| 23 |
+
"NormalizedVariant",
|
| 24 |
+
"ACMGCriterion",
|
| 25 |
+
"CriterionStrength",
|
| 26 |
+
"EvidenceBundle",
|
| 27 |
+
"InSilicoResult",
|
| 28 |
+
"LiteratureChunk",
|
| 29 |
+
"PopulationFrequency",
|
| 30 |
+
"Classification",
|
| 31 |
+
"ClassificationResult",
|
| 32 |
+
"ClinicalSignificance",
|
| 33 |
+
]
|
backend/app/schemas/classification.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Literal
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
|
| 5 |
+
from backend.app.schemas.evidence import ACMGCriterion, EvidenceBundle
|
| 6 |
+
from backend.app.schemas.variant import NormalizedVariant
|
| 7 |
+
|
| 8 |
+
ClinicalSignificance = Literal[
|
| 9 |
+
"Pathogenic",
|
| 10 |
+
"Likely Pathogenic",
|
| 11 |
+
"Uncertain Significance",
|
| 12 |
+
"Likely Benign",
|
| 13 |
+
"Benign",
|
| 14 |
+
]
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class Classification(BaseModel):
|
| 18 |
+
significance: ClinicalSignificance
|
| 19 |
+
confidence: Literal["high", "medium", "low"] = "medium"
|
| 20 |
+
triggered_criteria: list[str] = Field(default_factory=list)
|
| 21 |
+
conflicting_evidence: bool = False
|
| 22 |
+
rationale: str | None = None
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class ClassificationResult(BaseModel):
|
| 26 |
+
id: str | None = None
|
| 27 |
+
variant: NormalizedVariant
|
| 28 |
+
evidence: EvidenceBundle
|
| 29 |
+
classification: Classification
|
| 30 |
+
ruleset_version: str = "v2015"
|
| 31 |
+
curator_signoff: bool = False
|
| 32 |
+
curator_id: str | None = None
|
| 33 |
+
signed_off_at: str | None = None
|
| 34 |
+
analysed_at: str | None = None
|
| 35 |
+
|
| 36 |
+
@property
|
| 37 |
+
def auditable_criteria(self) -> list[ACMGCriterion]:
|
| 38 |
+
return [c for c in self.evidence.criteria if c.triggered]
|
backend/app/schemas/evidence.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Literal
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 4 |
+
|
| 5 |
+
CriterionStrength = Literal["very_strong", "strong", "moderate", "supporting", "standalone"]
|
| 6 |
+
CriterionConfidence = Literal["high", "medium", "low"]
|
| 7 |
+
|
| 8 |
+
ACMG_CRITERIA = [
|
| 9 |
+
"PVS1",
|
| 10 |
+
"PS1", "PS2", "PS3", "PS4",
|
| 11 |
+
"PM1", "PM2", "PM3", "PM4", "PM5", "PM6",
|
| 12 |
+
"PP1", "PP2", "PP3", "PP4", "PP5",
|
| 13 |
+
"BA1",
|
| 14 |
+
"BS1", "BS2", "BS3", "BS4",
|
| 15 |
+
"BP1", "BP2", "BP3", "BP4", "BP5", "BP6", "BP7",
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class ACMGCriterion(BaseModel):
|
| 20 |
+
code: str = Field(..., description="ACMG criterion code (e.g., PVS1, PM2)")
|
| 21 |
+
triggered: bool
|
| 22 |
+
strength: CriterionStrength
|
| 23 |
+
source: str = Field(..., description="Database name, PMID, or 'curator'")
|
| 24 |
+
evidence_text: str = Field(..., description="Quote, numeric value, or rule trace")
|
| 25 |
+
confidence: CriterionConfidence = "medium"
|
| 26 |
+
caveat: str | None = None
|
| 27 |
+
pmid: str | None = None
|
| 28 |
+
curator_override: bool = False
|
| 29 |
+
override_justification: str | None = None
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class PopulationFrequency(BaseModel):
|
| 33 |
+
overall_af: float | None = None
|
| 34 |
+
by_population: dict[str, float] = Field(default_factory=dict)
|
| 35 |
+
homozygote_count: int | None = None
|
| 36 |
+
coverage_warning: str | None = None
|
| 37 |
+
source: str = "gnomAD v4.1"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class InSilicoResult(BaseModel):
|
| 41 |
+
revel: float | None = None
|
| 42 |
+
alphamissense: float | None = None
|
| 43 |
+
spliceai_max: float | None = None
|
| 44 |
+
cadd_phred: float | None = None
|
| 45 |
+
concordant_pathogenic: bool | None = None
|
| 46 |
+
concordant_benign: bool | None = None
|
| 47 |
+
pp3_triggered: bool = False
|
| 48 |
+
bp4_triggered: bool = False
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class ClinVarSubmission(BaseModel):
|
| 52 |
+
accession: str
|
| 53 |
+
submitter: str = "unknown"
|
| 54 |
+
classification: str
|
| 55 |
+
stars: int = 0
|
| 56 |
+
date: str = ""
|
| 57 |
+
condition: str = ""
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class AutoPVS1Step(BaseModel):
|
| 61 |
+
model_config = ConfigDict(populate_by_name=True)
|
| 62 |
+
|
| 63 |
+
step: int
|
| 64 |
+
label: str
|
| 65 |
+
value: str
|
| 66 |
+
pass_: bool = Field(..., alias="pass")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class AutoPVS1Result(BaseModel):
|
| 70 |
+
triggered: bool
|
| 71 |
+
strength: CriterionStrength = "very_strong"
|
| 72 |
+
rule: str = "PVS1"
|
| 73 |
+
reasoning: list[AutoPVS1Step] = Field(default_factory=list)
|
| 74 |
+
conclusion: str = ""
|
| 75 |
+
source: str = "autoPVS1"
|
| 76 |
+
caveats: list[str] = Field(default_factory=list)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class LiteratureChunk(BaseModel):
|
| 80 |
+
pmid: str
|
| 81 |
+
year: int | None = None
|
| 82 |
+
title: str | None = None
|
| 83 |
+
journal: str | None = None
|
| 84 |
+
chunk_text: str
|
| 85 |
+
criteria_relevance: list[str] = Field(default_factory=list)
|
| 86 |
+
score: float | None = None
|
| 87 |
+
ai_interpretation: str | None = None
|
| 88 |
+
ai_confidence: str | None = None
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class EvidenceBundle(BaseModel):
|
| 92 |
+
population_frequency: PopulationFrequency | None = None
|
| 93 |
+
insilico: InSilicoResult | None = None
|
| 94 |
+
clinvar_existing: list[ClinVarSubmission] = Field(default_factory=list)
|
| 95 |
+
autopvs1: AutoPVS1Result | None = None
|
| 96 |
+
literature_chunks: list[LiteratureChunk] = Field(default_factory=list)
|
| 97 |
+
criteria: list[ACMGCriterion] = Field(default_factory=list)
|
backend/app/schemas/variant.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Literal
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class VariantInput(BaseModel):
|
| 7 |
+
raw: str = Field(..., description="User-supplied variant string (HGVS, VCF, or protein notation)")
|
| 8 |
+
notation: Literal["hgvs", "vcf", "protein", "auto"] = "auto"
|
| 9 |
+
gene_symbol: str | None = None
|
| 10 |
+
disease: str | None = None
|
| 11 |
+
hpo_terms: list[str] = Field(default_factory=list)
|
| 12 |
+
inheritance: Literal["AD", "AR", "XL", "MT", "unknown"] | None = None
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class NormalizedVariant(BaseModel):
|
| 16 |
+
raw_input: str
|
| 17 |
+
hgvs_genomic: str | None = None
|
| 18 |
+
hgvs_coding: str | None = None
|
| 19 |
+
hgvs_protein: str | None = None
|
| 20 |
+
transcript: str | None = None
|
| 21 |
+
gene_symbol: str | None = None
|
| 22 |
+
chromosome: str | None = None
|
| 23 |
+
position: int | None = None
|
| 24 |
+
ref: str | None = None
|
| 25 |
+
alt: str | None = None
|
| 26 |
+
consequence: str | None = None
|
| 27 |
+
normalization_source: Literal["mutalyzer", "pyhgvs", "passthrough"] = "mutalyzer"
|
| 28 |
+
warnings: list[str] = Field(default_factory=list)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class VariantOutput(BaseModel):
|
| 32 |
+
id: str
|
| 33 |
+
normalized: NormalizedVariant
|
| 34 |
+
submitted_at: str
|
backend/app/services/__init__.py
ADDED
|
File without changes
|
backend/app/services/acmg/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from backend.app.services.acmg.combiner import combine_criteria
|
| 2 |
+
from backend.app.services.acmg.rules import RuleEngine
|
| 3 |
+
|
| 4 |
+
__all__ = ["RuleEngine", "combine_criteria"]
|
backend/app/services/acmg/combiner.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ACMG/AMP variant classification combiner.
|
| 2 |
+
|
| 3 |
+
This module implements two classifiers:
|
| 4 |
+
|
| 5 |
+
1. **Strict Table 5** (Richards 2015) β the original combinatorial rules.
|
| 6 |
+
This is the clinical default because it is auditable and conservative.
|
| 7 |
+
|
| 8 |
+
2. **Bayesian point system** (Tavtigian 2018; ClinGen SVI 2020) β assigns
|
| 9 |
+
numeric points to each triggered criterion based on its strength, then
|
| 10 |
+
classifies by total. This can be enabled explicitly for validation and
|
| 11 |
+
research cohorts.
|
| 12 |
+
|
| 13 |
+
Point thresholds (Tavtigian 2018, Genet Med 20:1054):
|
| 14 |
+
β₯10 β Pathogenic
|
| 15 |
+
6-9 β Likely Pathogenic
|
| 16 |
+
0-5 β VUS
|
| 17 |
+
-6 to -1 β Likely Benign
|
| 18 |
+
β€-7 β Benign
|
| 19 |
+
|
| 20 |
+
Point values:
|
| 21 |
+
very_strong=8, strong=4, moderate=2, supporting=1
|
| 22 |
+
standalone=-8, benign equivalents flip sign
|
| 23 |
+
|
| 24 |
+
The previous implementation selected the more pathogenic result by default.
|
| 25 |
+
That is useful for exploration, but too permissive for lab-facing defaults.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
from backend.app.config import get_settings
|
| 29 |
+
from backend.app.schemas.classification import Classification, ClinicalSignificance
|
| 30 |
+
from backend.app.schemas.evidence import ACMGCriterion
|
| 31 |
+
|
| 32 |
+
PATHOGENIC_PREFIX = ("PVS", "PS", "PM", "PP")
|
| 33 |
+
BENIGN_PREFIX = ("BA", "BS", "BP")
|
| 34 |
+
|
| 35 |
+
POINTS_PATH = {"very_strong": 8, "strong": 4, "moderate": 2, "supporting": 1}
|
| 36 |
+
POINTS_BEN = {"standalone": 8, "strong": 4, "moderate": 2, "supporting": 1}
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _bayesian_score(criteria: list[ACMGCriterion]) -> int:
|
| 40 |
+
"""Tavtigian 2018 point system. Pathogenic criteria add, benign subtract."""
|
| 41 |
+
score = 0
|
| 42 |
+
for c in criteria:
|
| 43 |
+
if not c.triggered:
|
| 44 |
+
continue
|
| 45 |
+
if c.code.startswith(PATHOGENIC_PREFIX):
|
| 46 |
+
score += POINTS_PATH.get(c.strength, 0)
|
| 47 |
+
elif c.code.startswith(BENIGN_PREFIX):
|
| 48 |
+
score -= POINTS_BEN.get(c.strength, 0)
|
| 49 |
+
return score
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _bayesian_significance(score: int) -> ClinicalSignificance:
|
| 53 |
+
if score >= 10:
|
| 54 |
+
return "Pathogenic"
|
| 55 |
+
if score >= 6:
|
| 56 |
+
return "Likely Pathogenic"
|
| 57 |
+
if score >= 0:
|
| 58 |
+
return "Uncertain Significance"
|
| 59 |
+
if score >= -6:
|
| 60 |
+
return "Likely Benign"
|
| 61 |
+
return "Benign"
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
SIGNIFICANCE_RANK = {
|
| 65 |
+
"Benign": 0,
|
| 66 |
+
"Likely Benign": 1,
|
| 67 |
+
"Uncertain Significance": 2,
|
| 68 |
+
"Likely Pathogenic": 3,
|
| 69 |
+
"Pathogenic": 4,
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _bucket(criteria: list[ACMGCriterion]) -> dict[str, int]:
|
| 74 |
+
triggered = [c for c in criteria if c.triggered]
|
| 75 |
+
return {
|
| 76 |
+
"very_strong": sum(1 for c in triggered if c.strength == "very_strong"),
|
| 77 |
+
"strong_path": sum(1 for c in triggered if c.strength == "strong" and c.code.startswith(PATHOGENIC_PREFIX)),
|
| 78 |
+
"moderate_path": sum(1 for c in triggered if c.strength == "moderate" and c.code.startswith(PATHOGENIC_PREFIX)),
|
| 79 |
+
"supporting_path": sum(1 for c in triggered if c.strength == "supporting" and c.code.startswith(PATHOGENIC_PREFIX)),
|
| 80 |
+
"standalone": sum(1 for c in triggered if c.strength == "standalone"),
|
| 81 |
+
"strong_benign": sum(1 for c in triggered if c.strength == "strong" and c.code.startswith(BENIGN_PREFIX)),
|
| 82 |
+
"moderate_benign": sum(1 for c in triggered if c.strength == "moderate" and c.code.startswith(BENIGN_PREFIX)),
|
| 83 |
+
"supporting_benign": sum(1 for c in triggered if c.strength == "supporting" and c.code.startswith(BENIGN_PREFIX)),
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def _is_pathogenic(b: dict[str, int]) -> bool:
|
| 88 |
+
if b["very_strong"] >= 1:
|
| 89 |
+
if b["strong_path"] >= 1:
|
| 90 |
+
return True
|
| 91 |
+
if b["moderate_path"] >= 2:
|
| 92 |
+
return True
|
| 93 |
+
if b["moderate_path"] >= 1 and b["supporting_path"] >= 1:
|
| 94 |
+
return True
|
| 95 |
+
if b["supporting_path"] >= 2:
|
| 96 |
+
return True
|
| 97 |
+
if b["strong_path"] >= 2:
|
| 98 |
+
return True
|
| 99 |
+
if b["strong_path"] >= 1:
|
| 100 |
+
if b["moderate_path"] >= 3:
|
| 101 |
+
return True
|
| 102 |
+
if b["moderate_path"] >= 2 and b["supporting_path"] >= 2:
|
| 103 |
+
return True
|
| 104 |
+
return b["moderate_path"] >= 1 and b["supporting_path"] >= 4
|
| 105 |
+
return False
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _is_likely_pathogenic(b: dict[str, int]) -> bool:
|
| 109 |
+
if b["very_strong"] >= 1 and b["moderate_path"] >= 1:
|
| 110 |
+
return True
|
| 111 |
+
if b["strong_path"] >= 1 and 1 <= b["moderate_path"] <= 2:
|
| 112 |
+
return True
|
| 113 |
+
if b["strong_path"] >= 1 and b["supporting_path"] >= 2:
|
| 114 |
+
return True
|
| 115 |
+
if b["moderate_path"] >= 3:
|
| 116 |
+
return True
|
| 117 |
+
if b["moderate_path"] >= 2 and b["supporting_path"] >= 2:
|
| 118 |
+
return True
|
| 119 |
+
return b["moderate_path"] >= 1 and b["supporting_path"] >= 4
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def _is_benign(b: dict[str, int]) -> bool:
|
| 123 |
+
if b["standalone"] >= 1:
|
| 124 |
+
return True
|
| 125 |
+
return b["strong_benign"] >= 2
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def _is_likely_benign(b: dict[str, int]) -> bool:
|
| 129 |
+
if b["strong_benign"] >= 1 and b["supporting_benign"] >= 1:
|
| 130 |
+
return True
|
| 131 |
+
return b["supporting_benign"] >= 2
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def combine_criteria(criteria: list[ACMGCriterion]) -> Classification:
|
| 135 |
+
"""Combine ACMG criteria using the configured combiner strategy.
|
| 136 |
+
|
| 137 |
+
Conflict detection still uses the strict bucketing β if pathogenic
|
| 138 |
+
AND benign criteria both fire, we surface VUS regardless of points.
|
| 139 |
+
"""
|
| 140 |
+
strategy = get_settings().acmg_combiner_strategy
|
| 141 |
+
triggered = [c for c in criteria if c.triggered]
|
| 142 |
+
b = _bucket(criteria)
|
| 143 |
+
|
| 144 |
+
table5_pathogenic = _is_pathogenic(b)
|
| 145 |
+
table5_likely_pathogenic = _is_likely_pathogenic(b)
|
| 146 |
+
table5_benign = _is_benign(b)
|
| 147 |
+
table5_likely_benign = _is_likely_benign(b)
|
| 148 |
+
|
| 149 |
+
table5_sig: ClinicalSignificance = (
|
| 150 |
+
"Pathogenic" if table5_pathogenic else
|
| 151 |
+
"Likely Pathogenic" if table5_likely_pathogenic else
|
| 152 |
+
"Benign" if table5_benign else
|
| 153 |
+
"Likely Benign" if table5_likely_benign else
|
| 154 |
+
"Uncertain Significance"
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
points = _bayesian_score(criteria)
|
| 158 |
+
bayes_sig = _bayesian_significance(points)
|
| 159 |
+
|
| 160 |
+
if strategy == "bayesian":
|
| 161 |
+
significance: ClinicalSignificance = bayes_sig
|
| 162 |
+
used_classifier = f"Bayesian {points:+d} pts"
|
| 163 |
+
elif strategy == "most_pathogenic" and SIGNIFICANCE_RANK[bayes_sig] >= SIGNIFICANCE_RANK[table5_sig]:
|
| 164 |
+
significance = bayes_sig
|
| 165 |
+
used_classifier = f"Bayesian {points:+d} pts"
|
| 166 |
+
else:
|
| 167 |
+
significance = table5_sig
|
| 168 |
+
used_classifier = "Richards 2015 Table 5"
|
| 169 |
+
|
| 170 |
+
has_path_evidence = b["very_strong"] + b["strong_path"] + b["moderate_path"] + b["supporting_path"] > 0
|
| 171 |
+
has_benign_evidence = b["standalone"] + b["strong_benign"] + b["moderate_benign"] + b["supporting_benign"] > 0
|
| 172 |
+
conflicting = has_path_evidence and has_benign_evidence
|
| 173 |
+
|
| 174 |
+
if conflicting:
|
| 175 |
+
significance = "Uncertain Significance"
|
| 176 |
+
|
| 177 |
+
avg_low = sum(1 for c in triggered if c.confidence == "low")
|
| 178 |
+
if not triggered or avg_low >= 2:
|
| 179 |
+
confidence = "low"
|
| 180 |
+
elif all(c.confidence == "high" for c in triggered):
|
| 181 |
+
confidence = "high"
|
| 182 |
+
else:
|
| 183 |
+
confidence = "medium"
|
| 184 |
+
|
| 185 |
+
return Classification(
|
| 186 |
+
significance=significance,
|
| 187 |
+
confidence=confidence,
|
| 188 |
+
triggered_criteria=[c.code for c in triggered],
|
| 189 |
+
conflicting_evidence=conflicting,
|
| 190 |
+
rationale=_build_rationale(b, significance, points, used_classifier),
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def _build_rationale(
|
| 195 |
+
b: dict[str, int],
|
| 196 |
+
significance: ClinicalSignificance,
|
| 197 |
+
points: int,
|
| 198 |
+
classifier: str,
|
| 199 |
+
) -> str:
|
| 200 |
+
parts = []
|
| 201 |
+
if b["very_strong"]:
|
| 202 |
+
parts.append(f"{b['very_strong']}Γ Very Strong")
|
| 203 |
+
if b["strong_path"]:
|
| 204 |
+
parts.append(f"{b['strong_path']}Γ Strong (P)")
|
| 205 |
+
if b["moderate_path"]:
|
| 206 |
+
parts.append(f"{b['moderate_path']}Γ Moderate (P)")
|
| 207 |
+
if b["supporting_path"]:
|
| 208 |
+
parts.append(f"{b['supporting_path']}Γ Supporting (P)")
|
| 209 |
+
if b["standalone"]:
|
| 210 |
+
parts.append(f"{b['standalone']}Γ Stand-alone (B)")
|
| 211 |
+
if b["strong_benign"]:
|
| 212 |
+
parts.append(f"{b['strong_benign']}Γ Strong (B)")
|
| 213 |
+
if b["moderate_benign"]:
|
| 214 |
+
parts.append(f"{b['moderate_benign']}Γ Moderate (B)")
|
| 215 |
+
if b["supporting_benign"]:
|
| 216 |
+
parts.append(f"{b['supporting_benign']}Γ Supporting (B)")
|
| 217 |
+
counts = " + ".join(parts) if parts else "no triggered criteria"
|
| 218 |
+
return f"{significance} ({classifier}, {points:+d} pts) β {counts}"
|
backend/app/services/acmg/rules.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
from backend.app.config import get_settings
|
| 4 |
+
from backend.app.schemas.evidence import (
|
| 5 |
+
ACMGCriterion,
|
| 6 |
+
AutoPVS1Result,
|
| 7 |
+
ClinVarSubmission,
|
| 8 |
+
EvidenceBundle,
|
| 9 |
+
InSilicoResult,
|
| 10 |
+
PopulationFrequency,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
settings = get_settings()
|
| 15 |
+
|
| 16 |
+
PM2_THRESHOLD = 0.0001
|
| 17 |
+
BS1_THRESHOLD = 0.005
|
| 18 |
+
BA1_THRESHOLD = 0.05
|
| 19 |
+
BS2_HOM_THRESHOLD = 2
|
| 20 |
+
|
| 21 |
+
# PM2 strength β Richards 2015 originally specified MODERATE.
|
| 22 |
+
# ClinGen SVI 2020 recommended downgrading to SUPPORTING for general use,
|
| 23 |
+
# but most clinical labs and ClinGen VCEPs still apply MODERATE in practice.
|
| 24 |
+
# Switch via env if you want the SVI 2020 behavior.
|
| 25 |
+
PM2_STRENGTH = "moderate"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class RuleEngine:
|
| 29 |
+
"""Auto-scorers for database-derived ACMG criteria. Literature criteria
|
| 30 |
+
(PM3, PP1, PS3, PS4, PP4, PS2/PM6, PP5/BP6) are populated by the LLM layer."""
|
| 31 |
+
|
| 32 |
+
def score_pvs1(self, autopvs1_result: AutoPVS1Result | None) -> ACMGCriterion | None:
|
| 33 |
+
if not autopvs1_result or not autopvs1_result.triggered:
|
| 34 |
+
return None
|
| 35 |
+
return ACMGCriterion(
|
| 36 |
+
code="PVS1",
|
| 37 |
+
triggered=True,
|
| 38 |
+
strength=autopvs1_result.strength,
|
| 39 |
+
source=autopvs1_result.source,
|
| 40 |
+
evidence_text=autopvs1_result.conclusion,
|
| 41 |
+
confidence="high",
|
| 42 |
+
caveat="; ".join(autopvs1_result.caveats) or None,
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
def score_population(self, freq: PopulationFrequency | None) -> list[ACMGCriterion]:
|
| 46 |
+
if not freq or freq.overall_af is None:
|
| 47 |
+
logger.warning("Population frequency missing; PM2 not triggered until coverage is verified")
|
| 48 |
+
return []
|
| 49 |
+
|
| 50 |
+
out: list[ACMGCriterion] = []
|
| 51 |
+
af = freq.overall_af or 0.0
|
| 52 |
+
|
| 53 |
+
if af >= BA1_THRESHOLD:
|
| 54 |
+
out.append(ACMGCriterion(
|
| 55 |
+
code="BA1",
|
| 56 |
+
triggered=True,
|
| 57 |
+
strength="standalone",
|
| 58 |
+
source="gnomAD v4.1",
|
| 59 |
+
evidence_text=f"overall AF = {af:.4f} β₯ 5%",
|
| 60 |
+
confidence="high",
|
| 61 |
+
))
|
| 62 |
+
elif af >= BS1_THRESHOLD:
|
| 63 |
+
out.append(ACMGCriterion(
|
| 64 |
+
code="BS1",
|
| 65 |
+
triggered=True,
|
| 66 |
+
strength="strong",
|
| 67 |
+
source="gnomAD v4.1",
|
| 68 |
+
evidence_text=f"overall AF = {af:.4f} > expected",
|
| 69 |
+
confidence="medium",
|
| 70 |
+
caveat="compare against disease-specific BS1 threshold",
|
| 71 |
+
))
|
| 72 |
+
elif af < PM2_THRESHOLD:
|
| 73 |
+
out.append(ACMGCriterion(
|
| 74 |
+
code="PM2",
|
| 75 |
+
triggered=True,
|
| 76 |
+
strength="supporting",
|
| 77 |
+
source="gnomAD v4.1",
|
| 78 |
+
evidence_text=f"overall AF = {af:.6f} < 0.0001",
|
| 79 |
+
confidence="high",
|
| 80 |
+
))
|
| 81 |
+
|
| 82 |
+
if (freq.homozygote_count or 0) >= BS2_HOM_THRESHOLD:
|
| 83 |
+
out.append(ACMGCriterion(
|
| 84 |
+
code="BS2",
|
| 85 |
+
triggered=True,
|
| 86 |
+
strength="strong",
|
| 87 |
+
source="gnomAD v4.1",
|
| 88 |
+
evidence_text=f"{freq.homozygote_count} healthy homozygotes",
|
| 89 |
+
confidence="high",
|
| 90 |
+
))
|
| 91 |
+
return out
|
| 92 |
+
|
| 93 |
+
def score_insilico(self, ins: InSilicoResult | None) -> list[ACMGCriterion]:
|
| 94 |
+
"""Modulate PP3/BP4 strength using ClinGen SVI 2022 recommendations
|
| 95 |
+
(Pejaver et al. 2022, AJHG) β REVEL β₯ 0.932 + concordant signals
|
| 96 |
+
upgrade to PP3_strong; β₯ 0.773 to PP3_moderate; otherwise supporting.
|
| 97 |
+
Mirror thresholds for BP4.
|
| 98 |
+
"""
|
| 99 |
+
if not ins:
|
| 100 |
+
return []
|
| 101 |
+
out = []
|
| 102 |
+
if ins.pp3_triggered:
|
| 103 |
+
strength = self._pp3_strength(ins)
|
| 104 |
+
out.append(ACMGCriterion(
|
| 105 |
+
code="PP3",
|
| 106 |
+
triggered=True,
|
| 107 |
+
strength=strength,
|
| 108 |
+
source="REVEL+AlphaMissense+SpliceAI concordant",
|
| 109 |
+
evidence_text=f"REVEL={ins.revel}, AM={ins.alphamissense}, SpliceAI={ins.spliceai_max} β {strength}",
|
| 110 |
+
confidence="high" if strength in ("strong", "moderate") else "medium",
|
| 111 |
+
))
|
| 112 |
+
if ins.bp4_triggered:
|
| 113 |
+
strength = self._bp4_strength(ins)
|
| 114 |
+
out.append(ACMGCriterion(
|
| 115 |
+
code="BP4",
|
| 116 |
+
triggered=True,
|
| 117 |
+
strength=strength,
|
| 118 |
+
source="REVEL+AlphaMissense+SpliceAI concordant",
|
| 119 |
+
evidence_text=f"REVEL={ins.revel}, AM={ins.alphamissense}, SpliceAI={ins.spliceai_max} β {strength}",
|
| 120 |
+
confidence="high" if strength in ("strong", "moderate") else "medium",
|
| 121 |
+
))
|
| 122 |
+
return out
|
| 123 |
+
|
| 124 |
+
@staticmethod
|
| 125 |
+
def _pp3_strength(ins: "InSilicoResult") -> str:
|
| 126 |
+
# Pejaver et al. 2022 calibration β REVEL stratification for PP3
|
| 127 |
+
revel = ins.revel or 0.0
|
| 128 |
+
am = ins.alphamissense or 0.0
|
| 129 |
+
if revel >= 0.932 and am >= 0.95:
|
| 130 |
+
return "strong"
|
| 131 |
+
if revel >= 0.773 or am >= 0.834:
|
| 132 |
+
return "moderate"
|
| 133 |
+
return "supporting"
|
| 134 |
+
|
| 135 |
+
@staticmethod
|
| 136 |
+
def _bp4_strength(ins: "InSilicoResult") -> str:
|
| 137 |
+
revel = ins.revel if ins.revel is not None else 1.0
|
| 138 |
+
am = ins.alphamissense if ins.alphamissense is not None else 1.0
|
| 139 |
+
if revel <= 0.183 and am <= 0.099:
|
| 140 |
+
return "strong"
|
| 141 |
+
if revel <= 0.290 or am <= 0.099:
|
| 142 |
+
return "moderate"
|
| 143 |
+
return "supporting"
|
| 144 |
+
|
| 145 |
+
def score_clinvar(self, submissions: list[ClinVarSubmission] | None) -> list[ACMGCriterion]:
|
| 146 |
+
"""Map ClinVar consensus to optional PP5/BP6 evidence.
|
| 147 |
+
|
| 148 |
+
The first submission is the AGGREGATE consensus from ClinVar (the
|
| 149 |
+
green-star verdict). ACMG SVI deprecated PP5/BP6 as standalone
|
| 150 |
+
criteria in 2018, so VariantLens does not auto-trigger them unless
|
| 151 |
+
explicitly enabled for research/backward-compatibility validation.
|
| 152 |
+
"""
|
| 153 |
+
if not submissions:
|
| 154 |
+
return []
|
| 155 |
+
if not settings.enable_deprecated_clinvar_criteria:
|
| 156 |
+
logger.info("ClinVar PP5/BP6 auto-scoring disabled; retaining ClinVar as evidence only")
|
| 157 |
+
return []
|
| 158 |
+
|
| 159 |
+
# First submission is the aggregate consensus (see clinvar.py); rest are lab-level
|
| 160 |
+
consensus = submissions[0]
|
| 161 |
+
cls = consensus.classification.lower()
|
| 162 |
+
stars = consensus.stars
|
| 163 |
+
is_path = "pathogenic" in cls and "conflicting" not in cls
|
| 164 |
+
is_benign = "benign" in cls and "conflicting" not in cls
|
| 165 |
+
|
| 166 |
+
if not (is_path or is_benign):
|
| 167 |
+
return []
|
| 168 |
+
|
| 169 |
+
# Strength scales with ClinGen review-status stars:
|
| 170 |
+
# 4β
practice guideline β strong
|
| 171 |
+
# 3β
expert panel β strong
|
| 172 |
+
# 2β
multi-submitter ok β moderate
|
| 173 |
+
# 1β
single submitter β supporting
|
| 174 |
+
# 0β
no criteria β supporting (downgraded)
|
| 175 |
+
strength = (
|
| 176 |
+
"strong" if stars >= 3 else
|
| 177 |
+
"moderate" if stars == 2 else
|
| 178 |
+
"supporting"
|
| 179 |
+
)
|
| 180 |
+
confidence: str = "high" if stars >= 3 else ("medium" if stars >= 1 else "low")
|
| 181 |
+
|
| 182 |
+
out: list[ACMGCriterion] = []
|
| 183 |
+
if is_path:
|
| 184 |
+
out.append(ACMGCriterion(
|
| 185 |
+
code="PP5",
|
| 186 |
+
triggered=True,
|
| 187 |
+
strength=strength,
|
| 188 |
+
source=f"ClinVar consensus {consensus.accession} ({stars}β
)",
|
| 189 |
+
evidence_text=f"Aggregate ClinVar classification: {consensus.classification} β {stars}β
review",
|
| 190 |
+
confidence=confidence,
|
| 191 |
+
caveat=("ACMG SVI 2018 deprecated PP5 as standalone β verify before final sign-off"
|
| 192 |
+
if stars < 3 else None),
|
| 193 |
+
))
|
| 194 |
+
elif is_benign:
|
| 195 |
+
out.append(ACMGCriterion(
|
| 196 |
+
code="BP6",
|
| 197 |
+
triggered=True,
|
| 198 |
+
strength=strength,
|
| 199 |
+
source=f"ClinVar consensus {consensus.accession} ({stars}β
)",
|
| 200 |
+
evidence_text=f"Aggregate ClinVar classification: {consensus.classification} β {stars}β
review",
|
| 201 |
+
confidence=confidence,
|
| 202 |
+
caveat=("ACMG SVI 2018 deprecated BP6 as standalone β verify before final sign-off"
|
| 203 |
+
if stars < 3 else None),
|
| 204 |
+
))
|
| 205 |
+
return out
|
| 206 |
+
|
| 207 |
+
def score_all(self, evidence: EvidenceBundle) -> list[ACMGCriterion]:
|
| 208 |
+
criteria: list[ACMGCriterion] = []
|
| 209 |
+
pvs1 = self.score_pvs1(evidence.autopvs1)
|
| 210 |
+
if pvs1:
|
| 211 |
+
criteria.append(pvs1)
|
| 212 |
+
criteria.extend(self.score_population(evidence.population_frequency))
|
| 213 |
+
criteria.extend(self.score_insilico(evidence.insilico))
|
| 214 |
+
criteria.extend(self.score_clinvar(evidence.clinvar_existing))
|
| 215 |
+
return criteria
|
backend/app/services/clinvar.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ClinVar lookup β aggregate consensus + per-submitter assertions.
|
| 2 |
+
|
| 3 |
+
The previous implementation only fetched `ids[0]` from esearch, which often
|
| 4 |
+
isn't the canonical VariationArchive (esearch ranks by recency, not by
|
| 5 |
+
match quality). It also ignored the aggregate `GermlineClassification`
|
| 6 |
+
field, so a variant with 50 Pathogenic assertions and a 3-star expert-panel
|
| 7 |
+
review status would render as the first lab-level submission found β often
|
| 8 |
+
a discordant single-lab call.
|
| 9 |
+
|
| 10 |
+
This module now:
|
| 11 |
+
1. Fetches all matching variation IDs from esearch (up to MAX_IDS).
|
| 12 |
+
2. Extracts the aggregate `Classifications/GermlineClassification` from
|
| 13 |
+
each β that's the curated consensus that ClinGen uses for the green
|
| 14 |
+
star ratings.
|
| 15 |
+
3. Picks the entry whose review status carries the highest weight.
|
| 16 |
+
4. Returns it as the primary `ClinVarSubmission`, plus up to N
|
| 17 |
+
supporting per-submitter assertions for the UI's evidence list.
|
| 18 |
+
"""
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
|
| 21 |
+
import logging
|
| 22 |
+
import xml.etree.ElementTree as ET
|
| 23 |
+
from typing import Any
|
| 24 |
+
|
| 25 |
+
import httpx
|
| 26 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 27 |
+
|
| 28 |
+
from backend.app.config import get_settings
|
| 29 |
+
from backend.app.schemas.evidence import ClinVarSubmission
|
| 30 |
+
|
| 31 |
+
logger = logging.getLogger(__name__)
|
| 32 |
+
settings = get_settings()
|
| 33 |
+
|
| 34 |
+
EUTILS = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
|
| 35 |
+
MAX_IDS = 10
|
| 36 |
+
MAX_ASSERTIONS = 10
|
| 37 |
+
|
| 38 |
+
REVIEW_STATUS_STARS: dict[str, int] = {
|
| 39 |
+
"practice guideline": 4,
|
| 40 |
+
"reviewed by expert panel": 3,
|
| 41 |
+
"criteria provided, multiple submitters, no conflicts": 2,
|
| 42 |
+
"criteria provided, multiple submitters": 2,
|
| 43 |
+
"criteria provided, single submitter": 1,
|
| 44 |
+
"criteria provided, conflicting classifications": 1,
|
| 45 |
+
"criteria provided, conflicting interpretations": 1,
|
| 46 |
+
"no assertion criteria provided": 0,
|
| 47 |
+
"no classification provided": 0,
|
| 48 |
+
"no assertion provided": 0,
|
| 49 |
+
"no classifications from unflagged records": 0,
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _stars_for(review_status: str | None) -> int:
|
| 54 |
+
if not review_status:
|
| 55 |
+
return 0
|
| 56 |
+
return REVIEW_STATUS_STARS.get(review_status.strip().lower(), 0)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class ClinVarClient:
|
| 60 |
+
def __init__(self, api_key: str | None = None, email: str | None = None) -> None:
|
| 61 |
+
self.api_key = api_key or settings.ncbi_api_key
|
| 62 |
+
self.email = email or settings.ncbi_email
|
| 63 |
+
|
| 64 |
+
def _params(self, **extra: Any) -> dict[str, Any]:
|
| 65 |
+
params = {"db": "clinvar", "tool": "VariantLens", "email": self.email}
|
| 66 |
+
if self.api_key:
|
| 67 |
+
params["api_key"] = self.api_key
|
| 68 |
+
return {**params, **extra}
|
| 69 |
+
|
| 70 |
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=8), reraise=True)
|
| 71 |
+
async def search(self, hgvs: str) -> list[str]:
|
| 72 |
+
async with httpx.AsyncClient(timeout=15.0) as client:
|
| 73 |
+
r = await client.get(
|
| 74 |
+
f"{EUTILS}/esearch.fcgi",
|
| 75 |
+
params=self._params(term=hgvs, retmode="json", retmax=MAX_IDS),
|
| 76 |
+
)
|
| 77 |
+
r.raise_for_status()
|
| 78 |
+
return r.json().get("esearchresult", {}).get("idlist", [])
|
| 79 |
+
|
| 80 |
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=8), reraise=True)
|
| 81 |
+
async def _efetch(self, variation_ids: list[str]) -> str:
|
| 82 |
+
"""Bulk-fetch up to N variation IDs in one call. ClinVar's efetch
|
| 83 |
+
supports comma-separated IDs and returns a single ClinVarResult
|
| 84 |
+
document containing one VariationArchive per ID."""
|
| 85 |
+
async with httpx.AsyncClient(timeout=30.0) as client:
|
| 86 |
+
r = await client.get(
|
| 87 |
+
f"{EUTILS}/efetch.fcgi",
|
| 88 |
+
params=self._params(
|
| 89 |
+
id=",".join(variation_ids),
|
| 90 |
+
rettype="vcv",
|
| 91 |
+
is_variationid="true",
|
| 92 |
+
),
|
| 93 |
+
)
|
| 94 |
+
r.raise_for_status()
|
| 95 |
+
return r.text
|
| 96 |
+
|
| 97 |
+
def _parse_aggregate(self, vcv: ET.Element) -> ClinVarSubmission | None:
|
| 98 |
+
"""Extract the canonical aggregate consensus from a VariationArchive.
|
| 99 |
+
|
| 100 |
+
This corresponds to the green-star review at the top of the ClinVar
|
| 101 |
+
web page β the single line of consensus that the ACMG SVI and most
|
| 102 |
+
clinical labs treat as the authoritative ClinVar verdict.
|
| 103 |
+
"""
|
| 104 |
+
accession = vcv.get("Accession") or vcv.get("VariationID") or "unknown"
|
| 105 |
+
|
| 106 |
+
# GRCh38 RefSeq accessions are direct children, not nested deeper
|
| 107 |
+
cls_node = vcv.find(".//Classifications/GermlineClassification/Description")
|
| 108 |
+
review_node = vcv.find(".//Classifications/GermlineClassification/ReviewStatus")
|
| 109 |
+
date_node = vcv.find(".//Classifications/GermlineClassification")
|
| 110 |
+
cond_nodes = vcv.findall(".//Classifications/GermlineClassification/ConditionList/TraitSet/Trait/Name/ElementValue")
|
| 111 |
+
|
| 112 |
+
if cls_node is None or not cls_node.text:
|
| 113 |
+
return None
|
| 114 |
+
|
| 115 |
+
review = review_node.text if review_node is not None else None
|
| 116 |
+
date = ""
|
| 117 |
+
if date_node is not None:
|
| 118 |
+
date = date_node.get("DateLastEvaluated") or date_node.get("DateCreated") or ""
|
| 119 |
+
|
| 120 |
+
condition = "not specified"
|
| 121 |
+
for n in cond_nodes:
|
| 122 |
+
if n.get("Type") == "Preferred" and n.text:
|
| 123 |
+
condition = n.text
|
| 124 |
+
break
|
| 125 |
+
if condition == "not specified" and cond_nodes and cond_nodes[0].text:
|
| 126 |
+
condition = cond_nodes[0].text
|
| 127 |
+
|
| 128 |
+
return ClinVarSubmission(
|
| 129 |
+
accession=accession,
|
| 130 |
+
submitter="ClinVar aggregate",
|
| 131 |
+
classification=cls_node.text,
|
| 132 |
+
stars=_stars_for(review),
|
| 133 |
+
date=date,
|
| 134 |
+
condition=condition,
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
def _parse_assertions(self, vcv: ET.Element, limit: int) -> list[ClinVarSubmission]:
|
| 138 |
+
"""Pull individual lab-level assertions for the UI's evidence list.
|
| 139 |
+
|
| 140 |
+
Aggregated separately from the consensus so the rule engine doesn't
|
| 141 |
+
double-count a single ClinVar entry into 50 distinct PP5 hits.
|
| 142 |
+
"""
|
| 143 |
+
out: list[ClinVarSubmission] = []
|
| 144 |
+
for scv in vcv.iter("ClinicalAssertion"):
|
| 145 |
+
if len(out) >= limit:
|
| 146 |
+
break
|
| 147 |
+
acc_node = scv.find(".//ClinVarAccession")
|
| 148 |
+
acc = acc_node.get("Accession") if acc_node is not None else "unknown"
|
| 149 |
+
submitter = acc_node.get("SubmitterName") if acc_node is not None else "unknown"
|
| 150 |
+
|
| 151 |
+
cls_node = scv.find("Classification/GermlineClassification")
|
| 152 |
+
if cls_node is None or not cls_node.text:
|
| 153 |
+
continue
|
| 154 |
+
classification = cls_node.text
|
| 155 |
+
|
| 156 |
+
review_node = scv.find("Classification/ReviewStatus")
|
| 157 |
+
review = review_node.text if review_node is not None else None
|
| 158 |
+
|
| 159 |
+
date_node = scv.find("Classification")
|
| 160 |
+
date = date_node.get("DateLastEvaluated") if date_node is not None else ""
|
| 161 |
+
|
| 162 |
+
cond_node = scv.find(".//TraitSet/Trait/Name/ElementValue")
|
| 163 |
+
condition = cond_node.text if cond_node is not None and cond_node.text else "not specified"
|
| 164 |
+
|
| 165 |
+
out.append(ClinVarSubmission(
|
| 166 |
+
accession=acc or "unknown",
|
| 167 |
+
submitter=submitter or "unknown",
|
| 168 |
+
classification=classification,
|
| 169 |
+
stars=_stars_for(review),
|
| 170 |
+
date=date or "",
|
| 171 |
+
condition=condition,
|
| 172 |
+
))
|
| 173 |
+
return out
|
| 174 |
+
|
| 175 |
+
def _parse(self, xml_text: str) -> list[ClinVarSubmission]:
|
| 176 |
+
"""Parse all VariationArchives in the ClinVar response.
|
| 177 |
+
|
| 178 |
+
Returns the strongest aggregate consensus first, then up to
|
| 179 |
+
MAX_ASSERTIONS per-submitter assertions from the same archive.
|
| 180 |
+
"""
|
| 181 |
+
try:
|
| 182 |
+
root = ET.fromstring(xml_text)
|
| 183 |
+
except ET.ParseError as e:
|
| 184 |
+
logger.warning("clinvar xml parse failure: %s", e)
|
| 185 |
+
return []
|
| 186 |
+
|
| 187 |
+
# Pick the VariationArchive with the highest-star aggregate consensus β
|
| 188 |
+
# esearch sometimes returns several IDs (alternative alleles, related
|
| 189 |
+
# variants) and we want the canonical one for THIS variant.
|
| 190 |
+
archives_with_consensus: list[tuple[ET.Element, ClinVarSubmission]] = []
|
| 191 |
+
for vcv in root.iter("VariationArchive"):
|
| 192 |
+
agg = self._parse_aggregate(vcv)
|
| 193 |
+
if agg is not None:
|
| 194 |
+
archives_with_consensus.append((vcv, agg))
|
| 195 |
+
|
| 196 |
+
if not archives_with_consensus:
|
| 197 |
+
return []
|
| 198 |
+
|
| 199 |
+
archives_with_consensus.sort(key=lambda t: -t[1].stars)
|
| 200 |
+
canonical_vcv, consensus = archives_with_consensus[0]
|
| 201 |
+
return [consensus] + self._parse_assertions(canonical_vcv, MAX_ASSERTIONS)
|
| 202 |
+
|
| 203 |
+
async def lookup(self, hgvs: str) -> list[ClinVarSubmission]:
|
| 204 |
+
try:
|
| 205 |
+
ids = await self.search(hgvs)
|
| 206 |
+
except (httpx.HTTPError, httpx.TimeoutException) as e:
|
| 207 |
+
logger.warning("ClinVar search failed for %s: %s", hgvs, e)
|
| 208 |
+
return []
|
| 209 |
+
if not ids:
|
| 210 |
+
return []
|
| 211 |
+
|
| 212 |
+
try:
|
| 213 |
+
xml = await self._efetch(ids[:MAX_IDS])
|
| 214 |
+
except (httpx.HTTPError, httpx.TimeoutException) as e:
|
| 215 |
+
logger.warning("ClinVar efetch failed for %s: %s", hgvs, e)
|
| 216 |
+
return []
|
| 217 |
+
|
| 218 |
+
return self._parse(xml)
|
backend/app/services/exports.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Export classifications to standard interchange formats.
|
| 2 |
+
|
| 3 |
+
ClinVar XML β Submission Schema v1.16 (https://www.ncbi.nlm.nih.gov/clinvar/docs/submit/).
|
| 4 |
+
FHIR R4 β Observation resource with LOINC 53037-8 (genetic clinical significance).
|
| 5 |
+
|
| 6 |
+
Both renderers read from a persisted `ClassificationRecord` so the audit
|
| 7 |
+
trail is intact (sign-off and curator overrides are reflected in the export).
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import xml.etree.ElementTree as ET
|
| 12 |
+
from datetime import UTC, datetime
|
| 13 |
+
from typing import Any
|
| 14 |
+
from xml.dom import minidom
|
| 15 |
+
|
| 16 |
+
from backend.app.models.classification import ClassificationRecord
|
| 17 |
+
|
| 18 |
+
CLINVAR_SIG_MAP = {
|
| 19 |
+
"Pathogenic": "Pathogenic",
|
| 20 |
+
"Likely Pathogenic": "Likely pathogenic",
|
| 21 |
+
"Uncertain Significance": "Uncertain significance",
|
| 22 |
+
"Likely Benign": "Likely benign",
|
| 23 |
+
"Benign": "Benign",
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
LOINC_CLINSIG = {
|
| 27 |
+
"Pathogenic": {"system": "http://loinc.org", "code": "LA6668-3", "display": "Pathogenic"},
|
| 28 |
+
"Likely Pathogenic": {"system": "http://loinc.org", "code": "LA26332-9", "display": "Likely pathogenic"},
|
| 29 |
+
"Uncertain Significance": {"system": "http://loinc.org", "code": "LA26333-7", "display": "Uncertain significance"},
|
| 30 |
+
"Likely Benign": {"system": "http://loinc.org", "code": "LA26334-5", "display": "Likely benign"},
|
| 31 |
+
"Benign": {"system": "http://loinc.org", "code": "LA6675-8", "display": "Benign"},
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _today() -> str:
|
| 36 |
+
return datetime.now(UTC).strftime("%Y-%m-%d")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def render_clinvar_xml(rec: ClassificationRecord, *, submitter_org_id: str = "VARIANTLENS_LAB") -> str:
|
| 40 |
+
"""Render a minimal ClinVar SCV submission for a single variant.
|
| 41 |
+
|
| 42 |
+
The output validates against the ClinVar Submission Schema's
|
| 43 |
+
`ClinvarSubmissionSet > ClinVarSubmission > ClinVarAssertion` path.
|
| 44 |
+
"""
|
| 45 |
+
root = ET.Element("ClinvarSubmissionSet", attrib={"Date": _today()})
|
| 46 |
+
submission = ET.SubElement(root, "ClinvarSubmission", attrib={
|
| 47 |
+
"ID": rec.id,
|
| 48 |
+
"SubmissionDate": _today(),
|
| 49 |
+
})
|
| 50 |
+
|
| 51 |
+
assertion = ET.SubElement(submission, "ClinVarAssertion")
|
| 52 |
+
|
| 53 |
+
# ClinVarAccession β submitter assigned IDs
|
| 54 |
+
ET.SubElement(assertion, "ClinVarAccession", attrib={
|
| 55 |
+
"Acc": f"SCV-LOCAL-{rec.id}",
|
| 56 |
+
"Type": "SCV",
|
| 57 |
+
"OrgID": submitter_org_id,
|
| 58 |
+
})
|
| 59 |
+
|
| 60 |
+
# RecordStatus
|
| 61 |
+
rs = ET.SubElement(assertion, "RecordStatus")
|
| 62 |
+
rs.text = "current"
|
| 63 |
+
|
| 64 |
+
# ClinicalSignificance β the actual call
|
| 65 |
+
cs = ET.SubElement(assertion, "ClinicalSignificance", attrib={
|
| 66 |
+
"DateLastEvaluated": (rec.signed_off_at.strftime("%Y-%m-%d")
|
| 67 |
+
if rec.signed_off_at
|
| 68 |
+
else _today()),
|
| 69 |
+
})
|
| 70 |
+
review = ET.SubElement(cs, "ReviewStatus")
|
| 71 |
+
review.text = ("criteria provided, single submitter"
|
| 72 |
+
if rec.curator_signoff
|
| 73 |
+
else "no assertion criteria provided")
|
| 74 |
+
desc = ET.SubElement(cs, "Description")
|
| 75 |
+
desc.text = CLINVAR_SIG_MAP.get(rec.significance, rec.significance)
|
| 76 |
+
if rec.rationale:
|
| 77 |
+
comment = ET.SubElement(cs, "Comment", attrib={"Type": "ConvertedByNCBI"})
|
| 78 |
+
comment.text = rec.rationale
|
| 79 |
+
|
| 80 |
+
# AssertionMethod β the ruleset
|
| 81 |
+
method = ET.SubElement(assertion, "AssertionMethod")
|
| 82 |
+
method_name = ET.SubElement(method, "MethodName")
|
| 83 |
+
method_name.text = f"ACMG/AMP guidelines (Richards 2015) β VariantLens {rec.ruleset_version}"
|
| 84 |
+
|
| 85 |
+
# ObservedIn β placeholder for the proband
|
| 86 |
+
obs_in = ET.SubElement(assertion, "ObservedIn")
|
| 87 |
+
sample = ET.SubElement(obs_in, "Sample")
|
| 88 |
+
origin = ET.SubElement(sample, "Origin")
|
| 89 |
+
origin.text = "germline"
|
| 90 |
+
species = ET.SubElement(sample, "Species", attrib={"TaxonomyId": "9606"})
|
| 91 |
+
species.text = "human"
|
| 92 |
+
affected = ET.SubElement(sample, "AffectedStatus")
|
| 93 |
+
affected.text = "yes"
|
| 94 |
+
method_obs = ET.SubElement(obs_in, "Method")
|
| 95 |
+
method_type = ET.SubElement(method_obs, "MethodType")
|
| 96 |
+
method_type.text = "clinical testing"
|
| 97 |
+
obs_data = ET.SubElement(obs_in, "ObservedData")
|
| 98 |
+
obs_attr = ET.SubElement(obs_data, "Attribute", attrib={"Type": "Description"})
|
| 99 |
+
obs_attr.text = (f"Variant interpreted by VariantLens with "
|
| 100 |
+
f"{len(rec.triggered_criteria or [])} ACMG criteria triggered.")
|
| 101 |
+
|
| 102 |
+
# MeasureSet β the variant itself
|
| 103 |
+
measure_set = ET.SubElement(assertion, "MeasureSet", attrib={"Type": "Variant"})
|
| 104 |
+
measure = ET.SubElement(measure_set, "Measure", attrib={"Type": "Variation"})
|
| 105 |
+
if rec.variant_id and hasattr(rec, "variant") and rec.variant is not None:
|
| 106 |
+
# Use the raw HGVS coding string from the related variant if available
|
| 107 |
+
for attr_name, attr_type in [
|
| 108 |
+
("hgvs_coding", "HGVS, coding"),
|
| 109 |
+
("hgvs_protein", "HGVS, protein"),
|
| 110 |
+
("hgvs_genomic", "HGVS, genomic"),
|
| 111 |
+
]:
|
| 112 |
+
val = getattr(rec.variant, attr_name, None)
|
| 113 |
+
if val:
|
| 114 |
+
name = ET.SubElement(measure, "AttributeSet")
|
| 115 |
+
attr = ET.SubElement(name, "Attribute", attrib={"Type": attr_type})
|
| 116 |
+
attr.text = val
|
| 117 |
+
|
| 118 |
+
# Per-criterion comments β the audit trail in flat form
|
| 119 |
+
for c in rec.criteria or []:
|
| 120 |
+
if not c.triggered:
|
| 121 |
+
continue
|
| 122 |
+
crit_comment = ET.SubElement(assertion, "Comment", attrib={"Type": "public"})
|
| 123 |
+
bits = [f"{c.code} ({c.strength})", f"source={c.source}"]
|
| 124 |
+
if c.pmid:
|
| 125 |
+
bits.append(f"PMID:{c.pmid}")
|
| 126 |
+
if c.curator_override and c.override_justification:
|
| 127 |
+
bits.append(f"curator override: {c.override_justification}")
|
| 128 |
+
crit_comment.text = " β ".join(bits + [c.evidence_text])
|
| 129 |
+
|
| 130 |
+
rough = ET.tostring(root, encoding="utf-8")
|
| 131 |
+
return minidom.parseString(rough).toprettyxml(indent=" ")
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def render_fhir_observation(rec: ClassificationRecord) -> dict[str, Any]:
|
| 135 |
+
"""Render a FHIR R4 Observation resource for the variant interpretation.
|
| 136 |
+
|
| 137 |
+
Conforms to the HL7 Genomics Reporting IG profile
|
| 138 |
+
`genomic-implication` / `variant` family. The encoded structure is the
|
| 139 |
+
minimum needed for an EHR import β extend with `specimen`, `subject`, and
|
| 140 |
+
`performer` references at the deployment boundary.
|
| 141 |
+
"""
|
| 142 |
+
sig = LOINC_CLINSIG.get(rec.significance, {
|
| 143 |
+
"system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor",
|
| 144 |
+
"code": "OTH",
|
| 145 |
+
"display": rec.significance,
|
| 146 |
+
})
|
| 147 |
+
|
| 148 |
+
components: list[dict[str, Any]] = []
|
| 149 |
+
if hasattr(rec, "variant") and rec.variant is not None:
|
| 150 |
+
for code, display, attr in [
|
| 151 |
+
("48004-6", "DNA change (c.HGVS)", "hgvs_coding"),
|
| 152 |
+
("48005-3", "Amino acid change (p.HGVS)", "hgvs_protein"),
|
| 153 |
+
("81290-9", "Genomic DNA change (g.HGVS)", "hgvs_genomic"),
|
| 154 |
+
("48018-6", "Gene studied [ID]", "gene_symbol"),
|
| 155 |
+
]:
|
| 156 |
+
val = getattr(rec.variant, attr, None)
|
| 157 |
+
if val:
|
| 158 |
+
components.append({
|
| 159 |
+
"code": {"coding": [{"system": "http://loinc.org", "code": code, "display": display}]},
|
| 160 |
+
"valueString": val,
|
| 161 |
+
})
|
| 162 |
+
|
| 163 |
+
derived: list[dict[str, Any]] = []
|
| 164 |
+
for c in rec.criteria or []:
|
| 165 |
+
if not c.triggered:
|
| 166 |
+
continue
|
| 167 |
+
derived.append({
|
| 168 |
+
"extension": [
|
| 169 |
+
{"url": "https://variantlens.local/fhir/criterion-code", "valueString": c.code},
|
| 170 |
+
{"url": "https://variantlens.local/fhir/criterion-strength", "valueString": c.strength},
|
| 171 |
+
{"url": "https://variantlens.local/fhir/criterion-source", "valueString": c.source},
|
| 172 |
+
],
|
| 173 |
+
"valueString": c.evidence_text,
|
| 174 |
+
})
|
| 175 |
+
|
| 176 |
+
return {
|
| 177 |
+
"resourceType": "Observation",
|
| 178 |
+
"id": rec.id,
|
| 179 |
+
"meta": {
|
| 180 |
+
"profile": [
|
| 181 |
+
"http://hl7.org/fhir/uv/genomics-reporting/StructureDefinition/variant",
|
| 182 |
+
],
|
| 183 |
+
},
|
| 184 |
+
"status": "final" if rec.curator_signoff else "preliminary",
|
| 185 |
+
"category": [{
|
| 186 |
+
"coding": [{
|
| 187 |
+
"system": "http://terminology.hl7.org/CodeSystem/observation-category",
|
| 188 |
+
"code": "laboratory",
|
| 189 |
+
}],
|
| 190 |
+
}],
|
| 191 |
+
"code": {
|
| 192 |
+
"coding": [{
|
| 193 |
+
"system": "http://loinc.org",
|
| 194 |
+
"code": "53037-8",
|
| 195 |
+
"display": "Genetic variation clinical significance",
|
| 196 |
+
}],
|
| 197 |
+
},
|
| 198 |
+
"issued": (rec.signed_off_at.isoformat() if rec.signed_off_at else
|
| 199 |
+
rec.created_at.isoformat() if rec.created_at else
|
| 200 |
+
datetime.now(UTC).isoformat()),
|
| 201 |
+
"performer": [{"display": rec.curator_id or "VariantLens (auto)"}],
|
| 202 |
+
"valueCodeableConcept": {"coding": [sig], "text": rec.significance},
|
| 203 |
+
"interpretation": [{"text": rec.rationale or ""}] if rec.rationale else [],
|
| 204 |
+
"note": [{"text": f"ACMG ruleset {rec.ruleset_version}; "
|
| 205 |
+
f"triggered: {', '.join(rec.triggered_criteria or [])}"}],
|
| 206 |
+
"component": components,
|
| 207 |
+
"derivedFrom": derived,
|
| 208 |
+
}
|
backend/app/services/gnomad.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import sqlite3
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import httpx
|
| 6 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 7 |
+
|
| 8 |
+
from backend.app.config import get_settings
|
| 9 |
+
from backend.app.schemas.evidence import PopulationFrequency
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
settings = get_settings()
|
| 13 |
+
|
| 14 |
+
GNOMAD_QUERY = """
|
| 15 |
+
query VariantInfo($variantId: String!, $datasetId: DatasetId!) {
|
| 16 |
+
variant(variantId: $variantId, dataset: $datasetId) {
|
| 17 |
+
variant_id
|
| 18 |
+
exome {
|
| 19 |
+
ac
|
| 20 |
+
an
|
| 21 |
+
af
|
| 22 |
+
ac_hom
|
| 23 |
+
populations { id ac an }
|
| 24 |
+
}
|
| 25 |
+
genome {
|
| 26 |
+
ac
|
| 27 |
+
an
|
| 28 |
+
af
|
| 29 |
+
ac_hom
|
| 30 |
+
populations { id ac an }
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
}
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class GnomADClient:
|
| 38 |
+
def __init__(self, url: str | None = None, cache_db: Path | None = None) -> None:
|
| 39 |
+
self.url = url or settings.gnomad_graphql_url
|
| 40 |
+
self.cache_db = cache_db or settings.gnomad_cache_db
|
| 41 |
+
self._init_cache()
|
| 42 |
+
|
| 43 |
+
def _init_cache(self) -> None:
|
| 44 |
+
self.cache_db.parent.mkdir(parents=True, exist_ok=True)
|
| 45 |
+
with sqlite3.connect(self.cache_db) as conn:
|
| 46 |
+
conn.execute(
|
| 47 |
+
"""
|
| 48 |
+
CREATE TABLE IF NOT EXISTS gnomad_cache (
|
| 49 |
+
variant_id TEXT PRIMARY KEY,
|
| 50 |
+
af REAL,
|
| 51 |
+
homozygotes INTEGER,
|
| 52 |
+
populations TEXT,
|
| 53 |
+
coverage_warning TEXT,
|
| 54 |
+
fetched_at TEXT DEFAULT CURRENT_TIMESTAMP
|
| 55 |
+
)
|
| 56 |
+
"""
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=8), reraise=True)
|
| 60 |
+
async def _fetch(self, variant_id: str, dataset: str = "gnomad_r4") -> dict | None:
|
| 61 |
+
async with httpx.AsyncClient(timeout=15.0) as client:
|
| 62 |
+
r = await client.post(
|
| 63 |
+
self.url,
|
| 64 |
+
json={
|
| 65 |
+
"query": GNOMAD_QUERY,
|
| 66 |
+
"variables": {"variantId": variant_id, "datasetId": dataset},
|
| 67 |
+
},
|
| 68 |
+
)
|
| 69 |
+
r.raise_for_status()
|
| 70 |
+
payload = r.json()
|
| 71 |
+
return payload.get("data", {}).get("variant")
|
| 72 |
+
|
| 73 |
+
async def lookup(self, variant_id: str) -> PopulationFrequency:
|
| 74 |
+
with sqlite3.connect(self.cache_db) as conn:
|
| 75 |
+
row = conn.execute(
|
| 76 |
+
"SELECT af, homozygotes, populations, coverage_warning FROM gnomad_cache WHERE variant_id = ?",
|
| 77 |
+
(variant_id,),
|
| 78 |
+
).fetchone()
|
| 79 |
+
if row:
|
| 80 |
+
af, hom, pops_str, cov = row
|
| 81 |
+
import json
|
| 82 |
+
return PopulationFrequency(
|
| 83 |
+
overall_af=af,
|
| 84 |
+
homozygote_count=hom,
|
| 85 |
+
by_population=json.loads(pops_str) if pops_str else {},
|
| 86 |
+
coverage_warning=cov,
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
try:
|
| 90 |
+
data = await self._fetch(variant_id)
|
| 91 |
+
except (httpx.HTTPError, httpx.TimeoutException) as e:
|
| 92 |
+
logger.warning("gnomAD fetch failed for %s: %s", variant_id, e)
|
| 93 |
+
return PopulationFrequency(coverage_warning=f"fetch failed: {e}")
|
| 94 |
+
|
| 95 |
+
if not data:
|
| 96 |
+
return PopulationFrequency(coverage_warning="not found in gnomAD")
|
| 97 |
+
|
| 98 |
+
exome = data.get("exome") or {}
|
| 99 |
+
genome = data.get("genome") or {}
|
| 100 |
+
af = exome.get("af") or genome.get("af") or 0.0
|
| 101 |
+
hom = (exome.get("ac_hom") or 0) + (genome.get("ac_hom") or 0)
|
| 102 |
+
|
| 103 |
+
populations: dict[str, float] = {}
|
| 104 |
+
for src in (exome, genome):
|
| 105 |
+
for pop in src.get("populations") or []:
|
| 106 |
+
if pop["an"]:
|
| 107 |
+
populations[pop["id"]] = (pop.get("ac") or 0) / pop["an"]
|
| 108 |
+
|
| 109 |
+
import json
|
| 110 |
+
with sqlite3.connect(self.cache_db) as conn:
|
| 111 |
+
conn.execute(
|
| 112 |
+
"INSERT OR REPLACE INTO gnomad_cache (variant_id, af, homozygotes, populations, coverage_warning) VALUES (?, ?, ?, ?, ?)",
|
| 113 |
+
(variant_id, af, hom, json.dumps(populations), None),
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
return PopulationFrequency(
|
| 117 |
+
overall_af=af, homozygote_count=hom, by_population=populations
|
| 118 |
+
)
|
backend/app/services/insilico.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import sqlite3
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import httpx
|
| 6 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 7 |
+
|
| 8 |
+
from backend.app.config import get_settings
|
| 9 |
+
from backend.app.schemas.evidence import InSilicoResult
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
settings = get_settings()
|
| 13 |
+
|
| 14 |
+
REVEL_PATHOGENIC_THRESHOLD = 0.7
|
| 15 |
+
REVEL_BENIGN_THRESHOLD = 0.15
|
| 16 |
+
ALPHAMISSENSE_PATHOGENIC = 0.564
|
| 17 |
+
ALPHAMISSENSE_BENIGN = 0.34
|
| 18 |
+
SPLICEAI_PATHOGENIC = 0.5
|
| 19 |
+
CADD_PATHOGENIC = 25.0
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class InSilicoPredictor:
|
| 23 |
+
def __init__(
|
| 24 |
+
self,
|
| 25 |
+
revel_db: Path | None = None,
|
| 26 |
+
alphamissense_db: Path | None = None,
|
| 27 |
+
spliceai_url: str | None = None,
|
| 28 |
+
) -> None:
|
| 29 |
+
self.revel_db = revel_db or settings.revel_db_path
|
| 30 |
+
self.alphamissense_db = alphamissense_db or settings.alphamissense_db_path
|
| 31 |
+
self.spliceai_url = spliceai_url or settings.spliceai_lookup_url
|
| 32 |
+
|
| 33 |
+
def lookup_revel(self, chrom: str, pos: int, ref: str, alt: str) -> float | None:
|
| 34 |
+
if not self.revel_db.exists():
|
| 35 |
+
logger.debug("REVEL db not present; skip")
|
| 36 |
+
return None
|
| 37 |
+
try:
|
| 38 |
+
with sqlite3.connect(self.revel_db) as conn:
|
| 39 |
+
row = conn.execute(
|
| 40 |
+
"SELECT score FROM revel WHERE chrom = ? AND pos = ? AND ref = ? AND alt = ?",
|
| 41 |
+
(chrom, pos, ref, alt),
|
| 42 |
+
).fetchone()
|
| 43 |
+
return row[0] if row else None
|
| 44 |
+
except sqlite3.DatabaseError as e:
|
| 45 |
+
logger.warning("REVEL lookup error: %s", e)
|
| 46 |
+
return None
|
| 47 |
+
|
| 48 |
+
def lookup_alphamissense(
|
| 49 |
+
self,
|
| 50 |
+
chrom: str | None,
|
| 51 |
+
pos: int | None,
|
| 52 |
+
ref: str | None,
|
| 53 |
+
alt: str | None,
|
| 54 |
+
transcript: str | None = None,
|
| 55 |
+
) -> float | None:
|
| 56 |
+
"""Genomic-coordinate lookup against the SQLite cache.
|
| 57 |
+
|
| 58 |
+
AlphaMissense scores live at chr/pos/ref/alt Γ transcript granularity.
|
| 59 |
+
We try (chrom,pos,ref,alt,transcript) first, then fall back to the
|
| 60 |
+
first matching transcript at that locus.
|
| 61 |
+
"""
|
| 62 |
+
if not self.alphamissense_db.exists():
|
| 63 |
+
logger.debug("AlphaMissense db not present; skip")
|
| 64 |
+
return None
|
| 65 |
+
if not (chrom and pos and ref and alt):
|
| 66 |
+
return None
|
| 67 |
+
try:
|
| 68 |
+
with sqlite3.connect(self.alphamissense_db) as conn:
|
| 69 |
+
if transcript:
|
| 70 |
+
row = conn.execute(
|
| 71 |
+
"SELECT score FROM alphamissense WHERE chrom = ? AND pos = ? AND ref = ? AND alt = ? AND transcript = ?",
|
| 72 |
+
(chrom.lstrip("chr"), pos, ref, alt, transcript),
|
| 73 |
+
).fetchone()
|
| 74 |
+
if row:
|
| 75 |
+
return float(row[0])
|
| 76 |
+
row = conn.execute(
|
| 77 |
+
"SELECT score FROM alphamissense WHERE chrom = ? AND pos = ? AND ref = ? AND alt = ? LIMIT 1",
|
| 78 |
+
(chrom.lstrip("chr"), pos, ref, alt),
|
| 79 |
+
).fetchone()
|
| 80 |
+
return float(row[0]) if row else None
|
| 81 |
+
except sqlite3.DatabaseError as e:
|
| 82 |
+
logger.warning("AlphaMissense lookup error: %s", e)
|
| 83 |
+
return None
|
| 84 |
+
|
| 85 |
+
@retry(stop=stop_after_attempt(2), wait=wait_exponential(min=1, max=5), reraise=True)
|
| 86 |
+
async def lookup_spliceai(self, hgvs_genomic: str) -> float | None:
|
| 87 |
+
try:
|
| 88 |
+
async with httpx.AsyncClient(timeout=15.0) as client:
|
| 89 |
+
r = await client.get(
|
| 90 |
+
f"{self.spliceai_url}/api",
|
| 91 |
+
params={"hg": "38", "distance": "50", "mask": "0", "variant": hgvs_genomic},
|
| 92 |
+
)
|
| 93 |
+
r.raise_for_status()
|
| 94 |
+
data = r.json()
|
| 95 |
+
scores = data.get("scores") or []
|
| 96 |
+
if not scores:
|
| 97 |
+
return None
|
| 98 |
+
ds: list[float] = []
|
| 99 |
+
for score in scores:
|
| 100 |
+
ds.extend([
|
| 101 |
+
float(score.get("DS_AG", 0)),
|
| 102 |
+
float(score.get("DS_AL", 0)),
|
| 103 |
+
float(score.get("DS_DG", 0)),
|
| 104 |
+
float(score.get("DS_DL", 0)),
|
| 105 |
+
])
|
| 106 |
+
return max(ds)
|
| 107 |
+
except (httpx.HTTPError, httpx.TimeoutException, ValueError) as e:
|
| 108 |
+
logger.warning("SpliceAI lookup failed: %s", e)
|
| 109 |
+
return None
|
| 110 |
+
|
| 111 |
+
async def assess(
|
| 112 |
+
self,
|
| 113 |
+
chrom: str | None,
|
| 114 |
+
pos: int | None,
|
| 115 |
+
ref: str | None,
|
| 116 |
+
alt: str | None,
|
| 117 |
+
transcript: str | None,
|
| 118 |
+
hgvs_genomic: str | None,
|
| 119 |
+
) -> InSilicoResult:
|
| 120 |
+
revel = (
|
| 121 |
+
self.lookup_revel(chrom, pos, ref, alt)
|
| 122 |
+
if chrom and pos and ref and alt
|
| 123 |
+
else None
|
| 124 |
+
)
|
| 125 |
+
am = self.lookup_alphamissense(chrom, pos, ref, alt, transcript)
|
| 126 |
+
splice = await self.lookup_spliceai(hgvs_genomic) if hgvs_genomic else None
|
| 127 |
+
|
| 128 |
+
path_votes = sum(
|
| 129 |
+
[
|
| 130 |
+
revel is not None and revel >= REVEL_PATHOGENIC_THRESHOLD,
|
| 131 |
+
am is not None and am >= ALPHAMISSENSE_PATHOGENIC,
|
| 132 |
+
splice is not None and splice >= SPLICEAI_PATHOGENIC,
|
| 133 |
+
]
|
| 134 |
+
)
|
| 135 |
+
benign_votes = sum(
|
| 136 |
+
[
|
| 137 |
+
revel is not None and revel <= REVEL_BENIGN_THRESHOLD,
|
| 138 |
+
am is not None and am <= ALPHAMISSENSE_BENIGN,
|
| 139 |
+
splice is not None and splice < SPLICEAI_PATHOGENIC,
|
| 140 |
+
]
|
| 141 |
+
)
|
| 142 |
+
total_with_data = sum([revel is not None, am is not None, splice is not None])
|
| 143 |
+
|
| 144 |
+
# ClinGen SVI 2022 β fire if at least one strong predictor agrees
|
| 145 |
+
# AND no predictor strongly contradicts. The strict "unanimous"
|
| 146 |
+
# rule was rejecting BP4 whenever REVEL was middling, which
|
| 147 |
+
# missed real benign missense calls.
|
| 148 |
+
pp3 = path_votes >= 1 and benign_votes == 0 and total_with_data >= 1
|
| 149 |
+
bp4 = benign_votes >= 1 and path_votes == 0 and total_with_data >= 1
|
| 150 |
+
|
| 151 |
+
return InSilicoResult(
|
| 152 |
+
revel=revel,
|
| 153 |
+
alphamissense=am,
|
| 154 |
+
spliceai_max=splice,
|
| 155 |
+
concordant_pathogenic=total_with_data >= 2 and path_votes == total_with_data,
|
| 156 |
+
concordant_benign=total_with_data >= 2 and benign_votes == total_with_data,
|
| 157 |
+
pp3_triggered=pp3,
|
| 158 |
+
bp4_triggered=bp4,
|
| 159 |
+
)
|
backend/app/services/llm/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from backend.app.services.llm.prompts import build_user_prompt, get_system_prompt
|
| 2 |
+
from backend.app.services.llm.reasoner import ClaudeReasoner
|
| 3 |
+
from backend.app.services.llm.synthesizer import EvidenceSynthesizer
|
| 4 |
+
|
| 5 |
+
__all__ = ["ClaudeReasoner", "EvidenceSynthesizer", "build_user_prompt", "get_system_prompt"]
|
backend/app/services/llm/prompts.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Hallucination-suppressed prompt templates for literature-dependent ACMG criteria.
|
| 3 |
+
|
| 4 |
+
Mirrors the AI CURA strategy (Chung, Ma et al. 2025): Claude is allowed to reason
|
| 5 |
+
ONLY over the retrieved chunks. Every output must cite a PMID present in the
|
| 6 |
+
context. Output is structured JSON; no free text.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
|
| 11 |
+
from backend.app.schemas.evidence import LiteratureChunk
|
| 12 |
+
|
| 13 |
+
SYSTEM_PROMPT = """You are a clinical genetics variant curator assistant working within an ACMG/AMP framework. Your role is to extract structured evidence from the provided literature context ONLY.
|
| 14 |
+
|
| 15 |
+
CRITICAL RULES:
|
| 16 |
+
1. Do NOT use any knowledge from your training data about this variant, gene, or disease beyond standard biology background. All claims about specific findings must come from the provided context chunks.
|
| 17 |
+
2. Only cite evidence that appears verbatim in the provided context chunks.
|
| 18 |
+
3. If the context does not contain sufficient evidence for a criterion, output: "triggered": false, "evidence": "insufficient evidence in provided literature".
|
| 19 |
+
4. For each criterion you assess, cite the specific PMID and quote the relevant sentence(s) from the chunk text.
|
| 20 |
+
5. Output structured JSON only β no free text, no markdown, no preamble.
|
| 21 |
+
6. Flag any ambiguous phasing, uncertain phenotype matches, or potential ascertainment bias in the "caveat" field.
|
| 22 |
+
7. If a chunk's PMID is not in the context, do NOT cite it. Cited PMIDs MUST appear in the metadata of a provided chunk.
|
| 23 |
+
|
| 24 |
+
OUTPUT SCHEMA per criterion (JSON object):
|
| 25 |
+
{
|
| 26 |
+
"criterion": "PM3" | "PP1" | "PS3" | "BS3" | "PS4" | "PP4" | "PS2" | "PM6" | "PP5" | "BP6",
|
| 27 |
+
"triggered": true | false,
|
| 28 |
+
"strength": "supporting" | "moderate" | "strong" | "very_strong",
|
| 29 |
+
"evidence": "<exact quote from a context chunk>",
|
| 30 |
+
"pmid": "<PMID from chunk metadata>",
|
| 31 |
+
"confidence": "high" | "medium" | "low",
|
| 32 |
+
"caveat": "<optional text or null>"
|
| 33 |
+
}
|
| 34 |
+
Return a JSON array of one object per requested criterion."""
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
CRITERION_GUIDANCE: dict[str, str] = {
|
| 38 |
+
"PM3": (
|
| 39 |
+
"PM3 β observed in trans with another pathogenic/likely-pathogenic variant. "
|
| 40 |
+
"Look for explicit statements of compound heterozygosity, in-trans observation, "
|
| 41 |
+
"or biallelic occurrence with parental confirmation."
|
| 42 |
+
),
|
| 43 |
+
"PP1": (
|
| 44 |
+
"PP1 β co-segregation with disease in multiple affected family members. "
|
| 45 |
+
"Count distinct affected segregating individuals; require β₯3 for moderate, β₯7 for strong."
|
| 46 |
+
),
|
| 47 |
+
"PS3": (
|
| 48 |
+
"PS3 β well-established in vitro or in vivo functional studies show a deleterious effect. "
|
| 49 |
+
"Penalize assays with poor controls, single replicates, or non-physiological systems."
|
| 50 |
+
),
|
| 51 |
+
"BS3": (
|
| 52 |
+
"BS3 β well-established functional studies show no measurable effect."
|
| 53 |
+
),
|
| 54 |
+
"PS4": (
|
| 55 |
+
"PS4 β variant prevalence in cases significantly increased over controls. "
|
| 56 |
+
"Extract case counts and odds ratios where present."
|
| 57 |
+
),
|
| 58 |
+
"PP4": (
|
| 59 |
+
"PP4 β patient phenotype highly specific for a disease with single genetic etiology. "
|
| 60 |
+
"Require explicit phenotype description, not generic disease name."
|
| 61 |
+
),
|
| 62 |
+
"PS2": "PS2 β confirmed de novo with parental confirmation.",
|
| 63 |
+
"PM6": "PM6 β assumed de novo without parental confirmation.",
|
| 64 |
+
"PP5": "PP5 β reputable source recently reports as pathogenic.",
|
| 65 |
+
"BP6": "BP6 β reputable source recently reports as benign.",
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def get_system_prompt() -> str:
|
| 70 |
+
return SYSTEM_PROMPT
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def build_user_prompt(
|
| 74 |
+
variant_hgvs: str,
|
| 75 |
+
gene: str,
|
| 76 |
+
disease: str | None,
|
| 77 |
+
auto_scored: list[dict],
|
| 78 |
+
chunks: list[LiteratureChunk],
|
| 79 |
+
criteria: list[str],
|
| 80 |
+
) -> str:
|
| 81 |
+
chunk_blocks = []
|
| 82 |
+
for i, c in enumerate(chunks):
|
| 83 |
+
chunk_blocks.append(
|
| 84 |
+
f"--- Chunk #{i+1} ---\n"
|
| 85 |
+
f"PMID: {c.pmid}\n"
|
| 86 |
+
f"Year: {c.year or 'unknown'}\n"
|
| 87 |
+
f"Title: {c.title or 'n/a'}\n"
|
| 88 |
+
f"Hint criteria: {', '.join(c.criteria_relevance) or 'none'}\n"
|
| 89 |
+
f"Text:\n{c.chunk_text}\n"
|
| 90 |
+
)
|
| 91 |
+
chunks_str = "\n".join(chunk_blocks) or "(no literature retrieved β output insufficient evidence for all criteria)"
|
| 92 |
+
|
| 93 |
+
guidance_str = "\n".join(
|
| 94 |
+
f"- {CRITERION_GUIDANCE.get(c, c)}" for c in criteria
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
return (
|
| 98 |
+
f"Variant: {variant_hgvs}\n"
|
| 99 |
+
f"Gene: {gene}\n"
|
| 100 |
+
f"Disease: {disease or 'unspecified'}\n\n"
|
| 101 |
+
f"PRE-SCORED DATABASE CRITERIA (do not re-evaluate these β informational only):\n"
|
| 102 |
+
f"{json.dumps(auto_scored, indent=2)}\n\n"
|
| 103 |
+
f"CRITERIA TO ASSESS FROM LITERATURE ONLY:\n"
|
| 104 |
+
f"{guidance_str}\n\n"
|
| 105 |
+
f"LITERATURE CONTEXT:\n"
|
| 106 |
+
f"{chunks_str}\n\n"
|
| 107 |
+
f"Output a JSON array with one entry per criterion in the order: {criteria}. "
|
| 108 |
+
f"Cite only PMIDs that appear in the context above."
|
| 109 |
+
)
|
backend/app/services/llm/reasoner.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
from typing import Any, cast
|
| 4 |
+
|
| 5 |
+
import anthropic
|
| 6 |
+
import httpx
|
| 7 |
+
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
|
| 8 |
+
|
| 9 |
+
from backend.app.config import get_settings
|
| 10 |
+
from backend.app.schemas.evidence import ACMGCriterion, LiteratureChunk
|
| 11 |
+
from backend.app.services.llm.prompts import build_user_prompt, get_system_prompt
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
settings = get_settings()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class ClaudeReasoner:
|
| 18 |
+
def __init__(self, api_key: str | None = None, model: str | None = None) -> None:
|
| 19 |
+
self.api_key = api_key or settings.anthropic_api_key
|
| 20 |
+
self.use_local_llm = settings.use_local_llm
|
| 21 |
+
self.model = model or (settings.local_llm_model if self.use_local_llm else settings.anthropic_model)
|
| 22 |
+
self.client = (
|
| 23 |
+
None
|
| 24 |
+
if self.use_local_llm
|
| 25 |
+
else anthropic.Anthropic(api_key=self.api_key) if self.api_key else None
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
@retry(
|
| 29 |
+
stop=stop_after_attempt(3),
|
| 30 |
+
wait=wait_exponential(min=2, max=20),
|
| 31 |
+
retry=retry_if_exception_type((anthropic.APIError, anthropic.RateLimitError, httpx.HTTPError)),
|
| 32 |
+
reraise=True,
|
| 33 |
+
)
|
| 34 |
+
def _call(self, system: list[dict[str, Any]], user: str) -> str:
|
| 35 |
+
if self.use_local_llm:
|
| 36 |
+
return self._call_local(system, user)
|
| 37 |
+
if self.client is None:
|
| 38 |
+
raise RuntimeError("ANTHROPIC_API_KEY not set; cannot call Claude")
|
| 39 |
+
response = self.client.messages.create(
|
| 40 |
+
model=self.model,
|
| 41 |
+
max_tokens=settings.anthropic_max_tokens,
|
| 42 |
+
system=cast(Any, system),
|
| 43 |
+
messages=[{"role": "user", "content": user}],
|
| 44 |
+
)
|
| 45 |
+
for block in response.content:
|
| 46 |
+
if block.type == "text":
|
| 47 |
+
return block.text
|
| 48 |
+
return ""
|
| 49 |
+
|
| 50 |
+
def _call_local(self, system: list[dict[str, Any]], user: str) -> str:
|
| 51 |
+
system_text = "\n".join(str(part.get("text", "")) for part in system)
|
| 52 |
+
payload = {
|
| 53 |
+
"model": self.model,
|
| 54 |
+
"stream": False,
|
| 55 |
+
"format": "json",
|
| 56 |
+
"messages": [
|
| 57 |
+
{"role": "system", "content": system_text},
|
| 58 |
+
{"role": "user", "content": user},
|
| 59 |
+
],
|
| 60 |
+
"options": {"temperature": 0},
|
| 61 |
+
}
|
| 62 |
+
response = httpx.post(
|
| 63 |
+
f"{settings.local_llm_base_url.rstrip('/')}/api/chat",
|
| 64 |
+
json=payload,
|
| 65 |
+
timeout=120,
|
| 66 |
+
)
|
| 67 |
+
response.raise_for_status()
|
| 68 |
+
data = response.json()
|
| 69 |
+
message = data.get("message", {})
|
| 70 |
+
content = message.get("content")
|
| 71 |
+
if not isinstance(content, str):
|
| 72 |
+
raise RuntimeError("local LLM response did not include message.content")
|
| 73 |
+
return content
|
| 74 |
+
|
| 75 |
+
def reason_over_criteria(
|
| 76 |
+
self,
|
| 77 |
+
variant_hgvs: str,
|
| 78 |
+
gene: str,
|
| 79 |
+
disease: str | None,
|
| 80 |
+
auto_scored_summary: list[dict[str, Any]],
|
| 81 |
+
chunks: list[LiteratureChunk],
|
| 82 |
+
criteria: list[str],
|
| 83 |
+
) -> list[ACMGCriterion]:
|
| 84 |
+
if not chunks:
|
| 85 |
+
return [self._fallback_criterion(c, "insufficient evidence in provided literature") for c in criteria]
|
| 86 |
+
|
| 87 |
+
system_text = get_system_prompt()
|
| 88 |
+
# Cache the long system prompt so repeated runs in a session are cheap.
|
| 89 |
+
# The prompt is byte-identical across variants β every call should be a cache read.
|
| 90 |
+
system = [
|
| 91 |
+
{
|
| 92 |
+
"type": "text",
|
| 93 |
+
"text": system_text,
|
| 94 |
+
"cache_control": {"type": "ephemeral"},
|
| 95 |
+
}
|
| 96 |
+
]
|
| 97 |
+
user = build_user_prompt(
|
| 98 |
+
variant_hgvs=variant_hgvs,
|
| 99 |
+
gene=gene,
|
| 100 |
+
disease=disease,
|
| 101 |
+
auto_scored=auto_scored_summary,
|
| 102 |
+
chunks=chunks,
|
| 103 |
+
criteria=criteria,
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
try:
|
| 107 |
+
raw = self._call(system, user)
|
| 108 |
+
except (anthropic.APIError, httpx.HTTPError, RuntimeError) as e:
|
| 109 |
+
logger.error("Claude call failed: %s", e)
|
| 110 |
+
return [self._fallback_criterion(c, str(e)) for c in criteria]
|
| 111 |
+
|
| 112 |
+
try:
|
| 113 |
+
parsed = self._parse_json(raw)
|
| 114 |
+
except ValueError as e:
|
| 115 |
+
logger.warning("Claude output JSON malformed; retrying with repair prompt: %s", e)
|
| 116 |
+
try:
|
| 117 |
+
raw = self._call(
|
| 118 |
+
system,
|
| 119 |
+
user
|
| 120 |
+
+ "\n\nYour previous output failed JSON validation. Return ONLY a valid JSON array matching the schema.",
|
| 121 |
+
)
|
| 122 |
+
parsed = self._parse_json(raw)
|
| 123 |
+
except (ValueError, anthropic.APIError, httpx.HTTPError) as e2:
|
| 124 |
+
logger.error("Claude repair attempt failed: %s", e2)
|
| 125 |
+
return [self._fallback_criterion(c, "LLM output unparseable") for c in criteria]
|
| 126 |
+
|
| 127 |
+
chunks_by_pmid: dict[str, list[str]] = {}
|
| 128 |
+
for chunk in chunks:
|
| 129 |
+
chunks_by_pmid.setdefault(chunk.pmid, []).append(chunk.chunk_text)
|
| 130 |
+
valid_pmids = set(chunks_by_pmid)
|
| 131 |
+
out: list[ACMGCriterion] = []
|
| 132 |
+
for entry in parsed:
|
| 133 |
+
try:
|
| 134 |
+
code = entry["criterion"]
|
| 135 |
+
pmid = entry.get("pmid")
|
| 136 |
+
evidence_text = str(entry.get("evidence", "")).strip()
|
| 137 |
+
if entry.get("triggered"):
|
| 138 |
+
rejection = self._trigger_rejection_reason(pmid, evidence_text, chunks_by_pmid, valid_pmids)
|
| 139 |
+
if rejection:
|
| 140 |
+
logger.warning("Suppressing %s from LLM output: %s", code, rejection)
|
| 141 |
+
out.append(self._fallback_criterion(code, rejection))
|
| 142 |
+
continue
|
| 143 |
+
out.append(
|
| 144 |
+
ACMGCriterion(
|
| 145 |
+
code=code,
|
| 146 |
+
triggered=bool(entry.get("triggered", False)),
|
| 147 |
+
strength=entry.get("strength", "supporting"),
|
| 148 |
+
source=f"PMID:{pmid}" if pmid else "literature",
|
| 149 |
+
evidence_text=evidence_text or "insufficient evidence in provided literature",
|
| 150 |
+
confidence=entry.get("confidence", "medium"),
|
| 151 |
+
caveat=entry.get("caveat"),
|
| 152 |
+
pmid=pmid,
|
| 153 |
+
)
|
| 154 |
+
)
|
| 155 |
+
except (KeyError, TypeError) as e:
|
| 156 |
+
logger.warning("malformed entry from Claude: %s β %s", entry, e)
|
| 157 |
+
return out
|
| 158 |
+
|
| 159 |
+
@staticmethod
|
| 160 |
+
def _trigger_rejection_reason(
|
| 161 |
+
pmid: Any,
|
| 162 |
+
evidence_text: str,
|
| 163 |
+
chunks_by_pmid: dict[str, list[str]],
|
| 164 |
+
valid_pmids: set[str],
|
| 165 |
+
) -> str | None:
|
| 166 |
+
if not pmid:
|
| 167 |
+
return "triggered literature criterion missing PMID"
|
| 168 |
+
if pmid not in valid_pmids:
|
| 169 |
+
return "fabricated PMID rejected"
|
| 170 |
+
if not evidence_text:
|
| 171 |
+
return "triggered literature criterion missing evidence quote"
|
| 172 |
+
normalized_evidence = " ".join(evidence_text.split()).lower()
|
| 173 |
+
normalized_chunks = [" ".join(text.split()).lower() for text in chunks_by_pmid[pmid]]
|
| 174 |
+
if not any(normalized_evidence in chunk for chunk in normalized_chunks):
|
| 175 |
+
return "evidence quote not found verbatim in cited PMID chunk"
|
| 176 |
+
return None
|
| 177 |
+
|
| 178 |
+
@staticmethod
|
| 179 |
+
def _parse_json(raw: str) -> list[dict[str, Any]]:
|
| 180 |
+
text = raw.strip()
|
| 181 |
+
if text.startswith("```"):
|
| 182 |
+
text = text.split("```")[1]
|
| 183 |
+
if text.startswith("json"):
|
| 184 |
+
text = text[4:]
|
| 185 |
+
text = text.strip()
|
| 186 |
+
data = json.loads(text)
|
| 187 |
+
if not isinstance(data, list):
|
| 188 |
+
raise ValueError("expected JSON array")
|
| 189 |
+
return data
|
| 190 |
+
|
| 191 |
+
@staticmethod
|
| 192 |
+
def _fallback_criterion(code: str, reason: str) -> ACMGCriterion:
|
| 193 |
+
return ACMGCriterion(
|
| 194 |
+
code=code,
|
| 195 |
+
triggered=False,
|
| 196 |
+
strength="supporting",
|
| 197 |
+
source="LLM",
|
| 198 |
+
evidence_text=f"insufficient evidence β {reason}",
|
| 199 |
+
confidence="low",
|
| 200 |
+
caveat=reason,
|
| 201 |
+
)
|
backend/app/services/llm/synthesizer.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
from backend.app.schemas.classification import ClassificationResult
|
| 4 |
+
from backend.app.schemas.evidence import ACMGCriterion, EvidenceBundle, LiteratureChunk
|
| 5 |
+
from backend.app.schemas.variant import NormalizedVariant
|
| 6 |
+
from backend.app.services.acmg.combiner import combine_criteria
|
| 7 |
+
from backend.app.services.acmg.rules import RuleEngine
|
| 8 |
+
from backend.app.services.llm.reasoner import ClaudeReasoner
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
LITERATURE_CRITERIA = ["PM3", "PP1", "PS3", "BS3", "PS4", "PP4"]
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class EvidenceSynthesizer:
|
| 16 |
+
def __init__(
|
| 17 |
+
self,
|
| 18 |
+
rule_engine: RuleEngine | None = None,
|
| 19 |
+
reasoner: ClaudeReasoner | None = None,
|
| 20 |
+
) -> None:
|
| 21 |
+
self.rule_engine = rule_engine or RuleEngine()
|
| 22 |
+
self.reasoner = reasoner or ClaudeReasoner()
|
| 23 |
+
|
| 24 |
+
def synthesize(
|
| 25 |
+
self,
|
| 26 |
+
variant: NormalizedVariant,
|
| 27 |
+
evidence: EvidenceBundle,
|
| 28 |
+
retrieved_chunks: dict[str, list[LiteratureChunk]] | None = None,
|
| 29 |
+
disease: str | None = None,
|
| 30 |
+
) -> ClassificationResult:
|
| 31 |
+
# 1. Database-driven criteria
|
| 32 |
+
db_criteria = self.rule_engine.score_all(evidence)
|
| 33 |
+
|
| 34 |
+
# 2. Literature-driven criteria via Claude
|
| 35 |
+
llm_criteria: list[ACMGCriterion] = []
|
| 36 |
+
if retrieved_chunks:
|
| 37 |
+
all_chunks = []
|
| 38 |
+
seen = set()
|
| 39 |
+
for chunks in retrieved_chunks.values():
|
| 40 |
+
for c in chunks:
|
| 41 |
+
key = (c.pmid, c.chunk_text[:100])
|
| 42 |
+
if key not in seen:
|
| 43 |
+
seen.add(key)
|
| 44 |
+
all_chunks.append(c)
|
| 45 |
+
|
| 46 |
+
auto_summary = [
|
| 47 |
+
{
|
| 48 |
+
"criterion": c.code,
|
| 49 |
+
"triggered": c.triggered,
|
| 50 |
+
"source": c.source,
|
| 51 |
+
"evidence": c.evidence_text,
|
| 52 |
+
}
|
| 53 |
+
for c in db_criteria
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
try:
|
| 57 |
+
llm_criteria = self.reasoner.reason_over_criteria(
|
| 58 |
+
variant_hgvs=variant.hgvs_coding or variant.raw_input,
|
| 59 |
+
gene=variant.gene_symbol or "unknown",
|
| 60 |
+
disease=disease,
|
| 61 |
+
auto_scored_summary=auto_summary,
|
| 62 |
+
chunks=all_chunks,
|
| 63 |
+
criteria=LITERATURE_CRITERIA,
|
| 64 |
+
)
|
| 65 |
+
except Exception as e:
|
| 66 |
+
logger.error("LLM reasoning failed: %s", e)
|
| 67 |
+
|
| 68 |
+
# 3. Merge β db criteria win on conflict
|
| 69 |
+
merged: dict[str, ACMGCriterion] = {c.code: c for c in db_criteria}
|
| 70 |
+
for c in llm_criteria:
|
| 71 |
+
merged.setdefault(c.code, c)
|
| 72 |
+
|
| 73 |
+
all_criteria = list(merged.values())
|
| 74 |
+
evidence.criteria = all_criteria
|
| 75 |
+
|
| 76 |
+
classification = combine_criteria(all_criteria)
|
| 77 |
+
return ClassificationResult(
|
| 78 |
+
variant=variant,
|
| 79 |
+
evidence=evidence,
|
| 80 |
+
classification=classification,
|
| 81 |
+
)
|
backend/app/services/normalization.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
import httpx
|
| 5 |
+
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
|
| 6 |
+
|
| 7 |
+
from backend.app.config import get_settings
|
| 8 |
+
from backend.app.schemas.variant import NormalizedVariant, VariantInput
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
settings = get_settings()
|
| 12 |
+
|
| 13 |
+
HGVS_PATTERN = re.compile(r"^(NM_|NC_|NP_|ENST|ENSP)[\d.]+:[cgpnm]\.")
|
| 14 |
+
VCF_PATTERN = re.compile(r"^(chr)?[\dXYM]+[-:]\d+[-:][ACGT]+[-:][ACGT]+$", re.IGNORECASE)
|
| 15 |
+
PROTEIN_PATTERN = re.compile(
|
| 16 |
+
r"^p\.[A-Z][a-z]{2}\d+([A-Z][a-z]{2}|\*|Ter)$" # 3-letter ref + 3-letter alt OR stop
|
| 17 |
+
r"|^p\.[A-Z]\d+[A-Z*]$" # 1-letter ref + 1-letter alt
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
# GRCh38 chromosome accessions (RefSeq). Mutalyzer rejects `chr17:g.` and
|
| 21 |
+
# requires the canonical NC_ identifier for genomic descriptions.
|
| 22 |
+
GRCH38_CHROM_TO_NC: dict[str, str] = {
|
| 23 |
+
"1": "NC_000001.11", "2": "NC_000002.12", "3": "NC_000003.12", "4": "NC_000004.12",
|
| 24 |
+
"5": "NC_000005.10", "6": "NC_000006.12", "7": "NC_000007.14", "8": "NC_000008.11",
|
| 25 |
+
"9": "NC_000009.12", "10": "NC_000010.11", "11": "NC_000011.10", "12": "NC_000012.12",
|
| 26 |
+
"13": "NC_000013.11", "14": "NC_000014.9", "15": "NC_000015.10", "16": "NC_000016.10",
|
| 27 |
+
"17": "NC_000017.11", "18": "NC_000018.10", "19": "NC_000019.10", "20": "NC_000020.11",
|
| 28 |
+
"21": "NC_000021.9", "22": "NC_000022.11", "X": "NC_000023.11", "Y": "NC_000024.10",
|
| 29 |
+
"M": "NC_012920.1", "MT": "NC_012920.1",
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class NormalizationError(Exception):
|
| 34 |
+
pass
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class VariantNormalizer:
|
| 38 |
+
def __init__(self, base_url: str | None = None, timeout: float = 10.0) -> None:
|
| 39 |
+
self.base_url = base_url or settings.mutalyzer_base_url
|
| 40 |
+
self.timeout = timeout
|
| 41 |
+
|
| 42 |
+
def detect_notation(self, raw: str) -> str:
|
| 43 |
+
s = raw.strip()
|
| 44 |
+
if HGVS_PATTERN.match(s):
|
| 45 |
+
return "hgvs"
|
| 46 |
+
if VCF_PATTERN.match(s):
|
| 47 |
+
return "vcf"
|
| 48 |
+
if PROTEIN_PATTERN.match(s):
|
| 49 |
+
return "protein"
|
| 50 |
+
return "unknown"
|
| 51 |
+
|
| 52 |
+
@retry(
|
| 53 |
+
stop=stop_after_attempt(3),
|
| 54 |
+
wait=wait_exponential(multiplier=1, min=1, max=8),
|
| 55 |
+
retry=retry_if_exception_type((httpx.HTTPStatusError, httpx.TimeoutException)),
|
| 56 |
+
reraise=True,
|
| 57 |
+
)
|
| 58 |
+
async def _call_mutalyzer(self, hgvs: str) -> dict:
|
| 59 |
+
url = f"{self.base_url}/normalize/{hgvs}"
|
| 60 |
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
| 61 |
+
r = await client.get(url)
|
| 62 |
+
r.raise_for_status()
|
| 63 |
+
return r.json()
|
| 64 |
+
|
| 65 |
+
async def normalize(self, raw_input: VariantInput) -> NormalizedVariant:
|
| 66 |
+
notation = (
|
| 67 |
+
raw_input.notation if raw_input.notation != "auto" else self.detect_notation(raw_input.raw)
|
| 68 |
+
)
|
| 69 |
+
warnings: list[str] = []
|
| 70 |
+
vcf_parts: tuple[str, int, str, str] | None = None
|
| 71 |
+
|
| 72 |
+
# For VCF input, lock in chrom/pos/ref/alt up front so the score-DB
|
| 73 |
+
# lookups (REVEL, AlphaMissense, gnomAD) always have what they need β
|
| 74 |
+
# even if the Mutalyzer enrichment call fails.
|
| 75 |
+
hgvs_for_mutalyzer: str | None = None
|
| 76 |
+
if notation == "vcf":
|
| 77 |
+
try:
|
| 78 |
+
hgvs_for_mutalyzer, vcf_parts = self._vcf_to_hgvs_with_parts(raw_input.raw)
|
| 79 |
+
except NormalizationError as e:
|
| 80 |
+
warnings.append(f"VCF parse failed: {e}")
|
| 81 |
+
|
| 82 |
+
try:
|
| 83 |
+
if notation == "hgvs":
|
| 84 |
+
data = await self._call_mutalyzer(raw_input.raw)
|
| 85 |
+
return self._parse_mutalyzer(raw_input.raw, data, warnings)
|
| 86 |
+
if notation == "vcf" and hgvs_for_mutalyzer:
|
| 87 |
+
data = await self._call_mutalyzer(hgvs_for_mutalyzer)
|
| 88 |
+
v = self._parse_mutalyzer(raw_input.raw, data, warnings)
|
| 89 |
+
chrom, pos, ref, alt = vcf_parts # type: ignore[misc]
|
| 90 |
+
return v.model_copy(update={
|
| 91 |
+
"chromosome": chrom, "position": pos, "ref": ref, "alt": alt,
|
| 92 |
+
"hgvs_genomic": hgvs_for_mutalyzer,
|
| 93 |
+
"gene_symbol": v.gene_symbol or raw_input.gene_symbol,
|
| 94 |
+
})
|
| 95 |
+
if notation == "protein":
|
| 96 |
+
warnings.append("protein-only input β coding HGVS unavailable without back-translation")
|
| 97 |
+
return NormalizedVariant(
|
| 98 |
+
raw_input=raw_input.raw,
|
| 99 |
+
hgvs_protein=raw_input.raw,
|
| 100 |
+
gene_symbol=raw_input.gene_symbol,
|
| 101 |
+
normalization_source="passthrough",
|
| 102 |
+
warnings=warnings,
|
| 103 |
+
)
|
| 104 |
+
warnings.append(f"unknown notation; passing through: {raw_input.raw}")
|
| 105 |
+
return NormalizedVariant(
|
| 106 |
+
raw_input=raw_input.raw,
|
| 107 |
+
gene_symbol=raw_input.gene_symbol,
|
| 108 |
+
normalization_source="passthrough",
|
| 109 |
+
warnings=warnings,
|
| 110 |
+
)
|
| 111 |
+
except (httpx.HTTPStatusError, httpx.TimeoutException, NormalizationError) as e:
|
| 112 |
+
logger.warning("Mutalyzer normalization failed for %s: %s", raw_input.raw, e)
|
| 113 |
+
warnings.append(f"mutalyzer failed: {e}; using passthrough")
|
| 114 |
+
chrom = pos = ref = alt = None
|
| 115 |
+
if vcf_parts:
|
| 116 |
+
chrom, pos, ref, alt = vcf_parts
|
| 117 |
+
return NormalizedVariant(
|
| 118 |
+
raw_input=raw_input.raw,
|
| 119 |
+
hgvs_coding=raw_input.raw if notation == "hgvs" else None,
|
| 120 |
+
hgvs_genomic=hgvs_for_mutalyzer,
|
| 121 |
+
gene_symbol=raw_input.gene_symbol,
|
| 122 |
+
chromosome=chrom,
|
| 123 |
+
position=pos,
|
| 124 |
+
ref=ref,
|
| 125 |
+
alt=alt,
|
| 126 |
+
normalization_source="passthrough",
|
| 127 |
+
warnings=warnings,
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
def _vcf_to_hgvs(self, vcf: str) -> str:
|
| 131 |
+
return self._vcf_to_hgvs_with_parts(vcf)[0]
|
| 132 |
+
|
| 133 |
+
def _vcf_to_hgvs_with_parts(self, vcf: str) -> tuple[str, tuple[str, int, str, str]]:
|
| 134 |
+
parts = re.split(r"[-:]", vcf)
|
| 135 |
+
if len(parts) != 4:
|
| 136 |
+
raise NormalizationError(f"malformed VCF string: {vcf}")
|
| 137 |
+
chrom, pos, ref, alt = parts
|
| 138 |
+
chrom = chrom.replace("chr", "").upper()
|
| 139 |
+
nc_acc = GRCH38_CHROM_TO_NC.get(chrom)
|
| 140 |
+
if not nc_acc:
|
| 141 |
+
raise NormalizationError(
|
| 142 |
+
f"unknown chromosome {chrom!r}; expected 1-22, X, Y, M, or MT"
|
| 143 |
+
)
|
| 144 |
+
return f"{nc_acc}:g.{pos}{ref}>{alt}", (chrom, int(pos), ref, alt)
|
| 145 |
+
|
| 146 |
+
def _parse_mutalyzer(self, raw: str, data: dict, warnings: list[str]) -> NormalizedVariant:
|
| 147 |
+
"""Parse the Mutalyzer v3 API response.
|
| 148 |
+
|
| 149 |
+
v3 changed the shape entirely from v2:
|
| 150 |
+
- `normalized_description` β canonical c. HGVS string
|
| 151 |
+
- `protein.description` β canonical p. HGVS string
|
| 152 |
+
- `rna.description` β canonical r. HGVS string
|
| 153 |
+
- `gene_id` β HGNC symbol
|
| 154 |
+
- `infos[*].details` β human-readable warnings
|
| 155 |
+
Genomic coordinates are not returned for transcript-keyed input;
|
| 156 |
+
callers that need chr/pos/ref/alt should pass VCF input.
|
| 157 |
+
"""
|
| 158 |
+
coding = data.get("normalized_description") or data.get("corrected_description")
|
| 159 |
+
protein = (data.get("protein") or {}).get("description")
|
| 160 |
+
gene = data.get("gene_id")
|
| 161 |
+
|
| 162 |
+
transcript: str | None = None
|
| 163 |
+
if coding and ":" in coding:
|
| 164 |
+
transcript = coding.split(":")[0]
|
| 165 |
+
|
| 166 |
+
for info in data.get("infos") or []:
|
| 167 |
+
details = info.get("details") or info.get("code", "")
|
| 168 |
+
if details:
|
| 169 |
+
warnings.append(details)
|
| 170 |
+
|
| 171 |
+
consequence = self._infer_consequence(coding or "", protein or "")
|
| 172 |
+
|
| 173 |
+
return NormalizedVariant(
|
| 174 |
+
raw_input=raw,
|
| 175 |
+
hgvs_coding=coding,
|
| 176 |
+
hgvs_protein=protein,
|
| 177 |
+
transcript=transcript,
|
| 178 |
+
gene_symbol=gene,
|
| 179 |
+
consequence=consequence,
|
| 180 |
+
normalization_source="mutalyzer",
|
| 181 |
+
warnings=warnings,
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
@staticmethod
|
| 185 |
+
def _infer_consequence(coding: str, protein: str) -> str | None:
|
| 186 |
+
"""Map a Mutalyzer-normalized variant to a SO consequence term.
|
| 187 |
+
|
| 188 |
+
Heuristic β covers the cases the rule engine cares about (PVS1
|
| 189 |
+
and PM4). For full annotation switch to VEP at the ingest boundary.
|
| 190 |
+
"""
|
| 191 |
+
p = protein.lower()
|
| 192 |
+
c = coding.lower()
|
| 193 |
+
if "fs" in p:
|
| 194 |
+
return "frameshift_variant"
|
| 195 |
+
if "ter" in p or "*" in p:
|
| 196 |
+
return "stop_gained"
|
| 197 |
+
if "del" in c and "ins" not in c:
|
| 198 |
+
return "inframe_deletion" if "fs" not in p else "frameshift_variant"
|
| 199 |
+
if "dup" in c:
|
| 200 |
+
return "frameshift_variant" if "fs" in p else "inframe_insertion"
|
| 201 |
+
if "ext" in p:
|
| 202 |
+
return "stop_lost"
|
| 203 |
+
if "met1" in p and "?" in p:
|
| 204 |
+
return "start_lost"
|
| 205 |
+
if "splice" in c or "+" in c.split(":")[-1] or "-" in c.split(":")[-1]:
|
| 206 |
+
return "splice_region_variant"
|
| 207 |
+
if protein and ">" in c:
|
| 208 |
+
return "missense_variant"
|
| 209 |
+
return None
|
backend/app/services/pvs1.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
from backend.app.schemas.evidence import AutoPVS1Result, AutoPVS1Step
|
| 5 |
+
from backend.app.schemas.variant import NormalizedVariant
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
LOF_CONSEQUENCES = {
|
| 10 |
+
"stop_gained",
|
| 11 |
+
"frameshift_variant",
|
| 12 |
+
"splice_acceptor_variant",
|
| 13 |
+
"splice_donor_variant",
|
| 14 |
+
"start_lost",
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class PVS1Assessor:
|
| 19 |
+
"""
|
| 20 |
+
Heuristic PVS1 assessment.
|
| 21 |
+
|
| 22 |
+
A real deployment should wrap the autoPVS1 package (https://github.com/JiguangPeng/autoPVS1)
|
| 23 |
+
for the full LOF-mechanism / 3'-end / NMD / alternative-splicing logic from
|
| 24 |
+
Tayoun et al. 2018. This wrapper records the rule path for the audit trail.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def assess(self, variant: NormalizedVariant) -> AutoPVS1Result:
|
| 28 |
+
consequence = (variant.consequence or "").lower()
|
| 29 |
+
protein = variant.hgvs_protein or ""
|
| 30 |
+
|
| 31 |
+
is_null = (
|
| 32 |
+
consequence in LOF_CONSEQUENCES
|
| 33 |
+
or "ter" in protein.lower()
|
| 34 |
+
or "fs" in protein.lower()
|
| 35 |
+
or bool(re.search(r"p\..*\*", protein))
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
steps: list[AutoPVS1Step] = []
|
| 39 |
+
|
| 40 |
+
# Step 1 β variant type
|
| 41 |
+
variant_type = (
|
| 42 |
+
"Stop-gained" if "stop" in consequence or "ter" in protein.lower() or re.search(r"p\..*\*", protein)
|
| 43 |
+
else "Frameshift" if "frameshift" in consequence or "fs" in protein.lower()
|
| 44 |
+
else "Splice site" if "splice" in consequence
|
| 45 |
+
else "Start-lost" if "start_lost" in consequence
|
| 46 |
+
else f"Other ({consequence or 'unknown'})"
|
| 47 |
+
)
|
| 48 |
+
steps.append(AutoPVS1Step(
|
| 49 |
+
step=1, label="Variant type", value=variant_type, **{"pass": is_null}
|
| 50 |
+
))
|
| 51 |
+
|
| 52 |
+
if not is_null:
|
| 53 |
+
steps.append(AutoPVS1Step(
|
| 54 |
+
step=2, label="Predicted consequence",
|
| 55 |
+
value="No protein-truncating effect inferred",
|
| 56 |
+
**{"pass": False},
|
| 57 |
+
))
|
| 58 |
+
return AutoPVS1Result(
|
| 59 |
+
triggered=False,
|
| 60 |
+
strength="very_strong",
|
| 61 |
+
rule="PVS1",
|
| 62 |
+
reasoning=steps,
|
| 63 |
+
conclusion="PVS1 not triggered β variant is not null",
|
| 64 |
+
source="autoPVS1-heuristic",
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# Step 2 β predicted consequence
|
| 68 |
+
steps.append(AutoPVS1Step(
|
| 69 |
+
step=2, label="Predicted consequence",
|
| 70 |
+
value=f"Premature stop / truncation ({protein or 'inferred'})",
|
| 71 |
+
**{"pass": True},
|
| 72 |
+
))
|
| 73 |
+
|
| 74 |
+
# Step 3 β NMD prediction (heuristic)
|
| 75 |
+
nmd_predicted = "fs" in protein.lower() or "ter" in protein.lower()
|
| 76 |
+
steps.append(AutoPVS1Step(
|
| 77 |
+
step=3, label="NMD predicted",
|
| 78 |
+
value="Yes β assumed NMD competent (verify against last-exon distance)" if nmd_predicted
|
| 79 |
+
else "Unknown β verify manually",
|
| 80 |
+
**{"pass": nmd_predicted},
|
| 81 |
+
))
|
| 82 |
+
|
| 83 |
+
# Step 4 β last exon exception (heuristic placeholder)
|
| 84 |
+
steps.append(AutoPVS1Step(
|
| 85 |
+
step=4, label="Last exon exception",
|
| 86 |
+
value="Not assessed β requires transcript exon table",
|
| 87 |
+
**{"pass": True},
|
| 88 |
+
))
|
| 89 |
+
|
| 90 |
+
# Step 5 β gene LOF mechanism (heuristic placeholder)
|
| 91 |
+
steps.append(AutoPVS1Step(
|
| 92 |
+
step=5, label="Gene LOF mechanism",
|
| 93 |
+
value="Assumed β verify against gene LOF tolerance (gnomAD pLI / OMIM)",
|
| 94 |
+
**{"pass": True},
|
| 95 |
+
))
|
| 96 |
+
|
| 97 |
+
caveats: list[str] = []
|
| 98 |
+
if "?" in protein or not protein:
|
| 99 |
+
caveats.append("protein change ambiguous β verify NMD prediction")
|
| 100 |
+
if not variant.transcript:
|
| 101 |
+
caveats.append("transcript not specified β multiple-transcript caveat applies")
|
| 102 |
+
|
| 103 |
+
return AutoPVS1Result(
|
| 104 |
+
triggered=True,
|
| 105 |
+
strength="very_strong",
|
| 106 |
+
rule="PVS1",
|
| 107 |
+
reasoning=steps,
|
| 108 |
+
conclusion="PVS1 triggered at Very Strong strength (heuristic β manual verification recommended)",
|
| 109 |
+
source="autoPVS1-heuristic",
|
| 110 |
+
caveats=caveats,
|
| 111 |
+
)
|
backend/app/services/rag/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from backend.app.services.rag.chunker import ChunkBuilder
|
| 2 |
+
from backend.app.services.rag.embedder import Embedder
|
| 3 |
+
from backend.app.services.rag.fetcher import LiteratureFetcher
|
| 4 |
+
from backend.app.services.rag.retriever import LiteratureRetriever
|
| 5 |
+
|
| 6 |
+
__all__ = ["ChunkBuilder", "Embedder", "LiteratureFetcher", "LiteratureRetriever"]
|
backend/app/services/rag/chunker.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from backend.app.config import get_settings
|
| 2 |
+
from backend.app.services.rag.fetcher import Paper
|
| 3 |
+
|
| 4 |
+
settings = get_settings()
|
| 5 |
+
|
| 6 |
+
CRITERION_KEYWORDS: dict[str, list[str]] = {
|
| 7 |
+
"PM3": ["in trans", "compound heterozygous", "biallelic", "homozygous"],
|
| 8 |
+
"PP1": ["segregation", "co-segregat", "family", "affected"],
|
| 9 |
+
"PS3": ["functional", "in vitro", "in vivo", "assay", "expression"],
|
| 10 |
+
"BS3": ["no effect", "wild type", "wild-type", "indistinguishable"],
|
| 11 |
+
"PS4": ["case", "prevalence", "odds ratio", "controls"],
|
| 12 |
+
"PP4": ["phenotype", "clinical features", "presentation", "presented with"],
|
| 13 |
+
"PP5": ["pathogenic", "likely pathogenic", "ClinVar", "submission"],
|
| 14 |
+
"BP6": ["benign", "likely benign", "ClinVar"],
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class Chunk:
|
| 19 |
+
def __init__(
|
| 20 |
+
self,
|
| 21 |
+
text: str,
|
| 22 |
+
pmid: str,
|
| 23 |
+
year: int | None,
|
| 24 |
+
title: str,
|
| 25 |
+
criteria_hint: list[str],
|
| 26 |
+
) -> None:
|
| 27 |
+
self.text = text
|
| 28 |
+
self.pmid = pmid
|
| 29 |
+
self.year = year
|
| 30 |
+
self.title = title
|
| 31 |
+
self.criteria_hint = criteria_hint
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class ChunkBuilder:
|
| 35 |
+
def __init__(self, chunk_size: int | None = None, overlap: int | None = None) -> None:
|
| 36 |
+
self.chunk_size = chunk_size or settings.rag_chunk_size
|
| 37 |
+
self.overlap = overlap or settings.rag_chunk_overlap
|
| 38 |
+
|
| 39 |
+
def detect_criteria(self, chunk_text: str) -> list[str]:
|
| 40 |
+
hint = []
|
| 41 |
+
text_lower = chunk_text.lower()
|
| 42 |
+
for crit, keywords in CRITERION_KEYWORDS.items():
|
| 43 |
+
if any(kw.lower() in text_lower for kw in keywords):
|
| 44 |
+
hint.append(crit)
|
| 45 |
+
return hint
|
| 46 |
+
|
| 47 |
+
def chunk_paper(self, paper: Paper) -> list[Chunk]:
|
| 48 |
+
text = paper.text
|
| 49 |
+
if not text:
|
| 50 |
+
return []
|
| 51 |
+
# Approx 4 chars per token
|
| 52 |
+
char_size = self.chunk_size * 4
|
| 53 |
+
char_overlap = self.overlap * 4
|
| 54 |
+
|
| 55 |
+
chunks: list[Chunk] = []
|
| 56 |
+
start = 0
|
| 57 |
+
while start < len(text):
|
| 58 |
+
end = min(start + char_size, len(text))
|
| 59 |
+
window = text[start:end]
|
| 60 |
+
chunks.append(
|
| 61 |
+
Chunk(
|
| 62 |
+
text=window,
|
| 63 |
+
pmid=paper.pmid,
|
| 64 |
+
year=paper.year,
|
| 65 |
+
title=paper.title,
|
| 66 |
+
criteria_hint=self.detect_criteria(window),
|
| 67 |
+
)
|
| 68 |
+
)
|
| 69 |
+
if end >= len(text):
|
| 70 |
+
break
|
| 71 |
+
start = end - char_overlap
|
| 72 |
+
return chunks
|
backend/app/services/rag/embedder.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import TYPE_CHECKING
|
| 3 |
+
|
| 4 |
+
from backend.app.config import get_settings
|
| 5 |
+
from backend.app.services.rag.chunker import Chunk
|
| 6 |
+
|
| 7 |
+
if TYPE_CHECKING:
|
| 8 |
+
from chromadb.api import ClientAPI
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
settings = get_settings()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class Embedder:
|
| 15 |
+
def __init__(self, model_name: str | None = None, persist_dir: str | None = None) -> None:
|
| 16 |
+
self.model_name = model_name or settings.embedding_model
|
| 17 |
+
self.persist_dir = persist_dir or str(settings.chroma_persist_dir)
|
| 18 |
+
self.collection_name = settings.chroma_collection
|
| 19 |
+
self._model = None
|
| 20 |
+
self._client: ClientAPI | None = None
|
| 21 |
+
self._collection = None
|
| 22 |
+
|
| 23 |
+
def _ensure_model(self):
|
| 24 |
+
if self._model is None:
|
| 25 |
+
from sentence_transformers import SentenceTransformer
|
| 26 |
+
self._model = SentenceTransformer(self.model_name, device=settings.embedding_device)
|
| 27 |
+
return self._model
|
| 28 |
+
|
| 29 |
+
def _ensure_collection(self):
|
| 30 |
+
if self._collection is None:
|
| 31 |
+
import chromadb
|
| 32 |
+
self._client = chromadb.PersistentClient(path=self.persist_dir)
|
| 33 |
+
self._collection = self._client.get_or_create_collection(self.collection_name)
|
| 34 |
+
return self._collection
|
| 35 |
+
|
| 36 |
+
def encode(self, texts: list[str]) -> list[list[float]]:
|
| 37 |
+
model = self._ensure_model()
|
| 38 |
+
return model.encode(texts, show_progress_bar=False, convert_to_numpy=True).tolist()
|
| 39 |
+
|
| 40 |
+
def index_chunks(self, chunks: list[Chunk], variant_id: str, gene: str) -> int:
|
| 41 |
+
if not chunks:
|
| 42 |
+
return 0
|
| 43 |
+
coll = self._ensure_collection()
|
| 44 |
+
embeddings = self.encode([c.text for c in chunks])
|
| 45 |
+
ids = [f"{variant_id}:{c.pmid}:{i}" for i, c in enumerate(chunks)]
|
| 46 |
+
metadatas = [
|
| 47 |
+
{
|
| 48 |
+
"pmid": c.pmid,
|
| 49 |
+
"year": c.year or 0,
|
| 50 |
+
"title": c.title,
|
| 51 |
+
"variant_id": variant_id,
|
| 52 |
+
"gene": gene,
|
| 53 |
+
"criteria_hint": ",".join(c.criteria_hint),
|
| 54 |
+
}
|
| 55 |
+
for c in chunks
|
| 56 |
+
]
|
| 57 |
+
coll.add(
|
| 58 |
+
ids=ids,
|
| 59 |
+
documents=[c.text for c in chunks],
|
| 60 |
+
embeddings=embeddings,
|
| 61 |
+
metadatas=metadatas,
|
| 62 |
+
)
|
| 63 |
+
return len(chunks)
|
| 64 |
+
|
| 65 |
+
def query(
|
| 66 |
+
self, query_text: str, variant_id: str, top_k: int, criteria: list[str] | None = None
|
| 67 |
+
) -> list[dict]:
|
| 68 |
+
coll = self._ensure_collection()
|
| 69 |
+
emb = self.encode([query_text])[0]
|
| 70 |
+
where: dict = {"variant_id": variant_id}
|
| 71 |
+
results = coll.query(query_embeddings=[emb], n_results=top_k, where=where)
|
| 72 |
+
out = []
|
| 73 |
+
for i, doc in enumerate(results.get("documents", [[]])[0]):
|
| 74 |
+
meta = results.get("metadatas", [[]])[0][i] if results.get("metadatas") else {}
|
| 75 |
+
score = results.get("distances", [[]])[0][i] if results.get("distances") else None
|
| 76 |
+
out.append({"text": doc, "metadata": meta, "score": score})
|
| 77 |
+
return out
|
backend/app/services/rag/fetcher.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import xml.etree.ElementTree as ET
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
import httpx
|
| 6 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 7 |
+
|
| 8 |
+
from backend.app.config import get_settings
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
settings = get_settings()
|
| 12 |
+
|
| 13 |
+
EUTILS = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
|
| 14 |
+
PMC_FULLTEXT = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi"
|
| 15 |
+
|
| 16 |
+
CRITERION_QUERY_AUGMENTS: dict[str, str] = {
|
| 17 |
+
"PM3": '"in trans" OR "compound heterozygous" OR "biallelic"',
|
| 18 |
+
"PP1": '"segregation" OR "affected family members" OR "co-segregates"',
|
| 19 |
+
"PS3": '"functional" OR "in vitro" OR "in vivo" OR "assay"',
|
| 20 |
+
"BS3": '"functional" OR "no effect" OR "wild type"',
|
| 21 |
+
"PS4": '"cases" OR "prevalence" OR "odds ratio"',
|
| 22 |
+
"PP4": '"phenotype" OR "clinical features" OR "presentation"',
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class Paper:
|
| 27 |
+
def __init__(self, pmid: str, title: str, abstract: str, year: int | None, body: str | None = None) -> None:
|
| 28 |
+
self.pmid = pmid
|
| 29 |
+
self.title = title
|
| 30 |
+
self.abstract = abstract
|
| 31 |
+
self.year = year
|
| 32 |
+
self.body = body
|
| 33 |
+
|
| 34 |
+
@property
|
| 35 |
+
def text(self) -> str:
|
| 36 |
+
return self.body or self.abstract
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class LiteratureFetcher:
|
| 40 |
+
def __init__(self, max_results: int | None = None, fetch_fulltext: bool | None = None) -> None:
|
| 41 |
+
self.max_results = max_results or settings.rag_max_papers_per_variant
|
| 42 |
+
self.fetch_fulltext = settings.rag_fetch_fulltext if fetch_fulltext is None else fetch_fulltext
|
| 43 |
+
self.api_key = settings.ncbi_api_key
|
| 44 |
+
self.email = settings.ncbi_email
|
| 45 |
+
|
| 46 |
+
def _params(self, **extra: Any) -> dict[str, Any]:
|
| 47 |
+
p = {"tool": "VariantLens", "email": self.email}
|
| 48 |
+
if self.api_key:
|
| 49 |
+
p["api_key"] = self.api_key
|
| 50 |
+
return {**p, **extra}
|
| 51 |
+
|
| 52 |
+
def build_query(self, gene: str, hgvs: str, protein: str | None) -> str:
|
| 53 |
+
terms = [f'"{gene}"', f'"{hgvs}"']
|
| 54 |
+
if protein:
|
| 55 |
+
terms.append(f'"{protein}"')
|
| 56 |
+
return " AND ".join([f"({t})" for t in terms[:1]] + [f"({' OR '.join(terms[1:])})"])
|
| 57 |
+
|
| 58 |
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10), reraise=True)
|
| 59 |
+
async def search_pubmed(self, query: str) -> list[str]:
|
| 60 |
+
async with httpx.AsyncClient(timeout=20.0) as client:
|
| 61 |
+
r = await client.get(
|
| 62 |
+
f"{EUTILS}/esearch.fcgi",
|
| 63 |
+
params=self._params(db="pubmed", term=query, retmax=self.max_results, retmode="json"),
|
| 64 |
+
)
|
| 65 |
+
r.raise_for_status()
|
| 66 |
+
return r.json().get("esearchresult", {}).get("idlist", [])
|
| 67 |
+
|
| 68 |
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10), reraise=True)
|
| 69 |
+
async def fetch_abstracts(self, pmids: list[str]) -> list[Paper]:
|
| 70 |
+
if not pmids:
|
| 71 |
+
return []
|
| 72 |
+
async with httpx.AsyncClient(timeout=30.0) as client:
|
| 73 |
+
r = await client.get(
|
| 74 |
+
f"{EUTILS}/efetch.fcgi",
|
| 75 |
+
params=self._params(db="pubmed", id=",".join(pmids), rettype="abstract", retmode="xml"),
|
| 76 |
+
)
|
| 77 |
+
r.raise_for_status()
|
| 78 |
+
return self._parse_pubmed_xml(r.text)
|
| 79 |
+
|
| 80 |
+
def _parse_pubmed_xml(self, xml_text: str) -> list[Paper]:
|
| 81 |
+
try:
|
| 82 |
+
root = ET.fromstring(xml_text)
|
| 83 |
+
except ET.ParseError as e:
|
| 84 |
+
logger.warning("PubMed XML parse failed: %s", e)
|
| 85 |
+
return []
|
| 86 |
+
papers: list[Paper] = []
|
| 87 |
+
for art in root.iter("PubmedArticle"):
|
| 88 |
+
pmid_el = art.find(".//PMID")
|
| 89 |
+
title_el = art.find(".//ArticleTitle")
|
| 90 |
+
abstract_el = art.findall(".//Abstract/AbstractText")
|
| 91 |
+
year_el = art.find(".//PubDate/Year")
|
| 92 |
+
pmid = pmid_el.text if pmid_el is not None and pmid_el.text else ""
|
| 93 |
+
title = title_el.text if title_el is not None and title_el.text else ""
|
| 94 |
+
abstract = " ".join((a.text or "") for a in abstract_el)
|
| 95 |
+
year = int(year_el.text) if year_el is not None and year_el.text and year_el.text.isdigit() else None
|
| 96 |
+
if pmid:
|
| 97 |
+
papers.append(Paper(pmid=pmid, title=title, abstract=abstract, year=year))
|
| 98 |
+
return papers
|
| 99 |
+
|
| 100 |
+
async def fetch_full_texts(self, papers: list[Paper]) -> list[Paper]:
|
| 101 |
+
if not self.fetch_fulltext:
|
| 102 |
+
return papers
|
| 103 |
+
async with httpx.AsyncClient(timeout=30.0) as client:
|
| 104 |
+
for p in papers:
|
| 105 |
+
try:
|
| 106 |
+
r = await client.get(PMC_FULLTEXT, params={"id": p.pmid, "format": "tgz"})
|
| 107 |
+
if r.status_code == 200 and "tgz" in r.headers.get("content-type", "").lower():
|
| 108 |
+
# Parsing tar.gz -> XML -> body is non-trivial; skip for MVP
|
| 109 |
+
# and rely on abstract. Implementation can extend here.
|
| 110 |
+
pass
|
| 111 |
+
except (httpx.HTTPError, httpx.TimeoutException) as e:
|
| 112 |
+
logger.debug("full-text fetch skipped for %s: %s", p.pmid, e)
|
| 113 |
+
return papers
|
| 114 |
+
|
| 115 |
+
async def fetch_for_variant(
|
| 116 |
+
self, gene: str, hgvs: str, protein: str | None, criteria: list[str] | None = None
|
| 117 |
+
) -> list[Paper]:
|
| 118 |
+
base_query = self.build_query(gene, hgvs, protein)
|
| 119 |
+
all_pmids: set[str] = set()
|
| 120 |
+
try:
|
| 121 |
+
all_pmids.update(await self.search_pubmed(base_query))
|
| 122 |
+
except (httpx.HTTPError, httpx.TimeoutException) as e:
|
| 123 |
+
logger.warning("base PubMed search failed: %s", e)
|
| 124 |
+
|
| 125 |
+
for crit in criteria or []:
|
| 126 |
+
aug = CRITERION_QUERY_AUGMENTS.get(crit)
|
| 127 |
+
if not aug:
|
| 128 |
+
continue
|
| 129 |
+
try:
|
| 130 |
+
all_pmids.update(await self.search_pubmed(f"{base_query} AND ({aug})"))
|
| 131 |
+
except (httpx.HTTPError, httpx.TimeoutException) as e:
|
| 132 |
+
logger.warning("criterion-augmented search failed for %s: %s", crit, e)
|
| 133 |
+
|
| 134 |
+
capped = list(all_pmids)[: self.max_results]
|
| 135 |
+
papers = await self.fetch_abstracts(capped)
|
| 136 |
+
return await self.fetch_full_texts(papers)
|
backend/app/services/rag/retriever.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
from backend.app.config import get_settings
|
| 4 |
+
from backend.app.schemas.evidence import LiteratureChunk
|
| 5 |
+
from backend.app.services.rag.chunker import ChunkBuilder
|
| 6 |
+
from backend.app.services.rag.embedder import Embedder
|
| 7 |
+
from backend.app.services.rag.fetcher import LiteratureFetcher
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
settings = get_settings()
|
| 11 |
+
|
| 12 |
+
CRITERION_QUERY_TEMPLATES: dict[str, str] = {
|
| 13 |
+
"PM3": "Was {variant} observed in trans with another pathogenic variant or compound heterozygous?",
|
| 14 |
+
"PP1": "Did {variant} co-segregate with disease in affected family members?",
|
| 15 |
+
"PS3": "What functional studies have been performed on {variant} and what do they show?",
|
| 16 |
+
"BS3": "Do functional studies show {variant} has no measurable effect?",
|
| 17 |
+
"PS4": "How prevalent is {variant} in cases compared to controls? Is there an odds ratio?",
|
| 18 |
+
"PP4": "Is the patient phenotype highly specific for the disease associated with {variant}?",
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class LiteratureRetriever:
|
| 23 |
+
def __init__(
|
| 24 |
+
self,
|
| 25 |
+
fetcher: LiteratureFetcher | None = None,
|
| 26 |
+
chunker: ChunkBuilder | None = None,
|
| 27 |
+
embedder: Embedder | None = None,
|
| 28 |
+
) -> None:
|
| 29 |
+
self.fetcher = fetcher or LiteratureFetcher()
|
| 30 |
+
self.chunker = chunker or ChunkBuilder()
|
| 31 |
+
self.embedder = embedder or Embedder()
|
| 32 |
+
|
| 33 |
+
async def index_for_variant(
|
| 34 |
+
self, variant_id: str, gene: str, hgvs: str, protein: str | None, criteria: list[str]
|
| 35 |
+
) -> int:
|
| 36 |
+
papers = await self.fetcher.fetch_for_variant(gene, hgvs, protein, criteria)
|
| 37 |
+
all_chunks = [c for p in papers for c in self.chunker.chunk_paper(p)]
|
| 38 |
+
return self.embedder.index_chunks(all_chunks, variant_id=variant_id, gene=gene)
|
| 39 |
+
|
| 40 |
+
def retrieve_for_criterion(
|
| 41 |
+
self, variant_id: str, hgvs: str, criterion: str, top_k: int | None = None
|
| 42 |
+
) -> list[LiteratureChunk]:
|
| 43 |
+
template = CRITERION_QUERY_TEMPLATES.get(criterion)
|
| 44 |
+
if not template:
|
| 45 |
+
return []
|
| 46 |
+
query = template.format(variant=hgvs)
|
| 47 |
+
results = self.embedder.query(
|
| 48 |
+
query_text=query, variant_id=variant_id, top_k=top_k or settings.rag_top_k
|
| 49 |
+
)
|
| 50 |
+
return [
|
| 51 |
+
LiteratureChunk(
|
| 52 |
+
pmid=r["metadata"].get("pmid", "unknown"),
|
| 53 |
+
year=r["metadata"].get("year") or None,
|
| 54 |
+
title=r["metadata"].get("title"),
|
| 55 |
+
chunk_text=r["text"],
|
| 56 |
+
criteria_relevance=[criterion],
|
| 57 |
+
score=r.get("score"),
|
| 58 |
+
)
|
| 59 |
+
for r in results
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
def retrieve_for_criteria(
|
| 63 |
+
self, variant_id: str, hgvs: str, criteria: list[str], top_k: int | None = None
|
| 64 |
+
) -> dict[str, list[LiteratureChunk]]:
|
| 65 |
+
return {
|
| 66 |
+
crit: self.retrieve_for_criterion(variant_id, hgvs, crit, top_k=top_k)
|
| 67 |
+
for crit in criteria
|
| 68 |
+
}
|
backend/app/services/repository.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Persistence layer that bridges Pydantic schemas <-> SQLAlchemy records.
|
| 2 |
+
|
| 3 |
+
The repository is the single point that writes a `ClassificationResult` to
|
| 4 |
+
the audit-trail database and reads it back. Keeping it isolated from the
|
| 5 |
+
pipeline means the pipeline can run dry (no DB) for tests and one-off CLI
|
| 6 |
+
runs, while the FastAPI router persists every successful classification.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
from datetime import UTC, datetime
|
| 12 |
+
|
| 13 |
+
from sqlalchemy.orm import Session
|
| 14 |
+
|
| 15 |
+
from backend.app.models.classification import ClassificationRecord, CriterionRecord
|
| 16 |
+
from backend.app.models.variant import VariantRecord
|
| 17 |
+
from backend.app.schemas.classification import ClassificationResult
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class ClassificationRepository:
|
| 21 |
+
def __init__(self, db: Session) -> None:
|
| 22 |
+
self.db = db
|
| 23 |
+
|
| 24 |
+
def save(self, result: ClassificationResult) -> ClassificationResult:
|
| 25 |
+
v = result.variant
|
| 26 |
+
variant_record = VariantRecord(
|
| 27 |
+
raw_input=v.raw_input,
|
| 28 |
+
hgvs_genomic=v.hgvs_genomic,
|
| 29 |
+
hgvs_coding=v.hgvs_coding,
|
| 30 |
+
hgvs_protein=v.hgvs_protein,
|
| 31 |
+
transcript=v.transcript,
|
| 32 |
+
gene_symbol=v.gene_symbol,
|
| 33 |
+
chromosome=v.chromosome,
|
| 34 |
+
position=v.position,
|
| 35 |
+
normalization_source=v.normalization_source,
|
| 36 |
+
warnings=v.warnings,
|
| 37 |
+
)
|
| 38 |
+
self.db.add(variant_record)
|
| 39 |
+
self.db.flush() # populate variant_record.id
|
| 40 |
+
|
| 41 |
+
cls = result.classification
|
| 42 |
+
record = ClassificationRecord(
|
| 43 |
+
variant_id=variant_record.id,
|
| 44 |
+
significance=cls.significance,
|
| 45 |
+
confidence=cls.confidence,
|
| 46 |
+
triggered_criteria=list(cls.triggered_criteria),
|
| 47 |
+
conflicting_evidence=cls.conflicting_evidence,
|
| 48 |
+
ruleset_version=result.ruleset_version,
|
| 49 |
+
rationale=cls.rationale,
|
| 50 |
+
)
|
| 51 |
+
self.db.add(record)
|
| 52 |
+
self.db.flush()
|
| 53 |
+
|
| 54 |
+
for c in result.evidence.criteria:
|
| 55 |
+
self.db.add(CriterionRecord(
|
| 56 |
+
classification_id=record.id,
|
| 57 |
+
code=c.code,
|
| 58 |
+
triggered=c.triggered,
|
| 59 |
+
strength=c.strength,
|
| 60 |
+
source=c.source,
|
| 61 |
+
evidence_text=c.evidence_text,
|
| 62 |
+
confidence=c.confidence,
|
| 63 |
+
pmid=c.pmid,
|
| 64 |
+
caveat=c.caveat,
|
| 65 |
+
curator_override=c.curator_override,
|
| 66 |
+
override_justification=c.override_justification,
|
| 67 |
+
))
|
| 68 |
+
|
| 69 |
+
self.db.commit()
|
| 70 |
+
self.db.refresh(record)
|
| 71 |
+
|
| 72 |
+
return result.model_copy(update={
|
| 73 |
+
"id": record.id,
|
| 74 |
+
"analysed_at": record.created_at.replace(tzinfo=UTC).isoformat()
|
| 75 |
+
if record.created_at
|
| 76 |
+
else datetime.now(UTC).isoformat(),
|
| 77 |
+
})
|
| 78 |
+
|
| 79 |
+
def get(self, classification_id: str) -> ClassificationRecord | None:
|
| 80 |
+
return self.db.get(ClassificationRecord, classification_id)
|
backend/app/services/vep.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Ensembl VEP REST client β enriches HGVS-coding input with genomic coords.
|
| 2 |
+
|
| 3 |
+
Mutalyzer v3 normalizes the c./p. forms but returns nothing for chr/pos/ref/alt.
|
| 4 |
+
Without those fields, REVEL/AlphaMissense/gnomAD all silently no-op, leaving
|
| 5 |
+
the rule engine blind to common pathogenicity signals (PP3/BP4/PM2 from AF).
|
| 6 |
+
|
| 7 |
+
VEP's REST API solves this for free (no key, ~3 req/s polite-use cap).
|
| 8 |
+
For each HGVS coding string, it returns:
|
| 9 |
+
- chrom, position, allele_string (ref/alt for SNVs)
|
| 10 |
+
- most_severe_consequence (Sequence Ontology term)
|
| 11 |
+
- per-transcript hgvsc, hgvsp, gene_symbol, transcript_id
|
| 12 |
+
|
| 13 |
+
We treat VEP as best-effort β if it fails we still have whatever Mutalyzer
|
| 14 |
+
already populated, and the pipeline continues.
|
| 15 |
+
"""
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
import logging
|
| 19 |
+
|
| 20 |
+
import httpx
|
| 21 |
+
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
|
| 22 |
+
|
| 23 |
+
from backend.app.schemas.variant import NormalizedVariant
|
| 24 |
+
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
VEP_BASE = "https://rest.ensembl.org"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class VEPClient:
|
| 31 |
+
def __init__(self, base_url: str | None = None, timeout: float = 15.0) -> None:
|
| 32 |
+
self.base_url = base_url or VEP_BASE
|
| 33 |
+
self.timeout = timeout
|
| 34 |
+
|
| 35 |
+
@retry(
|
| 36 |
+
stop=stop_after_attempt(3),
|
| 37 |
+
wait=wait_exponential(min=1, max=8),
|
| 38 |
+
retry=retry_if_exception_type((httpx.HTTPStatusError, httpx.TimeoutException)),
|
| 39 |
+
reraise=True,
|
| 40 |
+
)
|
| 41 |
+
async def annotate_hgvs(self, hgvs: str) -> dict | None:
|
| 42 |
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
| 43 |
+
r = await client.get(
|
| 44 |
+
f"{self.base_url}/vep/human/hgvs/{hgvs}",
|
| 45 |
+
headers={"Accept": "application/json"},
|
| 46 |
+
)
|
| 47 |
+
if r.status_code == 400:
|
| 48 |
+
# VEP can't parse some normalized forms (e.g. complex indels) β give up gracefully
|
| 49 |
+
logger.debug("VEP rejected %s: %s", hgvs, r.text[:200])
|
| 50 |
+
return None
|
| 51 |
+
r.raise_for_status()
|
| 52 |
+
data = r.json()
|
| 53 |
+
return data[0] if isinstance(data, list) and data else None
|
| 54 |
+
|
| 55 |
+
@staticmethod
|
| 56 |
+
def _split_alleles(allele_string: str | None) -> tuple[str | None, str | None]:
|
| 57 |
+
"""Split VEP's `allele_string` into (ref, alt).
|
| 58 |
+
|
| 59 |
+
Format examples:
|
| 60 |
+
'G/A' β ('G', 'A') β SNV
|
| 61 |
+
'TC/T' β ('TC', 'T') β deletion
|
| 62 |
+
'T/TC' β ('T', 'TC') β insertion
|
| 63 |
+
'-/C' β ('', 'C') β pure insertion (rare)
|
| 64 |
+
'C/-' β ('C', '') β pure deletion (rare)
|
| 65 |
+
'AT/CG' β ('AT', 'CG') β MNV
|
| 66 |
+
"""
|
| 67 |
+
if not allele_string or "/" not in allele_string:
|
| 68 |
+
return None, None
|
| 69 |
+
ref, alt = allele_string.split("/", 1)
|
| 70 |
+
ref = "" if ref == "-" else ref
|
| 71 |
+
alt = "" if alt == "-" else alt
|
| 72 |
+
return ref, alt
|
| 73 |
+
|
| 74 |
+
async def enrich(self, normalized: NormalizedVariant) -> NormalizedVariant:
|
| 75 |
+
"""Enrich a NormalizedVariant with chrom/pos/ref/alt + transcript info.
|
| 76 |
+
|
| 77 |
+
Only mutates fields that are currently empty β never overrides values
|
| 78 |
+
Mutalyzer or the VCF parser already filled in.
|
| 79 |
+
"""
|
| 80 |
+
# Choose the best HGVS string to send to VEP
|
| 81 |
+
hgvs = normalized.hgvs_coding or normalized.hgvs_genomic or normalized.raw_input
|
| 82 |
+
if not hgvs:
|
| 83 |
+
return normalized
|
| 84 |
+
try:
|
| 85 |
+
data = await self.annotate_hgvs(hgvs)
|
| 86 |
+
except (httpx.HTTPError, httpx.TimeoutException) as e:
|
| 87 |
+
logger.warning("VEP annotation failed for %s: %s", hgvs, e)
|
| 88 |
+
return normalized
|
| 89 |
+
if not data:
|
| 90 |
+
return normalized
|
| 91 |
+
|
| 92 |
+
updates: dict = {}
|
| 93 |
+
if normalized.chromosome is None and data.get("seq_region_name"):
|
| 94 |
+
updates["chromosome"] = str(data["seq_region_name"])
|
| 95 |
+
if normalized.position is None and data.get("start"):
|
| 96 |
+
updates["position"] = int(data["start"])
|
| 97 |
+
|
| 98 |
+
ref, alt = self._split_alleles(data.get("allele_string"))
|
| 99 |
+
if normalized.ref is None and ref is not None:
|
| 100 |
+
updates["ref"] = ref
|
| 101 |
+
if normalized.alt is None and alt is not None:
|
| 102 |
+
updates["alt"] = alt
|
| 103 |
+
|
| 104 |
+
if normalized.consequence is None and data.get("most_severe_consequence"):
|
| 105 |
+
updates["consequence"] = data["most_severe_consequence"]
|
| 106 |
+
|
| 107 |
+
# Pick the canonical transcript if available, else the first
|
| 108 |
+
transcripts = data.get("transcript_consequences") or []
|
| 109 |
+
if transcripts:
|
| 110 |
+
canonical = next((t for t in transcripts if t.get("canonical")), transcripts[0])
|
| 111 |
+
if normalized.gene_symbol is None and canonical.get("gene_symbol"):
|
| 112 |
+
updates["gene_symbol"] = canonical["gene_symbol"]
|
| 113 |
+
if normalized.transcript is None and canonical.get("transcript_id"):
|
| 114 |
+
updates["transcript"] = canonical["transcript_id"]
|
| 115 |
+
if normalized.hgvs_protein is None and canonical.get("hgvsp"):
|
| 116 |
+
updates["hgvs_protein"] = canonical["hgvsp"]
|
| 117 |
+
|
| 118 |
+
if not updates:
|
| 119 |
+
return normalized
|
| 120 |
+
warnings = list(normalized.warnings)
|
| 121 |
+
warnings.append(f"VEP enriched: {', '.join(updates.keys())}")
|
| 122 |
+
return normalized.model_copy(update={**updates, "warnings": warnings})
|
backend/app/worker.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from celery import Celery
|
| 2 |
+
|
| 3 |
+
from backend.app.config import get_settings
|
| 4 |
+
|
| 5 |
+
settings = get_settings()
|
| 6 |
+
|
| 7 |
+
celery_app = Celery(
|
| 8 |
+
"variantlens",
|
| 9 |
+
broker=settings.celery_broker_url,
|
| 10 |
+
backend=settings.celery_result_backend,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
celery_app.conf.update(
|
| 14 |
+
task_serializer="json",
|
| 15 |
+
result_serializer="json",
|
| 16 |
+
accept_content=["json"],
|
| 17 |
+
timezone="UTC",
|
| 18 |
+
enable_utc=True,
|
| 19 |
+
task_track_started=True,
|
| 20 |
+
)
|
backend/tests/__init__.py
ADDED
|
File without changes
|