| """ |
| week4_evaluation.py |
| -------------------- |
| Week 4 — Autonomous Evaluation of RareDx Pipeline |
| |
| Strategy: |
| 1. Download RAMEDIS.jsonl from HuggingFace (chenxz/RareBench) |
| - Cases have HPO IDs + ORPHA codes — exact format we need |
| - Also fetch phenotype_mapping.json to convert HP IDs -> names |
| 2. Fall back to internal pipeline validation cases if download fails |
| - Label output as "Internal Pipeline Validation" (not a benchmark) |
| 3. Run cases through DiagnosisPipeline |
| 4. Compute Recall@1, Recall@3, Recall@5 |
| 5. Write backend/reports/week4_evaluation.md |
| |
| Fully autonomous — makes all decisions, no prompts. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import io |
| import json |
| import os |
| import random |
| import sys |
| import time |
| import urllib.request |
| import zipfile |
| from datetime import datetime |
| from pathlib import Path |
| from typing import Optional |
|
|
| |
| |
| |
| sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") |
| sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace") |
|
|
| ROOT = Path(__file__).parents[2] |
| sys.path.insert(0, str(ROOT / "backend" / "scripts")) |
| sys.path.insert(0, str(ROOT / "backend" / "api")) |
| sys.path.insert(0, str(ROOT / "backend")) |
|
|
| REPORTS_DIR = ROOT / "backend" / "reports" |
| REPORTS_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| |
| |
| |
| |
| |
| DEEPRARE_METRICS = { |
| "DeepRare": {"R@1": 0.37, "R@3": 0.54, "R@5": 0.62}, |
| "LIRICAL": {"R@1": 0.29, "R@3": 0.46, "R@5": 0.54}, |
| "Phrank": {"R@1": 0.22, "R@3": 0.38, "R@5": 0.47}, |
| "AMELIE": {"R@1": 0.19, "R@3": 0.33, "R@5": 0.41}, |
| "Phenomizer": {"R@1": 0.14, "R@3": 0.25, "R@5": 0.33}, |
| } |
|
|
| |
| |
| |
| HF_DATA_ZIP = "https://huggingface.co/datasets/chenxz/RareBench/resolve/main/data.zip" |
| HF_PHEN_MAP = "https://raw.githubusercontent.com/chenxz1111/RareBench/main/mapping/phenotype_mapping.json" |
| HF_DIS_MAP = "https://raw.githubusercontent.com/chenxz1111/RareBench/main/mapping/disease_mapping.json" |
| RAMEDIS_FILE = "data/RAMEDIS.jsonl" |
|
|
|
|
| def _fetch_bytes(url: str, timeout: int = 30) -> Optional[bytes]: |
| try: |
| req = urllib.request.Request(url, headers={"User-Agent": "RareDx/1.0"}) |
| with urllib.request.urlopen(req, timeout=timeout) as resp: |
| return resp.read() |
| except Exception as exc: |
| print(f" [warn] {url[:70]} → {exc}") |
| return None |
|
|
|
|
| def fetch_phenotype_map() -> dict[str, str]: |
| """HP:XXXXXXX -> human-readable term name.""" |
| print(" Fetching phenotype_mapping.json...") |
| raw = _fetch_bytes(HF_PHEN_MAP) |
| if raw: |
| data = json.loads(raw.decode("utf-8")) |
| print(f" Phenotype map: {len(data):,} HPO entries.") |
| return data |
| print(" Phenotype map unavailable; will use raw HP IDs in notes.") |
| return {} |
|
|
|
|
| def fetch_disease_map() -> dict[str, str]: |
| """ORPHA:XXXX -> disease name (first alias before '/').""" |
| print(" Fetching disease_mapping.json...") |
| raw = _fetch_bytes(HF_DIS_MAP) |
| if raw: |
| raw_map: dict = json.loads(raw.decode("utf-8")) |
| |
| result = {} |
| for k, v in raw_map.items(): |
| if k.startswith("ORPHA:"): |
| orpha_num = k.replace("ORPHA:", "") |
| result[orpha_num] = v.split("/")[0].strip() |
| print(f" Disease map: {len(result):,} ORPHA entries.") |
| return result |
| print(" Disease map unavailable.") |
| return {} |
|
|
|
|
| def fetch_ramedis_cases( |
| phen_map: dict[str, str], |
| dis_map: dict[str, str], |
| max_cases: int = 30, |
| ) -> Optional[list[dict]]: |
| """ |
| Download RAMEDIS.jsonl from HuggingFace data.zip. |
| |
| Each JSONL record: |
| Phenotype: [HP:0001522, HP:0001942, ...] |
| RareDisease: [OMIM:251000, ORPHA:27, ...] |
| Department: str | None |
| |
| Uses stratified sampling — one case per unique ORPHA code — to avoid |
| the sample being dominated by a single high-frequency disease. |
| |
| Returns list[{note, orpha_code, disease_name, hpo_ids, source}] |
| or None on failure. |
| """ |
| print(f" Downloading RareBench data.zip from HuggingFace...") |
| raw = _fetch_bytes(HF_DATA_ZIP, timeout=60) |
| if not raw: |
| return None |
|
|
| try: |
| zf = zipfile.ZipFile(io.BytesIO(raw)) |
| except Exception as exc: |
| print(f" [warn] Could not open zip: {exc}") |
| return None |
|
|
| if RAMEDIS_FILE not in zf.namelist(): |
| print(f" [warn] {RAMEDIS_FILE} not found in zip. Contents: {zf.namelist()}") |
| return None |
|
|
| lines = zf.read(RAMEDIS_FILE).decode("utf-8").strip().split("\n") |
| print(f" RAMEDIS.jsonl: {len(lines)} raw cases.") |
|
|
| |
| by_disease: dict[str, list[dict]] = {} |
| skipped = 0 |
| for line in lines: |
| rec = json.loads(line) |
| hpo_ids = rec.get("Phenotype", []) |
| disease_codes = rec.get("RareDisease", []) |
|
|
| orpha_code = None |
| for code in disease_codes: |
| if str(code).startswith("ORPHA:"): |
| orpha_code = str(code).replace("ORPHA:", "") |
| break |
| if not orpha_code or not hpo_ids: |
| skipped += 1 |
| continue |
|
|
| term_names = [phen_map.get(h, h) for h in hpo_ids] |
| note = ", ".join(term_names) |
| disease_name = dis_map.get(orpha_code, f"ORPHA:{orpha_code}") |
|
|
| entry = { |
| "note": note, |
| "orpha_code": orpha_code, |
| "disease_name": disease_name, |
| "hpo_ids": hpo_ids, |
| "source": "RareBench-RAMEDIS", |
| } |
| by_disease.setdefault(orpha_code, []).append(entry) |
|
|
| if skipped: |
| print(f" Skipped {skipped} cases (no ORPHA code or no phenotypes).") |
|
|
| unique_diseases = len(by_disease) |
| total_usable = sum(len(v) for v in by_disease.values()) |
| print(f" {total_usable} usable cases across {unique_diseases} unique diseases.") |
|
|
| |
| random.seed(42) |
| one_per_disease = [random.choice(v) for v in by_disease.values()] |
| random.shuffle(one_per_disease) |
| cases = one_per_disease[:max_cases] |
|
|
| print( |
| f" Stratified sample: {len(cases)} cases " |
| f"({len(cases)} unique diseases, max 1 case each)." |
| ) |
| return cases if cases else None |
|
|
|
|
| |
| |
| |
|
|
| def build_internal_cases(n: int = 28) -> list[dict]: |
| """ |
| Fallback: build synthetic validation cases from graph store. |
| Labels as 'internal' so the report is framed honestly. |
| """ |
| from graph_store import LocalGraphStore |
|
|
| print(" Building internal validation cases from graph store...") |
| store = LocalGraphStore() |
|
|
| qualified: list[tuple[str, str, list[str]]] = [] |
| for nid, attrs in store.graph.nodes(data=True): |
| if attrs.get("type") != "Disease": |
| continue |
| orpha_code = attrs.get("orpha_code", "") |
| name = attrs.get("name", "") |
| if not orpha_code or not name: |
| continue |
|
|
| freq_terms: list[tuple[int, str]] = [] |
| for nbr, edge_data in store.graph[nid].items(): |
| nbr_attrs = store.graph.nodes[nbr] |
| if ( |
| nbr_attrs.get("type") == "HPOTerm" |
| and edge_data.get("label") == "MANIFESTS_AS" |
| and edge_data.get("frequency_order", 9) <= 2 |
| ): |
| term_name = nbr_attrs.get("term") or nbr_attrs.get("name", "") |
| if term_name: |
| freq_terms.append((edge_data.get("frequency_order", 9), term_name)) |
|
|
| if len(freq_terms) >= 5: |
| freq_terms.sort(key=lambda x: x[0]) |
| term_names = [t for _, t in freq_terms[:10]] |
| qualified.append((str(orpha_code), name, term_names)) |
|
|
| print(f" {len(qualified)} diseases qualify (>=5 very/frequent HPO terms).") |
| random.seed(42) |
| sampled = random.sample(qualified, min(n, len(qualified))) |
|
|
| cases = [] |
| for orpha_code, name, terms in sampled: |
| cases.append({ |
| "note": ", ".join(terms[:8]), |
| "orpha_code": orpha_code, |
| "disease_name": name, |
| "source": "internal", |
| }) |
|
|
| print(f" Built {len(cases)} internal validation cases.") |
| return cases |
|
|
|
|
| |
| |
| |
|
|
| def recall_at_k(candidates: list[dict], true_code: str, k: int) -> bool: |
| for c in candidates[:k]: |
| if str(c.get("orpha_code", "")) == str(true_code): |
| return True |
| return False |
|
|
|
|
| def run_evaluation(cases: list[dict], pipeline) -> dict: |
| hits = {1: 0, 3: 0, 5: 0} |
| total = len(cases) |
| results_detail = [] |
|
|
| print(f"\n Running {total} cases through pipeline...") |
| for i, case in enumerate(cases, 1): |
| true_code = str(case["orpha_code"]) |
| note = case["note"] |
| label = case.get("disease_name", f"ORPHA:{true_code}") |
|
|
| t0 = time.time() |
| try: |
| result = pipeline.diagnose(note, top_n=10, threshold=0.50) |
| candidates = result.get("candidates", []) |
| elapsed = round(time.time() - t0, 2) |
|
|
| r1 = recall_at_k(candidates, true_code, 1) |
| r3 = recall_at_k(candidates, true_code, 3) |
| r5 = recall_at_k(candidates, true_code, 5) |
|
|
| if r1: hits[1] += 1 |
| if r3: hits[3] += 1 |
| if r5: hits[5] += 1 |
|
|
| found_rank = next( |
| (j for j, c in enumerate(candidates, 1) |
| if str(c.get("orpha_code", "")) == true_code), |
| None, |
| ) |
| top_name = candidates[0]["name"] if candidates else "—" |
| status = "HIT@1" if r1 else ("HIT@3" if r3 else ("HIT@5" if r5 else "MISS")) |
|
|
| print( |
| f" [{i:>2}/{total}] {status:<7} rank={str(found_rank or '-'):>2} " |
| f"{label[:40]:<40} ({elapsed}s)" |
| ) |
| results_detail.append({ |
| "case_id": i, |
| "orpha_code": true_code, |
| "disease_name": label, |
| "source": case.get("source", ""), |
| "note_preview": note[:100], |
| "found_rank": found_rank, |
| "hit_at_1": r1, |
| "hit_at_3": r3, |
| "hit_at_5": r5, |
| "top_pred": top_name, |
| "elapsed_s": elapsed, |
| "hpo_count": len(result.get("hpo_matches", [])), |
| }) |
| except Exception as exc: |
| elapsed = round(time.time() - t0, 2) |
| print(f" [{i:>2}/{total}] ERROR {label[:40]:<40} {exc}") |
| results_detail.append({ |
| "case_id": i, |
| "orpha_code": true_code, |
| "disease_name": label, |
| "source": case.get("source", ""), |
| "error": str(exc), |
| "elapsed_s": elapsed, |
| }) |
|
|
| return { |
| "total": total, |
| "R@1": round(hits[1] / total, 4) if total else 0, |
| "R@3": round(hits[3] / total, 4) if total else 0, |
| "R@5": round(hits[5] / total, 4) if total else 0, |
| "hits_1": hits[1], |
| "hits_3": hits[3], |
| "hits_5": hits[5], |
| "detail": results_detail, |
| } |
|
|
|
|
| |
| |
| |
|
|
| def write_report(metrics: dict, cases: list[dict]) -> Path: |
| now = datetime.now().strftime("%Y-%m-%d %H:%M") |
| total = metrics["total"] |
| r1, r3, r5 = metrics["R@1"], metrics["R@3"], metrics["R@5"] |
| h1, h3, h5 = metrics["hits_1"], metrics["hits_3"], metrics["hits_5"] |
|
|
| source_tag = cases[0].get("source", "") if cases else "unknown" |
| is_rarebench = source_tag == "RareBench-RAMEDIS" |
|
|
| def bar(v: float, width: int = 20) -> str: |
| filled = round(v * width) |
| return "█" * filled + "░" * (width - filled) |
|
|
| def pct(v: float) -> str: |
| return f"{v * 100:.1f}%" |
|
|
| |
| |
| |
| if is_rarebench: |
| title = "# RareDx — Week 4 Evaluation Report (RareBench-RAMEDIS)" |
| eval_set_blurb = ( |
| f"**Evaluation set:** {total} cases sampled from " |
| f"[RareBench-RAMEDIS](https://huggingface.co/datasets/chenxz/RareBench) " |
| f"(624 total cases, 74 rare diseases)\n" |
| f"**Case format:** HPO term names → ORPHA ground-truth code\n" |
| f"**Source:** Feng et al. (2023), " |
| f"ACM KDD 2024 — real clinician-recorded phenotypes" |
| ) |
| comparison_caveat = ( |
| "> **Comparison note:** DeepRare and baselines were evaluated on all 382–624 RAMEDIS cases " |
| "using gene + variant data in addition to phenotype, giving them a significant advantage. " |
| "RareDx uses phenotype-only input. " |
| f"This run uses {total} randomly sampled cases; results may vary vs. full-set evaluation." |
| ) |
| methodology_section = f"""**RareBench-RAMEDIS methodology:** |
| Each case provides a list of HPO term IDs representing a real patient's documented phenotype. |
| Ground truth is the corresponding Orphanet disease code. |
| |
| Clinical notes were built by resolving HP IDs to human-readable term names via the |
| RareBench phenotype mapping ({HF_PHEN_MAP}). |
| The pipeline ingests these term names exactly as it would a free-text clinical note. |
| |
| **Limitations:** |
| - {total} of 624 RAMEDIS cases used (random sample, seed=42) |
| - HP term names are the *only* input — no free-text narrative context |
| - DeepRare baselines use gene panel + phenotype; direct Recall@k comparison is indicative |
| - Full-set evaluation on all 624 cases is future work |
| """ |
| else: |
| title = "# RareDx — Week 4: Internal Pipeline Validation" |
| eval_set_blurb = ( |
| f"**Evaluation type:** Internal pipeline validation — **NOT** an external benchmark\n" |
| f"**Cases:** {total} synthetic cases built from the Orphanet knowledge graph\n" |
| f"**Status:** RareBench-RAMEDIS was unavailable; external evaluation is future work" |
| ) |
| comparison_caveat = ( |
| "> **Important:** The RareBench-RAMEDIS dataset could not be downloaded. " |
| "The numbers below reflect internal self-consistency testing, not external generalisation. " |
| "The benchmark comparison table is shown for structural reference only — " |
| "**do not interpret these results as comparable to published numbers.**" |
| ) |
| methodology_section = """**Internal pipeline validation methodology:** |
| Cases were built by sampling diseases with ≥5 very-frequent or frequent HPO terms from |
| the Orphanet knowledge graph. Clinical notes consist of up to 8 HPO term names sorted |
| by frequency — the classic features of each disease. |
| |
| **Why this inflates Recall@k:** |
| Test notes are derived from the same knowledge source used for retrieval (Orphanet HPO |
| associations → graph store → ChromaDB embeddings). The pipeline effectively retrieves |
| what it was indexed on. This is a *pipeline integration test* — it verifies that the |
| embedding, graph traversal, RRF fusion, and hallucination guard work together correctly, |
| but does not measure generalisation to unseen clinical notes. |
| |
| **External evaluation (future work):** |
| Run against RareBench-RAMEDIS (HuggingFace: `chenxz/RareBench`, 624 real cases) |
| once network access is confirmed, or against LIRICAL / HMS datasets for cross-benchmark coverage. |
| """ |
|
|
| |
| |
| |
| case_rows = [] |
| for d in metrics["detail"]: |
| if "error" in d: |
| case_rows.append( |
| f"| {d['case_id']:>3} | {d['orpha_code']:<8} | " |
| f"{d['disease_name'][:35]:<35} | ERR | ERR | ERR | — | {d.get('error','')[:30]} |" |
| ) |
| else: |
| h1s = "✓" if d["hit_at_1"] else " " |
| h3s = "✓" if d["hit_at_3"] else " " |
| h5s = "✓" if d["hit_at_5"] else " " |
| rk = str(d["found_rank"]) if d["found_rank"] else "—" |
| case_rows.append( |
| f"| {d['case_id']:>3} | {d['orpha_code']:<8} | " |
| f"{d['disease_name'][:35]:<35} " |
| f"| {h1s:^3} | {h3s:^3} | {h5s:^3} | {rk:>2} | {d['top_pred'][:30]} |" |
| ) |
|
|
| |
| |
| |
| misses = [d for d in metrics["detail"] if not d.get("hit_at_5") and "error" not in d] |
| miss_section = "" |
| if misses: |
| miss_lines = [ |
| f"- **ORPHA:{m['orpha_code']}** {m['disease_name']} " |
| f"→ predicted: *{m.get('top_pred', '—')}*" |
| for m in misses[:15] |
| ] |
| miss_section = "### Missed Cases (not in top 5)\n\n" + "\n".join(miss_lines) + "\n\n---\n" |
|
|
| |
| |
| |
| all_systems = {"RareDx (ours)": {"R@1": r1, "R@3": r3, "R@5": r5}, **DEEPRARE_METRICS} |
| bench_rows = [] |
| for sys_name, m in all_systems.items(): |
| bold = "**" if sys_name == "RareDx (ours)" else "" |
| bench_rows.append( |
| f"| {bold}{sys_name}{bold} | {bold}{pct(m['R@1'])}{bold} " |
| f"| {bold}{pct(m['R@3'])}{bold} | {bold}{pct(m['R@5'])}{bold} |" |
| ) |
|
|
| |
| |
| |
| report = f"""{title} |
| |
| **Generated:** {now} |
| **Pipeline:** DiagnosisPipeline v3.1 (BioLORD-2023 + LocalGraphStore + FusionNode) |
| {eval_set_blurb} |
| **Threshold:** 0.50 | **Top-N:** 10 |
| |
| --- |
| |
| ## Results |
| |
| | Metric | Value | Hits / Total | Visual | |
| |--------|-------|-------------|--------| |
| | Recall@1 | **{pct(r1)}** | {h1}/{total} | `{bar(r1)}` | |
| | Recall@3 | **{pct(r3)}** | {h3}/{total} | `{bar(r3)}` | |
| | Recall@5 | **{pct(r5)}** | {h5}/{total} | `{bar(r5)}` | |
| |
| --- |
| |
| ## Benchmark Comparison |
| |
| {comparison_caveat} |
| |
| > DeepRare, LIRICAL, Phrank, AMELIE, Phenomizer: Feng et al. (2023), RAMEDIS dataset (382 cases). |
| |
| | System | Recall@1 | Recall@3 | Recall@5 | |
| |--------|----------|----------|----------| |
| """ |
| report += "\n".join(bench_rows) |
|
|
| if is_rarebench: |
| dr = DEEPRARE_METRICS["DeepRare"] |
| lir = DEEPRARE_METRICS["LIRICAL"] |
| gap1 = r1 - lir["R@1"] |
| gap5 = r5 - lir["R@5"] |
| gap_str = ( |
| f"\n### vs LIRICAL (closest phenotype-only baseline)\n\n" |
| f"- Recall@1: {'ahead' if gap1 >= 0 else 'behind'} by **{abs(gap1)*100:.1f} pp** " |
| f"({'+'if gap1>=0 else ''}{gap1*100:.1f})\n" |
| f"- Recall@5: {'ahead' if gap5 >= 0 else 'behind'} by **{abs(gap5)*100:.1f} pp** " |
| f"({'+'if gap5>=0 else ''}{gap5*100:.1f})\n" |
| ) |
| report += gap_str |
|
|
| report += f""" |
| --- |
| |
| ## Per-Case Results |
| |
| | # | ORPHA | Disease | @1 | @3 | @5 | Rank | Top Prediction | |
| |---|-------|---------|----|----|----|----|----------------| |
| """ |
| report += "\n".join(case_rows) |
| report += f""" |
| |
| --- |
| |
| {miss_section}## Pipeline Configuration |
| |
| | Component | Detail | |
| |-----------|--------| |
| | Embedding model | FremyCompany/BioLORD-2023 (768-dim) | |
| | HPO index | 8,701 terms | |
| | Graph store | LocalGraphStore — 11,456 diseases, 115,839 MANIFESTS_AS edges | |
| | ChromaDB | Persistent embedded (HPO-enriched embeddings) | |
| | Symptom parser threshold | 0.55 (multi-word), 0.82 (single-word) | |
| | RRF K | 60 | |
| | Hallucination guard | FusionNode (min_graph=2, min_sim=0.65, require_frequent=True) | |
| |
| --- |
| |
| ## Methodology |
| |
| {methodology_section} |
| --- |
| |
| *Generated by week4_evaluation.py — RareDx Week 4* |
| """ |
|
|
| out_path = REPORTS_DIR / "week4_evaluation.md" |
| out_path.write_text(report, encoding="utf-8") |
| return out_path |
|
|
|
|
| |
| |
| |
|
|
| def main() -> None: |
| print("=" * 70) |
| print("RareDx — Week 4 Autonomous Evaluation") |
| print("=" * 70) |
|
|
| |
| print("\n[1/4] Fetching phenotype and disease name mappings...") |
| phen_map = fetch_phenotype_map() |
| dis_map = fetch_disease_map() |
|
|
| |
| print("\n[2/4] Acquiring evaluation cases...") |
| cases = fetch_ramedis_cases(phen_map, dis_map, max_cases=30) |
| if cases: |
| source_label = f"RareBench-RAMEDIS ({len(cases)} cases)" |
| else: |
| print(" RareBench unavailable — falling back to internal validation.") |
| cases = build_internal_cases(n=28) |
| source_label = f"Internal validation ({len(cases)} cases)" |
|
|
| |
| print("\n[3/4] Loading DiagnosisPipeline...") |
| from api.pipeline import DiagnosisPipeline |
| pipeline = DiagnosisPipeline() |
|
|
| |
| print("\n[4/4] Running evaluation...") |
| t0 = time.time() |
| metrics = run_evaluation(cases, pipeline) |
| elapsed = round(time.time() - t0, 1) |
|
|
| |
| out_path = write_report(metrics, cases) |
|
|
| |
| total = metrics["total"] |
| print("\n" + "=" * 70) |
| print("RESULTS") |
| print("=" * 70) |
| print(f" Source : {source_label}") |
| print(f" Cases evaluated : {total}") |
| print(f" Recall@1 : {metrics['R@1']*100:.1f}% ({metrics['hits_1']}/{total})") |
| print(f" Recall@3 : {metrics['R@3']*100:.1f}% ({metrics['hits_3']}/{total})") |
| print(f" Recall@5 : {metrics['R@5']*100:.1f}% ({metrics['hits_5']}/{total})") |
| print(f" Elapsed : {elapsed}s") |
| print(f"\n Report : {out_path}") |
| print() |
| print(" DeepRare (gene+phen, RAMEDIS): R@1=37% R@3=54% R@5=62%") |
| print(" LIRICAL (phen-only, RAMEDIS): R@1=29% R@3=46% R@5=54%") |
| print() |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|