| """ |
| symptom_parser.py |
| ----------------- |
| Maps free-text clinical symptoms to HPO term IDs using BioLORD-2023 |
| semantic similarity — no string matching, no exact-name lookup. |
| |
| Algorithm: |
| 1. Build an HPO embedding index: embed all 8,701 HPO terms with BioLORD. |
| 2. Segment the clinical note into candidate phrases. |
| 3. Embed each phrase and find the nearest HPO term by cosine similarity. |
| 4. Return matches above a confidence threshold. |
| |
| The index is cached to disk so it only needs to be built once. |
| |
| Can be used as a module (SymptomParser class) or as a CLI: |
| python symptom_parser.py "tall stature, displaced lens, heart murmur" |
| """ |
|
|
| import io |
| import json |
| import sys |
| import re |
| from dataclasses import dataclass |
| from pathlib import Path |
|
|
| import numpy as np |
| from sentence_transformers import SentenceTransformer |
| from dotenv import load_dotenv |
|
|
| load_dotenv(Path(__file__).parents[2] / ".env") |
|
|
| INDEX_DIR = Path(__file__).parents[2] / "data" / "hpo_index" |
| EMBED_FILE = INDEX_DIR / "embeddings.npy" |
| TERMS_FILE = INDEX_DIR / "terms.json" |
|
|
| |
| DEFAULT_THRESHOLD = 0.55 |
|
|
| |
| |
| SINGLE_WORD_THRESHOLD = 0.82 |
|
|
|
|
| @dataclass |
| class HPOMatch: |
| phrase: str |
| hpo_id: str |
| term: str |
| score: float |
|
|
|
|
| |
| |
| |
|
|
| def build_hpo_index(model: SentenceTransformer) -> tuple[np.ndarray, list[dict]]: |
| """ |
| Embed all HPOTerm nodes from the graph store. |
| Returns (embeddings [N, D], terms [{"hpo_id": ..., "term": ...}]). |
| """ |
| sys.path.insert(0, str(Path(__file__).parent)) |
| from graph_store import LocalGraphStore |
|
|
| store = LocalGraphStore() |
| terms = [ |
| {"hpo_id": attrs["hpo_id"], "term": attrs["term"]} |
| for _, attrs in store.graph.nodes(data=True) |
| if attrs.get("type") == "HPOTerm" |
| ] |
|
|
| if not terms: |
| raise RuntimeError("No HPOTerm nodes in graph store. Run ingest_hpo.py first.") |
|
|
| print(f" Building HPO index for {len(terms):,} terms...") |
| texts = [t["term"] for t in terms] |
| embeddings = model.encode( |
| texts, |
| batch_size=128, |
| show_progress_bar=True, |
| normalize_embeddings=True, |
| ) |
|
|
| INDEX_DIR.mkdir(parents=True, exist_ok=True) |
| np.save(str(EMBED_FILE), embeddings.astype(np.float32)) |
| TERMS_FILE.write_text(json.dumps(terms, ensure_ascii=False), encoding="utf-8") |
| print(f" Index saved to {INDEX_DIR}") |
|
|
| return embeddings.astype(np.float32), terms |
|
|
|
|
| def load_hpo_index(model: SentenceTransformer, force_rebuild: bool = False): |
| """Load cached index or build it if missing / stale.""" |
| if not force_rebuild and EMBED_FILE.exists() and TERMS_FILE.exists(): |
| embeddings = np.load(str(EMBED_FILE)) |
| terms = json.loads(TERMS_FILE.read_text(encoding="utf-8")) |
| return embeddings, terms |
|
|
| return build_hpo_index(model) |
|
|
|
|
| |
| |
| |
|
|
| |
| |
| _SPLIT_RE = re.compile(r"[,;]|\band\b|\bwith\b|\bplus\b", re.IGNORECASE) |
| |
| |
| _SKIP_RE = re.compile( |
| r"^\s*(" |
| r"\d+[\s-]*(year|month|week|day|yr|mo)s?[\s-]*(old)?" |
| r"|male|female|man|woman|boy|girl" |
| r"|patient|presents?|has|have|had|history|noted" |
| r"|found|showing|revealed|demonstrated" |
| r"|with|and|the|a|an|of|in|on|at|to|by" |
| r"|left|right|bilateral|unilateral" |
| r")\s*$", |
| re.IGNORECASE, |
| ) |
|
|
|
|
| def segment_note(note: str) -> list[str]: |
| """ |
| Split a clinical note into candidate symptom phrases. |
| |
| Single words are allowed through (unlike before) but will be held to |
| a higher BioLORD similarity threshold in SymptomParser.parse(). |
| Demographic / filler tokens are still stripped by _SKIP_RE. |
| """ |
| raw_phrases = _SPLIT_RE.split(note) |
| phrases = [] |
| for p in raw_phrases: |
| p = p.strip().rstrip(".") |
| if not p or _SKIP_RE.match(p): |
| continue |
| phrases.append(p) |
| return phrases |
|
|
|
|
| |
| |
| |
|
|
| class SymptomParser: |
| """ |
| Maps free-text clinical notes to HPO term matches using BioLORD embeddings. |
| |
| Usage: |
| parser = SymptomParser(model) |
| matches = parser.parse("tall stature, displaced lens, heart murmur") |
| """ |
|
|
| def __init__( |
| self, |
| model: SentenceTransformer, |
| threshold: float = DEFAULT_THRESHOLD, |
| force_rebuild: bool = False, |
| ) -> None: |
| self.model = model |
| self.threshold = threshold |
| print("Loading HPO embedding index...") |
| self.embeddings, self.terms = load_hpo_index(model, force_rebuild) |
| print(f" Index ready: {len(self.terms):,} HPO terms, " |
| f"dim={self.embeddings.shape[1]}") |
|
|
| def parse(self, clinical_note: str) -> list[HPOMatch]: |
| """ |
| Parse a clinical note and return HPO matches above threshold. |
| Deduplicates by HPO ID (keeps highest-scoring match per term). |
| """ |
| phrases = segment_note(clinical_note) |
| if not phrases: |
| return [] |
|
|
| |
| phrase_embs = self.model.encode( |
| phrases, |
| normalize_embeddings=True, |
| show_progress_bar=False, |
| ) |
|
|
| |
| sims = phrase_embs @ self.embeddings.T |
|
|
| |
| best_indices = np.argmax(sims, axis=1) |
| best_scores = sims[np.arange(len(phrases)), best_indices] |
|
|
| |
| |
| seen_hpo: dict[str, HPOMatch] = {} |
| for phrase, idx, score in zip(phrases, best_indices, best_scores): |
| is_single_word = len(phrase.split()) == 1 |
| cutoff = SINGLE_WORD_THRESHOLD if is_single_word else self.threshold |
| if float(score) < cutoff: |
| continue |
| t = self.terms[idx] |
| hpo_id = t["hpo_id"] |
| match = HPOMatch( |
| phrase=phrase, |
| hpo_id=hpo_id, |
| term=t["term"], |
| score=round(float(score), 4), |
| ) |
| |
| if hpo_id not in seen_hpo or seen_hpo[hpo_id].score < match.score: |
| seen_hpo[hpo_id] = match |
|
|
| |
| return sorted(seen_hpo.values(), key=lambda m: m.score, reverse=True) |
|
|
|
|
| |
| |
| |
|
|
| def main() -> None: |
| sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") |
|
|
| import os |
| embed_model = os.getenv("EMBED_MODEL", "FremyCompany/BioLORD-2023") |
| note = " ".join(sys.argv[1:]) if len(sys.argv) > 1 else ( |
| "18 year old male, extremely tall, displaced lens in left eye, " |
| "heart murmur, flexible joints, scoliosis" |
| ) |
|
|
| print("=" * 60) |
| print("RareDx Symptom Parser — HPO Semantic Matching") |
| print("=" * 60) |
| print(f"\nInput: {note}\n") |
|
|
| model = SentenceTransformer(embed_model) |
| parser = SymptomParser(model) |
| matches = parser.parse(note) |
|
|
| print(f"\nMatched {len(matches)} HPO terms:\n") |
| print(f" {'Score':>6} {'HPO ID':<12} {'Term':<40} Phrase") |
| print(f" {'-'*6} {'-'*12} {'-'*40} {'-'*30}") |
| for m in matches: |
| print(f" {m.score:>6.4f} {m.hpo_id:<12} {m.term:<40} \"{m.phrase}\"") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|