Update mcp/nlp.py
Browse files- mcp/nlp.py +35 -52
mcp/nlp.py
CHANGED
|
@@ -1,55 +1,38 @@
|
|
| 1 |
# mcp/nlp.py
|
| 2 |
-
|
| 3 |
-
#!/usr/bin/env python3
|
| 4 |
-
"""MedGenesis – spaCy helper for lightweight keyword extraction.
|
| 5 |
-
|
| 6 |
-
Features
|
| 7 |
-
~~~~~~~~
|
| 8 |
-
* Lazy‑loads **`en_core_web_sm`** at first call; cached thereafter.
|
| 9 |
-
* If model missing, raises actionable RuntimeError — Dockerfile must
|
| 10 |
-
install via `python -m spacy download en_core_web_sm` (already in Dockerfile).
|
| 11 |
-
* `extract_keywords` returns **unique named‑entity strings** (>2 chars)
|
| 12 |
-
stripped of whitespace, preserving original casing.
|
| 13 |
-
* Adds fallback to simple noun‑chunk extraction when no entities found –
|
| 14 |
-
helps very short abstracts.
|
| 15 |
-
"""
|
| 16 |
-
from __future__ import annotations
|
| 17 |
-
|
| 18 |
import spacy
|
| 19 |
-
|
| 20 |
-
from
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
# ---------------------------------------------------------------------
|
| 40 |
-
# Public API
|
| 41 |
-
# ---------------------------------------------------------------------
|
| 42 |
-
|
| 43 |
-
def extract_keywords(text: str, *, min_len: int = 3) -> List[str]:
|
| 44 |
-
"""Return de‑duplicated entity keywords (fallback noun chunks)."""
|
| 45 |
-
nlp = _load_model()
|
| 46 |
doc = nlp(text)
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# mcp/nlp.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import spacy
|
| 3 |
+
import scispacy
|
| 4 |
+
from scispacy.linking import EntityLinker
|
| 5 |
+
|
| 6 |
+
# Load a powerful biomedical model + UMLS linker
|
| 7 |
+
@spacy.util.cache_dir("~/.cache/scispacy")
|
| 8 |
+
def load_model():
|
| 9 |
+
nlp = spacy.load("en_core_sci_scibert")
|
| 10 |
+
# Resolve abbreviations then link to UMLS
|
| 11 |
+
linker = EntityLinker(name="umls", resolve_abbreviations=True, threshold=0.75)
|
| 12 |
+
nlp.add_pipe(linker)
|
| 13 |
+
return nlp
|
| 14 |
+
|
| 15 |
+
nlp = load_model()
|
| 16 |
+
|
| 17 |
+
def extract_umls_concepts(text: str):
|
| 18 |
+
"""
|
| 19 |
+
Returns a list of {cui, concept_name, score, semantic_types}.
|
| 20 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
doc = nlp(text)
|
| 22 |
+
concepts = []
|
| 23 |
+
for ent in doc.ents:
|
| 24 |
+
for cui, score in ent._.umls_ents:
|
| 25 |
+
meta = nlp.get_pipe("scispacy_linker").kb.cui_to_entity[cui]
|
| 26 |
+
concepts.append({
|
| 27 |
+
"cui": cui,
|
| 28 |
+
"name": meta.canonical_name,
|
| 29 |
+
"score": float(score),
|
| 30 |
+
"types": meta.types # list of semantic type strings
|
| 31 |
+
})
|
| 32 |
+
# Deduplicate by CUI, keep highest score
|
| 33 |
+
seen = {}
|
| 34 |
+
for c in concepts:
|
| 35 |
+
prev = seen.get(c["cui"])
|
| 36 |
+
if not prev or c["score"] > prev["score"]:
|
| 37 |
+
seen[c["cui"]] = c
|
| 38 |
+
return list(seen.values())
|