Spaces:

mgbam
/

MCP_Res

Runtime error

App Files Files Community

mgbam commited on Jun 29, 2025

Commit

2c1c247

verified ·

1 Parent(s): 1786f57

Update mcp/nlp.py

Browse files

Files changed (1) hide show

mcp/nlp.py +35 -52

mcp/nlp.py CHANGED Viewed

@@ -1,55 +1,38 @@
 # mcp/nlp.py
-#!/usr/bin/env python3
-"""MedGenesis – spaCy helper for lightweight keyword extraction.
-Features
-~~~~~~~~
-* Lazy‑loads **`en_core_web_sm`** at first call; cached thereafter.
-* If model missing, raises actionable RuntimeError — Dockerfile must
-  install via `python -m spacy download en_core_web_sm` (already in Dockerfile).
-* `extract_keywords` returns **unique named‑entity strings** (>2 chars)
-  stripped of whitespace, preserving original casing.
-* Adds fallback to simple noun‑chunk extraction when no entities found –
-  helps very short abstracts.
-"""
-from __future__ import annotations
 import spacy
-from functools import lru_cache
-from typing import List
-# ---------------------------------------------------------------------
-# Model loader (cached)
-# ---------------------------------------------------------------------
-@lru_cache(maxsize=1)
-def _load_model():
-    try:
-        return spacy.load("en_core_web_sm")
-    except OSError as e:
-        raise RuntimeError(
-            "spaCy model 'en_core_web_sm' is not installed. Add\n"
-            "    RUN python -m spacy download en_core_web_sm\n"
-            "to your Dockerfile build stage."
-        ) from e
-# ---------------------------------------------------------------------
-# Public API
-# ---------------------------------------------------------------------
-def extract_keywords(text: str, *, min_len: int = 3) -> List[str]:
-    """Return de‑duplicated entity keywords (fallback noun chunks)."""
-    nlp = _load_model()
     doc = nlp(text)
-    ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_len}
-    if ents:
-        return list(ents)
-    # Fallback: noun chunks if spaCy found no entities (rare for tiny texts)
-    chunks = {chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) >= min_len}
-    return list(chunks)

 # mcp/nlp.py
 import spacy
+import scispacy
+from scispacy.linking import EntityLinker
+# Load a powerful biomedical model + UMLS linker
+@spacy.util.cache_dir("~/.cache/scispacy")
+def load_model():
+    nlp = spacy.load("en_core_sci_scibert")
+    # Resolve abbreviations then link to UMLS
+    linker = EntityLinker(name="umls", resolve_abbreviations=True, threshold=0.75)
+    nlp.add_pipe(linker)
+    return nlp
+nlp = load_model()
+def extract_umls_concepts(text: str):
+    """
+    Returns a list of {cui, concept_name, score, semantic_types}.
+    """
     doc = nlp(text)
+    concepts = []
+    for ent in doc.ents:
+        for cui, score in ent._.umls_ents:
+            meta = nlp.get_pipe("scispacy_linker").kb.cui_to_entity[cui]
+            concepts.append({
+                "cui": cui,
+                "name": meta.canonical_name,
+                "score": float(score),
+                "types": meta.types  # list of semantic type strings
+            })
+    # Deduplicate by CUI, keep highest score
+    seen = {}
+    for c in concepts:
+        prev = seen.get(c["cui"])
+        if not prev or c["score"] > prev["score"]:
+            seen[c["cui"]] = c
+    return list(seen.values())