Spaces:

IFMedTech
/

ner

Paused

App Files Files Community

IFMedTechdemo commited on Nov 21, 2025

Commit

7617d00

verified ·

1 Parent(s): 63499c7

Create clinical_ner.py

Browse files

Files changed (1) hide show

clinical_ner.py +216 -0

clinical_ner.py ADDED Viewed

	@@ -0,0 +1,216 @@

+from transformers import pipeline
+import spacy
+class ClinicalNERProcessor:
+    """
+    A class for Named Entity Recognition and POS tagging.
+    """
+    def __init__(self, use_pos=True, use_anatomy=True):
+        # Clinical NER pipeline
+        self.ner_pipeline = pipeline(
+            "ner",
+            model="samrawal/bert-base-uncased_clinical-ner",
+            aggregation_strategy="simple"
+        )
+        # Anatomy NER pipeline
+        # Available models (choose based on your needs):
+        # - OpenMed/OpenMed-NER-AnatomyDetect-BioPatient-108M (smallest, fastest)
+        # - OpenMed/OpenMed-NER-AnatomyDetect-ModernClinical-149M (balanced)
+        # - OpenMed/OpenMed-NER-AnatomyDetect-ElectraMed-560M (most accurate)
+        self.anatomy_pipeline = None
+        if use_anatomy:
+            try:
+                self.anatomy_pipeline = pipeline(
+                    "ner",
+                    model="OpenMed/OpenMed-NER-AnatomyDetect-BioPatient-108M",
+                    aggregation_strategy="simple"
+                )
+            except Exception as e:
+                print(f"Warning: Could not load anatomy model: {e}")
+        # Load spaCy model for POS tagging
+        self.nlp = None
+        if use_pos:
+            try:
+                self.nlp = spacy.load("en_core_web_sm")
+            except OSError:
+                print("Warning: spaCy model 'en_core_web_sm' not found.")
+                print("Install it with: python -m spacy download en_core_web_sm")
+    def _merge_subwords(self, entities):
+        if not entities:
+            return []
+        merged = []
+        i = 0
+        while i < len(entities):
+            current = entities[i].copy()
+            word = current['word']
+            end = current['end']
+            # Look ahead for subword tokens (starting with ##)
+            j = i + 1
+            while j < len(entities):
+                next_entity = entities[j]
+                # Check if it's a subword of the same entity type
+                if (next_entity['word'].startswith('##') and
+                    next_entity['entity_group'] == current['entity_group']):
+                    # Remove ## prefix and append
+                    word += next_entity['word'][2:]
+                    end = next_entity['end']
+                    j += 1
+                else:
+                    break
+            # Update the merged entity
+            current['word'] = word
+            current['end'] = end
+            merged.append(current)
+            # Skip the merged tokens
+            i = j
+        return merged
+    def basic_ner(self, text):
+        """Clinical NER only"""
+        entities = self.ner_pipeline(text)
+        return self._merge_subwords(entities)
+    def prolog_ner(self, text):
+        """Clinical NER as Prolog facts"""
+        entities = self.ner_pipeline(text)
+        merged_entities = self._merge_subwords(entities)
+        prolog_facts = []
+        for i, entity in enumerate(merged_entities):
+            # Escape single quotes in words for Prolog
+            word = entity['word'].replace("'", "\\'")
+            # Format: entity(Id, Type, Word, Start, End, Score)
+            fact = (
+                f"entity({i}, '{entity['entity_group']}', "
+                f"'{word}', {entity['start']}, "
+                f"{entity['end']}, {entity['score']:.4f})."
+            )
+            prolog_facts.append(fact)
+        return "\n".join(prolog_facts)
+    def anatomy_ner(self, text):
+        """Anatomy NER only"""
+        if self.anatomy_pipeline is None:
+            raise RuntimeError("Anatomy NER pipeline not initialized.")
+        entities = self.anatomy_pipeline(text)
+        return self._merge_subwords(entities)
+    def prolog_anatomy(self, text):
+        """Anatomy NER as Prolog facts"""
+        if self.anatomy_pipeline is None:
+            raise RuntimeError("Anatomy NER pipeline not initialized.")
+        entities = self.anatomy_pipeline(text)
+        merged_entities = self._merge_subwords(entities)
+        prolog_facts = []
+        for i, entity in enumerate(merged_entities):
+            # Escape single quotes in words for Prolog
+            word = entity['word'].replace("'", "\\'")
+            # Format: anatomy(Id, Type, Word, Start, End, Score)
+            fact = (
+                f"anatomy({i}, '{entity['entity_group']}', "
+                f"'{word}', {entity['start']}, "
+                f"{entity['end']}, {entity['score']:.4f})."
+            )
+            prolog_facts.append(fact)
+        return "\n".join(prolog_facts)
+    def pos_tagging(self, text):
+        """POS tagging only"""
+        if self.nlp is None:
+            raise RuntimeError("POS tagger not initialized. Install spaCy model: python -m spacy download en_core_web_sm")
+        doc = self.nlp(text)
+        pos_results = []
+        for token in doc:
+            pos_results.append({
+                'token': token.text,
+                'lemma': token.lemma_,
+                'pos': token.pos_,  # Universal POS tag
+                'tag': token.tag_,  # Fine-grained POS tag
+                'dep': token.dep_,  # Dependency relation
+                'start': token.idx,
+                'end': token.idx + len(token.text)
+            })
+        return pos_results
+    def prolog_pos(self, text):
+        """POS tagging as Prolog facts"""
+        if self.nlp is None:
+            raise RuntimeError("POS tagger not initialized. Install spaCy model: python -m spacy download en_core_web_sm")
+        pos_results = self.pos_tagging(text)
+        prolog_facts = []
+        for i, token_info in enumerate(pos_results):
+            # Escape single quotes in tokens for Prolog
+            token = token_info['token'].replace("'", "\\'")
+            lemma = token_info['lemma'].replace("'", "\\'")
+            # Format: pos(Id, Token, Lemma, POS, Tag, Dep, Start, End)
+            fact = (
+                f"pos({i}, '{token}', '{lemma}', '{token_info['pos']}', "
+                f"'{token_info['tag']}', '{token_info['dep']}', "
+                f"{token_info['start']}, {token_info['end']})."
+            )
+            prolog_facts.append(fact)
+        return "\n".join(prolog_facts)
+    def combined_analysis(self, text):
+        """Combined analysis: Clinical NER + Anatomy NER + POS tagging"""
+        result = {
+            'clinical_entities': self.basic_ner(text),
+            'anatomy_entities': [],
+            'pos_tags': []
+        }
+        if self.anatomy_pipeline:
+            result['anatomy_entities'] = self.anatomy_ner(text)
+        if self.nlp:
+            result['pos_tags'] = self.pos_tagging(text)
+        return result
+    def prolog_combined(self, text):
+        """Combined Prolog output: Clinical NER + Anatomy NER + POS tagging"""
+        sections = []
+        # Clinical NER
+        clinical_facts = self.prolog_ner(text)
+        if clinical_facts:
+            sections.append(f"% Clinical Entities\n{clinical_facts}")
+        # Anatomy NER
+        if self.anatomy_pipeline:
+            anatomy_facts = self.prolog_anatomy(text)
+            if anatomy_facts:
+                sections.append(f"% Anatomy Entities\n{anatomy_facts}")
+        # POS tagging
+        if self.nlp:
+            pos_facts = self.prolog_pos(text)
+            if pos_facts:
+                sections.append(f"% POS Tags\n{pos_facts}")
+        return "\n\n".join(sections)