|
|
from transformers import pipeline |
|
|
import spacy |
|
|
|
|
|
class ClinicalNERProcessor: |
|
|
""" |
|
|
A class for Named Entity Recognition and POS tagging. |
|
|
""" |
|
|
|
|
|
def __init__(self, use_pos=True, use_anatomy=True): |
|
|
|
|
|
self.ner_pipeline = pipeline( |
|
|
"ner", |
|
|
model="samrawal/bert-base-uncased_clinical-ner", |
|
|
aggregation_strategy="simple" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.anatomy_pipeline = None |
|
|
if use_anatomy: |
|
|
try: |
|
|
self.anatomy_pipeline = pipeline( |
|
|
"ner", |
|
|
model="OpenMed/OpenMed-NER-AnatomyDetect-BioPatient-108M", |
|
|
aggregation_strategy="simple" |
|
|
) |
|
|
except Exception as e: |
|
|
print(f"Warning: Could not load anatomy model: {e}") |
|
|
|
|
|
|
|
|
self.nlp = None |
|
|
if use_pos: |
|
|
try: |
|
|
self.nlp = spacy.load("en_core_web_sm") |
|
|
except OSError: |
|
|
print("Warning: spaCy model 'en_core_web_sm' not found.") |
|
|
print("Install it with: python -m spacy download en_core_web_sm") |
|
|
|
|
|
def _merge_subwords(self, entities): |
|
|
if not entities: |
|
|
return [] |
|
|
|
|
|
merged = [] |
|
|
i = 0 |
|
|
|
|
|
while i < len(entities): |
|
|
current = entities[i].copy() |
|
|
word = current['word'] |
|
|
end = current['end'] |
|
|
|
|
|
|
|
|
j = i + 1 |
|
|
while j < len(entities): |
|
|
next_entity = entities[j] |
|
|
|
|
|
|
|
|
if (next_entity['word'].startswith('##') and |
|
|
next_entity['entity_group'] == current['entity_group']): |
|
|
|
|
|
word += next_entity['word'][2:] |
|
|
end = next_entity['end'] |
|
|
j += 1 |
|
|
else: |
|
|
break |
|
|
|
|
|
|
|
|
current['word'] = word |
|
|
current['end'] = end |
|
|
merged.append(current) |
|
|
|
|
|
|
|
|
i = j |
|
|
|
|
|
return merged |
|
|
|
|
|
def basic_ner(self, text): |
|
|
"""Clinical NER only""" |
|
|
entities = self.ner_pipeline(text) |
|
|
return self._merge_subwords(entities) |
|
|
|
|
|
def prolog_ner(self, text): |
|
|
"""Clinical NER as Prolog facts""" |
|
|
entities = self.ner_pipeline(text) |
|
|
merged_entities = self._merge_subwords(entities) |
|
|
|
|
|
prolog_facts = [] |
|
|
for i, entity in enumerate(merged_entities): |
|
|
|
|
|
word = entity['word'].replace("'", "\\'") |
|
|
|
|
|
|
|
|
fact = ( |
|
|
f"entity({i}, '{entity['entity_group']}', " |
|
|
f"'{word}', {entity['start']}, " |
|
|
f"{entity['end']}, {entity['score']:.4f})." |
|
|
) |
|
|
prolog_facts.append(fact) |
|
|
|
|
|
return "\n".join(prolog_facts) |
|
|
|
|
|
def anatomy_ner(self, text): |
|
|
"""Anatomy NER only""" |
|
|
if self.anatomy_pipeline is None: |
|
|
raise RuntimeError("Anatomy NER pipeline not initialized.") |
|
|
|
|
|
entities = self.anatomy_pipeline(text) |
|
|
return self._merge_subwords(entities) |
|
|
|
|
|
def prolog_anatomy(self, text): |
|
|
"""Anatomy NER as Prolog facts""" |
|
|
if self.anatomy_pipeline is None: |
|
|
raise RuntimeError("Anatomy NER pipeline not initialized.") |
|
|
|
|
|
entities = self.anatomy_pipeline(text) |
|
|
merged_entities = self._merge_subwords(entities) |
|
|
|
|
|
prolog_facts = [] |
|
|
for i, entity in enumerate(merged_entities): |
|
|
|
|
|
word = entity['word'].replace("'", "\\'") |
|
|
|
|
|
|
|
|
fact = ( |
|
|
f"anatomy({i}, '{entity['entity_group']}', " |
|
|
f"'{word}', {entity['start']}, " |
|
|
f"{entity['end']}, {entity['score']:.4f})." |
|
|
) |
|
|
prolog_facts.append(fact) |
|
|
|
|
|
return "\n".join(prolog_facts) |
|
|
|
|
|
def pos_tagging(self, text): |
|
|
"""POS tagging only""" |
|
|
if self.nlp is None: |
|
|
raise RuntimeError("POS tagger not initialized. Install spaCy model: python -m spacy download en_core_web_sm") |
|
|
|
|
|
doc = self.nlp(text) |
|
|
|
|
|
pos_results = [] |
|
|
for token in doc: |
|
|
pos_results.append({ |
|
|
'token': token.text, |
|
|
'lemma': token.lemma_, |
|
|
'pos': token.pos_, |
|
|
'tag': token.tag_, |
|
|
'dep': token.dep_, |
|
|
'start': token.idx, |
|
|
'end': token.idx + len(token.text) |
|
|
}) |
|
|
|
|
|
return pos_results |
|
|
|
|
|
def prolog_pos(self, text): |
|
|
"""POS tagging as Prolog facts""" |
|
|
if self.nlp is None: |
|
|
raise RuntimeError("POS tagger not initialized. Install spaCy model: python -m spacy download en_core_web_sm") |
|
|
|
|
|
pos_results = self.pos_tagging(text) |
|
|
|
|
|
prolog_facts = [] |
|
|
for i, token_info in enumerate(pos_results): |
|
|
|
|
|
token = token_info['token'].replace("'", "\\'") |
|
|
lemma = token_info['lemma'].replace("'", "\\'") |
|
|
|
|
|
|
|
|
fact = ( |
|
|
f"pos({i}, '{token}', '{lemma}', '{token_info['pos']}', " |
|
|
f"'{token_info['tag']}', '{token_info['dep']}', " |
|
|
f"{token_info['start']}, {token_info['end']})." |
|
|
) |
|
|
prolog_facts.append(fact) |
|
|
|
|
|
return "\n".join(prolog_facts) |
|
|
|
|
|
def combined_analysis(self, text): |
|
|
"""Combined analysis: Clinical NER + Anatomy NER + POS tagging""" |
|
|
result = { |
|
|
'clinical_entities': self.basic_ner(text), |
|
|
'anatomy_entities': [], |
|
|
'pos_tags': [] |
|
|
} |
|
|
|
|
|
if self.anatomy_pipeline: |
|
|
result['anatomy_entities'] = self.anatomy_ner(text) |
|
|
|
|
|
if self.nlp: |
|
|
result['pos_tags'] = self.pos_tagging(text) |
|
|
|
|
|
return result |
|
|
|
|
|
def prolog_combined(self, text): |
|
|
"""Combined Prolog output: Clinical NER + Anatomy NER + POS tagging""" |
|
|
sections = [] |
|
|
|
|
|
|
|
|
clinical_facts = self.prolog_ner(text) |
|
|
if clinical_facts: |
|
|
sections.append(f"% Clinical Entities\n{clinical_facts}") |
|
|
|
|
|
|
|
|
if self.anatomy_pipeline: |
|
|
anatomy_facts = self.prolog_anatomy(text) |
|
|
if anatomy_facts: |
|
|
sections.append(f"% Anatomy Entities\n{anatomy_facts}") |
|
|
|
|
|
|
|
|
if self.nlp: |
|
|
pos_facts = self.prolog_pos(text) |
|
|
if pos_facts: |
|
|
sections.append(f"% POS Tags\n{pos_facts}") |
|
|
|
|
|
return "\n\n".join(sections) |