IFMedTechdemo commited on
Commit
7617d00
·
verified ·
1 Parent(s): 63499c7

Create clinical_ner.py

Browse files
Files changed (1) hide show
  1. clinical_ner.py +216 -0
clinical_ner.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import spacy
3
+
4
+ class ClinicalNERProcessor:
5
+ """
6
+ A class for Named Entity Recognition and POS tagging.
7
+ """
8
+
9
+ def __init__(self, use_pos=True, use_anatomy=True):
10
+ # Clinical NER pipeline
11
+ self.ner_pipeline = pipeline(
12
+ "ner",
13
+ model="samrawal/bert-base-uncased_clinical-ner",
14
+ aggregation_strategy="simple"
15
+ )
16
+
17
+ # Anatomy NER pipeline
18
+ # Available models (choose based on your needs):
19
+ # - OpenMed/OpenMed-NER-AnatomyDetect-BioPatient-108M (smallest, fastest)
20
+ # - OpenMed/OpenMed-NER-AnatomyDetect-ModernClinical-149M (balanced)
21
+ # - OpenMed/OpenMed-NER-AnatomyDetect-ElectraMed-560M (most accurate)
22
+ self.anatomy_pipeline = None
23
+ if use_anatomy:
24
+ try:
25
+ self.anatomy_pipeline = pipeline(
26
+ "ner",
27
+ model="OpenMed/OpenMed-NER-AnatomyDetect-BioPatient-108M",
28
+ aggregation_strategy="simple"
29
+ )
30
+ except Exception as e:
31
+ print(f"Warning: Could not load anatomy model: {e}")
32
+
33
+ # Load spaCy model for POS tagging
34
+ self.nlp = None
35
+ if use_pos:
36
+ try:
37
+ self.nlp = spacy.load("en_core_web_sm")
38
+ except OSError:
39
+ print("Warning: spaCy model 'en_core_web_sm' not found.")
40
+ print("Install it with: python -m spacy download en_core_web_sm")
41
+
42
+ def _merge_subwords(self, entities):
43
+ if not entities:
44
+ return []
45
+
46
+ merged = []
47
+ i = 0
48
+
49
+ while i < len(entities):
50
+ current = entities[i].copy()
51
+ word = current['word']
52
+ end = current['end']
53
+
54
+ # Look ahead for subword tokens (starting with ##)
55
+ j = i + 1
56
+ while j < len(entities):
57
+ next_entity = entities[j]
58
+
59
+ # Check if it's a subword of the same entity type
60
+ if (next_entity['word'].startswith('##') and
61
+ next_entity['entity_group'] == current['entity_group']):
62
+ # Remove ## prefix and append
63
+ word += next_entity['word'][2:]
64
+ end = next_entity['end']
65
+ j += 1
66
+ else:
67
+ break
68
+
69
+ # Update the merged entity
70
+ current['word'] = word
71
+ current['end'] = end
72
+ merged.append(current)
73
+
74
+ # Skip the merged tokens
75
+ i = j
76
+
77
+ return merged
78
+
79
+ def basic_ner(self, text):
80
+ """Clinical NER only"""
81
+ entities = self.ner_pipeline(text)
82
+ return self._merge_subwords(entities)
83
+
84
+ def prolog_ner(self, text):
85
+ """Clinical NER as Prolog facts"""
86
+ entities = self.ner_pipeline(text)
87
+ merged_entities = self._merge_subwords(entities)
88
+
89
+ prolog_facts = []
90
+ for i, entity in enumerate(merged_entities):
91
+ # Escape single quotes in words for Prolog
92
+ word = entity['word'].replace("'", "\\'")
93
+
94
+ # Format: entity(Id, Type, Word, Start, End, Score)
95
+ fact = (
96
+ f"entity({i}, '{entity['entity_group']}', "
97
+ f"'{word}', {entity['start']}, "
98
+ f"{entity['end']}, {entity['score']:.4f})."
99
+ )
100
+ prolog_facts.append(fact)
101
+
102
+ return "\n".join(prolog_facts)
103
+
104
+ def anatomy_ner(self, text):
105
+ """Anatomy NER only"""
106
+ if self.anatomy_pipeline is None:
107
+ raise RuntimeError("Anatomy NER pipeline not initialized.")
108
+
109
+ entities = self.anatomy_pipeline(text)
110
+ return self._merge_subwords(entities)
111
+
112
+ def prolog_anatomy(self, text):
113
+ """Anatomy NER as Prolog facts"""
114
+ if self.anatomy_pipeline is None:
115
+ raise RuntimeError("Anatomy NER pipeline not initialized.")
116
+
117
+ entities = self.anatomy_pipeline(text)
118
+ merged_entities = self._merge_subwords(entities)
119
+
120
+ prolog_facts = []
121
+ for i, entity in enumerate(merged_entities):
122
+ # Escape single quotes in words for Prolog
123
+ word = entity['word'].replace("'", "\\'")
124
+
125
+ # Format: anatomy(Id, Type, Word, Start, End, Score)
126
+ fact = (
127
+ f"anatomy({i}, '{entity['entity_group']}', "
128
+ f"'{word}', {entity['start']}, "
129
+ f"{entity['end']}, {entity['score']:.4f})."
130
+ )
131
+ prolog_facts.append(fact)
132
+
133
+ return "\n".join(prolog_facts)
134
+
135
+ def pos_tagging(self, text):
136
+ """POS tagging only"""
137
+ if self.nlp is None:
138
+ raise RuntimeError("POS tagger not initialized. Install spaCy model: python -m spacy download en_core_web_sm")
139
+
140
+ doc = self.nlp(text)
141
+
142
+ pos_results = []
143
+ for token in doc:
144
+ pos_results.append({
145
+ 'token': token.text,
146
+ 'lemma': token.lemma_,
147
+ 'pos': token.pos_, # Universal POS tag
148
+ 'tag': token.tag_, # Fine-grained POS tag
149
+ 'dep': token.dep_, # Dependency relation
150
+ 'start': token.idx,
151
+ 'end': token.idx + len(token.text)
152
+ })
153
+
154
+ return pos_results
155
+
156
+ def prolog_pos(self, text):
157
+ """POS tagging as Prolog facts"""
158
+ if self.nlp is None:
159
+ raise RuntimeError("POS tagger not initialized. Install spaCy model: python -m spacy download en_core_web_sm")
160
+
161
+ pos_results = self.pos_tagging(text)
162
+
163
+ prolog_facts = []
164
+ for i, token_info in enumerate(pos_results):
165
+ # Escape single quotes in tokens for Prolog
166
+ token = token_info['token'].replace("'", "\\'")
167
+ lemma = token_info['lemma'].replace("'", "\\'")
168
+
169
+ # Format: pos(Id, Token, Lemma, POS, Tag, Dep, Start, End)
170
+ fact = (
171
+ f"pos({i}, '{token}', '{lemma}', '{token_info['pos']}', "
172
+ f"'{token_info['tag']}', '{token_info['dep']}', "
173
+ f"{token_info['start']}, {token_info['end']})."
174
+ )
175
+ prolog_facts.append(fact)
176
+
177
+ return "\n".join(prolog_facts)
178
+
179
+ def combined_analysis(self, text):
180
+ """Combined analysis: Clinical NER + Anatomy NER + POS tagging"""
181
+ result = {
182
+ 'clinical_entities': self.basic_ner(text),
183
+ 'anatomy_entities': [],
184
+ 'pos_tags': []
185
+ }
186
+
187
+ if self.anatomy_pipeline:
188
+ result['anatomy_entities'] = self.anatomy_ner(text)
189
+
190
+ if self.nlp:
191
+ result['pos_tags'] = self.pos_tagging(text)
192
+
193
+ return result
194
+
195
+ def prolog_combined(self, text):
196
+ """Combined Prolog output: Clinical NER + Anatomy NER + POS tagging"""
197
+ sections = []
198
+
199
+ # Clinical NER
200
+ clinical_facts = self.prolog_ner(text)
201
+ if clinical_facts:
202
+ sections.append(f"% Clinical Entities\n{clinical_facts}")
203
+
204
+ # Anatomy NER
205
+ if self.anatomy_pipeline:
206
+ anatomy_facts = self.prolog_anatomy(text)
207
+ if anatomy_facts:
208
+ sections.append(f"% Anatomy Entities\n{anatomy_facts}")
209
+
210
+ # POS tagging
211
+ if self.nlp:
212
+ pos_facts = self.prolog_pos(text)
213
+ if pos_facts:
214
+ sections.append(f"% POS Tags\n{pos_facts}")
215
+
216
+ return "\n\n".join(sections)