Statistical-Impossibility commited on
Commit
26b59fd
·
verified ·
1 Parent(s): fb3b7fc

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +258 -0
app.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import spacy
4
+ import re
5
+ import unicodedata
6
+ import sys
7
+ import subprocess
8
+
9
+ # Download spaCy model if not present
10
+ try:
11
+ nlp = spacy.load("en_core_web_sm")
12
+ except OSError:
13
+ print("Downloading spaCy model...")
14
+ subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True)
15
+ nlp = spacy.load("en_core_web_sm")
16
+
17
+ nlp.add_pipe("sentencizer")
18
+
19
+ model_id = "Statistical-Impossibility/Feline-NER-Test"
20
+ ner_pipeline = pipeline("token-classification", model=model_id, aggregation_strategy="simple")
21
+
22
+ def clean_text(text):
23
+ """Aggressive cleaning for PDF/HTML paste artifacts."""
24
+ text = unicodedata.normalize('NFKC', text)
25
+ text = re.sub(r'<[^>]+>', '', text)
26
+ text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)
27
+ text = re.sub(r'\n{3,}', '\n\n', text)
28
+ text = re.sub(r'\s+', ' ', text)
29
+ text = re.sub(r'-\s+', '', text)
30
+ return text.strip()
31
+
32
+ def expand_to_word_boundaries(text, start, end):
33
+ """
34
+ Expand entity boundaries to complete words.
35
+ Prevents highlighting fragments like "itis" from "abnormalities".
36
+ """
37
+ # Expand left until we hit non-alphanumeric
38
+ while start > 0 and (text[start - 1].isalnum() or text[start - 1] in ['-', "'"]):
39
+ start -= 1
40
+
41
+ # Expand right until we hit non-alphanumeric
42
+ while end < len(text) and (text[end].isalnum() or text[end] in ['-', "'"]):
43
+ end += 1
44
+
45
+ return start, end
46
+
47
+ def is_valid_entity(text, start, end):
48
+ """
49
+ Filter out garbage entities.
50
+ Returns False if entity is:
51
+ - Too short (< 2 chars)
52
+ - All punctuation
53
+ - Just a suffix (starts with ##)
54
+ """
55
+ entity_text = text[start:end].strip()
56
+
57
+ # Too short
58
+ if len(entity_text) < 2:
59
+ return False
60
+
61
+ # All punctuation or numbers
62
+ if not any(c.isalpha() for c in entity_text):
63
+ return False
64
+
65
+ # Starts with subword marker (shouldn't happen after expansion, but check anyway)
66
+ if entity_text.startswith('##'):
67
+ return False
68
+
69
+ # Single letter (likely fragment)
70
+ if len(entity_text) == 1:
71
+ return False
72
+
73
+ return True
74
+
75
+ def ner_predict(text):
76
+ if not text.strip():
77
+ return "<p>No text provided</p>", "No entities"
78
+
79
+ if len(text) > 100000:
80
+ return "<p style='color:red;'>Text too long (max 100k characters)</p>", ""
81
+
82
+ # Clean text
83
+ text = clean_text(text)
84
+
85
+ # spaCy sentence splitting with exact offsets
86
+ doc = nlp(text)
87
+ sentences = []
88
+ for sent in doc.sents:
89
+ sentences.append({
90
+ "text": sent.text,
91
+ "start": sent.start_char,
92
+ "end": sent.end_char
93
+ })
94
+
95
+ if not sentences:
96
+ return "<p>No sentences detected</p>", ""
97
+
98
+ # Chunking with overlap
99
+ max_tokens = 450
100
+ chunks = []
101
+
102
+ i = 0
103
+ while i < len(sentences):
104
+ chunk_sents = []
105
+ chunk_text = ""
106
+
107
+ for j in range(i, len(sentences)):
108
+ candidate = chunk_text + " " + sentences[j]["text"] if chunk_text else sentences[j]["text"]
109
+ tokens = ner_pipeline.tokenizer.tokenize(candidate)
110
+
111
+ if len(tokens) > max_tokens and chunk_sents:
112
+ break
113
+
114
+ chunk_sents.append(sentences[j])
115
+ chunk_text = candidate
116
+
117
+ if chunk_sents:
118
+ chunks.append({
119
+ "text": chunk_text,
120
+ "offset": chunk_sents[0]["start"],
121
+ "sentence_count": len(chunk_sents)
122
+ })
123
+
124
+ sentences_to_skip = max(1, len(chunk_sents) - 2)
125
+ i += sentences_to_skip
126
+
127
+ # Predict on chunks
128
+ all_entities = []
129
+
130
+ for chunk in chunks:
131
+ try:
132
+ results = ner_pipeline(chunk["text"])
133
+
134
+ for r in results:
135
+ if r['score'] > 0.50: # Increased threshold to filter noise
136
+ # Adjust offsets to global position
137
+ r['start'] += chunk["offset"]
138
+ r['end'] += chunk["offset"]
139
+
140
+ # CRITICAL FIX: Expand to word boundaries
141
+ r['start'], r['end'] = expand_to_word_boundaries(
142
+ text, r['start'], r['end']
143
+ )
144
+
145
+ # Validate entity
146
+ if is_valid_entity(text, r['start'], r['end']):
147
+ all_entities.append(r)
148
+ except Exception as e:
149
+ print(f"Chunk processing error: {e}")
150
+ continue
151
+
152
+ # Sort and deduplicate
153
+ all_entities = sorted(all_entities, key=lambda x: (x['start'], -x['score']))
154
+
155
+ final_entities = []
156
+ for ent in all_entities:
157
+ # Check overlap with previous entity
158
+ if not final_entities or ent['start'] >= final_entities[-1]['end']:
159
+ final_entities.append(ent)
160
+ elif ent['score'] > final_entities[-1]['score']:
161
+ # Replace if higher confidence AND different span
162
+ if ent['end'] > final_entities[-1]['end'] or ent['start'] < final_entities[-1]['start']:
163
+ final_entities[-1] = ent
164
+
165
+ # Generate highlighted HTML
166
+ highlighted = ""
167
+ last_idx = 0
168
+
169
+ color_map = {
170
+ "SYMPTOM": "#FFD700",
171
+ "DISEASE": "#FF6B6B",
172
+ "MEDICATION": "#90EE90",
173
+ "PROCEDURE": "#87CEEB",
174
+ "ANATOMY": "#FFB347"
175
+ }
176
+
177
+ label_display = {
178
+ "DISEASE": "pathology",
179
+ "SYMPTOM": "symptom",
180
+ "MEDICATION": "medication",
181
+ "PROCEDURE": "procedure",
182
+ "ANATOMY": "anatomy"
183
+ }
184
+
185
+ for ent in final_entities:
186
+ start, end = ent['start'], ent['end']
187
+ label = ent['entity_group']
188
+ score = ent['score']
189
+
190
+ # Bounds check
191
+ if start >= len(text) or end > len(text) or start < 0 or end < 0:
192
+ continue
193
+
194
+ # Skip if indices are reversed
195
+ if start >= end:
196
+ continue
197
+
198
+ highlighted += text[last_idx:start]
199
+
200
+ color = color_map.get(label, "#E0E0E0")
201
+ display_label = label_display.get(label, label.lower())
202
+ entity_text = text[start:end]
203
+
204
+ highlighted += (
205
+ f'<mark style="background-color:{color}; padding:2px 4px; '
206
+ f'border-radius:3px; font-weight:500;" '
207
+ f'title="{display_label} ({score:.2f})">'
208
+ f'{entity_text} <sup style="font-size:0.65em; color:#666;">/{display_label}</sup>'
209
+ f'</mark>'
210
+ )
211
+
212
+ last_idx = end
213
+
214
+ highlighted += text[last_idx:]
215
+ highlighted = f'<div style="line-height:1.8; font-family:sans-serif; white-space:pre-wrap;">{highlighted}</div>'
216
+
217
+ # Entity list
218
+ if final_entities:
219
+ entity_list = "\n".join([
220
+ f"{label_display.get(e['entity_group'], e['entity_group'])}: "
221
+ f"{text[e['start']:e['end']]} ({e['score']:.2f})"
222
+ for e in final_entities
223
+ ])
224
+ else:
225
+ entity_list = "No entities detected"
226
+
227
+ return highlighted, entity_list
228
+
229
+ with gr.Blocks(title="Feline Veterinary NER") as demo:
230
+ gr.Markdown("# 🐱 Feline Veterinary NER System")
231
+ gr.Markdown(
232
+ "Extracts **pathologies**, **symptoms**, **medications**, **procedures**, "
233
+ "and **anatomy** from veterinary literature. Handles PDF/HTML paste artifacts."
234
+ )
235
+
236
+ input_text = gr.Textbox(
237
+ label="Input Text",
238
+ lines=15,
239
+ placeholder="Paste article text here (handles complex scientific formatting)..."
240
+ )
241
+
242
+ analyze_btn = gr.Button("🔬 Analyze", variant="primary", size="lg")
243
+
244
+ output_html = gr.HTML(label="📄 Annotated Text")
245
+ output_list = gr.Textbox(label="📋 Detected Entities", lines=10)
246
+
247
+ analyze_btn.click(ner_predict, input_text, [output_html, output_list])
248
+
249
+ gr.Examples(
250
+ examples=[
251
+ ["Chronic kidney disease was diagnosed. The cat received meloxicam and subcutaneous fluids."],
252
+ ["Ultrasound revealed a renal mass. FIV infection was confirmed by PCR in blood samples."],
253
+ ["The patient presented with vomiting, lethargy, and dehydration. Blood work showed elevated creatinine."]
254
+ ],
255
+ inputs=input_text
256
+ )
257
+
258
+ demo.launch()