Spaces:

Statistical-Impossibility
/

Feline-NER-Demo

Sleeping

App Files Files Community

Statistical-Impossibility commited on Jan 16

Commit

26b59fd

verified ·

1 Parent(s): fb3b7fc

Upload app.py

Browse files

Files changed (1) hide show

app.py +258 -0

app.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import gradio as gr
+from transformers import pipeline
+import spacy
+import re
+import unicodedata
+import sys
+import subprocess
+# Download spaCy model if not present
+try:
+    nlp = spacy.load("en_core_web_sm")
+except OSError:
+    print("Downloading spaCy model...")
+    subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True)
+    nlp = spacy.load("en_core_web_sm")
+nlp.add_pipe("sentencizer")
+model_id = "Statistical-Impossibility/Feline-NER-Test"
+ner_pipeline = pipeline("token-classification", model=model_id, aggregation_strategy="simple")
+def clean_text(text):
+    """Aggressive cleaning for PDF/HTML paste artifacts."""
+    text = unicodedata.normalize('NFKC', text)
+    text = re.sub(r'<[^>]+>', '', text)
+    text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'-\s+', '', text)
+    return text.strip()
+def expand_to_word_boundaries(text, start, end):
+    """
+    Expand entity boundaries to complete words.
+    Prevents highlighting fragments like "itis" from "abnormalities".
+    """
+    # Expand left until we hit non-alphanumeric
+    while start > 0 and (text[start - 1].isalnum() or text[start - 1] in ['-', "'"]):
+        start -= 1
+    # Expand right until we hit non-alphanumeric
+    while end < len(text) and (text[end].isalnum() or text[end] in ['-', "'"]):
+        end += 1
+    return start, end
+def is_valid_entity(text, start, end):
+    """
+    Filter out garbage entities.
+    Returns False if entity is:
+    - Too short (< 2 chars)
+    - All punctuation
+    - Just a suffix (starts with ##)
+    """
+    entity_text = text[start:end].strip()
+    # Too short
+    if len(entity_text) < 2:
+        return False
+    # All punctuation or numbers
+    if not any(c.isalpha() for c in entity_text):
+        return False
+    # Starts with subword marker (shouldn't happen after expansion, but check anyway)
+    if entity_text.startswith('##'):
+        return False
+    # Single letter (likely fragment)
+    if len(entity_text) == 1:
+        return False
+    return True
+def ner_predict(text):
+    if not text.strip():
+        return "<p>No text provided</p>", "No entities"
+    if len(text) > 100000:
+        return "<p style='color:red;'>Text too long (max 100k characters)</p>", ""
+    # Clean text
+    text = clean_text(text)
+    # spaCy sentence splitting with exact offsets
+    doc = nlp(text)
+    sentences = []
+    for sent in doc.sents:
+        sentences.append({
+            "text": sent.text,
+            "start": sent.start_char,
+            "end": sent.end_char
+        })
+    if not sentences:
+        return "<p>No sentences detected</p>", ""
+    # Chunking with overlap
+    max_tokens = 450
+    chunks = []
+    i = 0
+    while i < len(sentences):
+        chunk_sents = []
+        chunk_text = ""
+        for j in range(i, len(sentences)):
+            candidate = chunk_text + " " + sentences[j]["text"] if chunk_text else sentences[j]["text"]
+            tokens = ner_pipeline.tokenizer.tokenize(candidate)
+            if len(tokens) > max_tokens and chunk_sents:
+                break
+            chunk_sents.append(sentences[j])
+            chunk_text = candidate
+        if chunk_sents:
+            chunks.append({
+                "text": chunk_text,
+                "offset": chunk_sents[0]["start"],
+                "sentence_count": len(chunk_sents)
+            })
+        sentences_to_skip = max(1, len(chunk_sents) - 2)
+        i += sentences_to_skip
+    # Predict on chunks
+    all_entities = []
+    for chunk in chunks:
+        try:
+            results = ner_pipeline(chunk["text"])
+            for r in results:
+                if r['score'] > 0.50:  # Increased threshold to filter noise
+                    # Adjust offsets to global position
+                    r['start'] += chunk["offset"]
+                    r['end'] += chunk["offset"]
+                    # CRITICAL FIX: Expand to word boundaries
+                    r['start'], r['end'] = expand_to_word_boundaries(
+                        text, r['start'], r['end']
+                    )
+                    # Validate entity
+                    if is_valid_entity(text, r['start'], r['end']):
+                        all_entities.append(r)
+        except Exception as e:
+            print(f"Chunk processing error: {e}")
+            continue
+    # Sort and deduplicate
+    all_entities = sorted(all_entities, key=lambda x: (x['start'], -x['score']))
+    final_entities = []
+    for ent in all_entities:
+        # Check overlap with previous entity
+        if not final_entities or ent['start'] >= final_entities[-1]['end']:
+            final_entities.append(ent)
+        elif ent['score'] > final_entities[-1]['score']:
+            # Replace if higher confidence AND different span
+            if ent['end'] > final_entities[-1]['end'] or ent['start'] < final_entities[-1]['start']:
+                final_entities[-1] = ent
+    # Generate highlighted HTML
+    highlighted = ""
+    last_idx = 0
+    color_map = {
+        "SYMPTOM": "#FFD700",
+        "DISEASE": "#FF6B6B",
+        "MEDICATION": "#90EE90",
+        "PROCEDURE": "#87CEEB",
+        "ANATOMY": "#FFB347"
+    }
+    label_display = {
+        "DISEASE": "pathology",
+        "SYMPTOM": "symptom",
+        "MEDICATION": "medication",
+        "PROCEDURE": "procedure",
+        "ANATOMY": "anatomy"
+    }
+    for ent in final_entities:
+        start, end = ent['start'], ent['end']
+        label = ent['entity_group']
+        score = ent['score']
+        # Bounds check
+        if start >= len(text) or end > len(text) or start < 0 or end < 0:
+            continue
+        # Skip if indices are reversed
+        if start >= end:
+            continue
+        highlighted += text[last_idx:start]
+        color = color_map.get(label, "#E0E0E0")
+        display_label = label_display.get(label, label.lower())
+        entity_text = text[start:end]
+        highlighted += (
+            f'<mark style="background-color:{color}; padding:2px 4px; '
+            f'border-radius:3px; font-weight:500;" '
+            f'title="{display_label} ({score:.2f})">'
+            f'{entity_text} <sup style="font-size:0.65em; color:#666;">/{display_label}</sup>'
+            f'</mark>'
+        )
+        last_idx = end
+    highlighted += text[last_idx:]
+    highlighted = f'<div style="line-height:1.8; font-family:sans-serif; white-space:pre-wrap;">{highlighted}</div>'
+    # Entity list
+    if final_entities:
+        entity_list = "\n".join([
+            f"{label_display.get(e['entity_group'], e['entity_group'])}: "
+            f"{text[e['start']:e['end']]} ({e['score']:.2f})"
+            for e in final_entities
+        ])
+    else:
+        entity_list = "No entities detected"
+    return highlighted, entity_list
+with gr.Blocks(title="Feline Veterinary NER") as demo:
+    gr.Markdown("# 🐱 Feline Veterinary NER System")
+    gr.Markdown(
+        "Extracts **pathologies**, **symptoms**, **medications**, **procedures**, "
+        "and **anatomy** from veterinary literature. Handles PDF/HTML paste artifacts."
+    )
+    input_text = gr.Textbox(
+        label="Input Text",
+        lines=15,
+        placeholder="Paste article text here (handles complex scientific formatting)..."
+    )
+    analyze_btn = gr.Button("🔬 Analyze", variant="primary", size="lg")
+    output_html = gr.HTML(label="📄 Annotated Text")
+    output_list = gr.Textbox(label="📋 Detected Entities", lines=10)
+    analyze_btn.click(ner_predict, input_text, [output_html, output_list])
+    gr.Examples(
+        examples=[
+            ["Chronic kidney disease was diagnosed. The cat received meloxicam and subcutaneous fluids."],
+            ["Ultrasound revealed a renal mass. FIV infection was confirmed by PCR in blood samples."],
+            ["The patient presented with vomiting, lethargy, and dehydration. Blood work showed elevated creatinine."]
+        ],
+        inputs=input_text
+    )
+demo.launch()