NLP / app.py
MNGames's picture
Update app.py
3d6f597 verified
from transformers import pipeline
import gradio as gr
import re
# Load NER pipeline
ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
# --- Safe Sentence Splitter ---
def split_sentences(text):
"""
Splits text into sentences while protecting abbreviations like 'U.S.' or 'Dr.'.
Avoids variable-length lookbehind errors by using placeholder replacement.
"""
# Common abbreviations to protect
protected_terms = [
"Mr.", "Mrs.", "Ms.", "Dr.", "Jr.", "Sr.",
"U.S.", "D.C.", "vs.", "Lt.", "St.", "Prof.", "Inc.", "Ltd.", "etc."
]
# Temporarily replace periods in abbreviations to avoid splitting
protected_map = {term: term.replace(".", "<DOT>") for term in protected_terms}
for original, safe in protected_map.items():
text = text.replace(original, safe)
# Split on ., ?, ! followed by space + capital/lowercase/apostrophe
pattern = re.compile(r"(?<=[.!?])\s+(?=[A-Z'‘“a-z])")
parts = re.split(pattern, text.strip())
# Restore abbreviations
restored = []
for s in parts:
for original, safe in protected_map.items():
s = s.replace(safe, original)
restored.append(s.strip())
return [s for s in restored if s]
# --- API Function ---
def analyze_text(text):
sentences = split_sentences(text)
results = []
for i, sentence in enumerate(sentences, start=1):
entities = ner(sentence)
results.append({
"sentence_number": i,
"sentence": sentence,
"entities": entities
})
return {"sentences": results, "total_sentences": len(sentences)}
# --- Gradio Interface (API Style) ---
demo = gr.Interface(
fn=analyze_text,
inputs=gr.Textbox(label="Input Text", lines=6, placeholder="Paste your article..."),
outputs=gr.JSON(label="NER + Sentence Output"),
title="Sentence Splitter + NER API",
description="Splits text into sentences (protects abbreviations) and runs Named Entity Recognition (dslim/bert-base-NER)."
)
if __name__ == "__main__":
demo.launch()