|
|
from transformers import pipeline |
|
|
import gradio as gr |
|
|
import re |
|
|
|
|
|
|
|
|
ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple") |
|
|
|
|
|
|
|
|
def split_sentences(text): |
|
|
""" |
|
|
Splits text into sentences while protecting abbreviations like 'U.S.' or 'Dr.'. |
|
|
Avoids variable-length lookbehind errors by using placeholder replacement. |
|
|
""" |
|
|
|
|
|
protected_terms = [ |
|
|
"Mr.", "Mrs.", "Ms.", "Dr.", "Jr.", "Sr.", |
|
|
"U.S.", "D.C.", "vs.", "Lt.", "St.", "Prof.", "Inc.", "Ltd.", "etc." |
|
|
] |
|
|
|
|
|
|
|
|
protected_map = {term: term.replace(".", "<DOT>") for term in protected_terms} |
|
|
for original, safe in protected_map.items(): |
|
|
text = text.replace(original, safe) |
|
|
|
|
|
|
|
|
pattern = re.compile(r"(?<=[.!?])\s+(?=[A-Z'‘“a-z])") |
|
|
parts = re.split(pattern, text.strip()) |
|
|
|
|
|
|
|
|
restored = [] |
|
|
for s in parts: |
|
|
for original, safe in protected_map.items(): |
|
|
s = s.replace(safe, original) |
|
|
restored.append(s.strip()) |
|
|
|
|
|
return [s for s in restored if s] |
|
|
|
|
|
|
|
|
def analyze_text(text): |
|
|
sentences = split_sentences(text) |
|
|
results = [] |
|
|
|
|
|
for i, sentence in enumerate(sentences, start=1): |
|
|
entities = ner(sentence) |
|
|
results.append({ |
|
|
"sentence_number": i, |
|
|
"sentence": sentence, |
|
|
"entities": entities |
|
|
}) |
|
|
|
|
|
return {"sentences": results, "total_sentences": len(sentences)} |
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=analyze_text, |
|
|
inputs=gr.Textbox(label="Input Text", lines=6, placeholder="Paste your article..."), |
|
|
outputs=gr.JSON(label="NER + Sentence Output"), |
|
|
title="Sentence Splitter + NER API", |
|
|
description="Splits text into sentences (protects abbreviations) and runs Named Entity Recognition (dslim/bert-base-NER)." |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|