File size: 2,102 Bytes
e19d562
 
 
 
 
 
 
3d6f597
e19d562
3d6f597
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e19d562
 
 
 
 
 
 
 
 
 
 
 
 
 
3d6f597
e19d562
 
 
 
 
3d6f597
e19d562
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from transformers import pipeline
import gradio as gr
import re

# Load NER pipeline
ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")

# --- Safe Sentence Splitter ---
def split_sentences(text):
    """
    Splits text into sentences while protecting abbreviations like 'U.S.' or 'Dr.'.
    Avoids variable-length lookbehind errors by using placeholder replacement.
    """
    # Common abbreviations to protect
    protected_terms = [
        "Mr.", "Mrs.", "Ms.", "Dr.", "Jr.", "Sr.",
        "U.S.", "D.C.", "vs.", "Lt.", "St.", "Prof.", "Inc.", "Ltd.", "etc."
    ]

    # Temporarily replace periods in abbreviations to avoid splitting
    protected_map = {term: term.replace(".", "<DOT>") for term in protected_terms}
    for original, safe in protected_map.items():
        text = text.replace(original, safe)

    # Split on ., ?, ! followed by space + capital/lowercase/apostrophe
    pattern = re.compile(r"(?<=[.!?])\s+(?=[A-Z'‘“a-z])")
    parts = re.split(pattern, text.strip())

    # Restore abbreviations
    restored = []
    for s in parts:
        for original, safe in protected_map.items():
            s = s.replace(safe, original)
        restored.append(s.strip())

    return [s for s in restored if s]

# --- API Function ---
def analyze_text(text):
    sentences = split_sentences(text)
    results = []

    for i, sentence in enumerate(sentences, start=1):
        entities = ner(sentence)
        results.append({
            "sentence_number": i,
            "sentence": sentence,
            "entities": entities
        })

    return {"sentences": results, "total_sentences": len(sentences)}

# --- Gradio Interface (API Style) ---
demo = gr.Interface(
    fn=analyze_text,
    inputs=gr.Textbox(label="Input Text", lines=6, placeholder="Paste your article..."),
    outputs=gr.JSON(label="NER + Sentence Output"),
    title="Sentence Splitter + NER API",
    description="Splits text into sentences (protects abbreviations) and runs Named Entity Recognition (dslim/bert-base-NER)."
)

if __name__ == "__main__":
    demo.launch()