Spaces:

MNGames
/

NLP

Sleeping

App Files Files Community

MNGames commited on Oct 11, 2025

Commit

3d6f597

verified ·

1 Parent(s): e19d562

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -12

app.py CHANGED Viewed

@@ -5,17 +5,37 @@ import re
 # Load NER pipeline
 ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
-# Custom sentence splitting function
 def split_sentences(text):
-    # Avoid splitting after abbreviations
-    protected = r"\b(?:[A-Z]\.|D\.C\.|U\.S\.|Mr\.|Mrs\.|Dr\.|Jr\.|Sr\.|vs\.|Inc\.|Ltd\.|etc\.)"
-    # Split on ., ?, ! followed by a space + capital/lowercase/’ (not part of abbreviation)
-    pattern = re.compile(rf"(?<!{protected})(?<=[.!?])\s+(?=[A-Z'‘“a-z])")
-    sentences = re.split(pattern, text.strip())
-    return [s.strip() for s in sentences if s.strip()]
-# API function
 def analyze_text(text):
     sentences = split_sentences(text)
     results = []
@@ -30,13 +50,13 @@ def analyze_text(text):
     return {"sentences": results, "total_sentences": len(sentences)}
-# Gradio interface (API-style)
 demo = gr.Interface(
     fn=analyze_text,
     inputs=gr.Textbox(label="Input Text", lines=6, placeholder="Paste your article..."),
     outputs=gr.JSON(label="NER + Sentence Output"),
     title="Sentence Splitter + NER API",
-    description="Splits text into sentences (with abbreviation protection) and runs NER using dslim/bert-base-NER."
 )
 if __name__ == "__main__":

 # Load NER pipeline
 ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
+# --- Safe Sentence Splitter ---
 def split_sentences(text):
+    """
+    Splits text into sentences while protecting abbreviations like 'U.S.' or 'Dr.'.
+    Avoids variable-length lookbehind errors by using placeholder replacement.
+    """
+    # Common abbreviations to protect
+    protected_terms = [
+        "Mr.", "Mrs.", "Ms.", "Dr.", "Jr.", "Sr.",
+        "U.S.", "D.C.", "vs.", "Lt.", "St.", "Prof.", "Inc.", "Ltd.", "etc."
+    ]
+    # Temporarily replace periods in abbreviations to avoid splitting
+    protected_map = {term: term.replace(".", "<DOT>") for term in protected_terms}
+    for original, safe in protected_map.items():
+        text = text.replace(original, safe)
+    # Split on ., ?, ! followed by space + capital/lowercase/apostrophe
+    pattern = re.compile(r"(?<=[.!?])\s+(?=[A-Z'‘“a-z])")
+    parts = re.split(pattern, text.strip())
+    # Restore abbreviations
+    restored = []
+    for s in parts:
+        for original, safe in protected_map.items():
+            s = s.replace(safe, original)
+        restored.append(s.strip())
+    return [s for s in restored if s]
+# --- API Function ---
 def analyze_text(text):
     sentences = split_sentences(text)
     results = []
     return {"sentences": results, "total_sentences": len(sentences)}
+# --- Gradio Interface (API Style) ---
 demo = gr.Interface(
     fn=analyze_text,
     inputs=gr.Textbox(label="Input Text", lines=6, placeholder="Paste your article..."),
     outputs=gr.JSON(label="NER + Sentence Output"),
     title="Sentence Splitter + NER API",
+    description="Splits text into sentences (protects abbreviations) and runs Named Entity Recognition (dslim/bert-base-NER)."
 )
 if __name__ == "__main__":