MNGames commited on
Commit
3d6f597
·
verified ·
1 Parent(s): e19d562

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -12
app.py CHANGED
@@ -5,17 +5,37 @@ import re
5
  # Load NER pipeline
6
  ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
7
 
8
- # Custom sentence splitting function
9
  def split_sentences(text):
10
- # Avoid splitting after abbreviations
11
- protected = r"\b(?:[A-Z]\.|D\.C\.|U\.S\.|Mr\.|Mrs\.|Dr\.|Jr\.|Sr\.|vs\.|Inc\.|Ltd\.|etc\.)"
12
-
13
- # Split on ., ?, ! followed by a space + capital/lowercase/’ (not part of abbreviation)
14
- pattern = re.compile(rf"(?<!{protected})(?<=[.!?])\s+(?=[A-Z'‘“a-z])")
15
- sentences = re.split(pattern, text.strip())
16
- return [s.strip() for s in sentences if s.strip()]
17
-
18
- # API function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def analyze_text(text):
20
  sentences = split_sentences(text)
21
  results = []
@@ -30,13 +50,13 @@ def analyze_text(text):
30
 
31
  return {"sentences": results, "total_sentences": len(sentences)}
32
 
33
- # Gradio interface (API-style)
34
  demo = gr.Interface(
35
  fn=analyze_text,
36
  inputs=gr.Textbox(label="Input Text", lines=6, placeholder="Paste your article..."),
37
  outputs=gr.JSON(label="NER + Sentence Output"),
38
  title="Sentence Splitter + NER API",
39
- description="Splits text into sentences (with abbreviation protection) and runs NER using dslim/bert-base-NER."
40
  )
41
 
42
  if __name__ == "__main__":
 
5
  # Load NER pipeline
6
  ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
7
 
8
+ # --- Safe Sentence Splitter ---
9
  def split_sentences(text):
10
+ """
11
+ Splits text into sentences while protecting abbreviations like 'U.S.' or 'Dr.'.
12
+ Avoids variable-length lookbehind errors by using placeholder replacement.
13
+ """
14
+ # Common abbreviations to protect
15
+ protected_terms = [
16
+ "Mr.", "Mrs.", "Ms.", "Dr.", "Jr.", "Sr.",
17
+ "U.S.", "D.C.", "vs.", "Lt.", "St.", "Prof.", "Inc.", "Ltd.", "etc."
18
+ ]
19
+
20
+ # Temporarily replace periods in abbreviations to avoid splitting
21
+ protected_map = {term: term.replace(".", "<DOT>") for term in protected_terms}
22
+ for original, safe in protected_map.items():
23
+ text = text.replace(original, safe)
24
+
25
+ # Split on ., ?, ! followed by space + capital/lowercase/apostrophe
26
+ pattern = re.compile(r"(?<=[.!?])\s+(?=[A-Z'‘“a-z])")
27
+ parts = re.split(pattern, text.strip())
28
+
29
+ # Restore abbreviations
30
+ restored = []
31
+ for s in parts:
32
+ for original, safe in protected_map.items():
33
+ s = s.replace(safe, original)
34
+ restored.append(s.strip())
35
+
36
+ return [s for s in restored if s]
37
+
38
+ # --- API Function ---
39
  def analyze_text(text):
40
  sentences = split_sentences(text)
41
  results = []
 
50
 
51
  return {"sentences": results, "total_sentences": len(sentences)}
52
 
53
+ # --- Gradio Interface (API Style) ---
54
  demo = gr.Interface(
55
  fn=analyze_text,
56
  inputs=gr.Textbox(label="Input Text", lines=6, placeholder="Paste your article..."),
57
  outputs=gr.JSON(label="NER + Sentence Output"),
58
  title="Sentence Splitter + NER API",
59
+ description="Splits text into sentences (protects abbreviations) and runs Named Entity Recognition (dslim/bert-base-NER)."
60
  )
61
 
62
  if __name__ == "__main__":