Spaces:

ad180
/

speech-analyser

Sleeping

App Files Files Community

ad180 commited on Jan 4

Commit

3dd79a1

verified ·

1 Parent(s): c9fff8b

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -83

app.py CHANGED Viewed

@@ -7,32 +7,34 @@ from textblob import TextBlob
 from transformers import pipeline
 # ---------------------------------------------------------
-# MODEL LOADING
 # ---------------------------------------------------------
-# 1) spaCy transformer model
-nlp = spacy.load("en_core_web_trf")
-# 2) DeBERTa fallback NER model (general-purpose PER/ORG/LOC/MISC)
 deberta_ner = pipeline(
     "token-classification",
     model="geckos/deberta-base-fine-tuned-ner",
     aggregation_strategy="simple"
 )
-# spaCy stopwords
-stopwords = nlp.Defaults.stop_words
 # ---------------------------------------------------------
-# HELPER FUNCTIONS
 # ---------------------------------------------------------
-def clean_text(text: str) -> str:
     return text.strip()
-def get_word_freq(text: str) -> str:
     words = re.findall(r"\b\w+\b", text.lower())
     words = [w for w in words if w not in stopwords]
     counts = Counter(words).most_common(10)
@@ -41,7 +43,7 @@ def get_word_freq(text: str) -> str:
     return "\n".join(f"{w}: {c}" for w, c in counts)
-def get_sentiment(text: str) -> str:
     sentiment = TextBlob(text).sentiment
     return (
         f"Polarity: {sentiment.polarity:.3f}\n"
@@ -49,46 +51,36 @@ def get_sentiment(text: str) -> str:
     )
-def run_spacy_entities(text: str):
-    doc = nlp(text)
     ents = []
     for ent in doc.ents:
         ents.append({"text": ent.text, "label": ent.label_})
     return ents
-def run_deberta_entities(text: str):
-    results = deberta_ner(text)
-    ents = []
-    for r in results:
-        ents.append({"text": r["word"], "label": r["entity_group"]})
-    return ents
-def categorize_entities(spacy_ents, deberta_ents) -> str:
-    """
-    Merge entities from spaCy + DeBERTa into:
-    - People
-    - Organizations
-    - Countries/Locations
-    - Misc
-    """
     people = set()
     orgs = set()
     locations = set()
     misc = set()
-    def norm(t):
-        return t.strip()
-    # -------------------------
     # spaCy mapping
-    # -------------------------
     for ent in spacy_ents:
         text = norm(ent["text"])
         label = ent["label"]
         if label == "PERSON":
             people.add(text)
         elif label == "ORG":
@@ -98,13 +90,10 @@ def categorize_entities(spacy_ents, deberta_ents) -> str:
         else:
             misc.add(text)
-    # -------------------------
-    # DeBERTa mapping (PER/ORG/LOC/MISC)
-    # -------------------------
     for ent in deberta_ents:
         text = norm(ent["text"])
         label = ent["label"]
         if label == "PER":
             people.add(text)
         elif label == "ORG":
@@ -114,38 +103,28 @@ def categorize_entities(spacy_ents, deberta_ents) -> str:
         else:
             misc.add(text)
-    # -------------------------
-    # Format output
-    # -------------------------
     def fmt(title, items):
         if not items:
             return f"{title}:\n  (none)"
         items = sorted(items, key=lambda x: x.lower())
         return f"{title}:\n  - " + "\n  - ".join(items)
-    sections = [
         fmt("People", people),
         fmt("Organizations", orgs),
         fmt("Countries/Locations", locations),
         fmt("Misc", misc),
-    ]
-    return "\n\n".join(sections)
 # ---------------------------------------------------------
-# MAIN ANALYSIS FUNCTION
 # ---------------------------------------------------------
-def analyze_text(text: str):
     text = clean_text(text)
     if not text:
-        return (
-            "No words found.",
-            "No sentiment detected.",
-            "No entities detected.",
-            "Please enter some text."
-        )
     # Word frequency
     word_freq_str = get_word_freq(text)
@@ -153,46 +132,59 @@ def analyze_text(text: str):
     # Sentiment
     sentiment_str = get_sentiment(text)
-    # Entities from spaCy + DeBERTa
-    try:
-        spacy_ents = run_spacy_entities(text)
-    except Exception:
-        spacy_ents = []
-    try:
-        deberta_ents = run_deberta_entities(text)
-    except Exception:
-        deberta_ents = []
     entities_str = categorize_entities(spacy_ents, deberta_ents)
-    return (
-        word_freq_str,
-        sentiment_str,
-        entities_str,
-        "Analysis complete."
-    )
 # ---------------------------------------------------------
-# GRADIO UI
 # ---------------------------------------------------------
-demo = gr.Interface(
-    fn=analyze_text,
-    inputs=gr.Textbox(lines=10, label="Paste a speech, news article, or press release"),
-    outputs=[
-        gr.Textbox(lines=12, label="Most Common Words"),
-        gr.Textbox(lines=12, label="Sentiment"),
-        gr.Textbox(lines=20, label="Entities (People / Orgs / Locations)"),
-        gr.Textbox(lines=3, label="Status")
-    ],
-    title="🗳️ Text & Speech Analyzer (spaCy + DeBERTa)",
-    description=(
-        "Analyze political speeches, news, or press releases.\n"
-        "NER powered by spaCy transformer + DeBERTa."
     )
-)
 demo.launch()

 from transformers import pipeline
 # ---------------------------------------------------------
+# LOAD MODELS
 # ---------------------------------------------------------
+# Accurate mode model (spaCy transformer)
+nlp_trf = spacy.load("en_core_web_trf")
+# Fast mode model (spaCy small)
+nlp_sm = spacy.load("en_core_web_sm")
+# DeBERTa NER (used only in Accurate mode)
 deberta_ner = pipeline(
     "token-classification",
     model="geckos/deberta-base-fine-tuned-ner",
     aggregation_strategy="simple"
 )
+stopwords = nlp_trf.Defaults.stop_words
 # ---------------------------------------------------------
+# HELPERS
 # ---------------------------------------------------------
+def clean_text(text):
     return text.strip()
+def get_word_freq(text):
     words = re.findall(r"\b\w+\b", text.lower())
     words = [w for w in words if w not in stopwords]
     counts = Counter(words).most_common(10)
     return "\n".join(f"{w}: {c}" for w, c in counts)
+def get_sentiment(text):
     sentiment = TextBlob(text).sentiment
     return (
         f"Polarity: {sentiment.polarity:.3f}\n"
     )
+def run_spacy_entities(doc):
     ents = []
     for ent in doc.ents:
         ents.append({"text": ent.text, "label": ent.label_})
     return ents
+def run_deberta_batched(text):
+    """Split text into sentences and batch them for faster NER."""
+    sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
+    results = []
+    for sent in sentences:
+        out = deberta_ner(sent)
+        for r in out:
+            results.append({"text": r["word"], "label": r["entity_group"]})
+    return results
+def categorize_entities(spacy_ents, deberta_ents):
     people = set()
     orgs = set()
     locations = set()
     misc = set()
+    def norm(t): return t.strip()
     # spaCy mapping
     for ent in spacy_ents:
         text = norm(ent["text"])
         label = ent["label"]
         if label == "PERSON":
             people.add(text)
         elif label == "ORG":
         else:
             misc.add(text)
+    # DeBERTa mapping
     for ent in deberta_ents:
         text = norm(ent["text"])
         label = ent["label"]
         if label == "PER":
             people.add(text)
         elif label == "ORG":
         else:
             misc.add(text)
     def fmt(title, items):
         if not items:
             return f"{title}:\n  (none)"
         items = sorted(items, key=lambda x: x.lower())
         return f"{title}:\n  - " + "\n  - ".join(items)
+    return "\n\n".join([
         fmt("People", people),
         fmt("Organizations", orgs),
         fmt("Countries/Locations", locations),
         fmt("Misc", misc),
+    ])
 # ---------------------------------------------------------
+# MAIN ANALYSIS
 # ---------------------------------------------------------
+def analyze_text(text, mode):
     text = clean_text(text)
     if not text:
+        return ("No words found.", "No sentiment detected.", "No entities detected.")
     # Word frequency
     word_freq_str = get_word_freq(text)
     # Sentiment
     sentiment_str = get_sentiment(text)
+    # Fast mode → spaCy small only
+    if mode == "Fast":
+        doc = nlp_sm(text)
+        spacy_ents = run_spacy_entities(doc)
+        entities_str = categorize_entities(spacy_ents, [])
+        return (word_freq_str, sentiment_str, entities_str)
+    # Accurate mode → spaCy transformer + DeBERTa (batched)
+    doc = nlp_trf(text)
+    spacy_ents = run_spacy_entities(doc)
+    deberta_ents = run_deberta_batched(text)
     entities_str = categorize_entities(spacy_ents, deberta_ents)
+    return (word_freq_str, sentiment_str, entities_str)
 # ---------------------------------------------------------
+# UI
 # ---------------------------------------------------------
+with gr.Blocks(title="🗳️ Text & Speech Analyzer") as demo:
+    gr.Markdown("## 🗳️ Text & Speech Analyzer (Fast + Accurate Modes)")
+    gr.Markdown(
+        "Analyze political speeches, news, or press releases.\n\n"
+        "**Fast Mode** → spaCy small (1–2 seconds)\n\n"
+        "**Accurate Mode** → spaCy transformer + DeBERTa (8–12 seconds)"
+    )
+    mode = gr.Radio(["Fast", "Accurate"], value="Accurate", label="Choose Mode")
+    input_box = gr.Textbox(
+        lines=12,
+        label="Paste text here",
+        placeholder="Enter a speech, article, or paragraph..."
     )
+    with gr.Tabs():
+        with gr.Tab("Word Frequency"):
+            out_words = gr.Textbox(lines=10, label="Most Common Words")
+        with gr.Tab("Sentiment"):
+            out_sent = gr.Textbox(lines=3, label="Sentiment")
+        with gr.Tab("Entities"):
+            out_ents = gr.Textbox(lines=10, label="Entities (People / Orgs / Locations)")
+    analyze_btn = gr.Button("Analyze")
+    analyze_btn.click(
+        analyze_text,
+        inputs=[input_box, mode],
+        outputs=[out_words, out_sent, out_ents]
+    )
 demo.launch()