speech-analyser / app.py
ad180's picture
Update app.py
3dd79a1 verified
import re
from collections import Counter
import gradio as gr
import spacy
from textblob import TextBlob
from transformers import pipeline
# ---------------------------------------------------------
# LOAD MODELS
# ---------------------------------------------------------
# Accurate mode model (spaCy transformer)
nlp_trf = spacy.load("en_core_web_trf")
# Fast mode model (spaCy small)
nlp_sm = spacy.load("en_core_web_sm")
# DeBERTa NER (used only in Accurate mode)
deberta_ner = pipeline(
"token-classification",
model="geckos/deberta-base-fine-tuned-ner",
aggregation_strategy="simple"
)
stopwords = nlp_trf.Defaults.stop_words
# ---------------------------------------------------------
# HELPERS
# ---------------------------------------------------------
def clean_text(text):
return text.strip()
def get_word_freq(text):
words = re.findall(r"\b\w+\b", text.lower())
words = [w for w in words if w not in stopwords]
counts = Counter(words).most_common(10)
if not counts:
return "No words found."
return "\n".join(f"{w}: {c}" for w, c in counts)
def get_sentiment(text):
sentiment = TextBlob(text).sentiment
return (
f"Polarity: {sentiment.polarity:.3f}\n"
f"Subjectivity: {sentiment.subjectivity:.3f}"
)
def run_spacy_entities(doc):
ents = []
for ent in doc.ents:
ents.append({"text": ent.text, "label": ent.label_})
return ents
def run_deberta_batched(text):
"""Split text into sentences and batch them for faster NER."""
sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
results = []
for sent in sentences:
out = deberta_ner(sent)
for r in out:
results.append({"text": r["word"], "label": r["entity_group"]})
return results
def categorize_entities(spacy_ents, deberta_ents):
people = set()
orgs = set()
locations = set()
misc = set()
def norm(t): return t.strip()
# spaCy mapping
for ent in spacy_ents:
text = norm(ent["text"])
label = ent["label"]
if label == "PERSON":
people.add(text)
elif label == "ORG":
orgs.add(text)
elif label in ("GPE", "LOC"):
locations.add(text)
else:
misc.add(text)
# DeBERTa mapping
for ent in deberta_ents:
text = norm(ent["text"])
label = ent["label"]
if label == "PER":
people.add(text)
elif label == "ORG":
orgs.add(text)
elif label == "LOC":
locations.add(text)
else:
misc.add(text)
def fmt(title, items):
if not items:
return f"{title}:\n (none)"
items = sorted(items, key=lambda x: x.lower())
return f"{title}:\n - " + "\n - ".join(items)
return "\n\n".join([
fmt("People", people),
fmt("Organizations", orgs),
fmt("Countries/Locations", locations),
fmt("Misc", misc),
])
# ---------------------------------------------------------
# MAIN ANALYSIS
# ---------------------------------------------------------
def analyze_text(text, mode):
text = clean_text(text)
if not text:
return ("No words found.", "No sentiment detected.", "No entities detected.")
# Word frequency
word_freq_str = get_word_freq(text)
# Sentiment
sentiment_str = get_sentiment(text)
# Fast mode β†’ spaCy small only
if mode == "Fast":
doc = nlp_sm(text)
spacy_ents = run_spacy_entities(doc)
entities_str = categorize_entities(spacy_ents, [])
return (word_freq_str, sentiment_str, entities_str)
# Accurate mode β†’ spaCy transformer + DeBERTa (batched)
doc = nlp_trf(text)
spacy_ents = run_spacy_entities(doc)
deberta_ents = run_deberta_batched(text)
entities_str = categorize_entities(spacy_ents, deberta_ents)
return (word_freq_str, sentiment_str, entities_str)
# ---------------------------------------------------------
# UI
# ---------------------------------------------------------
with gr.Blocks(title="πŸ—³οΈ Text & Speech Analyzer") as demo:
gr.Markdown("## πŸ—³οΈ Text & Speech Analyzer (Fast + Accurate Modes)")
gr.Markdown(
"Analyze political speeches, news, or press releases.\n\n"
"**Fast Mode** β†’ spaCy small (1–2 seconds)\n\n"
"**Accurate Mode** β†’ spaCy transformer + DeBERTa (8–12 seconds)"
)
mode = gr.Radio(["Fast", "Accurate"], value="Accurate", label="Choose Mode")
input_box = gr.Textbox(
lines=12,
label="Paste text here",
placeholder="Enter a speech, article, or paragraph..."
)
with gr.Tabs():
with gr.Tab("Word Frequency"):
out_words = gr.Textbox(lines=10, label="Most Common Words")
with gr.Tab("Sentiment"):
out_sent = gr.Textbox(lines=3, label="Sentiment")
with gr.Tab("Entities"):
out_ents = gr.Textbox(lines=10, label="Entities (People / Orgs / Locations)")
analyze_btn = gr.Button("Analyze")
analyze_btn.click(
analyze_text,
inputs=[input_box, mode],
outputs=[out_words, out_sent, out_ents]
)
demo.launch()