Spaces:

ad180
/

speech-analyser

Sleeping

App Files Files Community

speech-analyser / app.py

ad180

Update app.py

3dd79a1 verified 21 days ago

raw

history blame contribute delete

5.26 kB

	import re
	from collections import Counter

	import gradio as gr
	import spacy
	from textblob import TextBlob
	from transformers import pipeline

	# ---------------------------------------------------------
	# LOAD MODELS
	# ---------------------------------------------------------

	# Accurate mode model (spaCy transformer)
	nlp_trf = spacy.load("en_core_web_trf")

	# Fast mode model (spaCy small)
	nlp_sm = spacy.load("en_core_web_sm")

	# DeBERTa NER (used only in Accurate mode)
	deberta_ner = pipeline(
	"token-classification",
	model="geckos/deberta-base-fine-tuned-ner",
	aggregation_strategy="simple"
	)

	stopwords = nlp_trf.Defaults.stop_words


	# ---------------------------------------------------------
	# HELPERS
	# ---------------------------------------------------------

	def clean_text(text):
	return text.strip()


	def get_word_freq(text):
	words = re.findall(r"\b\w+\b", text.lower())
	words = [w for w in words if w not in stopwords]
	counts = Counter(words).most_common(10)
	if not counts:
	return "No words found."
	return "\n".join(f"{w}: {c}" for w, c in counts)


	def get_sentiment(text):
	sentiment = TextBlob(text).sentiment
	return (
	f"Polarity: {sentiment.polarity:.3f}\n"
	f"Subjectivity: {sentiment.subjectivity:.3f}"
	)


	def run_spacy_entities(doc):
	ents = []
	for ent in doc.ents:
	ents.append({"text": ent.text, "label": ent.label_})
	return ents


	def run_deberta_batched(text):
	"""Split text into sentences and batch them for faster NER."""
	sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
	results = []
	for sent in sentences:
	out = deberta_ner(sent)
	for r in out:
	results.append({"text": r["word"], "label": r["entity_group"]})
	return results


	def categorize_entities(spacy_ents, deberta_ents):
	people = set()
	orgs = set()
	locations = set()
	misc = set()

	def norm(t): return t.strip()

	# spaCy mapping
	for ent in spacy_ents:
	text = norm(ent["text"])
	label = ent["label"]
	if label == "PERSON":
	people.add(text)
	elif label == "ORG":
	orgs.add(text)
	elif label in ("GPE", "LOC"):
	locations.add(text)
	else:
	misc.add(text)

	# DeBERTa mapping
	for ent in deberta_ents:
	text = norm(ent["text"])
	label = ent["label"]
	if label == "PER":
	people.add(text)
	elif label == "ORG":
	orgs.add(text)
	elif label == "LOC":
	locations.add(text)
	else:
	misc.add(text)

	def fmt(title, items):
	if not items:
	return f"{title}:\n (none)"
	items = sorted(items, key=lambda x: x.lower())
	return f"{title}:\n - " + "\n - ".join(items)

	return "\n\n".join([
	fmt("People", people),
	fmt("Organizations", orgs),
	fmt("Countries/Locations", locations),
	fmt("Misc", misc),
	])


	# ---------------------------------------------------------
	# MAIN ANALYSIS
	# ---------------------------------------------------------

	def analyze_text(text, mode):
	text = clean_text(text)
	if not text:
	return ("No words found.", "No sentiment detected.", "No entities detected.")

	# Word frequency
	word_freq_str = get_word_freq(text)

	# Sentiment
	sentiment_str = get_sentiment(text)

	# Fast mode → spaCy small only
	if mode == "Fast":
	doc = nlp_sm(text)
	spacy_ents = run_spacy_entities(doc)
	entities_str = categorize_entities(spacy_ents, [])
	return (word_freq_str, sentiment_str, entities_str)

	# Accurate mode → spaCy transformer + DeBERTa (batched)
	doc = nlp_trf(text)
	spacy_ents = run_spacy_entities(doc)
	deberta_ents = run_deberta_batched(text)
	entities_str = categorize_entities(spacy_ents, deberta_ents)

	return (word_freq_str, sentiment_str, entities_str)


	# ---------------------------------------------------------
	# UI
	# ---------------------------------------------------------

	with gr.Blocks(title="🗳️ Text & Speech Analyzer") as demo:

	gr.Markdown("## 🗳️ Text & Speech Analyzer (Fast + Accurate Modes)")
	gr.Markdown(
	"Analyze political speeches, news, or press releases.\n\n"
	"Fast Mode → spaCy small (1–2 seconds)\n\n"
	"Accurate Mode → spaCy transformer + DeBERTa (8–12 seconds)"
	)

	mode = gr.Radio(["Fast", "Accurate"], value="Accurate", label="Choose Mode")

	input_box = gr.Textbox(
	lines=12,
	label="Paste text here",
	placeholder="Enter a speech, article, or paragraph..."
	)

	with gr.Tabs():
	with gr.Tab("Word Frequency"):
	out_words = gr.Textbox(lines=10, label="Most Common Words")

	with gr.Tab("Sentiment"):
	out_sent = gr.Textbox(lines=3, label="Sentiment")

	with gr.Tab("Entities"):
	out_ents = gr.Textbox(lines=10, label="Entities (People / Orgs / Locations)")

	analyze_btn = gr.Button("Analyze")

	analyze_btn.click(
	analyze_text,
	inputs=[input_box, mode],
	outputs=[out_words, out_sent, out_ents]
	)

	demo.launch()