import re from collections import Counter import gradio as gr import spacy from textblob import TextBlob from transformers import pipeline # --------------------------------------------------------- # LOAD MODELS # --------------------------------------------------------- # Accurate mode model (spaCy transformer) nlp_trf = spacy.load("en_core_web_trf") # Fast mode model (spaCy small) nlp_sm = spacy.load("en_core_web_sm") # DeBERTa NER (used only in Accurate mode) deberta_ner = pipeline( "token-classification", model="geckos/deberta-base-fine-tuned-ner", aggregation_strategy="simple" ) stopwords = nlp_trf.Defaults.stop_words # --------------------------------------------------------- # HELPERS # --------------------------------------------------------- def clean_text(text): return text.strip() def get_word_freq(text): words = re.findall(r"\b\w+\b", text.lower()) words = [w for w in words if w not in stopwords] counts = Counter(words).most_common(10) if not counts: return "No words found." return "\n".join(f"{w}: {c}" for w, c in counts) def get_sentiment(text): sentiment = TextBlob(text).sentiment return ( f"Polarity: {sentiment.polarity:.3f}\n" f"Subjectivity: {sentiment.subjectivity:.3f}" ) def run_spacy_entities(doc): ents = [] for ent in doc.ents: ents.append({"text": ent.text, "label": ent.label_}) return ents def run_deberta_batched(text): """Split text into sentences and batch them for faster NER.""" sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()] results = [] for sent in sentences: out = deberta_ner(sent) for r in out: results.append({"text": r["word"], "label": r["entity_group"]}) return results def categorize_entities(spacy_ents, deberta_ents): people = set() orgs = set() locations = set() misc = set() def norm(t): return t.strip() # spaCy mapping for ent in spacy_ents: text = norm(ent["text"]) label = ent["label"] if label == "PERSON": people.add(text) elif label == "ORG": orgs.add(text) elif label in ("GPE", "LOC"): locations.add(text) else: misc.add(text) # DeBERTa mapping for ent in deberta_ents: text = norm(ent["text"]) label = ent["label"] if label == "PER": people.add(text) elif label == "ORG": orgs.add(text) elif label == "LOC": locations.add(text) else: misc.add(text) def fmt(title, items): if not items: return f"{title}:\n (none)" items = sorted(items, key=lambda x: x.lower()) return f"{title}:\n - " + "\n - ".join(items) return "\n\n".join([ fmt("People", people), fmt("Organizations", orgs), fmt("Countries/Locations", locations), fmt("Misc", misc), ]) # --------------------------------------------------------- # MAIN ANALYSIS # --------------------------------------------------------- def analyze_text(text, mode): text = clean_text(text) if not text: return ("No words found.", "No sentiment detected.", "No entities detected.") # Word frequency word_freq_str = get_word_freq(text) # Sentiment sentiment_str = get_sentiment(text) # Fast mode → spaCy small only if mode == "Fast": doc = nlp_sm(text) spacy_ents = run_spacy_entities(doc) entities_str = categorize_entities(spacy_ents, []) return (word_freq_str, sentiment_str, entities_str) # Accurate mode → spaCy transformer + DeBERTa (batched) doc = nlp_trf(text) spacy_ents = run_spacy_entities(doc) deberta_ents = run_deberta_batched(text) entities_str = categorize_entities(spacy_ents, deberta_ents) return (word_freq_str, sentiment_str, entities_str) # --------------------------------------------------------- # UI # --------------------------------------------------------- with gr.Blocks(title="🗳️ Text & Speech Analyzer") as demo: gr.Markdown("## 🗳️ Text & Speech Analyzer (Fast + Accurate Modes)") gr.Markdown( "Analyze political speeches, news, or press releases.\n\n" "**Fast Mode** → spaCy small (1–2 seconds)\n\n" "**Accurate Mode** → spaCy transformer + DeBERTa (8–12 seconds)" ) mode = gr.Radio(["Fast", "Accurate"], value="Accurate", label="Choose Mode") input_box = gr.Textbox( lines=12, label="Paste text here", placeholder="Enter a speech, article, or paragraph..." ) with gr.Tabs(): with gr.Tab("Word Frequency"): out_words = gr.Textbox(lines=10, label="Most Common Words") with gr.Tab("Sentiment"): out_sent = gr.Textbox(lines=3, label="Sentiment") with gr.Tab("Entities"): out_ents = gr.Textbox(lines=10, label="Entities (People / Orgs / Locations)") analyze_btn = gr.Button("Analyze") analyze_btn.click( analyze_text, inputs=[input_box, mode], outputs=[out_words, out_sent, out_ents] ) demo.launch()