Spaces:
Sleeping
Sleeping
| import re | |
| from collections import Counter | |
| import gradio as gr | |
| import spacy | |
| from textblob import TextBlob | |
| from transformers import pipeline | |
| # --------------------------------------------------------- | |
| # LOAD MODELS | |
| # --------------------------------------------------------- | |
| # Accurate mode model (spaCy transformer) | |
| nlp_trf = spacy.load("en_core_web_trf") | |
| # Fast mode model (spaCy small) | |
| nlp_sm = spacy.load("en_core_web_sm") | |
| # DeBERTa NER (used only in Accurate mode) | |
| deberta_ner = pipeline( | |
| "token-classification", | |
| model="geckos/deberta-base-fine-tuned-ner", | |
| aggregation_strategy="simple" | |
| ) | |
| stopwords = nlp_trf.Defaults.stop_words | |
| # --------------------------------------------------------- | |
| # HELPERS | |
| # --------------------------------------------------------- | |
| def clean_text(text): | |
| return text.strip() | |
| def get_word_freq(text): | |
| words = re.findall(r"\b\w+\b", text.lower()) | |
| words = [w for w in words if w not in stopwords] | |
| counts = Counter(words).most_common(10) | |
| if not counts: | |
| return "No words found." | |
| return "\n".join(f"{w}: {c}" for w, c in counts) | |
| def get_sentiment(text): | |
| sentiment = TextBlob(text).sentiment | |
| return ( | |
| f"Polarity: {sentiment.polarity:.3f}\n" | |
| f"Subjectivity: {sentiment.subjectivity:.3f}" | |
| ) | |
| def run_spacy_entities(doc): | |
| ents = [] | |
| for ent in doc.ents: | |
| ents.append({"text": ent.text, "label": ent.label_}) | |
| return ents | |
| def run_deberta_batched(text): | |
| """Split text into sentences and batch them for faster NER.""" | |
| sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()] | |
| results = [] | |
| for sent in sentences: | |
| out = deberta_ner(sent) | |
| for r in out: | |
| results.append({"text": r["word"], "label": r["entity_group"]}) | |
| return results | |
| def categorize_entities(spacy_ents, deberta_ents): | |
| people = set() | |
| orgs = set() | |
| locations = set() | |
| misc = set() | |
| def norm(t): return t.strip() | |
| # spaCy mapping | |
| for ent in spacy_ents: | |
| text = norm(ent["text"]) | |
| label = ent["label"] | |
| if label == "PERSON": | |
| people.add(text) | |
| elif label == "ORG": | |
| orgs.add(text) | |
| elif label in ("GPE", "LOC"): | |
| locations.add(text) | |
| else: | |
| misc.add(text) | |
| # DeBERTa mapping | |
| for ent in deberta_ents: | |
| text = norm(ent["text"]) | |
| label = ent["label"] | |
| if label == "PER": | |
| people.add(text) | |
| elif label == "ORG": | |
| orgs.add(text) | |
| elif label == "LOC": | |
| locations.add(text) | |
| else: | |
| misc.add(text) | |
| def fmt(title, items): | |
| if not items: | |
| return f"{title}:\n (none)" | |
| items = sorted(items, key=lambda x: x.lower()) | |
| return f"{title}:\n - " + "\n - ".join(items) | |
| return "\n\n".join([ | |
| fmt("People", people), | |
| fmt("Organizations", orgs), | |
| fmt("Countries/Locations", locations), | |
| fmt("Misc", misc), | |
| ]) | |
| # --------------------------------------------------------- | |
| # MAIN ANALYSIS | |
| # --------------------------------------------------------- | |
| def analyze_text(text, mode): | |
| text = clean_text(text) | |
| if not text: | |
| return ("No words found.", "No sentiment detected.", "No entities detected.") | |
| # Word frequency | |
| word_freq_str = get_word_freq(text) | |
| # Sentiment | |
| sentiment_str = get_sentiment(text) | |
| # Fast mode β spaCy small only | |
| if mode == "Fast": | |
| doc = nlp_sm(text) | |
| spacy_ents = run_spacy_entities(doc) | |
| entities_str = categorize_entities(spacy_ents, []) | |
| return (word_freq_str, sentiment_str, entities_str) | |
| # Accurate mode β spaCy transformer + DeBERTa (batched) | |
| doc = nlp_trf(text) | |
| spacy_ents = run_spacy_entities(doc) | |
| deberta_ents = run_deberta_batched(text) | |
| entities_str = categorize_entities(spacy_ents, deberta_ents) | |
| return (word_freq_str, sentiment_str, entities_str) | |
| # --------------------------------------------------------- | |
| # UI | |
| # --------------------------------------------------------- | |
| with gr.Blocks(title="π³οΈ Text & Speech Analyzer") as demo: | |
| gr.Markdown("## π³οΈ Text & Speech Analyzer (Fast + Accurate Modes)") | |
| gr.Markdown( | |
| "Analyze political speeches, news, or press releases.\n\n" | |
| "**Fast Mode** β spaCy small (1β2 seconds)\n\n" | |
| "**Accurate Mode** β spaCy transformer + DeBERTa (8β12 seconds)" | |
| ) | |
| mode = gr.Radio(["Fast", "Accurate"], value="Accurate", label="Choose Mode") | |
| input_box = gr.Textbox( | |
| lines=12, | |
| label="Paste text here", | |
| placeholder="Enter a speech, article, or paragraph..." | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("Word Frequency"): | |
| out_words = gr.Textbox(lines=10, label="Most Common Words") | |
| with gr.Tab("Sentiment"): | |
| out_sent = gr.Textbox(lines=3, label="Sentiment") | |
| with gr.Tab("Entities"): | |
| out_ents = gr.Textbox(lines=10, label="Entities (People / Orgs / Locations)") | |
| analyze_btn = gr.Button("Analyze") | |
| analyze_btn.click( | |
| analyze_text, | |
| inputs=[input_box, mode], | |
| outputs=[out_words, out_sent, out_ents] | |
| ) | |
| demo.launch() | |