Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,32 +7,34 @@ from textblob import TextBlob
|
|
| 7 |
from transformers import pipeline
|
| 8 |
|
| 9 |
# ---------------------------------------------------------
|
| 10 |
-
#
|
| 11 |
# ---------------------------------------------------------
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
|
| 15 |
|
| 16 |
-
#
|
|
|
|
|
|
|
|
|
|
| 17 |
deberta_ner = pipeline(
|
| 18 |
"token-classification",
|
| 19 |
model="geckos/deberta-base-fine-tuned-ner",
|
| 20 |
aggregation_strategy="simple"
|
| 21 |
)
|
| 22 |
|
| 23 |
-
|
| 24 |
-
stopwords = nlp.Defaults.stop_words
|
| 25 |
|
| 26 |
|
| 27 |
# ---------------------------------------------------------
|
| 28 |
-
#
|
| 29 |
# ---------------------------------------------------------
|
| 30 |
|
| 31 |
-
def clean_text(text
|
| 32 |
return text.strip()
|
| 33 |
|
| 34 |
|
| 35 |
-
def get_word_freq(text
|
| 36 |
words = re.findall(r"\b\w+\b", text.lower())
|
| 37 |
words = [w for w in words if w not in stopwords]
|
| 38 |
counts = Counter(words).most_common(10)
|
|
@@ -41,7 +43,7 @@ def get_word_freq(text: str) -> str:
|
|
| 41 |
return "\n".join(f"{w}: {c}" for w, c in counts)
|
| 42 |
|
| 43 |
|
| 44 |
-
def get_sentiment(text
|
| 45 |
sentiment = TextBlob(text).sentiment
|
| 46 |
return (
|
| 47 |
f"Polarity: {sentiment.polarity:.3f}\n"
|
|
@@ -49,46 +51,36 @@ def get_sentiment(text: str) -> str:
|
|
| 49 |
)
|
| 50 |
|
| 51 |
|
| 52 |
-
def run_spacy_entities(
|
| 53 |
-
doc = nlp(text)
|
| 54 |
ents = []
|
| 55 |
for ent in doc.ents:
|
| 56 |
ents.append({"text": ent.text, "label": ent.label_})
|
| 57 |
return ents
|
| 58 |
|
| 59 |
|
| 60 |
-
def
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
| 67 |
|
| 68 |
-
def categorize_entities(spacy_ents, deberta_ents) -> str:
|
| 69 |
-
"""
|
| 70 |
-
Merge entities from spaCy + DeBERTa into:
|
| 71 |
-
- People
|
| 72 |
-
- Organizations
|
| 73 |
-
- Countries/Locations
|
| 74 |
-
- Misc
|
| 75 |
-
"""
|
| 76 |
|
|
|
|
| 77 |
people = set()
|
| 78 |
orgs = set()
|
| 79 |
locations = set()
|
| 80 |
misc = set()
|
| 81 |
|
| 82 |
-
def norm(t):
|
| 83 |
-
return t.strip()
|
| 84 |
|
| 85 |
-
# -------------------------
|
| 86 |
# spaCy mapping
|
| 87 |
-
# -------------------------
|
| 88 |
for ent in spacy_ents:
|
| 89 |
text = norm(ent["text"])
|
| 90 |
label = ent["label"]
|
| 91 |
-
|
| 92 |
if label == "PERSON":
|
| 93 |
people.add(text)
|
| 94 |
elif label == "ORG":
|
|
@@ -98,13 +90,10 @@ def categorize_entities(spacy_ents, deberta_ents) -> str:
|
|
| 98 |
else:
|
| 99 |
misc.add(text)
|
| 100 |
|
| 101 |
-
#
|
| 102 |
-
# DeBERTa mapping (PER/ORG/LOC/MISC)
|
| 103 |
-
# -------------------------
|
| 104 |
for ent in deberta_ents:
|
| 105 |
text = norm(ent["text"])
|
| 106 |
label = ent["label"]
|
| 107 |
-
|
| 108 |
if label == "PER":
|
| 109 |
people.add(text)
|
| 110 |
elif label == "ORG":
|
|
@@ -114,38 +103,28 @@ def categorize_entities(spacy_ents, deberta_ents) -> str:
|
|
| 114 |
else:
|
| 115 |
misc.add(text)
|
| 116 |
|
| 117 |
-
# -------------------------
|
| 118 |
-
# Format output
|
| 119 |
-
# -------------------------
|
| 120 |
def fmt(title, items):
|
| 121 |
if not items:
|
| 122 |
return f"{title}:\n (none)"
|
| 123 |
items = sorted(items, key=lambda x: x.lower())
|
| 124 |
return f"{title}:\n - " + "\n - ".join(items)
|
| 125 |
|
| 126 |
-
|
| 127 |
fmt("People", people),
|
| 128 |
fmt("Organizations", orgs),
|
| 129 |
fmt("Countries/Locations", locations),
|
| 130 |
fmt("Misc", misc),
|
| 131 |
-
]
|
| 132 |
-
|
| 133 |
-
return "\n\n".join(sections)
|
| 134 |
|
| 135 |
|
| 136 |
# ---------------------------------------------------------
|
| 137 |
-
# MAIN ANALYSIS
|
| 138 |
# ---------------------------------------------------------
|
| 139 |
|
| 140 |
-
def analyze_text(text
|
| 141 |
text = clean_text(text)
|
| 142 |
if not text:
|
| 143 |
-
return (
|
| 144 |
-
"No words found.",
|
| 145 |
-
"No sentiment detected.",
|
| 146 |
-
"No entities detected.",
|
| 147 |
-
"Please enter some text."
|
| 148 |
-
)
|
| 149 |
|
| 150 |
# Word frequency
|
| 151 |
word_freq_str = get_word_freq(text)
|
|
@@ -153,46 +132,59 @@ def analyze_text(text: str):
|
|
| 153 |
# Sentiment
|
| 154 |
sentiment_str = get_sentiment(text)
|
| 155 |
|
| 156 |
-
#
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
entities_str = categorize_entities(spacy_ents, deberta_ents)
|
| 168 |
|
| 169 |
-
return (
|
| 170 |
-
word_freq_str,
|
| 171 |
-
sentiment_str,
|
| 172 |
-
entities_str,
|
| 173 |
-
"Analysis complete."
|
| 174 |
-
)
|
| 175 |
|
| 176 |
|
| 177 |
# ---------------------------------------------------------
|
| 178 |
-
#
|
| 179 |
# ---------------------------------------------------------
|
| 180 |
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
|
|
|
|
|
|
| 194 |
)
|
| 195 |
-
)
|
| 196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
demo.launch()
|
|
|
|
| 7 |
from transformers import pipeline
|
| 8 |
|
| 9 |
# ---------------------------------------------------------
|
| 10 |
+
# LOAD MODELS
|
| 11 |
# ---------------------------------------------------------
|
| 12 |
|
| 13 |
+
# Accurate mode model (spaCy transformer)
|
| 14 |
+
nlp_trf = spacy.load("en_core_web_trf")
|
| 15 |
|
| 16 |
+
# Fast mode model (spaCy small)
|
| 17 |
+
nlp_sm = spacy.load("en_core_web_sm")
|
| 18 |
+
|
| 19 |
+
# DeBERTa NER (used only in Accurate mode)
|
| 20 |
deberta_ner = pipeline(
|
| 21 |
"token-classification",
|
| 22 |
model="geckos/deberta-base-fine-tuned-ner",
|
| 23 |
aggregation_strategy="simple"
|
| 24 |
)
|
| 25 |
|
| 26 |
+
stopwords = nlp_trf.Defaults.stop_words
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
# ---------------------------------------------------------
|
| 30 |
+
# HELPERS
|
| 31 |
# ---------------------------------------------------------
|
| 32 |
|
| 33 |
+
def clean_text(text):
|
| 34 |
return text.strip()
|
| 35 |
|
| 36 |
|
| 37 |
+
def get_word_freq(text):
|
| 38 |
words = re.findall(r"\b\w+\b", text.lower())
|
| 39 |
words = [w for w in words if w not in stopwords]
|
| 40 |
counts = Counter(words).most_common(10)
|
|
|
|
| 43 |
return "\n".join(f"{w}: {c}" for w, c in counts)
|
| 44 |
|
| 45 |
|
| 46 |
+
def get_sentiment(text):
|
| 47 |
sentiment = TextBlob(text).sentiment
|
| 48 |
return (
|
| 49 |
f"Polarity: {sentiment.polarity:.3f}\n"
|
|
|
|
| 51 |
)
|
| 52 |
|
| 53 |
|
| 54 |
+
def run_spacy_entities(doc):
|
|
|
|
| 55 |
ents = []
|
| 56 |
for ent in doc.ents:
|
| 57 |
ents.append({"text": ent.text, "label": ent.label_})
|
| 58 |
return ents
|
| 59 |
|
| 60 |
|
| 61 |
+
def run_deberta_batched(text):
|
| 62 |
+
"""Split text into sentences and batch them for faster NER."""
|
| 63 |
+
sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
|
| 64 |
+
results = []
|
| 65 |
+
for sent in sentences:
|
| 66 |
+
out = deberta_ner(sent)
|
| 67 |
+
for r in out:
|
| 68 |
+
results.append({"text": r["word"], "label": r["entity_group"]})
|
| 69 |
+
return results
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
+
def categorize_entities(spacy_ents, deberta_ents):
|
| 73 |
people = set()
|
| 74 |
orgs = set()
|
| 75 |
locations = set()
|
| 76 |
misc = set()
|
| 77 |
|
| 78 |
+
def norm(t): return t.strip()
|
|
|
|
| 79 |
|
|
|
|
| 80 |
# spaCy mapping
|
|
|
|
| 81 |
for ent in spacy_ents:
|
| 82 |
text = norm(ent["text"])
|
| 83 |
label = ent["label"]
|
|
|
|
| 84 |
if label == "PERSON":
|
| 85 |
people.add(text)
|
| 86 |
elif label == "ORG":
|
|
|
|
| 90 |
else:
|
| 91 |
misc.add(text)
|
| 92 |
|
| 93 |
+
# DeBERTa mapping
|
|
|
|
|
|
|
| 94 |
for ent in deberta_ents:
|
| 95 |
text = norm(ent["text"])
|
| 96 |
label = ent["label"]
|
|
|
|
| 97 |
if label == "PER":
|
| 98 |
people.add(text)
|
| 99 |
elif label == "ORG":
|
|
|
|
| 103 |
else:
|
| 104 |
misc.add(text)
|
| 105 |
|
|
|
|
|
|
|
|
|
|
| 106 |
def fmt(title, items):
|
| 107 |
if not items:
|
| 108 |
return f"{title}:\n (none)"
|
| 109 |
items = sorted(items, key=lambda x: x.lower())
|
| 110 |
return f"{title}:\n - " + "\n - ".join(items)
|
| 111 |
|
| 112 |
+
return "\n\n".join([
|
| 113 |
fmt("People", people),
|
| 114 |
fmt("Organizations", orgs),
|
| 115 |
fmt("Countries/Locations", locations),
|
| 116 |
fmt("Misc", misc),
|
| 117 |
+
])
|
|
|
|
|
|
|
| 118 |
|
| 119 |
|
| 120 |
# ---------------------------------------------------------
|
| 121 |
+
# MAIN ANALYSIS
|
| 122 |
# ---------------------------------------------------------
|
| 123 |
|
| 124 |
+
def analyze_text(text, mode):
|
| 125 |
text = clean_text(text)
|
| 126 |
if not text:
|
| 127 |
+
return ("No words found.", "No sentiment detected.", "No entities detected.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
# Word frequency
|
| 130 |
word_freq_str = get_word_freq(text)
|
|
|
|
| 132 |
# Sentiment
|
| 133 |
sentiment_str = get_sentiment(text)
|
| 134 |
|
| 135 |
+
# Fast mode → spaCy small only
|
| 136 |
+
if mode == "Fast":
|
| 137 |
+
doc = nlp_sm(text)
|
| 138 |
+
spacy_ents = run_spacy_entities(doc)
|
| 139 |
+
entities_str = categorize_entities(spacy_ents, [])
|
| 140 |
+
return (word_freq_str, sentiment_str, entities_str)
|
| 141 |
+
|
| 142 |
+
# Accurate mode → spaCy transformer + DeBERTa (batched)
|
| 143 |
+
doc = nlp_trf(text)
|
| 144 |
+
spacy_ents = run_spacy_entities(doc)
|
| 145 |
+
deberta_ents = run_deberta_batched(text)
|
| 146 |
entities_str = categorize_entities(spacy_ents, deberta_ents)
|
| 147 |
|
| 148 |
+
return (word_freq_str, sentiment_str, entities_str)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
|
| 151 |
# ---------------------------------------------------------
|
| 152 |
+
# UI
|
| 153 |
# ---------------------------------------------------------
|
| 154 |
|
| 155 |
+
with gr.Blocks(title="🗳️ Text & Speech Analyzer") as demo:
|
| 156 |
+
|
| 157 |
+
gr.Markdown("## 🗳️ Text & Speech Analyzer (Fast + Accurate Modes)")
|
| 158 |
+
gr.Markdown(
|
| 159 |
+
"Analyze political speeches, news, or press releases.\n\n"
|
| 160 |
+
"**Fast Mode** → spaCy small (1–2 seconds)\n\n"
|
| 161 |
+
"**Accurate Mode** → spaCy transformer + DeBERTa (8–12 seconds)"
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
mode = gr.Radio(["Fast", "Accurate"], value="Accurate", label="Choose Mode")
|
| 165 |
+
|
| 166 |
+
input_box = gr.Textbox(
|
| 167 |
+
lines=12,
|
| 168 |
+
label="Paste text here",
|
| 169 |
+
placeholder="Enter a speech, article, or paragraph..."
|
| 170 |
)
|
|
|
|
| 171 |
|
| 172 |
+
with gr.Tabs():
|
| 173 |
+
with gr.Tab("Word Frequency"):
|
| 174 |
+
out_words = gr.Textbox(lines=10, label="Most Common Words")
|
| 175 |
+
|
| 176 |
+
with gr.Tab("Sentiment"):
|
| 177 |
+
out_sent = gr.Textbox(lines=3, label="Sentiment")
|
| 178 |
+
|
| 179 |
+
with gr.Tab("Entities"):
|
| 180 |
+
out_ents = gr.Textbox(lines=10, label="Entities (People / Orgs / Locations)")
|
| 181 |
+
|
| 182 |
+
analyze_btn = gr.Button("Analyze")
|
| 183 |
+
|
| 184 |
+
analyze_btn.click(
|
| 185 |
+
analyze_text,
|
| 186 |
+
inputs=[input_box, mode],
|
| 187 |
+
outputs=[out_words, out_sent, out_ents]
|
| 188 |
+
)
|
| 189 |
|
| 190 |
demo.launch()
|