ad180 commited on
Commit
3dd79a1
·
verified ·
1 Parent(s): c9fff8b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -83
app.py CHANGED
@@ -7,32 +7,34 @@ from textblob import TextBlob
7
  from transformers import pipeline
8
 
9
  # ---------------------------------------------------------
10
- # MODEL LOADING
11
  # ---------------------------------------------------------
12
 
13
- # 1) spaCy transformer model
14
- nlp = spacy.load("en_core_web_trf")
15
 
16
- # 2) DeBERTa fallback NER model (general-purpose PER/ORG/LOC/MISC)
 
 
 
17
  deberta_ner = pipeline(
18
  "token-classification",
19
  model="geckos/deberta-base-fine-tuned-ner",
20
  aggregation_strategy="simple"
21
  )
22
 
23
- # spaCy stopwords
24
- stopwords = nlp.Defaults.stop_words
25
 
26
 
27
  # ---------------------------------------------------------
28
- # HELPER FUNCTIONS
29
  # ---------------------------------------------------------
30
 
31
- def clean_text(text: str) -> str:
32
  return text.strip()
33
 
34
 
35
- def get_word_freq(text: str) -> str:
36
  words = re.findall(r"\b\w+\b", text.lower())
37
  words = [w for w in words if w not in stopwords]
38
  counts = Counter(words).most_common(10)
@@ -41,7 +43,7 @@ def get_word_freq(text: str) -> str:
41
  return "\n".join(f"{w}: {c}" for w, c in counts)
42
 
43
 
44
- def get_sentiment(text: str) -> str:
45
  sentiment = TextBlob(text).sentiment
46
  return (
47
  f"Polarity: {sentiment.polarity:.3f}\n"
@@ -49,46 +51,36 @@ def get_sentiment(text: str) -> str:
49
  )
50
 
51
 
52
- def run_spacy_entities(text: str):
53
- doc = nlp(text)
54
  ents = []
55
  for ent in doc.ents:
56
  ents.append({"text": ent.text, "label": ent.label_})
57
  return ents
58
 
59
 
60
- def run_deberta_entities(text: str):
61
- results = deberta_ner(text)
62
- ents = []
63
- for r in results:
64
- ents.append({"text": r["word"], "label": r["entity_group"]})
65
- return ents
66
-
 
 
67
 
68
- def categorize_entities(spacy_ents, deberta_ents) -> str:
69
- """
70
- Merge entities from spaCy + DeBERTa into:
71
- - People
72
- - Organizations
73
- - Countries/Locations
74
- - Misc
75
- """
76
 
 
77
  people = set()
78
  orgs = set()
79
  locations = set()
80
  misc = set()
81
 
82
- def norm(t):
83
- return t.strip()
84
 
85
- # -------------------------
86
  # spaCy mapping
87
- # -------------------------
88
  for ent in spacy_ents:
89
  text = norm(ent["text"])
90
  label = ent["label"]
91
-
92
  if label == "PERSON":
93
  people.add(text)
94
  elif label == "ORG":
@@ -98,13 +90,10 @@ def categorize_entities(spacy_ents, deberta_ents) -> str:
98
  else:
99
  misc.add(text)
100
 
101
- # -------------------------
102
- # DeBERTa mapping (PER/ORG/LOC/MISC)
103
- # -------------------------
104
  for ent in deberta_ents:
105
  text = norm(ent["text"])
106
  label = ent["label"]
107
-
108
  if label == "PER":
109
  people.add(text)
110
  elif label == "ORG":
@@ -114,38 +103,28 @@ def categorize_entities(spacy_ents, deberta_ents) -> str:
114
  else:
115
  misc.add(text)
116
 
117
- # -------------------------
118
- # Format output
119
- # -------------------------
120
  def fmt(title, items):
121
  if not items:
122
  return f"{title}:\n (none)"
123
  items = sorted(items, key=lambda x: x.lower())
124
  return f"{title}:\n - " + "\n - ".join(items)
125
 
126
- sections = [
127
  fmt("People", people),
128
  fmt("Organizations", orgs),
129
  fmt("Countries/Locations", locations),
130
  fmt("Misc", misc),
131
- ]
132
-
133
- return "\n\n".join(sections)
134
 
135
 
136
  # ---------------------------------------------------------
137
- # MAIN ANALYSIS FUNCTION
138
  # ---------------------------------------------------------
139
 
140
- def analyze_text(text: str):
141
  text = clean_text(text)
142
  if not text:
143
- return (
144
- "No words found.",
145
- "No sentiment detected.",
146
- "No entities detected.",
147
- "Please enter some text."
148
- )
149
 
150
  # Word frequency
151
  word_freq_str = get_word_freq(text)
@@ -153,46 +132,59 @@ def analyze_text(text: str):
153
  # Sentiment
154
  sentiment_str = get_sentiment(text)
155
 
156
- # Entities from spaCy + DeBERTa
157
- try:
158
- spacy_ents = run_spacy_entities(text)
159
- except Exception:
160
- spacy_ents = []
161
-
162
- try:
163
- deberta_ents = run_deberta_entities(text)
164
- except Exception:
165
- deberta_ents = []
166
-
167
  entities_str = categorize_entities(spacy_ents, deberta_ents)
168
 
169
- return (
170
- word_freq_str,
171
- sentiment_str,
172
- entities_str,
173
- "Analysis complete."
174
- )
175
 
176
 
177
  # ---------------------------------------------------------
178
- # GRADIO UI
179
  # ---------------------------------------------------------
180
 
181
- demo = gr.Interface(
182
- fn=analyze_text,
183
- inputs=gr.Textbox(lines=10, label="Paste a speech, news article, or press release"),
184
- outputs=[
185
- gr.Textbox(lines=12, label="Most Common Words"),
186
- gr.Textbox(lines=12, label="Sentiment"),
187
- gr.Textbox(lines=20, label="Entities (People / Orgs / Locations)"),
188
- gr.Textbox(lines=3, label="Status")
189
- ],
190
- title="🗳️ Text & Speech Analyzer (spaCy + DeBERTa)",
191
- description=(
192
- "Analyze political speeches, news, or press releases.\n"
193
- "NER powered by spaCy transformer + DeBERTa."
 
 
194
  )
195
- )
196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
  demo.launch()
 
7
  from transformers import pipeline
8
 
9
  # ---------------------------------------------------------
10
+ # LOAD MODELS
11
  # ---------------------------------------------------------
12
 
13
+ # Accurate mode model (spaCy transformer)
14
+ nlp_trf = spacy.load("en_core_web_trf")
15
 
16
+ # Fast mode model (spaCy small)
17
+ nlp_sm = spacy.load("en_core_web_sm")
18
+
19
+ # DeBERTa NER (used only in Accurate mode)
20
  deberta_ner = pipeline(
21
  "token-classification",
22
  model="geckos/deberta-base-fine-tuned-ner",
23
  aggregation_strategy="simple"
24
  )
25
 
26
+ stopwords = nlp_trf.Defaults.stop_words
 
27
 
28
 
29
  # ---------------------------------------------------------
30
+ # HELPERS
31
  # ---------------------------------------------------------
32
 
33
+ def clean_text(text):
34
  return text.strip()
35
 
36
 
37
+ def get_word_freq(text):
38
  words = re.findall(r"\b\w+\b", text.lower())
39
  words = [w for w in words if w not in stopwords]
40
  counts = Counter(words).most_common(10)
 
43
  return "\n".join(f"{w}: {c}" for w, c in counts)
44
 
45
 
46
+ def get_sentiment(text):
47
  sentiment = TextBlob(text).sentiment
48
  return (
49
  f"Polarity: {sentiment.polarity:.3f}\n"
 
51
  )
52
 
53
 
54
+ def run_spacy_entities(doc):
 
55
  ents = []
56
  for ent in doc.ents:
57
  ents.append({"text": ent.text, "label": ent.label_})
58
  return ents
59
 
60
 
61
+ def run_deberta_batched(text):
62
+ """Split text into sentences and batch them for faster NER."""
63
+ sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
64
+ results = []
65
+ for sent in sentences:
66
+ out = deberta_ner(sent)
67
+ for r in out:
68
+ results.append({"text": r["word"], "label": r["entity_group"]})
69
+ return results
70
 
 
 
 
 
 
 
 
 
71
 
72
+ def categorize_entities(spacy_ents, deberta_ents):
73
  people = set()
74
  orgs = set()
75
  locations = set()
76
  misc = set()
77
 
78
+ def norm(t): return t.strip()
 
79
 
 
80
  # spaCy mapping
 
81
  for ent in spacy_ents:
82
  text = norm(ent["text"])
83
  label = ent["label"]
 
84
  if label == "PERSON":
85
  people.add(text)
86
  elif label == "ORG":
 
90
  else:
91
  misc.add(text)
92
 
93
+ # DeBERTa mapping
 
 
94
  for ent in deberta_ents:
95
  text = norm(ent["text"])
96
  label = ent["label"]
 
97
  if label == "PER":
98
  people.add(text)
99
  elif label == "ORG":
 
103
  else:
104
  misc.add(text)
105
 
 
 
 
106
  def fmt(title, items):
107
  if not items:
108
  return f"{title}:\n (none)"
109
  items = sorted(items, key=lambda x: x.lower())
110
  return f"{title}:\n - " + "\n - ".join(items)
111
 
112
+ return "\n\n".join([
113
  fmt("People", people),
114
  fmt("Organizations", orgs),
115
  fmt("Countries/Locations", locations),
116
  fmt("Misc", misc),
117
+ ])
 
 
118
 
119
 
120
  # ---------------------------------------------------------
121
+ # MAIN ANALYSIS
122
  # ---------------------------------------------------------
123
 
124
+ def analyze_text(text, mode):
125
  text = clean_text(text)
126
  if not text:
127
+ return ("No words found.", "No sentiment detected.", "No entities detected.")
 
 
 
 
 
128
 
129
  # Word frequency
130
  word_freq_str = get_word_freq(text)
 
132
  # Sentiment
133
  sentiment_str = get_sentiment(text)
134
 
135
+ # Fast mode spaCy small only
136
+ if mode == "Fast":
137
+ doc = nlp_sm(text)
138
+ spacy_ents = run_spacy_entities(doc)
139
+ entities_str = categorize_entities(spacy_ents, [])
140
+ return (word_freq_str, sentiment_str, entities_str)
141
+
142
+ # Accurate mode → spaCy transformer + DeBERTa (batched)
143
+ doc = nlp_trf(text)
144
+ spacy_ents = run_spacy_entities(doc)
145
+ deberta_ents = run_deberta_batched(text)
146
  entities_str = categorize_entities(spacy_ents, deberta_ents)
147
 
148
+ return (word_freq_str, sentiment_str, entities_str)
 
 
 
 
 
149
 
150
 
151
  # ---------------------------------------------------------
152
+ # UI
153
  # ---------------------------------------------------------
154
 
155
+ with gr.Blocks(title="🗳️ Text & Speech Analyzer") as demo:
156
+
157
+ gr.Markdown("## 🗳️ Text & Speech Analyzer (Fast + Accurate Modes)")
158
+ gr.Markdown(
159
+ "Analyze political speeches, news, or press releases.\n\n"
160
+ "**Fast Mode** → spaCy small (1–2 seconds)\n\n"
161
+ "**Accurate Mode** spaCy transformer + DeBERTa (8–12 seconds)"
162
+ )
163
+
164
+ mode = gr.Radio(["Fast", "Accurate"], value="Accurate", label="Choose Mode")
165
+
166
+ input_box = gr.Textbox(
167
+ lines=12,
168
+ label="Paste text here",
169
+ placeholder="Enter a speech, article, or paragraph..."
170
  )
 
171
 
172
+ with gr.Tabs():
173
+ with gr.Tab("Word Frequency"):
174
+ out_words = gr.Textbox(lines=10, label="Most Common Words")
175
+
176
+ with gr.Tab("Sentiment"):
177
+ out_sent = gr.Textbox(lines=3, label="Sentiment")
178
+
179
+ with gr.Tab("Entities"):
180
+ out_ents = gr.Textbox(lines=10, label="Entities (People / Orgs / Locations)")
181
+
182
+ analyze_btn = gr.Button("Analyze")
183
+
184
+ analyze_btn.click(
185
+ analyze_text,
186
+ inputs=[input_box, mode],
187
+ outputs=[out_words, out_sent, out_ents]
188
+ )
189
 
190
  demo.launch()