DivYonko commited on
Commit
e765d56
·
1 Parent(s): 9004cb2

Fix duplicate widget key - pages now import from shared.py not app.py

Browse files
Files changed (3) hide show
  1. pages/comments.py +2 -2
  2. pages/stats.py +2 -2
  3. shared.py +616 -0
pages/comments.py CHANGED
@@ -12,8 +12,8 @@ import time
12
  import sys
13
  import os
14
  sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
15
- from app import (
16
- store_lrange, store_llen, load_stream_data,
17
  clean_sentiment, clean_topic, csv_download,
18
  TOPIC_LABELS, TOPIC_COLOR, SENT_COLORS, STREAM_NAMES,
19
  )
 
12
  import sys
13
  import os
14
  sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
15
+ from shared import (
16
+ store_llen, load_stream_data,
17
  clean_sentiment, clean_topic, csv_download,
18
  TOPIC_LABELS, TOPIC_COLOR, SENT_COLORS, STREAM_NAMES,
19
  )
pages/stats.py CHANGED
@@ -13,8 +13,8 @@ import time
13
  import sys
14
  import os
15
  sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
16
- from app import (
17
- store_lrange, store_llen, load_stream_data,
18
  clean_sentiment, clean_topic, csv_download, plotly_layout,
19
  compute_velocity, build_heatmap_data, check_alert, compute_engagement,
20
  compute_top_contributors, compute_word_freq, check_spam_alert, detect_repeat_spammers,
 
13
  import sys
14
  import os
15
  sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
16
+ from shared import (
17
+ store_llen, load_stream_data,
18
  clean_sentiment, clean_topic, csv_download, plotly_layout,
19
  compute_velocity, build_heatmap_data, check_alert, compute_engagement,
20
  compute_top_contributors, compute_word_freq, check_spam_alert, detect_repeat_spammers,
shared.py ADDED
@@ -0,0 +1,616 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # shared.py
2
+ # Pure infrastructure, helpers, and analytics functions.
3
+ # No Streamlit UI rendering — safe to import from any page without
4
+ # triggering widget re-execution.
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ import logging
9
+ import os
10
+ import re
11
+ import sqlite3
12
+ import threading
13
+ from collections import defaultdict
14
+ from datetime import datetime
15
+
16
+ import pandas as pd
17
+ import streamlit as st
18
+
19
+ # ── ML imports ────────────────────────────────────────────────────────────────
20
+ from ml.sentiment_model import predict_sentiment
21
+ from ml.topic_model import predict_topic, VALID_TOPICS
22
+ from ml.action_type_model import predict_action_type, VALID_ACTION_TYPES
23
+
24
+ # ── SQLite store ──────────────────────────────────────────────────────────────
25
+ DB_PATH = "/tmp/livepulse.db"
26
+ MAX_STORE_MESSAGES = 100_000
27
+
28
+ _DB_LOCK = threading.Lock()
29
+ _META: dict[str, str] = {}
30
+
31
+ _SCRAPER_THREADS: dict[str, threading.Thread] = {}
32
+ _SCRAPER_STOP: dict[str, threading.Event] = {}
33
+
34
+
35
+ def _get_db() -> sqlite3.Connection:
36
+ conn = sqlite3.connect(DB_PATH, check_same_thread=False)
37
+ conn.execute("""
38
+ CREATE TABLE IF NOT EXISTS messages (
39
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
40
+ key TEXT NOT NULL,
41
+ value TEXT NOT NULL
42
+ )
43
+ """)
44
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_key ON messages(key)")
45
+ conn.commit()
46
+ return conn
47
+
48
+
49
+ _db_conn = _get_db()
50
+
51
+
52
+ def store_lrange(key: str, start: int, end: int) -> list[str]:
53
+ with _DB_LOCK:
54
+ rows = _db_conn.execute(
55
+ "SELECT value FROM messages WHERE key=? ORDER BY id ASC", (key,)
56
+ ).fetchall()
57
+ values = [r[0] for r in rows]
58
+ n = len(values)
59
+ if n == 0:
60
+ return []
61
+ if start < 0:
62
+ start = max(n + start, 0)
63
+ if end < 0:
64
+ end = n + end
65
+ end = min(end, n - 1)
66
+ if start > end:
67
+ return []
68
+ return values[start: end + 1]
69
+
70
+
71
+ def store_llen(key: str) -> int:
72
+ with _DB_LOCK:
73
+ row = _db_conn.execute(
74
+ "SELECT COUNT(*) FROM messages WHERE key=?", (key,)
75
+ ).fetchone()
76
+ return row[0] if row else 0
77
+
78
+
79
+ def store_delete(key: str) -> None:
80
+ with _DB_LOCK:
81
+ _db_conn.execute("DELETE FROM messages WHERE key=?", (key,))
82
+ _db_conn.commit()
83
+
84
+
85
+ def store_rpush(key: str, value: str) -> None:
86
+ with _DB_LOCK:
87
+ _db_conn.execute(
88
+ "INSERT INTO messages (key, value) VALUES (?, ?)", (key, value)
89
+ )
90
+ _db_conn.execute("""
91
+ DELETE FROM messages WHERE key=? AND id NOT IN (
92
+ SELECT id FROM messages WHERE key=? ORDER BY id DESC LIMIT ?
93
+ )
94
+ """, (key, key, MAX_STORE_MESSAGES))
95
+ _db_conn.commit()
96
+
97
+
98
+ # ── Config ────────────────────────────────────────────────────────────────────
99
+ VIDEO_ID = os.getenv("VIDEO_ID", "")
100
+
101
+ # ── Logging ───────────────────────────────────────────────────────────────────
102
+ logging.basicConfig(
103
+ level=logging.INFO,
104
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
105
+ force=True,
106
+ )
107
+ logger = logging.getLogger("app.scraper")
108
+
109
+ # ── Constants ─────────────────────────────────────────────────────────────────
110
+ MAX_STREAMS = 5
111
+ STREAM_COLORS = ["#7c3aed", "#10b981", "#f59e0b", "#3b82f6", "#ec4899"]
112
+ STREAM_NAMES = ["A", "B", "C", "D", "E"]
113
+
114
+ TOPIC_LABELS = ["Appreciation", "Question", "Request/Feedback", "Promo", "Spam", "General", "MCQ Answer"]
115
+ TOPIC_COLOR = {
116
+ "Appreciation": "#f59e0b", "Question": "#3b82f6",
117
+ "Request/Feedback": "#8b5cf6",
118
+ "Promo": "#ec4899", "Spam": "#ef4444", "General": "#6b7280",
119
+ "MCQ Answer": "#10b981",
120
+ }
121
+ SENT_COLORS = {"Positive": "#22c55e", "Neutral": "#eab308", "Negative": "#ef4444"}
122
+
123
+ # ── Scraper helpers ───────────────────────────────────────────────────────────
124
+
125
+ def _safe_sentiment(text: str):
126
+ try:
127
+ return predict_sentiment(text)
128
+ except Exception as exc:
129
+ logger.error("predict_sentiment failed: %s", exc)
130
+ return "Neutral", 0.50
131
+
132
+
133
+ def _safe_topic(text: str):
134
+ try:
135
+ topic, conf = predict_topic(text)
136
+ if topic not in VALID_TOPICS:
137
+ return "General", 0.50
138
+ return topic, conf
139
+ except Exception as exc:
140
+ logger.error("predict_topic failed: %s", exc)
141
+ return "General", 0.50
142
+
143
+
144
+ def _safe_action_type(text: str):
145
+ try:
146
+ action_type, conf = predict_action_type(text)
147
+ if action_type not in VALID_ACTION_TYPES:
148
+ return "N/A", 0.50
149
+ return action_type, conf
150
+ except Exception as exc:
151
+ logger.error("predict_action_type failed: %s", exc)
152
+ return "N/A", 0.50
153
+
154
+
155
+ def _get_live_chat_id(video_id: str, api_key: str) -> str | None:
156
+ import urllib.request
157
+ import urllib.parse
158
+ import urllib.error
159
+
160
+ url = (
161
+ "https://www.googleapis.com/youtube/v3/videos"
162
+ f"?part=liveStreamingDetails&id={urllib.parse.quote(video_id)}&key={api_key}"
163
+ )
164
+ try:
165
+ with urllib.request.urlopen(url, timeout=10) as resp:
166
+ data = json.loads(resp.read())
167
+ items = data.get("items", [])
168
+ if not items:
169
+ logger.error("No video found for id=%s", video_id)
170
+ return None
171
+ live_details = items[0].get("liveStreamingDetails", {})
172
+ chat_id = live_details.get("activeLiveChatId")
173
+ if not chat_id:
174
+ logger.error("No activeLiveChatId for video=%s", video_id)
175
+ return chat_id
176
+ except urllib.error.HTTPError as exc:
177
+ body = exc.read().decode("utf-8", errors="replace")[:500]
178
+ logger.error("HTTP %d from YouTube API for video=%s: %s", exc.code, video_id, body)
179
+ return None
180
+ except Exception as exc:
181
+ logger.error("Failed to get liveChatId: %s", exc)
182
+ return None
183
+
184
+
185
+ def _fetch_chat_messages(live_chat_id: str, api_key: str, page_token: str | None = None):
186
+ import urllib.request
187
+ import urllib.parse
188
+
189
+ params = {
190
+ "part": "snippet,authorDetails",
191
+ "liveChatId": live_chat_id,
192
+ "key": api_key,
193
+ "maxResults": "200",
194
+ }
195
+ if page_token:
196
+ params["pageToken"] = page_token
197
+
198
+ url = "https://www.googleapis.com/youtube/v3/liveChat/messages?" + urllib.parse.urlencode(params)
199
+ try:
200
+ with urllib.request.urlopen(url, timeout=10) as resp:
201
+ data = json.loads(resp.read())
202
+ messages = data.get("items", [])
203
+ next_token = data.get("nextPageToken")
204
+ poll_interval = data.get("pollingIntervalMillis", 5000)
205
+ return messages, next_token, poll_interval
206
+ except Exception as exc:
207
+ logger.error("Failed to fetch chat messages: %s", exc)
208
+ return [], None, 5000
209
+
210
+
211
+ def _scraper_thread_fn(video_id: str, redis_key: str, stop_event: threading.Event) -> None:
212
+ api_key = os.getenv("YOUTUBE_API_KEY", "")
213
+ if not api_key:
214
+ msg = "YOUTUBE_API_KEY env var not set. Cannot start scraper."
215
+ logger.error(msg)
216
+ _META["scraper_error"] = msg
217
+ return
218
+
219
+ _META.pop("scraper_error", None)
220
+ live_chat_id = _get_live_chat_id(video_id, api_key)
221
+ if not live_chat_id:
222
+ msg = f"No active live chat found for video '{video_id}'. Make sure the stream is currently LIVE."
223
+ logger.error(msg)
224
+ _META["scraper_error"] = msg
225
+ return
226
+
227
+ page_token = None
228
+ seen_ids: set = set()
229
+ is_first_page = True
230
+
231
+ while not stop_event.is_set():
232
+ messages, page_token, poll_ms = _fetch_chat_messages(live_chat_id, api_key, page_token)
233
+
234
+ new_msgs = []
235
+ for item in messages:
236
+ if stop_event.is_set():
237
+ break
238
+ msg_id = item.get("id", "")
239
+ if msg_id in seen_ids:
240
+ continue
241
+ seen_ids.add(msg_id)
242
+ snippet = item.get("snippet", {})
243
+ if snippet.get("type") != "textMessageEvent":
244
+ continue
245
+ text = snippet.get("displayMessage", "").strip()
246
+ import emoji as _emoji
247
+ text = _emoji.emojize(text, language="alias")
248
+ author = item.get("authorDetails", {}).get("displayName", "Unknown")
249
+ if not text:
250
+ continue
251
+ new_msgs.append((msg_id, text, author))
252
+
253
+ if is_first_page and new_msgs:
254
+ for _, text, author in new_msgs:
255
+ message_data = {
256
+ "author": author, "text": text,
257
+ "sentiment": "Neutral", "confidence": 0.5,
258
+ "topic": "General", "topic_conf": 0.5,
259
+ "action_type": "N/A", "action_type_conf": 0.5,
260
+ "time": datetime.now().isoformat(),
261
+ }
262
+ store_rpush(redis_key, json.dumps(message_data))
263
+ is_first_page = False
264
+ else:
265
+ for _, text, author in new_msgs:
266
+ if stop_event.is_set():
267
+ break
268
+ try:
269
+ sentiment, s_conf = _safe_sentiment(text)
270
+ topic, t_conf = _safe_topic(text)
271
+ action_type, at_conf = _safe_action_type(text)
272
+ except Exception as exc:
273
+ logger.error("ML inference failed: %s", exc)
274
+ sentiment, s_conf = "Neutral", 0.5
275
+ topic, t_conf = "General", 0.5
276
+ action_type, at_conf = "N/A", 0.5
277
+
278
+ message_data = {
279
+ "author": author, "text": text,
280
+ "sentiment": sentiment, "confidence": round(s_conf, 3),
281
+ "topic": topic, "topic_conf": round(t_conf, 3),
282
+ "action_type": action_type, "action_type_conf": round(at_conf, 3),
283
+ "time": datetime.now().isoformat(),
284
+ }
285
+ store_rpush(redis_key, json.dumps(message_data))
286
+
287
+ if len(seen_ids) > 5000:
288
+ seen_ids = set(list(seen_ids)[-2000:])
289
+
290
+ wait_s = max(poll_ms / 1000, 3.0)
291
+ stop_event.wait(timeout=wait_s)
292
+
293
+
294
+ def start_scraper(slot_idx: int, video_id: str, redis_key: str) -> None:
295
+ key = str(slot_idx)
296
+ stop_scraper(slot_idx)
297
+ stop_event = threading.Event()
298
+ t = threading.Thread(
299
+ target=_scraper_thread_fn,
300
+ args=(video_id, redis_key, stop_event),
301
+ daemon=True,
302
+ name=f"scraper-{slot_idx}",
303
+ )
304
+ _SCRAPER_STOP[key] = stop_event
305
+ _SCRAPER_THREADS[key] = t
306
+ t.start()
307
+
308
+
309
+ def stop_scraper(slot_idx: int) -> None:
310
+ key = str(slot_idx)
311
+ ev = _SCRAPER_STOP.get(key)
312
+ if ev:
313
+ ev.set()
314
+
315
+
316
+ def is_scraper_running(slot_idx: int) -> bool:
317
+ key = str(slot_idx)
318
+ t = _SCRAPER_THREADS.get(key)
319
+ return t is not None and t.is_alive()
320
+
321
+
322
+ # ── UI helpers ────────────────────────────────────────────────────────────────
323
+
324
+ def extract_video_id(url_or_id: str) -> str:
325
+ url_or_id = url_or_id.strip()
326
+ match = re.search(r"(?:v=|/live/|youtu\.be/)([A-Za-z0-9_-]{11})", url_or_id)
327
+ if match:
328
+ return match.group(1)
329
+ if re.match(r"^[A-Za-z0-9_-]{11}$", url_or_id):
330
+ return url_or_id
331
+ return url_or_id
332
+
333
+
334
+ def fetch_video_title(video_id: str) -> str | None:
335
+ try:
336
+ import urllib.request
337
+ url = f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json"
338
+ with urllib.request.urlopen(url, timeout=5) as resp:
339
+ return json.loads(resp.read())["title"]
340
+ except Exception:
341
+ return None
342
+
343
+
344
+ def clean_topic(val) -> str:
345
+ if pd.isna(val) or str(val).strip() == "" or str(val).strip().lower() == "nan":
346
+ return "General"
347
+ return str(val).strip()
348
+
349
+
350
+ def clean_sentiment(val) -> str:
351
+ if str(val).strip() in ("Positive", "Negative", "Neutral"):
352
+ return str(val).strip()
353
+ return "Neutral"
354
+
355
+
356
+ def plotly_layout(height: int = 280) -> dict:
357
+ return dict(
358
+ paper_bgcolor="rgba(0,0,0,0)",
359
+ plot_bgcolor="rgba(0,0,0,0)",
360
+ height=height,
361
+ margin=dict(l=10, r=10, t=10, b=10),
362
+ font=dict(family="Space Grotesk"),
363
+ xaxis=dict(showgrid=False, zeroline=False, showline=False,
364
+ tickfont=dict(size=11), title=None),
365
+ yaxis=dict(showgrid=True, gridcolor="rgba(128,128,128,0.12)",
366
+ zeroline=False, showline=False, tickfont=dict(size=11), title=None),
367
+ showlegend=False,
368
+ hoverlabel=dict(font_family="Space Grotesk", font_size=12),
369
+ )
370
+
371
+
372
+ def csv_download(df_export, label: str, filename: str) -> None:
373
+ csv = df_export.to_csv(index=False).encode("utf-8")
374
+ st.download_button(label=f"\u2b07 {label}", data=csv,
375
+ file_name=filename, mime="text/csv", key=filename)
376
+
377
+
378
+ def load_stream_data(redis_key: str, limit: int | None = None) -> list[dict]:
379
+ if limit:
380
+ raws = store_lrange(redis_key, -limit, -1)
381
+ else:
382
+ raws = store_lrange(redis_key, 0, -1)
383
+ data = []
384
+ for raw in raws:
385
+ try:
386
+ data.append(json.loads(raw))
387
+ except Exception:
388
+ pass
389
+ return data
390
+
391
+
392
+ # ── Analytics (cached) ────────────────────────────────────────────────────────
393
+
394
+ @st.cache_data(ttl=10, show_spinner=False)
395
+ def compute_velocity(df_all_json: str, window: int = 20) -> dict:
396
+ import json as _json
397
+ sentiments = [m.get("sentiment", "Neutral") for m in _json.loads(df_all_json)]
398
+ n = len(sentiments)
399
+ if n < window * 2:
400
+ return {"direction": "\u2192", "delta": 0.0, "label": "Stable", "color": "#eab308"}
401
+ recent = sentiments[-window:]
402
+ prev = sentiments[-window*2:-window]
403
+ r_pos = sum(1 for s in recent if s == "Positive") / window
404
+ p_pos = sum(1 for s in prev if s == "Positive") / window
405
+ delta = r_pos - p_pos
406
+ if delta > 0.08:
407
+ return {"direction": "\u2191", "delta": delta, "label": "Rising", "color": "#22c55e"}
408
+ elif delta < -0.08:
409
+ return {"direction": "\u2193", "delta": delta, "label": "Falling", "color": "#ef4444"}
410
+ return {"direction": "\u2192", "delta": delta, "label": "Stable", "color": "#eab308"}
411
+
412
+
413
+ @st.cache_data(ttl=10, show_spinner=False)
414
+ def build_heatmap_data(df_all_json: str, bucket_minutes: int = 1) -> pd.DataFrame:
415
+ import json as _json
416
+ records = _json.loads(df_all_json)
417
+ if not records:
418
+ return pd.DataFrame()
419
+ df_t = pd.DataFrame(records)
420
+ if "time" not in df_t.columns:
421
+ return pd.DataFrame()
422
+ df_t["time"] = pd.to_datetime(df_t["time"], errors="coerce")
423
+ df_t = df_t.dropna(subset=["time"])
424
+ if df_t.empty:
425
+ return pd.DataFrame()
426
+ df_t["bucket"] = df_t["time"].dt.floor(f"{bucket_minutes}min")
427
+ grouped = df_t.groupby(["bucket", "sentiment"]).size().unstack(fill_value=0)
428
+ for col in ["Positive", "Neutral", "Negative"]:
429
+ if col not in grouped.columns:
430
+ grouped[col] = 0
431
+ grouped = grouped.reset_index()
432
+ grouped.columns.name = None
433
+ return grouped[["bucket", "Positive", "Neutral", "Negative"]]
434
+
435
+
436
+ def check_alert(df_all: pd.DataFrame, threshold: float = 0.4, window: int = 15) -> dict | None:
437
+ if len(df_all) < window:
438
+ return None
439
+ recent = df_all.iloc[-window:]
440
+ neg_ratio = (recent["sentiment"] == "Negative").mean()
441
+ if neg_ratio >= threshold:
442
+ return {
443
+ "neg_ratio": neg_ratio,
444
+ "count": int((recent["sentiment"] == "Negative").sum()),
445
+ "window": window,
446
+ }
447
+ return None
448
+
449
+
450
+ @st.cache_data(ttl=10, show_spinner=False)
451
+ def compute_engagement(all_data_json: str, window: int = 50) -> dict:
452
+ import json as _j
453
+ msgs = _j.loads(all_data_json)
454
+ if not msgs:
455
+ return {"score": 0, "rate": 0.0, "pos_ratio": 0.0, "q_density": 0.0, "grade": "\u2014"}
456
+ recent = msgs[-window:]
457
+ n = len(recent)
458
+ rate = 0.0
459
+ try:
460
+ t0 = datetime.fromisoformat(recent[0]["time"])
461
+ t1 = datetime.fromisoformat(recent[-1]["time"])
462
+ elapsed = max((t1 - t0).total_seconds() / 60, 0.1)
463
+ rate = round(n / elapsed, 1)
464
+ except Exception:
465
+ rate = float(n)
466
+ pos_ratio = sum(1 for m in recent if m.get("sentiment") == "Positive") / max(n, 1)
467
+ q_density = sum(1 for m in recent if m.get("topic") == "Question") / max(n, 1)
468
+ rate_norm = min(rate / 60, 1.0)
469
+ score = round((rate_norm * 0.4 + pos_ratio * 0.4 + q_density * 0.2) * 100)
470
+ if score >= 70: grade = "\U0001f525 High"
471
+ elif score >= 40: grade = "\u26a1 Medium"
472
+ else: grade = "\U0001f4a4 Low"
473
+ return {"score": score, "rate": rate, "pos_ratio": pos_ratio, "q_density": q_density, "grade": grade}
474
+
475
+
476
+ @st.cache_data(ttl=10, show_spinner=False)
477
+ def compute_top_contributors(all_data_json: str, top_n: int = 10) -> list[dict]:
478
+ import json as _j
479
+ msgs = _j.loads(all_data_json)
480
+ if not msgs:
481
+ return []
482
+ TOPICS = ["Appreciation", "Question", "Request/Feedback", "Promo", "Spam", "General", "MCQ Answer"]
483
+ author_data: dict[str, dict] = {}
484
+ for m in msgs:
485
+ a = m.get("author", "Unknown")
486
+ if a not in author_data:
487
+ author_data[a] = {"count": 0, "Positive": 0, "Neutral": 0, "Negative": 0,
488
+ **{t: 0 for t in TOPICS}}
489
+ author_data[a]["count"] += 1
490
+ s = m.get("sentiment", "Neutral")
491
+ if s in ("Positive", "Neutral", "Negative"):
492
+ author_data[a][s] += 1
493
+ t = m.get("topic", "General")
494
+ if t not in TOPICS:
495
+ t = "General"
496
+ author_data[a][t] += 1
497
+ sorted_authors = sorted(author_data.items(), key=lambda x: x[1]["count"], reverse=True)[:top_n]
498
+ result = []
499
+ for author, d in sorted_authors:
500
+ total = max(d["count"], 1)
501
+ result.append({
502
+ "author": author, "count": d["count"],
503
+ "pos_pct": round(d["Positive"] / total * 100),
504
+ "neu_pct": round(d["Neutral"] / total * 100),
505
+ "neg_pct": round(d["Negative"] / total * 100),
506
+ "t_appr": round(d["Appreciation"] / total * 100),
507
+ "t_ques": round(d["Question"] / total * 100),
508
+ "t_rf": round(d["Request/Feedback"] / total * 100),
509
+ "t_promo": round(d["Promo"] / total * 100),
510
+ "t_spam": round(d["Spam"] / total * 100),
511
+ "t_gen": round(d["General"] / total * 100),
512
+ "t_mcq": round(d["MCQ Answer"] / total * 100),
513
+ })
514
+ return result
515
+
516
+
517
+ @st.cache_data(ttl=10, show_spinner=False)
518
+ def compute_word_freq(all_data_json: str, sentiment_filter: str = "All",
519
+ topic_filter: str = "All", top_n: int = 60) -> list[tuple[str, int]]:
520
+ import json as _j
521
+ from collections import Counter
522
+ STOPWORDS = {
523
+ "the","a","an","is","it","in","on","at","to","of","and","or","but","for",
524
+ "with","this","that","are","was","be","as","by","from","have","has","had",
525
+ "not","no","so","if","do","did","will","can","just","i","you","he","she",
526
+ "we","they","my","your","his","her","our","their","me","him","us","them",
527
+ "what","how","why","when","where","who","which","there","here","been",
528
+ "would","could","should","may","might","shall","than","then","now","also",
529
+ "more","very","too","up","out","about","into","over","after","before",
530
+ "yaar","bhi","hai","hain","ho","kar","ke","ki","ka","ko","se","ne","ye",
531
+ "vo","woh","aur","nahi","nhi","toh","koi","kuch","ab","ek","hi",
532
+ }
533
+ msgs = _j.loads(all_data_json)
534
+ words: list[str] = []
535
+ for m in msgs:
536
+ if sentiment_filter != "All" and m.get("sentiment") != sentiment_filter:
537
+ continue
538
+ if topic_filter != "All" and m.get("topic") != topic_filter:
539
+ continue
540
+ text = re.sub(r"[^\w\s]", " ", m.get("text", "").lower())
541
+ for w in text.split():
542
+ if len(w) > 2 and w not in STOPWORDS and not w.isdigit():
543
+ words.append(w)
544
+ return Counter(words).most_common(top_n)
545
+
546
+
547
+ def check_spam_alert(df_all: pd.DataFrame, threshold: float = 0.3, window: int = 20) -> dict | None:
548
+ if "topic" not in df_all.columns or len(df_all) < window:
549
+ return None
550
+ recent = df_all.iloc[-window:]
551
+ spam_ratio = (recent["topic"] == "Spam").mean()
552
+ if spam_ratio >= threshold:
553
+ return {
554
+ "spam_ratio": spam_ratio,
555
+ "count": int((recent["topic"] == "Spam").sum()),
556
+ "window": window,
557
+ }
558
+ return None
559
+
560
+
561
+ @st.cache_data(ttl=10, show_spinner=False)
562
+ def detect_repeat_spammers(all_data_json: str, window_sec: int = 15, min_repeats: int = 2) -> list[dict]:
563
+ import json as _j
564
+ msgs = _j.loads(all_data_json)
565
+ if not msgs:
566
+ return []
567
+
568
+ def _normalize(t: str) -> str:
569
+ return re.sub(r"[^\w]", "", t.lower().strip())
570
+
571
+ bursts: dict[tuple, dict] = {}
572
+ for m in msgs:
573
+ author = m.get("author", "Unknown")
574
+ text = m.get("text", "").strip()
575
+ if not text:
576
+ continue
577
+ norm = _normalize(text)
578
+ if len(norm) < 4:
579
+ continue
580
+ ts_str = m.get("time", "")
581
+ try:
582
+ ts = datetime.fromisoformat(ts_str)
583
+ except Exception:
584
+ continue
585
+ key = (author, norm)
586
+ if key not in bursts:
587
+ bursts[key] = {
588
+ "author": author, "text": text,
589
+ "topic": m.get("topic", "General"),
590
+ "sentiment": m.get("sentiment", "Neutral"),
591
+ "timestamps": [],
592
+ }
593
+ bursts[key]["timestamps"].append(ts)
594
+
595
+ results = []
596
+ for key, burst in bursts.items():
597
+ times = sorted(burst["timestamps"])
598
+ max_in_window = 1
599
+ for i in range(len(times)):
600
+ count_in_window = sum(
601
+ 1 for t in times[i:]
602
+ if (t - times[i]).total_seconds() <= window_sec
603
+ )
604
+ max_in_window = max(max_in_window, count_in_window)
605
+ if max_in_window >= min_repeats:
606
+ results.append({
607
+ "author": burst["author"],
608
+ "text": burst["text"],
609
+ "topic": burst["topic"],
610
+ "sentiment": burst["sentiment"],
611
+ "count": len(times),
612
+ "max_burst": max_in_window,
613
+ "first_seen": times[0].strftime("%H:%M:%S"),
614
+ "last_seen": times[-1].strftime("%H:%M:%S"),
615
+ })
616
+ return sorted(results, key=lambda x: x["max_burst"], reverse=True)