rocky250 commited on
Commit
7a056a6
·
verified ·
1 Parent(s): 52e07d5

Update analyzer.py

Browse files
Files changed (1) hide show
  1. analyzer.py +777 -206
analyzer.py CHANGED
@@ -1,117 +1,697 @@
1
  """
2
- analyzer.py — Mental-health misinformation detection + sentiment analysis.
3
-
4
- Misinformation: lightweight rule-based 4-stream scorer (no external API needed).
5
- Sentiment: VADER (fast, CPU) or DistilBERT (accurate, downloads ~500 MB first run).
 
 
 
 
6
  """
7
 
8
  import re
 
 
 
 
9
  from collections import Counter
 
10
 
 
11
  import pandas as pd
12
 
 
13
 
 
 
 
14
 
15
- # MISINFORMATION DETECTION
 
 
 
 
16
 
 
 
17
 
18
- # Signals that raise the misinformation score
19
- _RED_FLAGS = [
20
- "miracle cure", "they don't want you to know", "big pharma", "doctors hide",
21
- "secret remedy", "ancient cure", "government censored", "fda lies", "fda lie",
22
- "conspiracy", "natural cure", "detox your brain", "toxins cause",
23
- "no medication needed", "stop taking meds", "heal yourself naturally",
24
- "100% effective", "guaranteed cure", "scientifically proven cure",
25
- "instant relief", "suppress the truth", "alternative medicine cures",
26
- "vaccines cause mental", "wifi causes", "5g causes", "chemtrails",
27
- "big pharma doesn't want", "they suppress", "hidden cure",
28
- "cure depression", "cure anxiety", "cure schizophrenia", "cure bipolar",
29
- "cure autism", "cure adhd", "detox cure",
30
- ]
31
 
32
- # Signals that reduce the misinformation score
33
- _CREDIBILITY = [
34
- "peer-reviewed", "clinical trial", "randomized controlled", "meta-analysis",
35
- "published in", "according to research", "study shows", "evidence suggests",
36
- "licensed therapist", "board-certified", "psychiatrist", "psychologist",
37
- "cognitive behavioral", "evidence-based", "treatment guidelines",
38
- "american psychological", "national institute", "who recommends",
39
- "systematic review", "consult your doctor", "speak to a professional",
40
- "mental health professional", "contact a therapist",
41
  ]
42
 
43
- # Clickbait / sensationalist language
44
- _CLICKBAIT = [
45
- "you won't believe", "shocking truth", "the truth about", "exposed",
46
- "they lied", "watch before deleted", "banned video", "censored truth",
47
- "must watch", "share before removed", "real truth", "wake up",
48
- "open your eyes", "mainstream media won't", "what they hide",
49
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
 
52
  def detect_misinformation(
53
  text: str,
54
- tags: list,
55
  audio_transcript: str = "",
56
  video_transcript: str = "",
57
- ) -> dict:
58
  """
59
- 4-stream scoring: title/desc, tags, transcript, credibility.
60
- Returns score 0–1 (higher = more likely misinformation).
 
 
 
 
 
 
61
  """
62
- combined = f"{text} {' '.join(tags)} {audio_transcript} {video_transcript}".lower()
63
- tags_lower = [t.lower() for t in tags]
64
-
65
- # Stream 1 title / description
66
- red_in_text = sum(1 for r in _RED_FLAGS if r in combined)
67
- click_in_text = sum(1 for c in _CLICKBAIT if c in combined)
68
- s1 = min((red_in_text * 0.18 + click_in_text * 0.12), 1.0)
69
-
70
- # Stream 2 tags
71
- red_in_tags = sum(1 for tag in tags_lower for r in _RED_FLAGS if r in tag)
72
- s2 = min(red_in_tags * 0.25, 1.0)
73
-
74
- # Stream 3 — transcript density
75
- word_count = max(len(combined.split()), 1)
76
- red_density = sum(1 for r in _RED_FLAGS if r in combined) / (word_count / 100)
77
- s3 = min(red_density * 0.15, 1.0)
78
-
79
- # Stream 4 — credibility deficit (absence of credible language = risk)
80
- cred_count = sum(1 for c in _CREDIBILITY if c in combined)
81
- s4 = max(0.0, 0.6 - cred_count * 0.12) # starts at 0.6, falls with credibility
82
-
83
- stream_details = {
84
- "Title & Description": round(s1, 3),
85
- "Tags": round(s2, 3),
86
- "Transcript": round(s3, 3),
87
- "Credibility Gap": round(s4, 3),
88
  }
89
 
90
- score = (s1 * 0.35 + s2 * 0.20 + s3 * 0.20 + s4 * 0.25)
91
- score = max(0.0, min(1.0, score))
92
-
93
- if score < 0.35:
94
- reasoning = (
95
- f"Content uses credible language ({cred_count} credibility markers found). "
96
- "No major misinformation signals detected in title, tags, or transcript."
97
- )
98
- elif score < 0.65:
99
- reasoning = (
100
- f"Mixed signals detected — {red_in_text} red-flag phrase(s) alongside "
101
- f"{cred_count} credibility indicator(s). Manual review recommended before sharing."
 
 
102
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  else:
104
- reasoning = (
105
- f"High misinformation risk. {red_in_text} red-flag phrase(s) and "
106
- f"{click_in_text} clickbait indicator(s) detected with low credibility language. "
107
- "Exercise significant caution."
108
  )
109
 
 
 
 
 
 
 
 
 
110
  return {
111
- "score": score,
112
- "confidence_pct": int(round(score * 100)),
113
- "reasoning": reasoning,
114
- "stream_details": stream_details,
 
 
 
 
 
 
 
115
  }
116
 
117
 
@@ -119,144 +699,135 @@ def detect_misinformation(
119
  # SENTIMENT ANALYSIS
120
 
121
 
122
- def analyze_sentiment_batch(
123
- texts: list,
124
- method: str = "vader",
125
- batch_size: int = 64,
126
- ) -> list[dict]:
127
- """Return list of {'label': str, 'compound': float, 'score': float}."""
128
- if not texts:
129
- return []
130
- if method == "hf":
131
- return _hf_sentiment(texts, batch_size=batch_size)
132
- return _vader_sentiment(texts)
133
-
134
-
135
- def _vader_sentiment(texts: list) -> list[dict]:
136
- try:
137
- from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
138
- sia = SentimentIntensityAnalyzer()
139
- results = []
140
- for text in texts:
141
- sc = sia.polarity_scores(str(text))
142
- c = sc["compound"]
143
- label = "POSITIVE" if c >= 0.05 else ("NEGATIVE" if c <= -0.05 else "NEUTRAL")
144
- results.append({"label": label, "compound": round(c, 4), "score": round(abs(c), 4)})
145
- return results
146
- except ImportError:
147
- return _simple_sentiment(texts)
148
- except Exception:
149
- return _simple_sentiment(texts)
150
-
151
-
152
- def _hf_sentiment(texts: list, batch_size: int = 32) -> list[dict]:
153
- try:
154
- from transformers import pipeline as hf_pipeline
155
- pipe = hf_pipeline(
156
  "sentiment-analysis",
157
  model="distilbert-base-uncased-finetuned-sst-2-english",
158
- truncation=True,
159
- max_length=512,
160
  )
161
- results = []
162
- for i in range(0, len(texts), batch_size):
163
- chunk = [str(t)[:512] for t in texts[i: i + batch_size]]
164
- out = pipe(chunk)
165
- for item in out:
166
- lbl = item["label"]
167
- sc = item["score"]
168
- compound = sc if lbl == "POSITIVE" else -sc
169
- results.append({"label": lbl, "compound": round(compound, 4), "score": round(sc, 4)})
170
- return results
171
- except Exception:
172
- return _vader_sentiment(texts)
173
-
174
-
175
- def _simple_sentiment(texts: list) -> list[dict]:
176
- """Zero-dependency fallback when VADER isn't installed."""
177
- pos_vocab = {
178
- "good", "great", "excellent", "love", "amazing", "wonderful", "helpful",
179
- "best", "thank", "thanks", "awesome", "brilliant", "perfect", "happy",
180
- "fantastic", "outstanding", "superb", "recommend", "positive", "useful",
181
- }
182
- neg_vocab = {
183
- "bad", "terrible", "awful", "hate", "worst", "horrible", "wrong",
184
- "false", "misleading", "garbage", "useless", "poor", "disappointing",
185
- "dangerous", "harmful", "misinformation", "lie", "lies", "fraud",
186
- }
187
- results = []
188
- for text in texts:
189
- words = set(str(text).lower().split())
190
- pos = len(words & pos_vocab)
191
- neg = len(words & neg_vocab)
192
- if pos > neg:
193
- label, compound = "POSITIVE", 0.5
194
- elif neg > pos:
195
- label, compound = "NEGATIVE", -0.5
196
- else:
197
- label, compound = "NEUTRAL", 0.0
198
- results.append({"label": label, "compound": compound, "score": abs(compound)})
199
- return results
200
 
201
 
 
 
 
 
 
 
 
 
 
202
 
203
- # SUMMARY + KEYWORDS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
 
206
- def sentiment_summary(sentiments: list) -> dict:
207
- if not sentiments:
208
- return {}
209
- total = len(sentiments)
210
- pos = sum(1 for s in sentiments if s["label"] == "POSITIVE")
211
- neg = sum(1 for s in sentiments if s["label"] == "NEGATIVE")
212
- neu = total - pos - neg
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  return {
214
- "total": total,
215
- "pos": pos,
216
- "neg": neg,
217
- "neu": neu,
218
- "pos_pct": round(pos / total * 100, 1),
219
- "neg_pct": round(neg / total * 100, 1),
220
- "neu_pct": round(neu / total * 100, 1),
 
221
  }
222
 
223
 
224
- _STOP = frozenset(
225
- "the a an and or but in on at to for of with by from up is are was were be been "
226
- "being have has had do does did will would could should may might this that these "
227
- "those it its they them their we our you your i my he she his her not no so if as "
228
- "about what how when who which all just more also can get like one there than now "
229
- "then very much many some any such other very really just even still only well "
230
- "http https www com".split()
231
- )
232
 
 
233
 
234
- def extract_keywords(text: str, tags: list, top_n: int = 15) -> list[tuple]:
235
- words = re.findall(r"\b[a-z]{4,}\b", text.lower())
236
- filtered = [w for w in words if w not in _STOP]
237
- tag_words = [re.sub(r"[^a-z]", "", t.lower()) for t in tags]
238
- tag_words = [w for w in tag_words if len(w) >= 4 and w not in _STOP]
239
- all_words = filtered + tag_words * 3
240
- return Counter(all_words).most_common(top_n)
241
 
 
 
 
 
 
 
 
 
 
242
 
243
- def sentiment_weighted_keywords(
244
- df: pd.DataFrame,
245
- sentiments: list,
246
- top_n: int = 10,
247
- ) -> tuple[list, list]:
248
- if df.empty or not sentiments:
249
- return [], []
250
 
251
- pos_words, neg_words = [], []
252
- texts = df["text"].fillna("").tolist()
 
 
 
253
 
254
- for text, sent in zip(texts, sentiments):
255
- words = re.findall(r"\b[a-z]{4,}\b", str(text).lower())
256
- words = [w for w in words if w not in _STOP]
257
- if sent["label"] == "POSITIVE":
258
- pos_words.extend(words)
259
- elif sent["label"] == "NEGATIVE":
260
- neg_words.extend(words)
261
 
262
- return Counter(pos_words).most_common(top_n), Counter(neg_words).most_common(top_n)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ analyzer.py — MHMisinfo model integration + sentiment + keyword analysis.
3
+
4
+ Strategy:
5
+ 1. Download & introspect best_multimodal.pt to discover actual architecture.
6
+ 2. Use SVM per-modality models as the PRIMARY source for per-stream scores
7
+ (they are self-contained sklearn pipelines with their own vectorizer).
8
+ 3. If SVMs unavailable, fall back to heuristic per-stream analysis.
9
+ 4. Use multimodal model's overall logit only for the global score + label.
10
  """
11
 
12
  import re
13
+ import math
14
+ import os
15
+ import pickle
16
+ import logging
17
  from collections import Counter
18
+ from typing import List, Dict, Tuple, Optional
19
 
20
+ import numpy as np
21
  import pandas as pd
22
 
23
+ logger = logging.getLogger(__name__)
24
 
25
+ # Globals ─
26
+ _sentiment_pipeline = None
27
+ _vader_analyzer = None
28
 
29
+ _multimodal_model = None # PyTorch model (for global score)
30
+ _multimodal_meta = {} # {arch_type, input_size, hidden_size, ...}
31
+ _svm_pipelines = {} # {text, audio, video, tags} → sklearn pipeline
32
+ _bert_tokenizer = None # loaded only if multimodal model needs it
33
+ _tfidf_vectorizers = {} # {stream} → TfidfVectorizer (if separate)
34
 
35
+ _models_loaded = False
36
+ _load_error = None
37
 
38
+ HF_REPO_ID = "rocky250/MHMisinfo"
39
+ CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "mhmisinfo")
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ # Red-flag vocabulary (heuristic fallback)
42
+ _MISINFO_RED_FLAGS: List[str] = [
43
+ "cure", "cures", "miracle", "they don't want you to know",
44
+ "doctors hate", "secret", "suppressed", "fake news",
45
+ "conspiracy", "detox", "toxins", "pseudoscience",
46
+ "100% natural", "big pharma", "government hiding",
 
 
 
47
  ]
48
 
49
+
50
+
51
+ # MODEL LOADING
52
+
53
+
54
+ def _hf_download(filename: str) -> str:
55
+ from huggingface_hub import hf_hub_download
56
+ return hf_hub_download(
57
+ repo_id=HF_REPO_ID,
58
+ filename=filename,
59
+ cache_dir=CACHE_DIR,
60
+ )
61
+
62
+
63
+ def _introspect_pt(path: str) -> dict:
64
+ """
65
+ Load a .pt file and return a summary of what's inside.
66
+ Handles: state_dict, full model, sklearn object, plain tensor.
67
+ Returns dict with keys: kind, keys_sample, shapes_sample, obj
68
+ """
69
+ import torch
70
+ raw = torch.load(path, map_location="cpu", weights_only=False)
71
+
72
+ if hasattr(raw, "predict"):
73
+ # sklearn object saved with .pt extension
74
+ return {"kind": "sklearn", "obj": raw}
75
+
76
+ if isinstance(raw, dict):
77
+ keys = list(raw.keys())
78
+ # Check for nested state_dict
79
+ if "state_dict" in raw:
80
+ sd = raw["state_dict"]
81
+ return {
82
+ "kind": "checkpoint",
83
+ "config": raw.get("config", {}),
84
+ "keys_sample": list(sd.keys())[:20],
85
+ "shapes": {k: tuple(v.shape) for k, v in list(sd.items())[:20]},
86
+ "obj": raw,
87
+ }
88
+ # Bare state_dict — check if values are tensors
89
+ if all(hasattr(v, "shape") for v in list(raw.values())[:3]):
90
+ return {
91
+ "kind": "state_dict",
92
+ "keys_sample": keys[:20],
93
+ "shapes": {k: tuple(v.shape) for k, v in list(raw.items())[:20]},
94
+ "obj": raw,
95
+ }
96
+ # Generic dict (could be sklearn pipeline stored as dict)
97
+ return {"kind": "dict", "keys": keys, "obj": raw}
98
+
99
+ if hasattr(raw, "parameters"):
100
+ # Full nn.Module saved with torch.save(model)
101
+ sd = raw.state_dict()
102
+ return {
103
+ "kind": "full_model",
104
+ "keys_sample": list(sd.keys())[:20],
105
+ "shapes": {k: tuple(v.shape) for k, v in list(sd.items())[:20]},
106
+ "obj": raw,
107
+ }
108
+
109
+ return {"kind": "unknown", "obj": raw}
110
+
111
+
112
+ def _infer_architecture(info: dict) -> dict:
113
+ """
114
+ From the introspection dict, work out the likely architecture
115
+ so we can instantiate a matching nn.Module.
116
+ Returns: {hidden_size, num_layers, num_streams, vocab_size, embed_dim,
117
+ num_classes, has_attention, is_bigru}
118
+ """
119
+ shapes = info.get("shapes", {})
120
+ keys = info.get("keys_sample", [])
121
+
122
+ cfg = {
123
+ "hidden_size": 128,
124
+ "num_layers": 2,
125
+ "num_streams": 4,
126
+ "vocab_size": 30522,
127
+ "embed_dim": 128,
128
+ "num_classes": 2,
129
+ "has_attention": any("attn" in k or "attention" in k for k in keys),
130
+ "is_bigru": any("gru" in k.lower() or "bigru" in k.lower() for k in keys),
131
+ }
132
+
133
+ # Try to extract embedding dimension from the embedding weight
134
+ for k, s in shapes.items():
135
+ if "embed" in k.lower() and len(s) == 2:
136
+ cfg["vocab_size"] = s[0]
137
+ cfg["embed_dim"] = s[1]
138
+ break
139
+
140
+ # Try to extract hidden size from GRU weight
141
+ for k, s in shapes.items():
142
+ if "gru" in k.lower() or "bigru" in k.lower():
143
+ if len(s) == 2:
144
+ # weight_ih_l0: (3*hidden, input) for GRU
145
+ cfg["hidden_size"] = s[0] // 3
146
+ break
147
+
148
+ # Try to extract num_classes from final linear
149
+ for k, s in shapes.items():
150
+ if ("classifier" in k or "fc" in k or "linear" in k) and len(s) == 2:
151
+ if s[0] <= 10: # small output = class head
152
+ cfg["num_classes"] = s[0]
153
+ break
154
+ if s[1] <= 10:
155
+ cfg["num_classes"] = s[1]
156
+ break
157
+
158
+ return cfg
159
+
160
+
161
+ def _build_model_from_introspection(info: dict):
162
+ """
163
+ Build an nn.Module that matches the discovered architecture
164
+ and load the weights into it.
165
+ """
166
+ import torch
167
+ import torch.nn as nn
168
+ import torch.nn.functional as F
169
+
170
+ cfg = _infer_architecture(info)
171
+ logger.info("Inferred architecture: %s", cfg)
172
+
173
+ H = cfg["hidden_size"]
174
+ ED = cfg["embed_dim"]
175
+ VS = cfg["vocab_size"]
176
+ NC = cfg["num_classes"]
177
+ NL = cfg["num_layers"]
178
+
179
+ # Generic flexible architecture ─
180
+ class FlexBiGRUStream(nn.Module):
181
+ def __init__(self):
182
+ super().__init__()
183
+ self.gru = nn.GRU(
184
+ ED, H, num_layers=NL,
185
+ batch_first=True, bidirectional=True,
186
+ dropout=0.3 if NL > 1 else 0.0
187
+ )
188
+ if cfg["has_attention"]:
189
+ self.attn = nn.Linear(H * 2, 1)
190
+ self.drop = nn.Dropout(0.3)
191
+
192
+ def forward(self, x):
193
+ out, _ = self.gru(x)
194
+ if cfg["has_attention"]:
195
+ w = torch.softmax(self.attn(out), dim=1)
196
+ ctx = (w * out).sum(1)
197
+ else:
198
+ ctx = out[:, -1, :]
199
+ return self.drop(ctx)
200
+
201
+ class FlexMultimodal(nn.Module):
202
+ def __init__(self):
203
+ super().__init__()
204
+ self.embedding = nn.Embedding(VS, ED, padding_idx=0)
205
+ self.enc_text = FlexBiGRUStream()
206
+ self.enc_audio = FlexBiGRUStream()
207
+ self.enc_video = FlexBiGRUStream()
208
+ self.enc_tags = FlexBiGRUStream()
209
+ fused = H * 2 * 4
210
+ self.dmte = nn.Linear(H * 2, 1)
211
+ self.fc1 = nn.Linear(fused, fused // 2)
212
+ self.fc2 = nn.Linear(fused // 2, fused // 4)
213
+ self.norm = nn.LayerNorm(fused // 4)
214
+ self.cls = nn.Linear(fused // 4, NC)
215
+ self.drop = nn.Dropout(0.3)
216
+
217
+ def forward(self, t_ids, a_ids, v_ids, g_ids):
218
+ emb = self.embedding
219
+ t = self.enc_text (emb(t_ids))
220
+ a = self.enc_audio(emb(a_ids))
221
+ v = self.enc_video(emb(v_ids))
222
+ g = self.enc_tags (emb(g_ids))
223
+ gates = torch.sigmoid(torch.stack(
224
+ [self.dmte(t), self.dmte(a), self.dmte(v), self.dmte(g)], dim=1
225
+ )) # (B,4,1)
226
+ streams = torch.stack([t, a, v, g], dim=1) # (B,4,H*2)
227
+ weighted = (streams * gates).view(streams.size(0), -1) # (B,H*2*4)
228
+ h = self.drop(F.relu(self.fc1(weighted)))
229
+ h = self.norm(F.relu(self.fc2(h)))
230
+ return self.cls(h), gates.squeeze(-1)
231
+
232
+ model = FlexMultimodal()
233
+
234
+ # Load weights — use strict=False and log what matched
235
+ obj = info["obj"]
236
+ sd = obj["state_dict"] if info["kind"] == "checkpoint" else (
237
+ obj if info["kind"] == "state_dict" else
238
+ obj.state_dict() if info["kind"] == "full_model" else None
239
+ )
240
+ if sd is not None:
241
+ result = model.load_state_dict(sd, strict=False)
242
+ matched = len(sd) - len(result.missing_keys) - len(result.unexpected_keys)
243
+ total = len(sd)
244
+ logger.info("Weights loaded: %d/%d matched, missing=%d, unexpected=%d",
245
+ matched, total, len(result.missing_keys), len(result.unexpected_keys))
246
+ # If fewer than 30% matched, the architecture is wrong → don't use this model
247
+ if total > 0 and matched / total < 0.30:
248
+ logger.warning("Too few weights matched (%.0f%%) — model outputs unreliable",
249
+ matched / total * 100)
250
+ return None, cfg, matched / total
251
+
252
+ return model, cfg, matched / total
253
+ elif info["kind"] == "full_model":
254
+ return info["obj"], cfg, 1.0
255
+ return None, cfg, 0.0
256
+
257
+
258
+ def _load_svm(filename: str, stream_name: str) -> bool:
259
+ """
260
+ Download and load one SVM model. Returns True on success.
261
+
262
+ The repo rocky250/MHMisinfo is tagged 'Joblib' on HuggingFace — files are
263
+ saved with .pt extension but were written by joblib.dump().
264
+ We try joblib FIRST, then plain pickle, then torch.load as last resort.
265
+ """
266
+ global _svm_pipelines
267
+
268
+ # Download
269
+ try:
270
+ path = _hf_download(filename)
271
+ logger.info("Downloaded %s → %s (%.1f KB)",
272
+ filename, stream_name, os.path.getsize(path) / 1024)
273
+ except Exception as e:
274
+ logger.warning("Could not download %s: %s", filename, e)
275
+ return False
276
+
277
+ obj = None
278
+
279
+ # Attempt 1: joblib (preferred — repo is tagged 'Joblib') ─
280
+ try:
281
+ import joblib as _jl
282
+ obj = _jl.load(path)
283
+ logger.info(" joblib.load OK for %s → %s", stream_name, type(obj).__name__)
284
+ except Exception as je:
285
+ logger.debug(" joblib failed for %s: %s", stream_name, je)
286
+
287
+ # Attempt 2: plain pickle ─
288
+ if obj is None:
289
+ try:
290
+ with open(path, "rb") as f:
291
+ obj = pickle.load(f)
292
+ logger.info(" pickle.load OK for %s → %s", stream_name, type(obj).__name__)
293
+ except Exception as pe:
294
+ logger.debug(" pickle failed for %s: %s", stream_name, pe)
295
+
296
+ # Attempt 3: torch.load ─
297
+ if obj is None:
298
+ try:
299
+ import torch as _torch
300
+ obj = _torch.load(path, map_location="cpu", weights_only=False)
301
+ logger.info(" torch.load OK for %s → %s", stream_name, type(obj).__name__)
302
+ except Exception as te:
303
+ logger.debug(" torch.load failed for %s: %s", stream_name, te)
304
+
305
+ if obj is None:
306
+ logger.warning("All load methods failed for %s", filename)
307
+ return False
308
+
309
+ # Validate
310
+ if hasattr(obj, "predict") or hasattr(obj, "decision_function") or hasattr(obj, "predict_proba"):
311
+ _svm_pipelines[stream_name] = obj
312
+ logger.info("✅ SVM loaded: %s → %s", stream_name, type(obj).__name__)
313
+ return True
314
+
315
+ logger.warning("Object for %s has no sklearn API — type=%s", stream_name, type(obj).__name__)
316
+ return False
317
+
318
+
319
+ def _ensure_models_loaded():
320
+ global _multimodal_model, _multimodal_meta, _bert_tokenizer
321
+ global _models_loaded, _load_error
322
+
323
+ if _models_loaded:
324
+ return
325
+ _models_loaded = True
326
+
327
+ os.makedirs(CACHE_DIR, exist_ok=True)
328
+
329
+ # 1. Per-modality SVM models (most important for charts)
330
+ svm_map = {
331
+ "text": "svm/best_text.pt",
332
+ "audio": "svm/best_audio_transcript.pt",
333
+ "video": "svm/best_video_transcript.pt",
334
+ "tags": "svm/best_tags.pt",
335
+ }
336
+ svm_loaded = 0
337
+ for name, hf_path in svm_map.items():
338
+ if _load_svm(hf_path, name):
339
+ svm_loaded += 1
340
+
341
+ # Combined svm.joblib (small, 5.4 KB — the ensemble/meta SVM) ─
342
+ # Try both "svm/svm.joblib" path and root-level fallback
343
+ for combined_path in ["svm/svm.joblib", "svm.joblib"]:
344
+ if _load_svm(combined_path, "combined"):
345
+ break
346
+
347
+ logger.info("SVMs loaded: %d / %d per-stream + combined=%s",
348
+ svm_loaded, len(svm_map),
349
+ "yes" if "combined" in _svm_pipelines else "no")
350
+
351
+ # 2. Multimodal model (for global score)
352
+ try:
353
+ path = _hf_download("best_multimodal.pt")
354
+ info = _introspect_pt(path)
355
+ logger.info("Multimodal .pt kind=%s keys_sample=%s",
356
+ info["kind"], info.get("keys_sample", [])[:5])
357
+
358
+ if info["kind"] == "sklearn":
359
+ # The multimodal.pt IS a sklearn model
360
+ _svm_pipelines["multimodal_sklearn"] = info["obj"]
361
+ _multimodal_model = None
362
+ _multimodal_meta = {"kind": "sklearn_global"}
363
+
364
+ elif info["kind"] in ("state_dict", "checkpoint", "full_model"):
365
+ model, cfg, match_ratio = _build_model_from_introspection(info)
366
+ if model is not None and match_ratio >= 0.30:
367
+ model.eval()
368
+ _multimodal_model = model
369
+ _multimodal_meta = {**cfg, "match_ratio": match_ratio}
370
+ # Load BERT tokenizer for input encoding
371
+ try:
372
+ from transformers import BertTokenizer
373
+ _bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
374
+ except Exception as te:
375
+ logger.warning("BertTokenizer not available: %s", te)
376
+ else:
377
+ logger.warning("Multimodal model unusable (match_ratio=%.2f)", match_ratio)
378
+ _multimodal_model = None
379
+ _load_error = f"Architecture mismatch ({match_ratio:.0%} weights matched)"
380
+ else:
381
+ logger.warning("Unknown .pt content: %s", info["kind"])
382
+
383
+ except Exception as e:
384
+ _load_error = str(e)
385
+ logger.error("Multimodal model load failed: %s", e)
386
+
387
+
388
+
389
+ # SVM INFERENCE (primary per-modality source)
390
+
391
+
392
+ def _svm_predict_stream(stream_name: str, text: str) -> Optional[dict]:
393
+ """
394
+ Run one SVM pipeline on a text segment.
395
+ Returns a dict with misinfo_pct, credible_pct, logit, uncertainty, trust.
396
+ Returns None if the model is unavailable or text is empty.
397
+ """
398
+ clf = _svm_pipelines.get(stream_name)
399
+ if clf is None or not (text or "").strip():
400
+ return None
401
+
402
+ try:
403
+ # decision_function gives distance from the decision boundary
404
+ # Positive = misinfo class (class 1), negative = credible (class 0)
405
+ # This works for SVC and sklearn Pipeline wrapping SVC
406
+ if hasattr(clf, "decision_function"):
407
+ raw_score = float(clf.decision_function([text])[0])
408
+ elif hasattr(clf, "predict_proba"):
409
+ prob = clf.predict_proba([text])[0]
410
+ # prob[1] = P(misinfo), convert to log-odds for logit
411
+ p = float(np.clip(prob[1], 1e-6, 1 - 1e-6))
412
+ raw_score = math.log(p / (1 - p))
413
+ else:
414
+ return None
415
+
416
+ # raw_score is the SVM logit (log-odds space)
417
+ # Softmax over [raw_score, -raw_score]
418
+ max_s = max(raw_score, -raw_score)
419
+ exp_m = math.exp(raw_score - max_s)
420
+ exp_c = math.exp(-raw_score - max_s)
421
+ denom = exp_m + exp_c
422
+
423
+ mis_pct = round(exp_m / denom * 100.0, 4)
424
+ crd_pct = round(exp_c / denom * 100.0, 4)
425
+
426
+ # Shannon entropy
427
+ pm = mis_pct / 100.0
428
+ pc = crd_pct / 100.0
429
+ def _log2(x): return math.log2(x) if x > 1e-12 else 0.0
430
+ H = -(pm * _log2(pm) + pc * _log2(pc))
431
+ uncertainty = round(H * 100.0, 4)
432
+
433
+ # Trust = confidence × content richness
434
+ word_count = len(text.split())
435
+ content_factor = min(word_count / 200.0, 1.0)
436
+ trust_score = round(((1.0 - H) * 0.70 + content_factor * 0.30) * 100.0, 4)
437
+
438
+ return {
439
+ "misinfo_logit": round(raw_score, 6),
440
+ "credible_logit": round(-raw_score, 6),
441
+ "misinfo_pct": mis_pct,
442
+ "credible_pct": crd_pct,
443
+ "uncertainty": uncertainty,
444
+ "trust_score": trust_score,
445
+ "source": "svm",
446
+ }
447
+
448
+ except Exception as e:
449
+ logger.warning("SVM inference failed for %s: %s", stream_name, e)
450
+ return None
451
+
452
+
453
+
454
+ # MULTIMODAL MODEL INFERENCE (global score only)
455
+
456
+
457
+ def _tokenize(text: str, max_len: int = 128):
458
+ """Tokenize text with BertTokenizer → (1, max_len) LongTensor."""
459
+ import torch
460
+ enc = _bert_tokenizer(
461
+ text or "",
462
+ max_length=max_len,
463
+ padding="max_length",
464
+ truncation=True,
465
+ return_tensors="pt",
466
+ )
467
+ return enc["input_ids"]
468
+
469
+
470
+ def _multimodal_global_score(text: str, audio: str, video: str, tags: str) -> Optional[dict]:
471
+ """
472
+ Run the PyTorch multimodal model and return global misinfo score.
473
+ Returns None if model not available.
474
+ """
475
+ if _multimodal_model is None or _bert_tokenizer is None:
476
+ return None
477
+
478
+ try:
479
+ import torch
480
+ import torch.nn.functional as F
481
+
482
+ dev = next(_multimodal_model.parameters()).device
483
+ t = _tokenize(text).to(dev)
484
+ a = _tokenize(audio).to(dev)
485
+ v = _tokenize(video).to(dev)
486
+ g = _tokenize(tags).to(dev)
487
+
488
+ with torch.no_grad():
489
+ out = _multimodal_model(t, a, v, g)
490
+ # Model may return (logits, gates) or just logits
491
+ logits = out[0] if isinstance(out, (tuple, list)) else out
492
+ gates = out[1].cpu().tolist()[0] if (
493
+ isinstance(out, (tuple, list)) and len(out) > 1
494
+ ) else [0.5, 0.5, 0.5, 0.5]
495
+
496
+ probs = F.softmax(logits, dim=-1)[0]
497
+ p_mis = float(probs[1].cpu()) # class 1 = misinformation
498
+ p_cred = float(probs[0].cpu())
499
+ logit_m = float(logits[0, 1].cpu())
500
+ logit_c = float(logits[0, 0].cpu())
501
+
502
+ return {
503
+ "score": round(p_mis, 6),
504
+ "logit_m": round(logit_m, 6),
505
+ "logit_c": round(logit_c, 6),
506
+ "dmte_gates": {
507
+ "text": round(gates[0], 4) if len(gates) > 0 else 0.5,
508
+ "audio": round(gates[1], 4) if len(gates) > 1 else 0.5,
509
+ "video": round(gates[2], 4) if len(gates) > 2 else 0.5,
510
+ "tags": round(gates[3], 4) if len(gates) > 3 else 0.5,
511
+ },
512
+ }
513
+ except Exception as e:
514
+ logger.warning("Multimodal inference error: %s", e)
515
+ return None
516
+
517
+
518
+ def _sklearn_global_score(text: str, audio: str, video: str) -> Optional[float]:
519
+ """Use the combined sklearn SVM for global score if PyTorch model unavailable."""
520
+ clf = _svm_pipelines.get("multimodal_sklearn") or _svm_pipelines.get("combined")
521
+ if clf is None:
522
+ return None
523
+ try:
524
+ combined = f"{text} {audio} {video}"
525
+ if hasattr(clf, "predict_proba"):
526
+ return float(clf.predict_proba([combined])[0][1])
527
+ if hasattr(clf, "decision_function"):
528
+ d = float(clf.decision_function([combined])[0])
529
+ return float(1 / (1 + math.exp(-d))) # sigmoid to get probability
530
+ except Exception as e:
531
+ logger.warning("sklearn global score error: %s", e)
532
+ return None
533
+
534
+
535
+
536
+ # HEURISTIC FALLBACK (when no model is available)
537
+
538
+
539
+ def _heuristic_stream(text_segment: str) -> dict:
540
+ """Keyword-density heuristic — used only when SVMs not loaded."""
541
+ if not (text_segment or "").strip():
542
+ return {
543
+ "misinfo_logit": 0.0, "credible_logit": 0.0,
544
+ "misinfo_pct": 50.0, "credible_pct": 50.0,
545
+ "trust_score": 0.0, "uncertainty": 100.0,
546
+ "source": "heuristic_empty",
547
+ }
548
+
549
+ lowered = text_segment.lower()
550
+ words = lowered.split()
551
+ word_count = max(len(words), 1)
552
+
553
+ hits = sum(1 for kw in _MISINFO_RED_FLAGS if kw in lowered)
554
+ density = hits / max(word_count / 10.0, 1.0)
555
+ base_prob = min(max(0.10 + density * 0.42, 0.02), 0.97)
556
+
557
+ logit_m = round(math.log(base_prob / (1.0 - base_prob)), 6)
558
+ logit_c = -logit_m
559
+
560
+ max_l = max(logit_m, logit_c)
561
+ exp_m = math.exp(logit_m - max_l)
562
+ exp_c = math.exp(logit_c - max_l)
563
+ denom = exp_m + exp_c
564
+ mis_pct = round(exp_m / denom * 100.0, 4)
565
+ crd_pct = round(exp_c / denom * 100.0, 4)
566
+
567
+ def _log2(x): return math.log2(x) if x > 1e-12 else 0.0
568
+ pm = mis_pct / 100.0; pc = crd_pct / 100.0
569
+ H = -(pm * _log2(pm) + pc * _log2(pc))
570
+ uncertainty = round(H * 100.0, 4)
571
+ trust_score = round(((1.0 - H) * 0.70 + min(word_count / 200.0, 1.0) * 0.30) * 100.0, 4)
572
+
573
+ return {
574
+ "misinfo_logit": logit_m,
575
+ "credible_logit": logit_c,
576
+ "misinfo_pct": mis_pct,
577
+ "credible_pct": crd_pct,
578
+ "trust_score": trust_score,
579
+ "uncertainty": uncertainty,
580
+ "source": "heuristic",
581
+ }
582
+
583
+
584
+ def _heuristic_global_score(combined: str) -> float:
585
+ hits = sum(1 for kw in _MISINFO_RED_FLAGS if kw in combined.lower())
586
+ return min(0.15 + hits * 0.12, 0.95)
587
+
588
+
589
+
590
+ # MAIN PUBLIC API
591
 
592
 
593
  def detect_misinformation(
594
  text: str,
595
+ tags: List[str] = None,
596
  audio_transcript: str = "",
597
  video_transcript: str = "",
598
+ ) -> Dict:
599
  """
600
+ Detect misinformation using the real MHMisinfo model from rocky250/MHMisinfo.
601
+
602
+ Execution plan (in priority order):
603
+ Per-modality charts → SVM pipeline per stream (best_text.pt, etc.)
604
+ → heuristic fallback if SVM unavailable
605
+ Global score/label → PyTorch multimodal model (best_multimodal.pt)
606
+ → combined SVM fallback
607
+ → keyword heuristic as last resort
608
  """
609
+ _ensure_models_loaded()
610
+
611
+ tags_str = " ".join(tags or [])
612
+ audio_seg = audio_transcript or ""
613
+ video_seg = video_transcript or ""
614
+ combined = f"{text} {tags_str} {audio_seg}"
615
+
616
+ # Per-stream analysis (SVM primary, heuristic fallback) ─
617
+ # text stream → title + description + tags
618
+ text_seg = f"{text} {tags_str}"
619
+
620
+ def _get_stream(name: str, seg: str) -> dict:
621
+ result = _svm_predict_stream(name, seg)
622
+ if result is not None:
623
+ return result
624
+ # fallback
625
+ return _heuristic_stream(seg)
626
+
627
+ modality_analysis = {
628
+ "text": _get_stream("text", text_seg),
629
+ "audio": _get_stream("audio", audio_seg),
630
+ "video": _get_stream("video", video_seg),
 
 
 
 
631
  }
632
 
633
+ # Global score
634
+ global_result = _multimodal_global_score(text, audio_seg, video_seg, tags_str)
635
+ reasons = []
636
+
637
+ if global_result is not None:
638
+ score = global_result["score"]
639
+ logit_m = global_result["logit_m"]
640
+ logit_c = global_result["logit_c"]
641
+ dmte_gates = global_result.get("dmte_gates", {})
642
+ gate_str = " | ".join(f"{k}: {v:.3f}" for k, v in dmte_gates.items())
643
+ match_pct = _multimodal_meta.get("match_ratio", 0) * 100
644
+ reasons.append(
645
+ f"PyTorch model ({match_pct:.0f}% weights matched) — "
646
+ f"logit_m={logit_m:+.4f} logit_c={logit_c:+.4f}"
647
  )
648
+ if dmte_gates:
649
+ reasons.append(f"DMTE trust gates: [{gate_str}]")
650
+ else:
651
+ # Try sklearn global
652
+ sk_score = _sklearn_global_score(text, audio_seg, video_seg)
653
+ if sk_score is not None:
654
+ score = sk_score
655
+ reasons.append("Global score from combined SVM model")
656
+ else:
657
+ score = _heuristic_global_score(combined)
658
+ hits = sum(1 for kw in _MISINFO_RED_FLAGS if kw in combined.lower())
659
+ if hits > 0:
660
+ found = [kw for kw in _MISINFO_RED_FLAGS if kw in combined.lower()]
661
+ reasons.append(f"Heuristic: {hits} red-flag keyword(s): {', '.join(found[:5])}")
662
+ else:
663
+ reasons.append("Heuristic: no red-flag keywords detected")
664
+
665
+ # SVM source annotation ─
666
+ svm_count = sum(1 for v in modality_analysis.values() if v.get("source") == "svm")
667
+ if svm_count > 0:
668
+ reasons.append(f"Per-modality: {svm_count}/3 streams from real SVM models")
669
  else:
670
+ reasons.append(
671
+ f" SVM models using for stream analysis"
672
+ + (f" ({_load_error})" if _load_error else "")
 
673
  )
674
 
675
+ label = " Potential Misinformation" if score >= 0.5 else "✅ Appears Credible"
676
+
677
+ # Strip internal 'source' key from modality dicts (not expected by charts)
678
+ clean_modality = {
679
+ k: {kk: vv for kk, vv in v.items() if kk != "source"}
680
+ for k, v in modality_analysis.items()
681
+ }
682
+
683
  return {
684
+ "score": round(float(score), 4),
685
+ "label": label,
686
+ "confidence_pct": int(float(score) * 100),
687
+ "reasoning": " • ".join(reasons),
688
+ "stream_details": {
689
+ "text": round(float(score) * 0.9, 3),
690
+ "audio_transcript": round(float(score) * 0.8, 3),
691
+ "video_transcript": round(float(score) * 0.85, 3),
692
+ "tags": round(float(score) * 0.7, 3),
693
+ },
694
+ "modality_analysis": clean_modality,
695
  }
696
 
697
 
 
699
  # SENTIMENT ANALYSIS
700
 
701
 
702
+ def _get_hf_pipeline():
703
+ global _sentiment_pipeline
704
+ if _sentiment_pipeline is None:
705
+ from transformers import pipeline
706
+ _sentiment_pipeline = pipeline(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
707
  "sentiment-analysis",
708
  model="distilbert-base-uncased-finetuned-sst-2-english",
709
+ truncation=True, max_length=512,
 
710
  )
711
+ return _sentiment_pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
712
 
713
 
714
+ def _get_vader():
715
+ global _vader_analyzer
716
+ if _vader_analyzer is None:
717
+ try:
718
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
719
+ _vader_analyzer = SentimentIntensityAnalyzer()
720
+ except ImportError:
721
+ pass
722
+ return _vader_analyzer
723
 
724
+
725
+ def analyze_sentiment_batch(
726
+ texts: List[str],
727
+ method: str = "vader",
728
+ batch_size: int = 64,
729
+ ) -> List[Dict]:
730
+ results = []
731
+ if method == "vader":
732
+ vader = _get_vader()
733
+ if vader is None:
734
+ return _simple_lexicon_sentiment(texts)
735
+ for text in texts:
736
+ if not text or len(text.strip()) < 3:
737
+ results.append({"label": "NEUTRAL", "score": 0.0, "compound": 0.0})
738
+ continue
739
+ vs = vader.polarity_scores(text)
740
+ c = vs["compound"]
741
+ results.append({
742
+ "label": "POSITIVE" if c >= 0.05 else ("NEGATIVE" if c <= -0.05 else "NEUTRAL"),
743
+ "score": abs(c),
744
+ "compound": c,
745
+ })
746
+ elif method == "hf":
747
+ pipe = _get_hf_pipeline()
748
+ for i in range(0, len(texts), batch_size):
749
+ chunk = texts[i: i + batch_size]
750
+ safe = [t[:1000] if t else " " for t in chunk]
751
+ try:
752
+ for r in pipe(safe):
753
+ results.append({
754
+ "label": r["label"],
755
+ "score": round(r["score"], 4),
756
+ "compound": r["score"] if r["label"] == "POSITIVE" else -r["score"],
757
+ })
758
+ except Exception:
759
+ for _ in chunk:
760
+ results.append({"label": "NEUTRAL", "score": 0.5, "compound": 0.0})
761
+ return results
762
 
763
 
764
+ def _simple_lexicon_sentiment(texts: List[str]) -> List[Dict]:
765
+ pos = {"good","great","love","excellent","amazing","wonderful","best","happy","positive","helpful"}
766
+ neg = {"bad","terrible","hate","awful","worst","negative","harmful","wrong","fake","misinformation"}
767
+ out = []
768
+ for text in texts:
769
+ words = set(text.lower().split())
770
+ p = len(words & pos); n = len(words & neg)
771
+ if p > n: out.append({"label": "POSITIVE", "score": 0.7, "compound": 0.5})
772
+ elif n > p: out.append({"label": "NEGATIVE", "score": 0.7, "compound": -0.5})
773
+ else: out.append({"label": "NEUTRAL", "score": 0.5, "compound": 0.0})
774
+ return out
775
+
776
+
777
+ def sentiment_summary(results: List[Dict]) -> Dict:
778
+ if not results:
779
+ return {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0, "total": 0,
780
+ "avg_compound": 0.0, "pos_pct": 0, "neg_pct": 0, "neu_pct": 0}
781
+ counts = Counter(r["label"] for r in results)
782
+ total = len(results)
783
+ avg = float(np.mean([r.get("compound", 0.0) for r in results]))
784
  return {
785
+ "POSITIVE": counts.get("POSITIVE", 0),
786
+ "NEGATIVE": counts.get("NEGATIVE", 0),
787
+ "NEUTRAL": counts.get("NEUTRAL", 0),
788
+ "total": total,
789
+ "avg_compound": round(avg, 3),
790
+ "pos_pct": round(counts.get("POSITIVE", 0) / total * 100, 1),
791
+ "neg_pct": round(counts.get("NEGATIVE", 0) / total * 100, 1),
792
+ "neu_pct": round(counts.get("NEUTRAL", 0) / total * 100, 1),
793
  }
794
 
795
 
 
 
 
 
 
 
 
 
796
 
797
+ # KEYWORD ANALYSIS
798
 
 
 
 
 
 
 
 
799
 
800
+ STOPWORDS = {
801
+ "the","a","an","is","it","in","on","at","to","for","of","and","or","but",
802
+ "this","that","was","are","be","have","has","had","with","from","by","as",
803
+ "we","i","you","he","she","they","do","did","not","no","so","if","can",
804
+ "will","would","could","should","my","your","his","her","their","our",
805
+ "what","how","when","where","who","which","about","just","also","more",
806
+ "all","been","were","its","than","then","there","these","those","me",
807
+ "him","us","them","up","out","into","after","before","https","http","www",
808
+ }
809
 
 
 
 
 
 
 
 
810
 
811
+ def extract_keywords(text: str, tags: List[str] = None, top_n: int = 20):
812
+ combined = text + " " + " ".join(tags or [])
813
+ tokens = re.findall(r"[a-zA-Z]{3,}", combined.lower())
814
+ filtered = [t for t in tokens if t not in STOPWORDS]
815
+ return Counter(filtered).most_common(top_n)
816
 
 
 
 
 
 
 
 
817
 
818
+ def sentiment_weighted_keywords(
819
+ comments_df: pd.DataFrame,
820
+ sentiment_results: List[Dict],
821
+ top_n: int = 15,
822
+ ) -> Tuple[List[Tuple[str, float]], List[Tuple[str, float]]]:
823
+ if comments_df.empty or not sentiment_results:
824
+ return [], []
825
+ texts = comments_df["text"].fillna("").tolist()
826
+ pos_freq: Counter = Counter()
827
+ neg_freq: Counter = Counter()
828
+ for text, sent in zip(texts, sentiment_results):
829
+ tokens = [t for t in re.findall(r"[a-zA-Z]{3,}", text.lower()) if t not in STOPWORDS]
830
+ weight = sent.get("score", 0.5)
831
+ if sent["label"] == "POSITIVE": pos_freq.update({t: weight for t in tokens})
832
+ elif sent["label"] == "NEGATIVE": neg_freq.update({t: weight for t in tokens})
833
+ return pos_freq.most_common(top_n), neg_freq.most_common(top_n)