rocky250 commited on
Commit
c6c2c6c
Β·
verified Β·
1 Parent(s): 27c3779

Update analyzer.py

Browse files
Files changed (1) hide show
  1. analyzer.py +209 -205
analyzer.py CHANGED
@@ -1,258 +1,262 @@
1
  """
2
- analyzer.py β€” Sentiment analysis, keyword extraction, and misinformation placeholder.
3
- Handles large comment volumes efficiently via batching + caching.
 
 
4
  """
5
 
6
  import re
7
- import math
8
  from collections import Counter
9
- from functools import lru_cache
10
- from typing import List, Dict, Tuple, Optional
11
 
12
- import numpy as np
13
  import pandas as pd
14
 
15
- # ── Lazy imports (heavy) ──────────────────────────────────────────────────────
16
- _sentiment_pipeline = None
17
- _vader_analyzer = None
18
-
19
-
20
- def _get_hf_pipeline():
21
- global _sentiment_pipeline
22
- if _sentiment_pipeline is None:
23
- from transformers import pipeline
24
- _sentiment_pipeline = pipeline(
25
- "sentiment-analysis",
26
- model="distilbert-base-uncased-finetuned-sst-2-english",
27
- truncation=True,
28
- max_length=512,
29
- )
30
- return _sentiment_pipeline
31
-
32
-
33
- def _get_vader():
34
- global _vader_analyzer
35
- if _vader_analyzer is None:
36
- try:
37
- from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
38
- _vader_analyzer = SentimentIntensityAnalyzer()
39
- except ImportError:
40
- pass
41
- return _vader_analyzer
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- # ── Misinformation Detector (PLACEHOLDER β€” plug in your model here) ───────────
45
 
46
  def detect_misinformation(
47
  text: str,
48
- tags: List[str] = None,
49
  audio_transcript: str = "",
50
  video_transcript: str = "",
51
- ) -> Dict:
52
  """
53
- PLACEHOLDER β€” replace the body of this function with your MHMisinfo model.
54
-
55
- Expected return format:
56
- {
57
- "score": float, # 0.0–1.0, probability of misinformation
58
- "label": str, # "Misinformation" or "Credible"
59
- "confidence_pct": int, # 0–100
60
- "reasoning": str, # human-readable summary
61
- "stream_details": dict, # per-modality trust/sigma/CCM (optional)
62
- }
63
  """
64
- # ── PLUG YOUR MODEL IN HERE ─────────────────────────────────────────────
65
- # Example:
66
- # from your_model_module import load_model, run_inference
67
- # model = load_model("path/to/checkpoint")
68
- # result = run_inference(model, text, tags, audio_transcript, video_transcript)
69
- # return result
70
- # ────────────────────────────────────────────────────────────────────────
71
-
72
- # Heuristic placeholder for demo purposes
73
- red_flags = [
74
- "cure", "cures", "miracle", "they don't want you to know",
75
- "doctors hate", "secret", "suppressed", "fake news",
76
- "conspiracy", "detox", "toxins", "pseudoscience",
77
- "100% natural", "big pharma", "government hiding",
78
- ]
79
- combined = f"{text} {' '.join(tags or [])} {audio_transcript}".lower()
80
- hits = sum(1 for kw in red_flags if kw in combined)
81
- score = min(0.15 + hits * 0.12, 0.95)
82
-
83
- label = "⚠️ Potential Misinformation" if score >= 0.5 else "βœ… Appears Credible"
84
-
85
- reasons = []
86
- if hits > 0:
87
- found = [kw for kw in red_flags if kw in combined]
88
- reasons.append(f"Detected {hits} red-flag keyword(s): {', '.join(found[:5])}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  else:
90
- reasons.append("No common misinformation red-flag keywords detected.")
91
- reasons.append("NOTE: This is a placeholder. Connect your MHMisinfo model for real results.")
 
 
 
92
 
93
  return {
94
- "score": round(score, 4),
95
- "label": label,
96
- "confidence_pct": int(score * 100),
97
- "reasoning": " β€’ ".join(reasons),
98
- "stream_details": {
99
- "text": round(score * 0.9, 3),
100
- "audio_transcript": round(score * 0.8, 3),
101
- "video_transcript": round(score * 0.85, 3),
102
- "tags": round(score * 0.7, 3),
103
- },
104
  }
105
 
106
 
107
- # ── Sentiment Analysis ────────────────────────────────────────────────────────
 
 
108
 
109
  def analyze_sentiment_batch(
110
- texts: List[str],
111
  method: str = "vader",
112
  batch_size: int = 64,
113
- ) -> List[Dict]:
114
- """
115
- Analyze sentiment for a list of texts efficiently.
116
-
117
- For large comment volumes (200+ comments) we use VADER by default:
118
- - O(n) linear pass, ~5k comments/second on CPU
119
- - No GPU or model download required
120
- - Returns compound score in [-1, 1]
121
-
122
- Switch method="hf" for DistilBERT (slower but more accurate).
123
-
124
- Efficiency strategy for HF:
125
- - Batching: groups texts into batch_size chunks to avoid OOM
126
- - Truncation: texts >512 tokens are truncated at the pipeline level
127
- - Short-circuit: texts <3 chars skip inference entirely
128
- """
129
- results = []
130
-
131
- if method == "vader":
132
- vader = _get_vader()
133
- if vader is None:
134
- # Fallback: simple lexicon
135
- return _simple_lexicon_sentiment(texts)
136
  for text in texts:
137
- if not text or len(text.strip()) < 3:
138
- results.append({"label": "NEUTRAL", "score": 0.0, "compound": 0.0})
139
- continue
140
- vs = vader.polarity_scores(text)
141
- compound = vs["compound"]
142
- if compound >= 0.05:
143
- label = "POSITIVE"
144
- elif compound <= -0.05:
145
- label = "NEGATIVE"
146
- else:
147
- label = "NEUTRAL"
148
- results.append({"label": label, "score": abs(compound), "compound": compound})
149
-
150
- elif method == "hf":
151
- pipe = _get_hf_pipeline()
 
 
 
 
 
 
152
  for i in range(0, len(texts), batch_size):
153
- chunk = texts[i: i + batch_size]
154
- safe = [t[:1000] if t else " " for t in chunk]
155
- try:
156
- batch_results = pipe(safe)
157
- for r in batch_results:
158
- results.append({
159
- "label": r["label"],
160
- "score": round(r["score"], 4),
161
- "compound": r["score"] if r["label"] == "POSITIVE" else -r["score"],
162
- })
163
- except Exception:
164
- for _ in chunk:
165
- results.append({"label": "NEUTRAL", "score": 0.5, "compound": 0.0})
166
-
167
- return results
168
-
169
-
170
- def _simple_lexicon_sentiment(texts: List[str]) -> List[Dict]:
171
- """Ultra-fast lexicon fallback if VADER is not installed."""
172
- pos_words = {"good","great","love","excellent","amazing","wonderful","best","happy","positive","helpful"}
173
- neg_words = {"bad","terrible","hate","awful","worst","negative","harmful","wrong","fake","misinformation"}
 
 
 
174
  results = []
175
  for text in texts:
176
- words = set(text.lower().split())
177
- pos = len(words & pos_words)
178
- neg = len(words & neg_words)
179
  if pos > neg:
180
- results.append({"label": "POSITIVE", "score": 0.7, "compound": 0.5})
181
  elif neg > pos:
182
- results.append({"label": "NEGATIVE", "score": 0.7, "compound": -0.5})
183
  else:
184
- results.append({"label": "NEUTRAL", "score": 0.5, "compound": 0.0})
 
185
  return results
186
 
187
 
188
- def sentiment_summary(results: List[Dict]) -> Dict:
189
- """Aggregate sentiment results into percentage counts."""
190
- if not results:
191
- return {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0, "total": 0,
192
- "avg_compound": 0.0, "pos_pct": 0, "neg_pct": 0, "neu_pct": 0}
193
-
194
- counts = Counter(r["label"] for r in results)
195
- total = len(results)
196
- avg_compound = np.mean([r.get("compound", 0.0) for r in results])
197
 
 
 
 
 
 
 
 
198
  return {
199
- "POSITIVE": counts.get("POSITIVE", 0),
200
- "NEGATIVE": counts.get("NEGATIVE", 0),
201
- "NEUTRAL": counts.get("NEUTRAL", 0),
202
- "total": total,
203
- "avg_compound": round(float(avg_compound), 3),
204
- "pos_pct": round(counts.get("POSITIVE", 0) / total * 100, 1),
205
- "neg_pct": round(counts.get("NEGATIVE", 0) / total * 100, 1),
206
- "neu_pct": round(counts.get("NEUTRAL", 0) / total * 100, 1),
207
  }
208
 
209
 
210
- # ── Keyword / Tag Analysis ────────────────────────────────────────────────────
 
 
 
 
 
 
 
211
 
212
- STOPWORDS = {
213
- "the","a","an","is","it","in","on","at","to","for","of","and","or","but",
214
- "this","that","was","are","be","have","has","had","with","from","by","as",
215
- "we","i","you","he","she","they","do","did","not","no","so","if","can",
216
- "will","would","could","should","my","your","his","her","their","our",
217
- "what","how","when","where","who","which","about","just","also","more",
218
- "all","been","were","its","than","then","there","these","those","me",
219
- "him","us","them","up","out","into","after","before","https","http","www",
220
- }
221
 
222
- def extract_keywords(
223
- text: str,
224
- tags: List[str] = None,
225
- top_n: int = 20,
226
- ) -> List[Tuple[str, int]]:
227
- """Extract top keywords from combined text + tags by TF (frequency)."""
228
- combined = text + " " + " ".join(tags or [])
229
- tokens = re.findall(r"[a-zA-Z]{3,}", combined.lower())
230
- filtered = [t for t in tokens if t not in STOPWORDS]
231
- return Counter(filtered).most_common(top_n)
232
 
233
 
234
  def sentiment_weighted_keywords(
235
- comments_df: pd.DataFrame,
236
- sentiment_results: List[Dict],
237
- top_n: int = 15,
238
- ) -> Tuple[List[Tuple[str, float]], List[Tuple[str, float]]]:
239
- """
240
- Return (positive_keywords, negative_keywords) each as [(word, weight), ...].
241
- Weight = TF Γ— avg_sentiment_strength for that word.
242
- """
243
- if comments_df.empty or not sentiment_results:
244
  return [], []
245
 
246
- texts = comments_df["text"].fillna("").tolist()
247
- pos_freq: Counter = Counter()
248
- neg_freq: Counter = Counter()
249
 
250
- for text, sent in zip(texts, sentiment_results):
251
- tokens = [t for t in re.findall(r"[a-zA-Z]{3,}", text.lower()) if t not in STOPWORDS]
252
- weight = sent.get("score", 0.5)
253
  if sent["label"] == "POSITIVE":
254
- pos_freq.update({t: weight for t in tokens})
255
  elif sent["label"] == "NEGATIVE":
256
- neg_freq.update({t: weight for t in tokens})
257
 
258
- return pos_freq.most_common(top_n), neg_freq.most_common(top_n)
 
1
  """
2
+ analyzer.py β€” Mental-health misinformation detection + sentiment analysis.
3
+
4
+ Misinformation: lightweight rule-based 4-stream scorer (no external API needed).
5
+ Sentiment: VADER (fast, CPU) or DistilBERT (accurate, downloads ~500 MB first run).
6
  """
7
 
8
  import re
 
9
  from collections import Counter
 
 
10
 
 
11
  import pandas as pd
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ # ═══════════════════════════════════════════════════════════════════════════════
15
+ # MISINFORMATION DETECTION
16
+ # ═══════════════════════════════════════════════════════════════════════════════
17
+
18
+ # Signals that raise the misinformation score
19
+ _RED_FLAGS = [
20
+ "miracle cure", "they don't want you to know", "big pharma", "doctors hide",
21
+ "secret remedy", "ancient cure", "government censored", "fda lies", "fda lie",
22
+ "conspiracy", "natural cure", "detox your brain", "toxins cause",
23
+ "no medication needed", "stop taking meds", "heal yourself naturally",
24
+ "100% effective", "guaranteed cure", "scientifically proven cure",
25
+ "instant relief", "suppress the truth", "alternative medicine cures",
26
+ "vaccines cause mental", "wifi causes", "5g causes", "chemtrails",
27
+ "big pharma doesn't want", "they suppress", "hidden cure",
28
+ "cure depression", "cure anxiety", "cure schizophrenia", "cure bipolar",
29
+ "cure autism", "cure adhd", "detox cure",
30
+ ]
31
+
32
+ # Signals that reduce the misinformation score
33
+ _CREDIBILITY = [
34
+ "peer-reviewed", "clinical trial", "randomized controlled", "meta-analysis",
35
+ "published in", "according to research", "study shows", "evidence suggests",
36
+ "licensed therapist", "board-certified", "psychiatrist", "psychologist",
37
+ "cognitive behavioral", "evidence-based", "treatment guidelines",
38
+ "american psychological", "national institute", "who recommends",
39
+ "systematic review", "consult your doctor", "speak to a professional",
40
+ "mental health professional", "contact a therapist",
41
+ ]
42
+
43
+ # Clickbait / sensationalist language
44
+ _CLICKBAIT = [
45
+ "you won't believe", "shocking truth", "the truth about", "exposed",
46
+ "they lied", "watch before deleted", "banned video", "censored truth",
47
+ "must watch", "share before removed", "real truth", "wake up",
48
+ "open your eyes", "mainstream media won't", "what they hide",
49
+ ]
50
 
 
51
 
52
  def detect_misinformation(
53
  text: str,
54
+ tags: list,
55
  audio_transcript: str = "",
56
  video_transcript: str = "",
57
+ ) -> dict:
58
  """
59
+ 4-stream scoring: title/desc, tags, transcript, credibility.
60
+ Returns score 0–1 (higher = more likely misinformation).
 
 
 
 
 
 
 
 
61
  """
62
+ combined = f"{text} {' '.join(tags)} {audio_transcript} {video_transcript}".lower()
63
+ tags_lower = [t.lower() for t in tags]
64
+
65
+ # Stream 1 β€” title / description
66
+ red_in_text = sum(1 for r in _RED_FLAGS if r in combined)
67
+ click_in_text = sum(1 for c in _CLICKBAIT if c in combined)
68
+ s1 = min((red_in_text * 0.18 + click_in_text * 0.12), 1.0)
69
+
70
+ # Stream 2 β€” tags
71
+ red_in_tags = sum(1 for tag in tags_lower for r in _RED_FLAGS if r in tag)
72
+ s2 = min(red_in_tags * 0.25, 1.0)
73
+
74
+ # Stream 3 β€” transcript density
75
+ word_count = max(len(combined.split()), 1)
76
+ red_density = sum(1 for r in _RED_FLAGS if r in combined) / (word_count / 100)
77
+ s3 = min(red_density * 0.15, 1.0)
78
+
79
+ # Stream 4 β€” credibility deficit (absence of credible language = risk)
80
+ cred_count = sum(1 for c in _CREDIBILITY if c in combined)
81
+ s4 = max(0.0, 0.6 - cred_count * 0.12) # starts at 0.6, falls with credibility
82
+
83
+ stream_details = {
84
+ "Title & Description": round(s1, 3),
85
+ "Tags": round(s2, 3),
86
+ "Transcript": round(s3, 3),
87
+ "Credibility Gap": round(s4, 3),
88
+ }
89
+
90
+ score = (s1 * 0.35 + s2 * 0.20 + s3 * 0.20 + s4 * 0.25)
91
+ score = max(0.0, min(1.0, score))
92
+
93
+ if score < 0.35:
94
+ reasoning = (
95
+ f"Content uses credible language ({cred_count} credibility markers found). "
96
+ "No major misinformation signals detected in title, tags, or transcript."
97
+ )
98
+ elif score < 0.65:
99
+ reasoning = (
100
+ f"Mixed signals detected β€” {red_in_text} red-flag phrase(s) alongside "
101
+ f"{cred_count} credibility indicator(s). Manual review recommended before sharing."
102
+ )
103
  else:
104
+ reasoning = (
105
+ f"High misinformation risk. {red_in_text} red-flag phrase(s) and "
106
+ f"{click_in_text} clickbait indicator(s) detected with low credibility language. "
107
+ "Exercise significant caution."
108
+ )
109
 
110
  return {
111
+ "score": score,
112
+ "confidence_pct": int(round(score * 100)),
113
+ "reasoning": reasoning,
114
+ "stream_details": stream_details,
 
 
 
 
 
 
115
  }
116
 
117
 
118
+ # ═══════════════════════════════════════════════════════════════════════════════
119
+ # SENTIMENT ANALYSIS
120
+ # ═══════════════════════════════════════════════════════════════════════════════
121
 
122
  def analyze_sentiment_batch(
123
+ texts: list,
124
  method: str = "vader",
125
  batch_size: int = 64,
126
+ ) -> list[dict]:
127
+ """Return list of {'label': str, 'compound': float, 'score': float}."""
128
+ if not texts:
129
+ return []
130
+ if method == "hf":
131
+ return _hf_sentiment(texts, batch_size=batch_size)
132
+ return _vader_sentiment(texts)
133
+
134
+
135
+ def _vader_sentiment(texts: list) -> list[dict]:
136
+ try:
137
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
138
+ sia = SentimentIntensityAnalyzer()
139
+ results = []
 
 
 
 
 
 
 
 
 
140
  for text in texts:
141
+ sc = sia.polarity_scores(str(text))
142
+ c = sc["compound"]
143
+ label = "POSITIVE" if c >= 0.05 else ("NEGATIVE" if c <= -0.05 else "NEUTRAL")
144
+ results.append({"label": label, "compound": round(c, 4), "score": round(abs(c), 4)})
145
+ return results
146
+ except ImportError:
147
+ return _simple_sentiment(texts)
148
+ except Exception:
149
+ return _simple_sentiment(texts)
150
+
151
+
152
+ def _hf_sentiment(texts: list, batch_size: int = 32) -> list[dict]:
153
+ try:
154
+ from transformers import pipeline as hf_pipeline
155
+ pipe = hf_pipeline(
156
+ "sentiment-analysis",
157
+ model="distilbert-base-uncased-finetuned-sst-2-english",
158
+ truncation=True,
159
+ max_length=512,
160
+ )
161
+ results = []
162
  for i in range(0, len(texts), batch_size):
163
+ chunk = [str(t)[:512] for t in texts[i: i + batch_size]]
164
+ out = pipe(chunk)
165
+ for item in out:
166
+ lbl = item["label"]
167
+ sc = item["score"]
168
+ compound = sc if lbl == "POSITIVE" else -sc
169
+ results.append({"label": lbl, "compound": round(compound, 4), "score": round(sc, 4)})
170
+ return results
171
+ except Exception:
172
+ return _vader_sentiment(texts)
173
+
174
+
175
+ def _simple_sentiment(texts: list) -> list[dict]:
176
+ """Zero-dependency fallback when VADER isn't installed."""
177
+ pos_vocab = {
178
+ "good", "great", "excellent", "love", "amazing", "wonderful", "helpful",
179
+ "best", "thank", "thanks", "awesome", "brilliant", "perfect", "happy",
180
+ "fantastic", "outstanding", "superb", "recommend", "positive", "useful",
181
+ }
182
+ neg_vocab = {
183
+ "bad", "terrible", "awful", "hate", "worst", "horrible", "wrong",
184
+ "false", "misleading", "garbage", "useless", "poor", "disappointing",
185
+ "dangerous", "harmful", "misinformation", "lie", "lies", "fraud",
186
+ }
187
  results = []
188
  for text in texts:
189
+ words = set(str(text).lower().split())
190
+ pos = len(words & pos_vocab)
191
+ neg = len(words & neg_vocab)
192
  if pos > neg:
193
+ label, compound = "POSITIVE", 0.5
194
  elif neg > pos:
195
+ label, compound = "NEGATIVE", -0.5
196
  else:
197
+ label, compound = "NEUTRAL", 0.0
198
+ results.append({"label": label, "compound": compound, "score": abs(compound)})
199
  return results
200
 
201
 
202
+ # ═══════════════════════════════════════════════════════════════════════════════
203
+ # SUMMARY + KEYWORDS
204
+ # ═══════════════════════════════════════════════════════════════════════════════
 
 
 
 
 
 
205
 
206
+ def sentiment_summary(sentiments: list) -> dict:
207
+ if not sentiments:
208
+ return {}
209
+ total = len(sentiments)
210
+ pos = sum(1 for s in sentiments if s["label"] == "POSITIVE")
211
+ neg = sum(1 for s in sentiments if s["label"] == "NEGATIVE")
212
+ neu = total - pos - neg
213
  return {
214
+ "total": total,
215
+ "pos": pos,
216
+ "neg": neg,
217
+ "neu": neu,
218
+ "pos_pct": round(pos / total * 100, 1),
219
+ "neg_pct": round(neg / total * 100, 1),
220
+ "neu_pct": round(neu / total * 100, 1),
 
221
  }
222
 
223
 
224
+ _STOP = frozenset(
225
+ "the a an and or but in on at to for of with by from up is are was were be been "
226
+ "being have has had do does did will would could should may might this that these "
227
+ "those it its they them their we our you your i my he she his her not no so if as "
228
+ "about what how when who which all just more also can get like one there than now "
229
+ "then very much many some any such other very really just even still only well "
230
+ "http https www com".split()
231
+ )
232
 
 
 
 
 
 
 
 
 
 
233
 
234
+ def extract_keywords(text: str, tags: list, top_n: int = 15) -> list[tuple]:
235
+ words = re.findall(r"\b[a-z]{4,}\b", text.lower())
236
+ filtered = [w for w in words if w not in _STOP]
237
+ tag_words = [re.sub(r"[^a-z]", "", t.lower()) for t in tags]
238
+ tag_words = [w for w in tag_words if len(w) >= 4 and w not in _STOP]
239
+ all_words = filtered + tag_words * 3
240
+ return Counter(all_words).most_common(top_n)
 
 
 
241
 
242
 
243
  def sentiment_weighted_keywords(
244
+ df: pd.DataFrame,
245
+ sentiments: list,
246
+ top_n: int = 10,
247
+ ) -> tuple[list, list]:
248
+ if df.empty or not sentiments:
 
 
 
 
249
  return [], []
250
 
251
+ pos_words, neg_words = [], []
252
+ texts = df["text"].fillna("").tolist()
 
253
 
254
+ for text, sent in zip(texts, sentiments):
255
+ words = re.findall(r"\b[a-z]{4,}\b", str(text).lower())
256
+ words = [w for w in words if w not in _STOP]
257
  if sent["label"] == "POSITIVE":
258
+ pos_words.extend(words)
259
  elif sent["label"] == "NEGATIVE":
260
+ neg_words.extend(words)
261
 
262
+ return Counter(pos_words).most_common(top_n), Counter(neg_words).most_common(top_n)