rocky250 commited on
Commit
f52d7fe
Β·
verified Β·
1 Parent(s): f0f0ba5

Create analyzer.py

Browse files
Files changed (1) hide show
  1. analyzer.py +258 -0
analyzer.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ analyzer.py β€” Sentiment analysis, keyword extraction, and misinformation placeholder.
3
+ Handles large comment volumes efficiently via batching + caching.
4
+ """
5
+
6
+ import re
7
+ import math
8
+ from collections import Counter
9
+ from functools import lru_cache
10
+ from typing import List, Dict, Tuple, Optional
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+
15
+ # ── Lazy imports (heavy) ──────────────────────────────────────────────────────
16
+ _sentiment_pipeline = None
17
+ _vader_analyzer = None
18
+
19
+
20
+ def _get_hf_pipeline():
21
+ global _sentiment_pipeline
22
+ if _sentiment_pipeline is None:
23
+ from transformers import pipeline
24
+ _sentiment_pipeline = pipeline(
25
+ "sentiment-analysis",
26
+ model="distilbert-base-uncased-finetuned-sst-2-english",
27
+ truncation=True,
28
+ max_length=512,
29
+ )
30
+ return _sentiment_pipeline
31
+
32
+
33
+ def _get_vader():
34
+ global _vader_analyzer
35
+ if _vader_analyzer is None:
36
+ try:
37
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
38
+ _vader_analyzer = SentimentIntensityAnalyzer()
39
+ except ImportError:
40
+ pass
41
+ return _vader_analyzer
42
+
43
+
44
+ # ── Misinformation Detector (PLACEHOLDER β€” plug in your model here) ───────────
45
+
46
+ def detect_misinformation(
47
+ text: str,
48
+ tags: List[str] = None,
49
+ audio_transcript: str = "",
50
+ video_transcript: str = "",
51
+ ) -> Dict:
52
+ """
53
+ PLACEHOLDER β€” replace the body of this function with your MHMisinfo model.
54
+
55
+ Expected return format:
56
+ {
57
+ "score": float, # 0.0–1.0, probability of misinformation
58
+ "label": str, # "Misinformation" or "Credible"
59
+ "confidence_pct": int, # 0–100
60
+ "reasoning": str, # human-readable summary
61
+ "stream_details": dict, # per-modality trust/sigma/CCM (optional)
62
+ }
63
+ """
64
+ # ── PLUG YOUR MODEL IN HERE ─────────────────────────────────────────────
65
+ # Example:
66
+ # from your_model_module import load_model, run_inference
67
+ # model = load_model("path/to/checkpoint")
68
+ # result = run_inference(model, text, tags, audio_transcript, video_transcript)
69
+ # return result
70
+ # ────────────────────────────────────────────────────────────────────────
71
+
72
+ # Heuristic placeholder for demo purposes
73
+ red_flags = [
74
+ "cure", "cures", "miracle", "they don't want you to know",
75
+ "doctors hate", "secret", "suppressed", "fake news",
76
+ "conspiracy", "detox", "toxins", "pseudoscience",
77
+ "100% natural", "big pharma", "government hiding",
78
+ ]
79
+ combined = f"{text} {' '.join(tags or [])} {audio_transcript}".lower()
80
+ hits = sum(1 for kw in red_flags if kw in combined)
81
+ score = min(0.15 + hits * 0.12, 0.95)
82
+
83
+ label = "⚠️ Potential Misinformation" if score >= 0.5 else "βœ… Appears Credible"
84
+
85
+ reasons = []
86
+ if hits > 0:
87
+ found = [kw for kw in red_flags if kw in combined]
88
+ reasons.append(f"Detected {hits} red-flag keyword(s): {', '.join(found[:5])}")
89
+ else:
90
+ reasons.append("No common misinformation red-flag keywords detected.")
91
+ reasons.append("NOTE: This is a placeholder. Connect your MHMisinfo model for real results.")
92
+
93
+ return {
94
+ "score": round(score, 4),
95
+ "label": label,
96
+ "confidence_pct": int(score * 100),
97
+ "reasoning": " β€’ ".join(reasons),
98
+ "stream_details": {
99
+ "text": round(score * 0.9, 3),
100
+ "audio_transcript": round(score * 0.8, 3),
101
+ "video_transcript": round(score * 0.85, 3),
102
+ "tags": round(score * 0.7, 3),
103
+ },
104
+ }
105
+
106
+
107
+ # ── Sentiment Analysis ────────────────────────────────────────────────────────
108
+
109
+ def analyze_sentiment_batch(
110
+ texts: List[str],
111
+ method: str = "vader",
112
+ batch_size: int = 64,
113
+ ) -> List[Dict]:
114
+ """
115
+ Analyze sentiment for a list of texts efficiently.
116
+
117
+ For large comment volumes (200+ comments) we use VADER by default:
118
+ - O(n) linear pass, ~5k comments/second on CPU
119
+ - No GPU or model download required
120
+ - Returns compound score in [-1, 1]
121
+
122
+ Switch method="hf" for DistilBERT (slower but more accurate).
123
+
124
+ Efficiency strategy for HF:
125
+ - Batching: groups texts into batch_size chunks to avoid OOM
126
+ - Truncation: texts >512 tokens are truncated at the pipeline level
127
+ - Short-circuit: texts <3 chars skip inference entirely
128
+ """
129
+ results = []
130
+
131
+ if method == "vader":
132
+ vader = _get_vader()
133
+ if vader is None:
134
+ # Fallback: simple lexicon
135
+ return _simple_lexicon_sentiment(texts)
136
+ for text in texts:
137
+ if not text or len(text.strip()) < 3:
138
+ results.append({"label": "NEUTRAL", "score": 0.0, "compound": 0.0})
139
+ continue
140
+ vs = vader.polarity_scores(text)
141
+ compound = vs["compound"]
142
+ if compound >= 0.05:
143
+ label = "POSITIVE"
144
+ elif compound <= -0.05:
145
+ label = "NEGATIVE"
146
+ else:
147
+ label = "NEUTRAL"
148
+ results.append({"label": label, "score": abs(compound), "compound": compound})
149
+
150
+ elif method == "hf":
151
+ pipe = _get_hf_pipeline()
152
+ for i in range(0, len(texts), batch_size):
153
+ chunk = texts[i: i + batch_size]
154
+ safe = [t[:1000] if t else " " for t in chunk]
155
+ try:
156
+ batch_results = pipe(safe)
157
+ for r in batch_results:
158
+ results.append({
159
+ "label": r["label"],
160
+ "score": round(r["score"], 4),
161
+ "compound": r["score"] if r["label"] == "POSITIVE" else -r["score"],
162
+ })
163
+ except Exception:
164
+ for _ in chunk:
165
+ results.append({"label": "NEUTRAL", "score": 0.5, "compound": 0.0})
166
+
167
+ return results
168
+
169
+
170
+ def _simple_lexicon_sentiment(texts: List[str]) -> List[Dict]:
171
+ """Ultra-fast lexicon fallback if VADER is not installed."""
172
+ pos_words = {"good","great","love","excellent","amazing","wonderful","best","happy","positive","helpful"}
173
+ neg_words = {"bad","terrible","hate","awful","worst","negative","harmful","wrong","fake","misinformation"}
174
+ results = []
175
+ for text in texts:
176
+ words = set(text.lower().split())
177
+ pos = len(words & pos_words)
178
+ neg = len(words & neg_words)
179
+ if pos > neg:
180
+ results.append({"label": "POSITIVE", "score": 0.7, "compound": 0.5})
181
+ elif neg > pos:
182
+ results.append({"label": "NEGATIVE", "score": 0.7, "compound": -0.5})
183
+ else:
184
+ results.append({"label": "NEUTRAL", "score": 0.5, "compound": 0.0})
185
+ return results
186
+
187
+
188
+ def sentiment_summary(results: List[Dict]) -> Dict:
189
+ """Aggregate sentiment results into percentage counts."""
190
+ if not results:
191
+ return {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0, "total": 0,
192
+ "avg_compound": 0.0, "pos_pct": 0, "neg_pct": 0, "neu_pct": 0}
193
+
194
+ counts = Counter(r["label"] for r in results)
195
+ total = len(results)
196
+ avg_compound = np.mean([r.get("compound", 0.0) for r in results])
197
+
198
+ return {
199
+ "POSITIVE": counts.get("POSITIVE", 0),
200
+ "NEGATIVE": counts.get("NEGATIVE", 0),
201
+ "NEUTRAL": counts.get("NEUTRAL", 0),
202
+ "total": total,
203
+ "avg_compound": round(float(avg_compound), 3),
204
+ "pos_pct": round(counts.get("POSITIVE", 0) / total * 100, 1),
205
+ "neg_pct": round(counts.get("NEGATIVE", 0) / total * 100, 1),
206
+ "neu_pct": round(counts.get("NEUTRAL", 0) / total * 100, 1),
207
+ }
208
+
209
+
210
+ # ── Keyword / Tag Analysis ────────────────────────────────────────────────────
211
+
212
+ STOPWORDS = {
213
+ "the","a","an","is","it","in","on","at","to","for","of","and","or","but",
214
+ "this","that","was","are","be","have","has","had","with","from","by","as",
215
+ "we","i","you","he","she","they","do","did","not","no","so","if","can",
216
+ "will","would","could","should","my","your","his","her","their","our",
217
+ "what","how","when","where","who","which","about","just","also","more",
218
+ "all","been","were","its","than","then","there","these","those","me",
219
+ "him","us","them","up","out","into","after","before","https","http","www",
220
+ }
221
+
222
+ def extract_keywords(
223
+ text: str,
224
+ tags: List[str] = None,
225
+ top_n: int = 20,
226
+ ) -> List[Tuple[str, int]]:
227
+ """Extract top keywords from combined text + tags by TF (frequency)."""
228
+ combined = text + " " + " ".join(tags or [])
229
+ tokens = re.findall(r"[a-zA-Z]{3,}", combined.lower())
230
+ filtered = [t for t in tokens if t not in STOPWORDS]
231
+ return Counter(filtered).most_common(top_n)
232
+
233
+
234
+ def sentiment_weighted_keywords(
235
+ comments_df: pd.DataFrame,
236
+ sentiment_results: List[Dict],
237
+ top_n: int = 15,
238
+ ) -> Tuple[List[Tuple[str, float]], List[Tuple[str, float]]]:
239
+ """
240
+ Return (positive_keywords, negative_keywords) each as [(word, weight), ...].
241
+ Weight = TF Γ— avg_sentiment_strength for that word.
242
+ """
243
+ if comments_df.empty or not sentiment_results:
244
+ return [], []
245
+
246
+ texts = comments_df["text"].fillna("").tolist()
247
+ pos_freq: Counter = Counter()
248
+ neg_freq: Counter = Counter()
249
+
250
+ for text, sent in zip(texts, sentiment_results):
251
+ tokens = [t for t in re.findall(r"[a-zA-Z]{3,}", text.lower()) if t not in STOPWORDS]
252
+ weight = sent.get("score", 0.5)
253
+ if sent["label"] == "POSITIVE":
254
+ pos_freq.update({t: weight for t in tokens})
255
+ elif sent["label"] == "NEGATIVE":
256
+ neg_freq.update({t: weight for t in tokens})
257
+
258
+ return pos_freq.most_common(top_n), neg_freq.most_common(top_n)