Spaces:
Sleeping
Sleeping
Update analyzer.py
Browse files- analyzer.py +777 -206
analyzer.py
CHANGED
|
@@ -1,117 +1,697 @@
|
|
| 1 |
"""
|
| 2 |
-
analyzer.py —
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
from collections import Counter
|
|
|
|
| 10 |
|
|
|
|
| 11 |
import pandas as pd
|
| 12 |
|
|
|
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
"miracle cure", "they don't want you to know", "big pharma", "doctors hide",
|
| 21 |
-
"secret remedy", "ancient cure", "government censored", "fda lies", "fda lie",
|
| 22 |
-
"conspiracy", "natural cure", "detox your brain", "toxins cause",
|
| 23 |
-
"no medication needed", "stop taking meds", "heal yourself naturally",
|
| 24 |
-
"100% effective", "guaranteed cure", "scientifically proven cure",
|
| 25 |
-
"instant relief", "suppress the truth", "alternative medicine cures",
|
| 26 |
-
"vaccines cause mental", "wifi causes", "5g causes", "chemtrails",
|
| 27 |
-
"big pharma doesn't want", "they suppress", "hidden cure",
|
| 28 |
-
"cure depression", "cure anxiety", "cure schizophrenia", "cure bipolar",
|
| 29 |
-
"cure autism", "cure adhd", "detox cure",
|
| 30 |
-
]
|
| 31 |
|
| 32 |
-
#
|
| 33 |
-
|
| 34 |
-
"
|
| 35 |
-
"
|
| 36 |
-
"
|
| 37 |
-
"
|
| 38 |
-
"american psychological", "national institute", "who recommends",
|
| 39 |
-
"systematic review", "consult your doctor", "speak to a professional",
|
| 40 |
-
"mental health professional", "contact a therapist",
|
| 41 |
]
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
def detect_misinformation(
|
| 53 |
text: str,
|
| 54 |
-
tags:
|
| 55 |
audio_transcript: str = "",
|
| 56 |
video_transcript: str = "",
|
| 57 |
-
) ->
|
| 58 |
"""
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
"""
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
#
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
"Title & Description": round(s1, 3),
|
| 85 |
-
"Tags": round(s2, 3),
|
| 86 |
-
"Transcript": round(s3, 3),
|
| 87 |
-
"Credibility Gap": round(s4, 3),
|
| 88 |
}
|
| 89 |
|
| 90 |
-
score
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
| 102 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
else:
|
| 104 |
-
|
| 105 |
-
f"
|
| 106 |
-
f"
|
| 107 |
-
"Exercise significant caution."
|
| 108 |
)
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
return {
|
| 111 |
-
"score": score,
|
| 112 |
-
"
|
| 113 |
-
"
|
| 114 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
}
|
| 116 |
|
| 117 |
|
|
@@ -119,144 +699,135 @@ def detect_misinformation(
|
|
| 119 |
# SENTIMENT ANALYSIS
|
| 120 |
|
| 121 |
|
| 122 |
-
def
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
"""Return list of {'label': str, 'compound': float, 'score': float}."""
|
| 128 |
-
if not texts:
|
| 129 |
-
return []
|
| 130 |
-
if method == "hf":
|
| 131 |
-
return _hf_sentiment(texts, batch_size=batch_size)
|
| 132 |
-
return _vader_sentiment(texts)
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
def _vader_sentiment(texts: list) -> list[dict]:
|
| 136 |
-
try:
|
| 137 |
-
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 138 |
-
sia = SentimentIntensityAnalyzer()
|
| 139 |
-
results = []
|
| 140 |
-
for text in texts:
|
| 141 |
-
sc = sia.polarity_scores(str(text))
|
| 142 |
-
c = sc["compound"]
|
| 143 |
-
label = "POSITIVE" if c >= 0.05 else ("NEGATIVE" if c <= -0.05 else "NEUTRAL")
|
| 144 |
-
results.append({"label": label, "compound": round(c, 4), "score": round(abs(c), 4)})
|
| 145 |
-
return results
|
| 146 |
-
except ImportError:
|
| 147 |
-
return _simple_sentiment(texts)
|
| 148 |
-
except Exception:
|
| 149 |
-
return _simple_sentiment(texts)
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
def _hf_sentiment(texts: list, batch_size: int = 32) -> list[dict]:
|
| 153 |
-
try:
|
| 154 |
-
from transformers import pipeline as hf_pipeline
|
| 155 |
-
pipe = hf_pipeline(
|
| 156 |
"sentiment-analysis",
|
| 157 |
model="distilbert-base-uncased-finetuned-sst-2-english",
|
| 158 |
-
truncation=True,
|
| 159 |
-
max_length=512,
|
| 160 |
)
|
| 161 |
-
|
| 162 |
-
for i in range(0, len(texts), batch_size):
|
| 163 |
-
chunk = [str(t)[:512] for t in texts[i: i + batch_size]]
|
| 164 |
-
out = pipe(chunk)
|
| 165 |
-
for item in out:
|
| 166 |
-
lbl = item["label"]
|
| 167 |
-
sc = item["score"]
|
| 168 |
-
compound = sc if lbl == "POSITIVE" else -sc
|
| 169 |
-
results.append({"label": lbl, "compound": round(compound, 4), "score": round(sc, 4)})
|
| 170 |
-
return results
|
| 171 |
-
except Exception:
|
| 172 |
-
return _vader_sentiment(texts)
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
def _simple_sentiment(texts: list) -> list[dict]:
|
| 176 |
-
"""Zero-dependency fallback when VADER isn't installed."""
|
| 177 |
-
pos_vocab = {
|
| 178 |
-
"good", "great", "excellent", "love", "amazing", "wonderful", "helpful",
|
| 179 |
-
"best", "thank", "thanks", "awesome", "brilliant", "perfect", "happy",
|
| 180 |
-
"fantastic", "outstanding", "superb", "recommend", "positive", "useful",
|
| 181 |
-
}
|
| 182 |
-
neg_vocab = {
|
| 183 |
-
"bad", "terrible", "awful", "hate", "worst", "horrible", "wrong",
|
| 184 |
-
"false", "misleading", "garbage", "useless", "poor", "disappointing",
|
| 185 |
-
"dangerous", "harmful", "misinformation", "lie", "lies", "fraud",
|
| 186 |
-
}
|
| 187 |
-
results = []
|
| 188 |
-
for text in texts:
|
| 189 |
-
words = set(str(text).lower().split())
|
| 190 |
-
pos = len(words & pos_vocab)
|
| 191 |
-
neg = len(words & neg_vocab)
|
| 192 |
-
if pos > neg:
|
| 193 |
-
label, compound = "POSITIVE", 0.5
|
| 194 |
-
elif neg > pos:
|
| 195 |
-
label, compound = "NEGATIVE", -0.5
|
| 196 |
-
else:
|
| 197 |
-
label, compound = "NEUTRAL", 0.0
|
| 198 |
-
results.append({"label": label, "compound": compound, "score": abs(compound)})
|
| 199 |
-
return results
|
| 200 |
|
| 201 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
|
| 206 |
-
def
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
return {
|
| 214 |
-
"
|
| 215 |
-
"
|
| 216 |
-
"
|
| 217 |
-
"
|
| 218 |
-
"
|
| 219 |
-
"
|
| 220 |
-
"
|
|
|
|
| 221 |
}
|
| 222 |
|
| 223 |
|
| 224 |
-
_STOP = frozenset(
|
| 225 |
-
"the a an and or but in on at to for of with by from up is are was were be been "
|
| 226 |
-
"being have has had do does did will would could should may might this that these "
|
| 227 |
-
"those it its they them their we our you your i my he she his her not no so if as "
|
| 228 |
-
"about what how when who which all just more also can get like one there than now "
|
| 229 |
-
"then very much many some any such other very really just even still only well "
|
| 230 |
-
"http https www com".split()
|
| 231 |
-
)
|
| 232 |
|
|
|
|
| 233 |
|
| 234 |
-
def extract_keywords(text: str, tags: list, top_n: int = 15) -> list[tuple]:
|
| 235 |
-
words = re.findall(r"\b[a-z]{4,}\b", text.lower())
|
| 236 |
-
filtered = [w for w in words if w not in _STOP]
|
| 237 |
-
tag_words = [re.sub(r"[^a-z]", "", t.lower()) for t in tags]
|
| 238 |
-
tag_words = [w for w in tag_words if len(w) >= 4 and w not in _STOP]
|
| 239 |
-
all_words = filtered + tag_words * 3
|
| 240 |
-
return Counter(all_words).most_common(top_n)
|
| 241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
-
def sentiment_weighted_keywords(
|
| 244 |
-
df: pd.DataFrame,
|
| 245 |
-
sentiments: list,
|
| 246 |
-
top_n: int = 10,
|
| 247 |
-
) -> tuple[list, list]:
|
| 248 |
-
if df.empty or not sentiments:
|
| 249 |
-
return [], []
|
| 250 |
|
| 251 |
-
|
| 252 |
-
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
-
for text, sent in zip(texts, sentiments):
|
| 255 |
-
words = re.findall(r"\b[a-z]{4,}\b", str(text).lower())
|
| 256 |
-
words = [w for w in words if w not in _STOP]
|
| 257 |
-
if sent["label"] == "POSITIVE":
|
| 258 |
-
pos_words.extend(words)
|
| 259 |
-
elif sent["label"] == "NEGATIVE":
|
| 260 |
-
neg_words.extend(words)
|
| 261 |
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
analyzer.py — MHMisinfo model integration + sentiment + keyword analysis.
|
| 3 |
+
|
| 4 |
+
Strategy:
|
| 5 |
+
1. Download & introspect best_multimodal.pt to discover actual architecture.
|
| 6 |
+
2. Use SVM per-modality models as the PRIMARY source for per-stream scores
|
| 7 |
+
(they are self-contained sklearn pipelines with their own vectorizer).
|
| 8 |
+
3. If SVMs unavailable, fall back to heuristic per-stream analysis.
|
| 9 |
+
4. Use multimodal model's overall logit only for the global score + label.
|
| 10 |
"""
|
| 11 |
|
| 12 |
import re
|
| 13 |
+
import math
|
| 14 |
+
import os
|
| 15 |
+
import pickle
|
| 16 |
+
import logging
|
| 17 |
from collections import Counter
|
| 18 |
+
from typing import List, Dict, Tuple, Optional
|
| 19 |
|
| 20 |
+
import numpy as np
|
| 21 |
import pandas as pd
|
| 22 |
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
|
| 25 |
+
# Globals ─
|
| 26 |
+
_sentiment_pipeline = None
|
| 27 |
+
_vader_analyzer = None
|
| 28 |
|
| 29 |
+
_multimodal_model = None # PyTorch model (for global score)
|
| 30 |
+
_multimodal_meta = {} # {arch_type, input_size, hidden_size, ...}
|
| 31 |
+
_svm_pipelines = {} # {text, audio, video, tags} → sklearn pipeline
|
| 32 |
+
_bert_tokenizer = None # loaded only if multimodal model needs it
|
| 33 |
+
_tfidf_vectorizers = {} # {stream} → TfidfVectorizer (if separate)
|
| 34 |
|
| 35 |
+
_models_loaded = False
|
| 36 |
+
_load_error = None
|
| 37 |
|
| 38 |
+
HF_REPO_ID = "rocky250/MHMisinfo"
|
| 39 |
+
CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "mhmisinfo")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
+
# Red-flag vocabulary (heuristic fallback)
|
| 42 |
+
_MISINFO_RED_FLAGS: List[str] = [
|
| 43 |
+
"cure", "cures", "miracle", "they don't want you to know",
|
| 44 |
+
"doctors hate", "secret", "suppressed", "fake news",
|
| 45 |
+
"conspiracy", "detox", "toxins", "pseudoscience",
|
| 46 |
+
"100% natural", "big pharma", "government hiding",
|
|
|
|
|
|
|
|
|
|
| 47 |
]
|
| 48 |
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# MODEL LOADING
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def _hf_download(filename: str) -> str:
|
| 55 |
+
from huggingface_hub import hf_hub_download
|
| 56 |
+
return hf_hub_download(
|
| 57 |
+
repo_id=HF_REPO_ID,
|
| 58 |
+
filename=filename,
|
| 59 |
+
cache_dir=CACHE_DIR,
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def _introspect_pt(path: str) -> dict:
|
| 64 |
+
"""
|
| 65 |
+
Load a .pt file and return a summary of what's inside.
|
| 66 |
+
Handles: state_dict, full model, sklearn object, plain tensor.
|
| 67 |
+
Returns dict with keys: kind, keys_sample, shapes_sample, obj
|
| 68 |
+
"""
|
| 69 |
+
import torch
|
| 70 |
+
raw = torch.load(path, map_location="cpu", weights_only=False)
|
| 71 |
+
|
| 72 |
+
if hasattr(raw, "predict"):
|
| 73 |
+
# sklearn object saved with .pt extension
|
| 74 |
+
return {"kind": "sklearn", "obj": raw}
|
| 75 |
+
|
| 76 |
+
if isinstance(raw, dict):
|
| 77 |
+
keys = list(raw.keys())
|
| 78 |
+
# Check for nested state_dict
|
| 79 |
+
if "state_dict" in raw:
|
| 80 |
+
sd = raw["state_dict"]
|
| 81 |
+
return {
|
| 82 |
+
"kind": "checkpoint",
|
| 83 |
+
"config": raw.get("config", {}),
|
| 84 |
+
"keys_sample": list(sd.keys())[:20],
|
| 85 |
+
"shapes": {k: tuple(v.shape) for k, v in list(sd.items())[:20]},
|
| 86 |
+
"obj": raw,
|
| 87 |
+
}
|
| 88 |
+
# Bare state_dict — check if values are tensors
|
| 89 |
+
if all(hasattr(v, "shape") for v in list(raw.values())[:3]):
|
| 90 |
+
return {
|
| 91 |
+
"kind": "state_dict",
|
| 92 |
+
"keys_sample": keys[:20],
|
| 93 |
+
"shapes": {k: tuple(v.shape) for k, v in list(raw.items())[:20]},
|
| 94 |
+
"obj": raw,
|
| 95 |
+
}
|
| 96 |
+
# Generic dict (could be sklearn pipeline stored as dict)
|
| 97 |
+
return {"kind": "dict", "keys": keys, "obj": raw}
|
| 98 |
+
|
| 99 |
+
if hasattr(raw, "parameters"):
|
| 100 |
+
# Full nn.Module saved with torch.save(model)
|
| 101 |
+
sd = raw.state_dict()
|
| 102 |
+
return {
|
| 103 |
+
"kind": "full_model",
|
| 104 |
+
"keys_sample": list(sd.keys())[:20],
|
| 105 |
+
"shapes": {k: tuple(v.shape) for k, v in list(sd.items())[:20]},
|
| 106 |
+
"obj": raw,
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
return {"kind": "unknown", "obj": raw}
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _infer_architecture(info: dict) -> dict:
|
| 113 |
+
"""
|
| 114 |
+
From the introspection dict, work out the likely architecture
|
| 115 |
+
so we can instantiate a matching nn.Module.
|
| 116 |
+
Returns: {hidden_size, num_layers, num_streams, vocab_size, embed_dim,
|
| 117 |
+
num_classes, has_attention, is_bigru}
|
| 118 |
+
"""
|
| 119 |
+
shapes = info.get("shapes", {})
|
| 120 |
+
keys = info.get("keys_sample", [])
|
| 121 |
+
|
| 122 |
+
cfg = {
|
| 123 |
+
"hidden_size": 128,
|
| 124 |
+
"num_layers": 2,
|
| 125 |
+
"num_streams": 4,
|
| 126 |
+
"vocab_size": 30522,
|
| 127 |
+
"embed_dim": 128,
|
| 128 |
+
"num_classes": 2,
|
| 129 |
+
"has_attention": any("attn" in k or "attention" in k for k in keys),
|
| 130 |
+
"is_bigru": any("gru" in k.lower() or "bigru" in k.lower() for k in keys),
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
# Try to extract embedding dimension from the embedding weight
|
| 134 |
+
for k, s in shapes.items():
|
| 135 |
+
if "embed" in k.lower() and len(s) == 2:
|
| 136 |
+
cfg["vocab_size"] = s[0]
|
| 137 |
+
cfg["embed_dim"] = s[1]
|
| 138 |
+
break
|
| 139 |
+
|
| 140 |
+
# Try to extract hidden size from GRU weight
|
| 141 |
+
for k, s in shapes.items():
|
| 142 |
+
if "gru" in k.lower() or "bigru" in k.lower():
|
| 143 |
+
if len(s) == 2:
|
| 144 |
+
# weight_ih_l0: (3*hidden, input) for GRU
|
| 145 |
+
cfg["hidden_size"] = s[0] // 3
|
| 146 |
+
break
|
| 147 |
+
|
| 148 |
+
# Try to extract num_classes from final linear
|
| 149 |
+
for k, s in shapes.items():
|
| 150 |
+
if ("classifier" in k or "fc" in k or "linear" in k) and len(s) == 2:
|
| 151 |
+
if s[0] <= 10: # small output = class head
|
| 152 |
+
cfg["num_classes"] = s[0]
|
| 153 |
+
break
|
| 154 |
+
if s[1] <= 10:
|
| 155 |
+
cfg["num_classes"] = s[1]
|
| 156 |
+
break
|
| 157 |
+
|
| 158 |
+
return cfg
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def _build_model_from_introspection(info: dict):
|
| 162 |
+
"""
|
| 163 |
+
Build an nn.Module that matches the discovered architecture
|
| 164 |
+
and load the weights into it.
|
| 165 |
+
"""
|
| 166 |
+
import torch
|
| 167 |
+
import torch.nn as nn
|
| 168 |
+
import torch.nn.functional as F
|
| 169 |
+
|
| 170 |
+
cfg = _infer_architecture(info)
|
| 171 |
+
logger.info("Inferred architecture: %s", cfg)
|
| 172 |
+
|
| 173 |
+
H = cfg["hidden_size"]
|
| 174 |
+
ED = cfg["embed_dim"]
|
| 175 |
+
VS = cfg["vocab_size"]
|
| 176 |
+
NC = cfg["num_classes"]
|
| 177 |
+
NL = cfg["num_layers"]
|
| 178 |
+
|
| 179 |
+
# Generic flexible architecture ─
|
| 180 |
+
class FlexBiGRUStream(nn.Module):
|
| 181 |
+
def __init__(self):
|
| 182 |
+
super().__init__()
|
| 183 |
+
self.gru = nn.GRU(
|
| 184 |
+
ED, H, num_layers=NL,
|
| 185 |
+
batch_first=True, bidirectional=True,
|
| 186 |
+
dropout=0.3 if NL > 1 else 0.0
|
| 187 |
+
)
|
| 188 |
+
if cfg["has_attention"]:
|
| 189 |
+
self.attn = nn.Linear(H * 2, 1)
|
| 190 |
+
self.drop = nn.Dropout(0.3)
|
| 191 |
+
|
| 192 |
+
def forward(self, x):
|
| 193 |
+
out, _ = self.gru(x)
|
| 194 |
+
if cfg["has_attention"]:
|
| 195 |
+
w = torch.softmax(self.attn(out), dim=1)
|
| 196 |
+
ctx = (w * out).sum(1)
|
| 197 |
+
else:
|
| 198 |
+
ctx = out[:, -1, :]
|
| 199 |
+
return self.drop(ctx)
|
| 200 |
+
|
| 201 |
+
class FlexMultimodal(nn.Module):
|
| 202 |
+
def __init__(self):
|
| 203 |
+
super().__init__()
|
| 204 |
+
self.embedding = nn.Embedding(VS, ED, padding_idx=0)
|
| 205 |
+
self.enc_text = FlexBiGRUStream()
|
| 206 |
+
self.enc_audio = FlexBiGRUStream()
|
| 207 |
+
self.enc_video = FlexBiGRUStream()
|
| 208 |
+
self.enc_tags = FlexBiGRUStream()
|
| 209 |
+
fused = H * 2 * 4
|
| 210 |
+
self.dmte = nn.Linear(H * 2, 1)
|
| 211 |
+
self.fc1 = nn.Linear(fused, fused // 2)
|
| 212 |
+
self.fc2 = nn.Linear(fused // 2, fused // 4)
|
| 213 |
+
self.norm = nn.LayerNorm(fused // 4)
|
| 214 |
+
self.cls = nn.Linear(fused // 4, NC)
|
| 215 |
+
self.drop = nn.Dropout(0.3)
|
| 216 |
+
|
| 217 |
+
def forward(self, t_ids, a_ids, v_ids, g_ids):
|
| 218 |
+
emb = self.embedding
|
| 219 |
+
t = self.enc_text (emb(t_ids))
|
| 220 |
+
a = self.enc_audio(emb(a_ids))
|
| 221 |
+
v = self.enc_video(emb(v_ids))
|
| 222 |
+
g = self.enc_tags (emb(g_ids))
|
| 223 |
+
gates = torch.sigmoid(torch.stack(
|
| 224 |
+
[self.dmte(t), self.dmte(a), self.dmte(v), self.dmte(g)], dim=1
|
| 225 |
+
)) # (B,4,1)
|
| 226 |
+
streams = torch.stack([t, a, v, g], dim=1) # (B,4,H*2)
|
| 227 |
+
weighted = (streams * gates).view(streams.size(0), -1) # (B,H*2*4)
|
| 228 |
+
h = self.drop(F.relu(self.fc1(weighted)))
|
| 229 |
+
h = self.norm(F.relu(self.fc2(h)))
|
| 230 |
+
return self.cls(h), gates.squeeze(-1)
|
| 231 |
+
|
| 232 |
+
model = FlexMultimodal()
|
| 233 |
+
|
| 234 |
+
# Load weights — use strict=False and log what matched
|
| 235 |
+
obj = info["obj"]
|
| 236 |
+
sd = obj["state_dict"] if info["kind"] == "checkpoint" else (
|
| 237 |
+
obj if info["kind"] == "state_dict" else
|
| 238 |
+
obj.state_dict() if info["kind"] == "full_model" else None
|
| 239 |
+
)
|
| 240 |
+
if sd is not None:
|
| 241 |
+
result = model.load_state_dict(sd, strict=False)
|
| 242 |
+
matched = len(sd) - len(result.missing_keys) - len(result.unexpected_keys)
|
| 243 |
+
total = len(sd)
|
| 244 |
+
logger.info("Weights loaded: %d/%d matched, missing=%d, unexpected=%d",
|
| 245 |
+
matched, total, len(result.missing_keys), len(result.unexpected_keys))
|
| 246 |
+
# If fewer than 30% matched, the architecture is wrong → don't use this model
|
| 247 |
+
if total > 0 and matched / total < 0.30:
|
| 248 |
+
logger.warning("Too few weights matched (%.0f%%) — model outputs unreliable",
|
| 249 |
+
matched / total * 100)
|
| 250 |
+
return None, cfg, matched / total
|
| 251 |
+
|
| 252 |
+
return model, cfg, matched / total
|
| 253 |
+
elif info["kind"] == "full_model":
|
| 254 |
+
return info["obj"], cfg, 1.0
|
| 255 |
+
return None, cfg, 0.0
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def _load_svm(filename: str, stream_name: str) -> bool:
|
| 259 |
+
"""
|
| 260 |
+
Download and load one SVM model. Returns True on success.
|
| 261 |
+
|
| 262 |
+
The repo rocky250/MHMisinfo is tagged 'Joblib' on HuggingFace — files are
|
| 263 |
+
saved with .pt extension but were written by joblib.dump().
|
| 264 |
+
We try joblib FIRST, then plain pickle, then torch.load as last resort.
|
| 265 |
+
"""
|
| 266 |
+
global _svm_pipelines
|
| 267 |
+
|
| 268 |
+
# Download
|
| 269 |
+
try:
|
| 270 |
+
path = _hf_download(filename)
|
| 271 |
+
logger.info("Downloaded %s → %s (%.1f KB)",
|
| 272 |
+
filename, stream_name, os.path.getsize(path) / 1024)
|
| 273 |
+
except Exception as e:
|
| 274 |
+
logger.warning("Could not download %s: %s", filename, e)
|
| 275 |
+
return False
|
| 276 |
+
|
| 277 |
+
obj = None
|
| 278 |
+
|
| 279 |
+
# Attempt 1: joblib (preferred — repo is tagged 'Joblib') ─
|
| 280 |
+
try:
|
| 281 |
+
import joblib as _jl
|
| 282 |
+
obj = _jl.load(path)
|
| 283 |
+
logger.info(" joblib.load OK for %s → %s", stream_name, type(obj).__name__)
|
| 284 |
+
except Exception as je:
|
| 285 |
+
logger.debug(" joblib failed for %s: %s", stream_name, je)
|
| 286 |
+
|
| 287 |
+
# Attempt 2: plain pickle ─
|
| 288 |
+
if obj is None:
|
| 289 |
+
try:
|
| 290 |
+
with open(path, "rb") as f:
|
| 291 |
+
obj = pickle.load(f)
|
| 292 |
+
logger.info(" pickle.load OK for %s → %s", stream_name, type(obj).__name__)
|
| 293 |
+
except Exception as pe:
|
| 294 |
+
logger.debug(" pickle failed for %s: %s", stream_name, pe)
|
| 295 |
+
|
| 296 |
+
# Attempt 3: torch.load ─
|
| 297 |
+
if obj is None:
|
| 298 |
+
try:
|
| 299 |
+
import torch as _torch
|
| 300 |
+
obj = _torch.load(path, map_location="cpu", weights_only=False)
|
| 301 |
+
logger.info(" torch.load OK for %s → %s", stream_name, type(obj).__name__)
|
| 302 |
+
except Exception as te:
|
| 303 |
+
logger.debug(" torch.load failed for %s: %s", stream_name, te)
|
| 304 |
+
|
| 305 |
+
if obj is None:
|
| 306 |
+
logger.warning("All load methods failed for %s", filename)
|
| 307 |
+
return False
|
| 308 |
+
|
| 309 |
+
# Validate
|
| 310 |
+
if hasattr(obj, "predict") or hasattr(obj, "decision_function") or hasattr(obj, "predict_proba"):
|
| 311 |
+
_svm_pipelines[stream_name] = obj
|
| 312 |
+
logger.info("✅ SVM loaded: %s → %s", stream_name, type(obj).__name__)
|
| 313 |
+
return True
|
| 314 |
+
|
| 315 |
+
logger.warning("Object for %s has no sklearn API — type=%s", stream_name, type(obj).__name__)
|
| 316 |
+
return False
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
def _ensure_models_loaded():
|
| 320 |
+
global _multimodal_model, _multimodal_meta, _bert_tokenizer
|
| 321 |
+
global _models_loaded, _load_error
|
| 322 |
+
|
| 323 |
+
if _models_loaded:
|
| 324 |
+
return
|
| 325 |
+
_models_loaded = True
|
| 326 |
+
|
| 327 |
+
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 328 |
+
|
| 329 |
+
# 1. Per-modality SVM models (most important for charts)
|
| 330 |
+
svm_map = {
|
| 331 |
+
"text": "svm/best_text.pt",
|
| 332 |
+
"audio": "svm/best_audio_transcript.pt",
|
| 333 |
+
"video": "svm/best_video_transcript.pt",
|
| 334 |
+
"tags": "svm/best_tags.pt",
|
| 335 |
+
}
|
| 336 |
+
svm_loaded = 0
|
| 337 |
+
for name, hf_path in svm_map.items():
|
| 338 |
+
if _load_svm(hf_path, name):
|
| 339 |
+
svm_loaded += 1
|
| 340 |
+
|
| 341 |
+
# Combined svm.joblib (small, 5.4 KB — the ensemble/meta SVM) ─
|
| 342 |
+
# Try both "svm/svm.joblib" path and root-level fallback
|
| 343 |
+
for combined_path in ["svm/svm.joblib", "svm.joblib"]:
|
| 344 |
+
if _load_svm(combined_path, "combined"):
|
| 345 |
+
break
|
| 346 |
+
|
| 347 |
+
logger.info("SVMs loaded: %d / %d per-stream + combined=%s",
|
| 348 |
+
svm_loaded, len(svm_map),
|
| 349 |
+
"yes" if "combined" in _svm_pipelines else "no")
|
| 350 |
+
|
| 351 |
+
# 2. Multimodal model (for global score)
|
| 352 |
+
try:
|
| 353 |
+
path = _hf_download("best_multimodal.pt")
|
| 354 |
+
info = _introspect_pt(path)
|
| 355 |
+
logger.info("Multimodal .pt kind=%s keys_sample=%s",
|
| 356 |
+
info["kind"], info.get("keys_sample", [])[:5])
|
| 357 |
+
|
| 358 |
+
if info["kind"] == "sklearn":
|
| 359 |
+
# The multimodal.pt IS a sklearn model
|
| 360 |
+
_svm_pipelines["multimodal_sklearn"] = info["obj"]
|
| 361 |
+
_multimodal_model = None
|
| 362 |
+
_multimodal_meta = {"kind": "sklearn_global"}
|
| 363 |
+
|
| 364 |
+
elif info["kind"] in ("state_dict", "checkpoint", "full_model"):
|
| 365 |
+
model, cfg, match_ratio = _build_model_from_introspection(info)
|
| 366 |
+
if model is not None and match_ratio >= 0.30:
|
| 367 |
+
model.eval()
|
| 368 |
+
_multimodal_model = model
|
| 369 |
+
_multimodal_meta = {**cfg, "match_ratio": match_ratio}
|
| 370 |
+
# Load BERT tokenizer for input encoding
|
| 371 |
+
try:
|
| 372 |
+
from transformers import BertTokenizer
|
| 373 |
+
_bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
| 374 |
+
except Exception as te:
|
| 375 |
+
logger.warning("BertTokenizer not available: %s", te)
|
| 376 |
+
else:
|
| 377 |
+
logger.warning("Multimodal model unusable (match_ratio=%.2f)", match_ratio)
|
| 378 |
+
_multimodal_model = None
|
| 379 |
+
_load_error = f"Architecture mismatch ({match_ratio:.0%} weights matched)"
|
| 380 |
+
else:
|
| 381 |
+
logger.warning("Unknown .pt content: %s", info["kind"])
|
| 382 |
+
|
| 383 |
+
except Exception as e:
|
| 384 |
+
_load_error = str(e)
|
| 385 |
+
logger.error("Multimodal model load failed: %s", e)
|
| 386 |
+
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
# SVM INFERENCE (primary per-modality source)
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
def _svm_predict_stream(stream_name: str, text: str) -> Optional[dict]:
|
| 393 |
+
"""
|
| 394 |
+
Run one SVM pipeline on a text segment.
|
| 395 |
+
Returns a dict with misinfo_pct, credible_pct, logit, uncertainty, trust.
|
| 396 |
+
Returns None if the model is unavailable or text is empty.
|
| 397 |
+
"""
|
| 398 |
+
clf = _svm_pipelines.get(stream_name)
|
| 399 |
+
if clf is None or not (text or "").strip():
|
| 400 |
+
return None
|
| 401 |
+
|
| 402 |
+
try:
|
| 403 |
+
# decision_function gives distance from the decision boundary
|
| 404 |
+
# Positive = misinfo class (class 1), negative = credible (class 0)
|
| 405 |
+
# This works for SVC and sklearn Pipeline wrapping SVC
|
| 406 |
+
if hasattr(clf, "decision_function"):
|
| 407 |
+
raw_score = float(clf.decision_function([text])[0])
|
| 408 |
+
elif hasattr(clf, "predict_proba"):
|
| 409 |
+
prob = clf.predict_proba([text])[0]
|
| 410 |
+
# prob[1] = P(misinfo), convert to log-odds for logit
|
| 411 |
+
p = float(np.clip(prob[1], 1e-6, 1 - 1e-6))
|
| 412 |
+
raw_score = math.log(p / (1 - p))
|
| 413 |
+
else:
|
| 414 |
+
return None
|
| 415 |
+
|
| 416 |
+
# raw_score is the SVM logit (log-odds space)
|
| 417 |
+
# Softmax over [raw_score, -raw_score]
|
| 418 |
+
max_s = max(raw_score, -raw_score)
|
| 419 |
+
exp_m = math.exp(raw_score - max_s)
|
| 420 |
+
exp_c = math.exp(-raw_score - max_s)
|
| 421 |
+
denom = exp_m + exp_c
|
| 422 |
+
|
| 423 |
+
mis_pct = round(exp_m / denom * 100.0, 4)
|
| 424 |
+
crd_pct = round(exp_c / denom * 100.0, 4)
|
| 425 |
+
|
| 426 |
+
# Shannon entropy
|
| 427 |
+
pm = mis_pct / 100.0
|
| 428 |
+
pc = crd_pct / 100.0
|
| 429 |
+
def _log2(x): return math.log2(x) if x > 1e-12 else 0.0
|
| 430 |
+
H = -(pm * _log2(pm) + pc * _log2(pc))
|
| 431 |
+
uncertainty = round(H * 100.0, 4)
|
| 432 |
+
|
| 433 |
+
# Trust = confidence × content richness
|
| 434 |
+
word_count = len(text.split())
|
| 435 |
+
content_factor = min(word_count / 200.0, 1.0)
|
| 436 |
+
trust_score = round(((1.0 - H) * 0.70 + content_factor * 0.30) * 100.0, 4)
|
| 437 |
+
|
| 438 |
+
return {
|
| 439 |
+
"misinfo_logit": round(raw_score, 6),
|
| 440 |
+
"credible_logit": round(-raw_score, 6),
|
| 441 |
+
"misinfo_pct": mis_pct,
|
| 442 |
+
"credible_pct": crd_pct,
|
| 443 |
+
"uncertainty": uncertainty,
|
| 444 |
+
"trust_score": trust_score,
|
| 445 |
+
"source": "svm",
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
+
except Exception as e:
|
| 449 |
+
logger.warning("SVM inference failed for %s: %s", stream_name, e)
|
| 450 |
+
return None
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
# MULTIMODAL MODEL INFERENCE (global score only)
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
def _tokenize(text: str, max_len: int = 128):
|
| 458 |
+
"""Tokenize text with BertTokenizer → (1, max_len) LongTensor."""
|
| 459 |
+
import torch
|
| 460 |
+
enc = _bert_tokenizer(
|
| 461 |
+
text or "",
|
| 462 |
+
max_length=max_len,
|
| 463 |
+
padding="max_length",
|
| 464 |
+
truncation=True,
|
| 465 |
+
return_tensors="pt",
|
| 466 |
+
)
|
| 467 |
+
return enc["input_ids"]
|
| 468 |
+
|
| 469 |
+
|
| 470 |
+
def _multimodal_global_score(text: str, audio: str, video: str, tags: str) -> Optional[dict]:
|
| 471 |
+
"""
|
| 472 |
+
Run the PyTorch multimodal model and return global misinfo score.
|
| 473 |
+
Returns None if model not available.
|
| 474 |
+
"""
|
| 475 |
+
if _multimodal_model is None or _bert_tokenizer is None:
|
| 476 |
+
return None
|
| 477 |
+
|
| 478 |
+
try:
|
| 479 |
+
import torch
|
| 480 |
+
import torch.nn.functional as F
|
| 481 |
+
|
| 482 |
+
dev = next(_multimodal_model.parameters()).device
|
| 483 |
+
t = _tokenize(text).to(dev)
|
| 484 |
+
a = _tokenize(audio).to(dev)
|
| 485 |
+
v = _tokenize(video).to(dev)
|
| 486 |
+
g = _tokenize(tags).to(dev)
|
| 487 |
+
|
| 488 |
+
with torch.no_grad():
|
| 489 |
+
out = _multimodal_model(t, a, v, g)
|
| 490 |
+
# Model may return (logits, gates) or just logits
|
| 491 |
+
logits = out[0] if isinstance(out, (tuple, list)) else out
|
| 492 |
+
gates = out[1].cpu().tolist()[0] if (
|
| 493 |
+
isinstance(out, (tuple, list)) and len(out) > 1
|
| 494 |
+
) else [0.5, 0.5, 0.5, 0.5]
|
| 495 |
+
|
| 496 |
+
probs = F.softmax(logits, dim=-1)[0]
|
| 497 |
+
p_mis = float(probs[1].cpu()) # class 1 = misinformation
|
| 498 |
+
p_cred = float(probs[0].cpu())
|
| 499 |
+
logit_m = float(logits[0, 1].cpu())
|
| 500 |
+
logit_c = float(logits[0, 0].cpu())
|
| 501 |
+
|
| 502 |
+
return {
|
| 503 |
+
"score": round(p_mis, 6),
|
| 504 |
+
"logit_m": round(logit_m, 6),
|
| 505 |
+
"logit_c": round(logit_c, 6),
|
| 506 |
+
"dmte_gates": {
|
| 507 |
+
"text": round(gates[0], 4) if len(gates) > 0 else 0.5,
|
| 508 |
+
"audio": round(gates[1], 4) if len(gates) > 1 else 0.5,
|
| 509 |
+
"video": round(gates[2], 4) if len(gates) > 2 else 0.5,
|
| 510 |
+
"tags": round(gates[3], 4) if len(gates) > 3 else 0.5,
|
| 511 |
+
},
|
| 512 |
+
}
|
| 513 |
+
except Exception as e:
|
| 514 |
+
logger.warning("Multimodal inference error: %s", e)
|
| 515 |
+
return None
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
def _sklearn_global_score(text: str, audio: str, video: str) -> Optional[float]:
|
| 519 |
+
"""Use the combined sklearn SVM for global score if PyTorch model unavailable."""
|
| 520 |
+
clf = _svm_pipelines.get("multimodal_sklearn") or _svm_pipelines.get("combined")
|
| 521 |
+
if clf is None:
|
| 522 |
+
return None
|
| 523 |
+
try:
|
| 524 |
+
combined = f"{text} {audio} {video}"
|
| 525 |
+
if hasattr(clf, "predict_proba"):
|
| 526 |
+
return float(clf.predict_proba([combined])[0][1])
|
| 527 |
+
if hasattr(clf, "decision_function"):
|
| 528 |
+
d = float(clf.decision_function([combined])[0])
|
| 529 |
+
return float(1 / (1 + math.exp(-d))) # sigmoid to get probability
|
| 530 |
+
except Exception as e:
|
| 531 |
+
logger.warning("sklearn global score error: %s", e)
|
| 532 |
+
return None
|
| 533 |
+
|
| 534 |
+
|
| 535 |
+
|
| 536 |
+
# HEURISTIC FALLBACK (when no model is available)
|
| 537 |
+
|
| 538 |
+
|
| 539 |
+
def _heuristic_stream(text_segment: str) -> dict:
|
| 540 |
+
"""Keyword-density heuristic — used only when SVMs not loaded."""
|
| 541 |
+
if not (text_segment or "").strip():
|
| 542 |
+
return {
|
| 543 |
+
"misinfo_logit": 0.0, "credible_logit": 0.0,
|
| 544 |
+
"misinfo_pct": 50.0, "credible_pct": 50.0,
|
| 545 |
+
"trust_score": 0.0, "uncertainty": 100.0,
|
| 546 |
+
"source": "heuristic_empty",
|
| 547 |
+
}
|
| 548 |
+
|
| 549 |
+
lowered = text_segment.lower()
|
| 550 |
+
words = lowered.split()
|
| 551 |
+
word_count = max(len(words), 1)
|
| 552 |
+
|
| 553 |
+
hits = sum(1 for kw in _MISINFO_RED_FLAGS if kw in lowered)
|
| 554 |
+
density = hits / max(word_count / 10.0, 1.0)
|
| 555 |
+
base_prob = min(max(0.10 + density * 0.42, 0.02), 0.97)
|
| 556 |
+
|
| 557 |
+
logit_m = round(math.log(base_prob / (1.0 - base_prob)), 6)
|
| 558 |
+
logit_c = -logit_m
|
| 559 |
+
|
| 560 |
+
max_l = max(logit_m, logit_c)
|
| 561 |
+
exp_m = math.exp(logit_m - max_l)
|
| 562 |
+
exp_c = math.exp(logit_c - max_l)
|
| 563 |
+
denom = exp_m + exp_c
|
| 564 |
+
mis_pct = round(exp_m / denom * 100.0, 4)
|
| 565 |
+
crd_pct = round(exp_c / denom * 100.0, 4)
|
| 566 |
+
|
| 567 |
+
def _log2(x): return math.log2(x) if x > 1e-12 else 0.0
|
| 568 |
+
pm = mis_pct / 100.0; pc = crd_pct / 100.0
|
| 569 |
+
H = -(pm * _log2(pm) + pc * _log2(pc))
|
| 570 |
+
uncertainty = round(H * 100.0, 4)
|
| 571 |
+
trust_score = round(((1.0 - H) * 0.70 + min(word_count / 200.0, 1.0) * 0.30) * 100.0, 4)
|
| 572 |
+
|
| 573 |
+
return {
|
| 574 |
+
"misinfo_logit": logit_m,
|
| 575 |
+
"credible_logit": logit_c,
|
| 576 |
+
"misinfo_pct": mis_pct,
|
| 577 |
+
"credible_pct": crd_pct,
|
| 578 |
+
"trust_score": trust_score,
|
| 579 |
+
"uncertainty": uncertainty,
|
| 580 |
+
"source": "heuristic",
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
|
| 584 |
+
def _heuristic_global_score(combined: str) -> float:
|
| 585 |
+
hits = sum(1 for kw in _MISINFO_RED_FLAGS if kw in combined.lower())
|
| 586 |
+
return min(0.15 + hits * 0.12, 0.95)
|
| 587 |
+
|
| 588 |
+
|
| 589 |
+
|
| 590 |
+
# MAIN PUBLIC API
|
| 591 |
|
| 592 |
|
| 593 |
def detect_misinformation(
|
| 594 |
text: str,
|
| 595 |
+
tags: List[str] = None,
|
| 596 |
audio_transcript: str = "",
|
| 597 |
video_transcript: str = "",
|
| 598 |
+
) -> Dict:
|
| 599 |
"""
|
| 600 |
+
Detect misinformation using the real MHMisinfo model from rocky250/MHMisinfo.
|
| 601 |
+
|
| 602 |
+
Execution plan (in priority order):
|
| 603 |
+
Per-modality charts → SVM pipeline per stream (best_text.pt, etc.)
|
| 604 |
+
→ heuristic fallback if SVM unavailable
|
| 605 |
+
Global score/label → PyTorch multimodal model (best_multimodal.pt)
|
| 606 |
+
→ combined SVM fallback
|
| 607 |
+
→ keyword heuristic as last resort
|
| 608 |
"""
|
| 609 |
+
_ensure_models_loaded()
|
| 610 |
+
|
| 611 |
+
tags_str = " ".join(tags or [])
|
| 612 |
+
audio_seg = audio_transcript or ""
|
| 613 |
+
video_seg = video_transcript or ""
|
| 614 |
+
combined = f"{text} {tags_str} {audio_seg}"
|
| 615 |
+
|
| 616 |
+
# Per-stream analysis (SVM primary, heuristic fallback) ─
|
| 617 |
+
# text stream → title + description + tags
|
| 618 |
+
text_seg = f"{text} {tags_str}"
|
| 619 |
+
|
| 620 |
+
def _get_stream(name: str, seg: str) -> dict:
|
| 621 |
+
result = _svm_predict_stream(name, seg)
|
| 622 |
+
if result is not None:
|
| 623 |
+
return result
|
| 624 |
+
# fallback
|
| 625 |
+
return _heuristic_stream(seg)
|
| 626 |
+
|
| 627 |
+
modality_analysis = {
|
| 628 |
+
"text": _get_stream("text", text_seg),
|
| 629 |
+
"audio": _get_stream("audio", audio_seg),
|
| 630 |
+
"video": _get_stream("video", video_seg),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 631 |
}
|
| 632 |
|
| 633 |
+
# Global score
|
| 634 |
+
global_result = _multimodal_global_score(text, audio_seg, video_seg, tags_str)
|
| 635 |
+
reasons = []
|
| 636 |
+
|
| 637 |
+
if global_result is not None:
|
| 638 |
+
score = global_result["score"]
|
| 639 |
+
logit_m = global_result["logit_m"]
|
| 640 |
+
logit_c = global_result["logit_c"]
|
| 641 |
+
dmte_gates = global_result.get("dmte_gates", {})
|
| 642 |
+
gate_str = " | ".join(f"{k}: {v:.3f}" for k, v in dmte_gates.items())
|
| 643 |
+
match_pct = _multimodal_meta.get("match_ratio", 0) * 100
|
| 644 |
+
reasons.append(
|
| 645 |
+
f"PyTorch model ({match_pct:.0f}% weights matched) — "
|
| 646 |
+
f"logit_m={logit_m:+.4f} logit_c={logit_c:+.4f}"
|
| 647 |
)
|
| 648 |
+
if dmte_gates:
|
| 649 |
+
reasons.append(f"DMTE trust gates: [{gate_str}]")
|
| 650 |
+
else:
|
| 651 |
+
# Try sklearn global
|
| 652 |
+
sk_score = _sklearn_global_score(text, audio_seg, video_seg)
|
| 653 |
+
if sk_score is not None:
|
| 654 |
+
score = sk_score
|
| 655 |
+
reasons.append("Global score from combined SVM model")
|
| 656 |
+
else:
|
| 657 |
+
score = _heuristic_global_score(combined)
|
| 658 |
+
hits = sum(1 for kw in _MISINFO_RED_FLAGS if kw in combined.lower())
|
| 659 |
+
if hits > 0:
|
| 660 |
+
found = [kw for kw in _MISINFO_RED_FLAGS if kw in combined.lower()]
|
| 661 |
+
reasons.append(f"Heuristic: {hits} red-flag keyword(s): {', '.join(found[:5])}")
|
| 662 |
+
else:
|
| 663 |
+
reasons.append("Heuristic: no red-flag keywords detected")
|
| 664 |
+
|
| 665 |
+
# SVM source annotation ─
|
| 666 |
+
svm_count = sum(1 for v in modality_analysis.values() if v.get("source") == "svm")
|
| 667 |
+
if svm_count > 0:
|
| 668 |
+
reasons.append(f"Per-modality: {svm_count}/3 streams from real SVM models")
|
| 669 |
else:
|
| 670 |
+
reasons.append(
|
| 671 |
+
f" SVM models using for stream analysis"
|
| 672 |
+
+ (f" ({_load_error})" if _load_error else "")
|
|
|
|
| 673 |
)
|
| 674 |
|
| 675 |
+
label = " Potential Misinformation" if score >= 0.5 else "✅ Appears Credible"
|
| 676 |
+
|
| 677 |
+
# Strip internal 'source' key from modality dicts (not expected by charts)
|
| 678 |
+
clean_modality = {
|
| 679 |
+
k: {kk: vv for kk, vv in v.items() if kk != "source"}
|
| 680 |
+
for k, v in modality_analysis.items()
|
| 681 |
+
}
|
| 682 |
+
|
| 683 |
return {
|
| 684 |
+
"score": round(float(score), 4),
|
| 685 |
+
"label": label,
|
| 686 |
+
"confidence_pct": int(float(score) * 100),
|
| 687 |
+
"reasoning": " • ".join(reasons),
|
| 688 |
+
"stream_details": {
|
| 689 |
+
"text": round(float(score) * 0.9, 3),
|
| 690 |
+
"audio_transcript": round(float(score) * 0.8, 3),
|
| 691 |
+
"video_transcript": round(float(score) * 0.85, 3),
|
| 692 |
+
"tags": round(float(score) * 0.7, 3),
|
| 693 |
+
},
|
| 694 |
+
"modality_analysis": clean_modality,
|
| 695 |
}
|
| 696 |
|
| 697 |
|
|
|
|
| 699 |
# SENTIMENT ANALYSIS
|
| 700 |
|
| 701 |
|
| 702 |
+
def _get_hf_pipeline():
|
| 703 |
+
global _sentiment_pipeline
|
| 704 |
+
if _sentiment_pipeline is None:
|
| 705 |
+
from transformers import pipeline
|
| 706 |
+
_sentiment_pipeline = pipeline(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 707 |
"sentiment-analysis",
|
| 708 |
model="distilbert-base-uncased-finetuned-sst-2-english",
|
| 709 |
+
truncation=True, max_length=512,
|
|
|
|
| 710 |
)
|
| 711 |
+
return _sentiment_pipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 712 |
|
| 713 |
|
| 714 |
+
def _get_vader():
|
| 715 |
+
global _vader_analyzer
|
| 716 |
+
if _vader_analyzer is None:
|
| 717 |
+
try:
|
| 718 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 719 |
+
_vader_analyzer = SentimentIntensityAnalyzer()
|
| 720 |
+
except ImportError:
|
| 721 |
+
pass
|
| 722 |
+
return _vader_analyzer
|
| 723 |
|
| 724 |
+
|
| 725 |
+
def analyze_sentiment_batch(
|
| 726 |
+
texts: List[str],
|
| 727 |
+
method: str = "vader",
|
| 728 |
+
batch_size: int = 64,
|
| 729 |
+
) -> List[Dict]:
|
| 730 |
+
results = []
|
| 731 |
+
if method == "vader":
|
| 732 |
+
vader = _get_vader()
|
| 733 |
+
if vader is None:
|
| 734 |
+
return _simple_lexicon_sentiment(texts)
|
| 735 |
+
for text in texts:
|
| 736 |
+
if not text or len(text.strip()) < 3:
|
| 737 |
+
results.append({"label": "NEUTRAL", "score": 0.0, "compound": 0.0})
|
| 738 |
+
continue
|
| 739 |
+
vs = vader.polarity_scores(text)
|
| 740 |
+
c = vs["compound"]
|
| 741 |
+
results.append({
|
| 742 |
+
"label": "POSITIVE" if c >= 0.05 else ("NEGATIVE" if c <= -0.05 else "NEUTRAL"),
|
| 743 |
+
"score": abs(c),
|
| 744 |
+
"compound": c,
|
| 745 |
+
})
|
| 746 |
+
elif method == "hf":
|
| 747 |
+
pipe = _get_hf_pipeline()
|
| 748 |
+
for i in range(0, len(texts), batch_size):
|
| 749 |
+
chunk = texts[i: i + batch_size]
|
| 750 |
+
safe = [t[:1000] if t else " " for t in chunk]
|
| 751 |
+
try:
|
| 752 |
+
for r in pipe(safe):
|
| 753 |
+
results.append({
|
| 754 |
+
"label": r["label"],
|
| 755 |
+
"score": round(r["score"], 4),
|
| 756 |
+
"compound": r["score"] if r["label"] == "POSITIVE" else -r["score"],
|
| 757 |
+
})
|
| 758 |
+
except Exception:
|
| 759 |
+
for _ in chunk:
|
| 760 |
+
results.append({"label": "NEUTRAL", "score": 0.5, "compound": 0.0})
|
| 761 |
+
return results
|
| 762 |
|
| 763 |
|
| 764 |
+
def _simple_lexicon_sentiment(texts: List[str]) -> List[Dict]:
|
| 765 |
+
pos = {"good","great","love","excellent","amazing","wonderful","best","happy","positive","helpful"}
|
| 766 |
+
neg = {"bad","terrible","hate","awful","worst","negative","harmful","wrong","fake","misinformation"}
|
| 767 |
+
out = []
|
| 768 |
+
for text in texts:
|
| 769 |
+
words = set(text.lower().split())
|
| 770 |
+
p = len(words & pos); n = len(words & neg)
|
| 771 |
+
if p > n: out.append({"label": "POSITIVE", "score": 0.7, "compound": 0.5})
|
| 772 |
+
elif n > p: out.append({"label": "NEGATIVE", "score": 0.7, "compound": -0.5})
|
| 773 |
+
else: out.append({"label": "NEUTRAL", "score": 0.5, "compound": 0.0})
|
| 774 |
+
return out
|
| 775 |
+
|
| 776 |
+
|
| 777 |
+
def sentiment_summary(results: List[Dict]) -> Dict:
|
| 778 |
+
if not results:
|
| 779 |
+
return {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0, "total": 0,
|
| 780 |
+
"avg_compound": 0.0, "pos_pct": 0, "neg_pct": 0, "neu_pct": 0}
|
| 781 |
+
counts = Counter(r["label"] for r in results)
|
| 782 |
+
total = len(results)
|
| 783 |
+
avg = float(np.mean([r.get("compound", 0.0) for r in results]))
|
| 784 |
return {
|
| 785 |
+
"POSITIVE": counts.get("POSITIVE", 0),
|
| 786 |
+
"NEGATIVE": counts.get("NEGATIVE", 0),
|
| 787 |
+
"NEUTRAL": counts.get("NEUTRAL", 0),
|
| 788 |
+
"total": total,
|
| 789 |
+
"avg_compound": round(avg, 3),
|
| 790 |
+
"pos_pct": round(counts.get("POSITIVE", 0) / total * 100, 1),
|
| 791 |
+
"neg_pct": round(counts.get("NEGATIVE", 0) / total * 100, 1),
|
| 792 |
+
"neu_pct": round(counts.get("NEUTRAL", 0) / total * 100, 1),
|
| 793 |
}
|
| 794 |
|
| 795 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 796 |
|
| 797 |
+
# KEYWORD ANALYSIS
|
| 798 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 799 |
|
| 800 |
+
STOPWORDS = {
|
| 801 |
+
"the","a","an","is","it","in","on","at","to","for","of","and","or","but",
|
| 802 |
+
"this","that","was","are","be","have","has","had","with","from","by","as",
|
| 803 |
+
"we","i","you","he","she","they","do","did","not","no","so","if","can",
|
| 804 |
+
"will","would","could","should","my","your","his","her","their","our",
|
| 805 |
+
"what","how","when","where","who","which","about","just","also","more",
|
| 806 |
+
"all","been","were","its","than","then","there","these","those","me",
|
| 807 |
+
"him","us","them","up","out","into","after","before","https","http","www",
|
| 808 |
+
}
|
| 809 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 810 |
|
| 811 |
+
def extract_keywords(text: str, tags: List[str] = None, top_n: int = 20):
|
| 812 |
+
combined = text + " " + " ".join(tags or [])
|
| 813 |
+
tokens = re.findall(r"[a-zA-Z]{3,}", combined.lower())
|
| 814 |
+
filtered = [t for t in tokens if t not in STOPWORDS]
|
| 815 |
+
return Counter(filtered).most_common(top_n)
|
| 816 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 817 |
|
| 818 |
+
def sentiment_weighted_keywords(
|
| 819 |
+
comments_df: pd.DataFrame,
|
| 820 |
+
sentiment_results: List[Dict],
|
| 821 |
+
top_n: int = 15,
|
| 822 |
+
) -> Tuple[List[Tuple[str, float]], List[Tuple[str, float]]]:
|
| 823 |
+
if comments_df.empty or not sentiment_results:
|
| 824 |
+
return [], []
|
| 825 |
+
texts = comments_df["text"].fillna("").tolist()
|
| 826 |
+
pos_freq: Counter = Counter()
|
| 827 |
+
neg_freq: Counter = Counter()
|
| 828 |
+
for text, sent in zip(texts, sentiment_results):
|
| 829 |
+
tokens = [t for t in re.findall(r"[a-zA-Z]{3,}", text.lower()) if t not in STOPWORDS]
|
| 830 |
+
weight = sent.get("score", 0.5)
|
| 831 |
+
if sent["label"] == "POSITIVE": pos_freq.update({t: weight for t in tokens})
|
| 832 |
+
elif sent["label"] == "NEGATIVE": neg_freq.update({t: weight for t in tokens})
|
| 833 |
+
return pos_freq.most_common(top_n), neg_freq.most_common(top_n)
|