MHMisinfo

Sleeping

App Files Files Community

rocky250 commited on Apr 19

Commit

c6c2c6c

verified ·

1 Parent(s): 27c3779

Update analyzer.py

Browse files

Files changed (1) hide show

analyzer.py +209 -205

analyzer.py CHANGED Viewed

@@ -1,258 +1,262 @@
 """
-analyzer.py — Sentiment analysis, keyword extraction, and misinformation placeholder.
-Handles large comment volumes efficiently via batching + caching.
 """
 import re
-import math
 from collections import Counter
-from functools import lru_cache
-from typing import List, Dict, Tuple, Optional
-import numpy as np
 import pandas as pd
-# ── Lazy imports (heavy) ──────────────────────────────────────────────────────
-_sentiment_pipeline = None
-_vader_analyzer = None
-def _get_hf_pipeline():
-    global _sentiment_pipeline
-    if _sentiment_pipeline is None:
-        from transformers import pipeline
-        _sentiment_pipeline = pipeline(
-            "sentiment-analysis",
-            model="distilbert-base-uncased-finetuned-sst-2-english",
-            truncation=True,
-            max_length=512,
-        )
-    return _sentiment_pipeline
-def _get_vader():
-    global _vader_analyzer
-    if _vader_analyzer is None:
-        try:
-            from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
-            _vader_analyzer = SentimentIntensityAnalyzer()
-        except ImportError:
-            pass
-    return _vader_analyzer
-# ── Misinformation Detector (PLACEHOLDER — plug in your model here) ───────────
 def detect_misinformation(
     text: str,
-    tags: List[str] = None,
     audio_transcript: str = "",
     video_transcript: str = "",
-) -> Dict:
     """
-    PLACEHOLDER — replace the body of this function with your MHMisinfo model.
-    Expected return format:
-    {
-        "score": float,           # 0.0–1.0, probability of misinformation
-        "label": str,             # "Misinformation" or "Credible"
-        "confidence_pct": int,    # 0–100
-        "reasoning": str,         # human-readable summary
-        "stream_details": dict,   # per-modality trust/sigma/CCM (optional)
-    }
     """
-    # ── PLUG YOUR MODEL IN HERE ─────────────────────────────────────────────
-    # Example:
-    #   from your_model_module import load_model, run_inference
-    #   model = load_model("path/to/checkpoint")
-    #   result = run_inference(model, text, tags, audio_transcript, video_transcript)
-    #   return result
-    # ────────────────────────────────────────────────────────────────────────
-    # Heuristic placeholder for demo purposes
-    red_flags = [
-        "cure", "cures", "miracle", "they don't want you to know",
-        "doctors hate", "secret", "suppressed", "fake news",
-        "conspiracy", "detox", "toxins", "pseudoscience",
-        "100% natural", "big pharma", "government hiding",
-    ]
-    combined = f"{text} {' '.join(tags or [])} {audio_transcript}".lower()
-    hits = sum(1 for kw in red_flags if kw in combined)
-    score = min(0.15 + hits * 0.12, 0.95)
-    label = "⚠️ Potential Misinformation" if score >= 0.5 else "✅ Appears Credible"
-    reasons = []
-    if hits > 0:
-        found = [kw for kw in red_flags if kw in combined]
-        reasons.append(f"Detected {hits} red-flag keyword(s): {', '.join(found[:5])}")
     else:
-        reasons.append("No common misinformation red-flag keywords detected.")
-    reasons.append("NOTE: This is a placeholder. Connect your MHMisinfo model for real results.")
     return {
-        "score":          round(score, 4),
-        "label":          label,
-        "confidence_pct": int(score * 100),
-        "reasoning":      " • ".join(reasons),
-        "stream_details": {
-            "text":              round(score * 0.9, 3),
-            "audio_transcript":  round(score * 0.8, 3),
-            "video_transcript":  round(score * 0.85, 3),
-            "tags":              round(score * 0.7, 3),
-        },
     }
-# ── Sentiment Analysis ────────────────────────────────────────────────────────
 def analyze_sentiment_batch(
-    texts: List[str],
     method: str = "vader",
     batch_size: int = 64,
-) -> List[Dict]:
-    """
-    Analyze sentiment for a list of texts efficiently.
-    For large comment volumes (200+ comments) we use VADER by default:
-      - O(n) linear pass, ~5k comments/second on CPU
-      - No GPU or model download required
-      - Returns compound score in [-1, 1]
-    Switch method="hf" for DistilBERT (slower but more accurate).
-    Efficiency strategy for HF:
-      - Batching: groups texts into batch_size chunks to avoid OOM
-      - Truncation: texts >512 tokens are truncated at the pipeline level
-      - Short-circuit: texts <3 chars skip inference entirely
-    """
-    results = []
-    if method == "vader":
-        vader = _get_vader()
-        if vader is None:
-            # Fallback: simple lexicon
-            return _simple_lexicon_sentiment(texts)
         for text in texts:
-            if not text or len(text.strip()) < 3:
-                results.append({"label": "NEUTRAL", "score": 0.0, "compound": 0.0})
-                continue
-            vs = vader.polarity_scores(text)
-            compound = vs["compound"]
-            if compound >= 0.05:
-                label = "POSITIVE"
-            elif compound <= -0.05:
-                label = "NEGATIVE"
-            else:
-                label = "NEUTRAL"
-            results.append({"label": label, "score": abs(compound), "compound": compound})
-    elif method == "hf":
-        pipe = _get_hf_pipeline()
         for i in range(0, len(texts), batch_size):
-            chunk = texts[i: i + batch_size]
-            safe = [t[:1000] if t else " " for t in chunk]
-            try:
-                batch_results = pipe(safe)
-                for r in batch_results:
-                    results.append({
-                        "label":    r["label"],
-                        "score":    round(r["score"], 4),
-                        "compound": r["score"] if r["label"] == "POSITIVE" else -r["score"],
-                    })
-            except Exception:
-                for _ in chunk:
-                    results.append({"label": "NEUTRAL", "score": 0.5, "compound": 0.0})
-    return results
-def _simple_lexicon_sentiment(texts: List[str]) -> List[Dict]:
-    """Ultra-fast lexicon fallback if VADER is not installed."""
-    pos_words = {"good","great","love","excellent","amazing","wonderful","best","happy","positive","helpful"}
-    neg_words = {"bad","terrible","hate","awful","worst","negative","harmful","wrong","fake","misinformation"}
     results = []
     for text in texts:
-        words = set(text.lower().split())
-        pos = len(words & pos_words)
-        neg = len(words & neg_words)
         if pos > neg:
-            results.append({"label": "POSITIVE", "score": 0.7, "compound": 0.5})
         elif neg > pos:
-            results.append({"label": "NEGATIVE", "score": 0.7, "compound": -0.5})
         else:
-            results.append({"label": "NEUTRAL", "score": 0.5, "compound": 0.0})
     return results
-def sentiment_summary(results: List[Dict]) -> Dict:
-    """Aggregate sentiment results into percentage counts."""
-    if not results:
-        return {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0, "total": 0,
-                "avg_compound": 0.0, "pos_pct": 0, "neg_pct": 0, "neu_pct": 0}
-    counts = Counter(r["label"] for r in results)
-    total = len(results)
-    avg_compound = np.mean([r.get("compound", 0.0) for r in results])
     return {
-        "POSITIVE":    counts.get("POSITIVE", 0),
-        "NEGATIVE":    counts.get("NEGATIVE", 0),
-        "NEUTRAL":     counts.get("NEUTRAL", 0),
-        "total":       total,
-        "avg_compound": round(float(avg_compound), 3),
-        "pos_pct":     round(counts.get("POSITIVE", 0) / total * 100, 1),
-        "neg_pct":     round(counts.get("NEGATIVE", 0) / total * 100, 1),
-        "neu_pct":     round(counts.get("NEUTRAL",  0) / total * 100, 1),
     }
-# ── Keyword / Tag Analysis ────────────────────────────────────────────────────
-STOPWORDS = {
-    "the","a","an","is","it","in","on","at","to","for","of","and","or","but",
-    "this","that","was","are","be","have","has","had","with","from","by","as",
-    "we","i","you","he","she","they","do","did","not","no","so","if","can",
-    "will","would","could","should","my","your","his","her","their","our",
-    "what","how","when","where","who","which","about","just","also","more",
-    "all","been","were","its","than","then","there","these","those","me",
-    "him","us","them","up","out","into","after","before","https","http","www",
-}
-def extract_keywords(
-    text: str,
-    tags: List[str] = None,
-    top_n: int = 20,
-) -> List[Tuple[str, int]]:
-    """Extract top keywords from combined text + tags by TF (frequency)."""
-    combined = text + " " + " ".join(tags or [])
-    tokens = re.findall(r"[a-zA-Z]{3,}", combined.lower())
-    filtered = [t for t in tokens if t not in STOPWORDS]
-    return Counter(filtered).most_common(top_n)
 def sentiment_weighted_keywords(
-    comments_df: pd.DataFrame,
-    sentiment_results: List[Dict],
-    top_n: int = 15,
-) -> Tuple[List[Tuple[str, float]], List[Tuple[str, float]]]:
-    """
-    Return (positive_keywords, negative_keywords) each as [(word, weight), ...].
-    Weight = TF × avg_sentiment_strength for that word.
-    """
-    if comments_df.empty or not sentiment_results:
         return [], []
-    texts = comments_df["text"].fillna("").tolist()
-    pos_freq: Counter = Counter()
-    neg_freq: Counter = Counter()
-    for text, sent in zip(texts, sentiment_results):
-        tokens = [t for t in re.findall(r"[a-zA-Z]{3,}", text.lower()) if t not in STOPWORDS]
-        weight = sent.get("score", 0.5)
         if sent["label"] == "POSITIVE":
-            pos_freq.update({t: weight for t in tokens})
         elif sent["label"] == "NEGATIVE":
-            neg_freq.update({t: weight for t in tokens})
-    return pos_freq.most_common(top_n), neg_freq.most_common(top_n)

 """
+analyzer.py — Mental-health misinformation detection + sentiment analysis.
+Misinformation: lightweight rule-based 4-stream scorer (no external API needed).
+Sentiment: VADER (fast, CPU) or DistilBERT (accurate, downloads ~500 MB first run).
 """
 import re
 from collections import Counter
 import pandas as pd
+# ═══════════════════════════════════════════════════════════════════════════════
+#  MISINFORMATION DETECTION
+# ═══════════════════════════════════════════════════════════════════════════════
+# Signals that raise the misinformation score
+_RED_FLAGS = [
+    "miracle cure", "they don't want you to know", "big pharma", "doctors hide",
+    "secret remedy", "ancient cure", "government censored", "fda lies", "fda lie",
+    "conspiracy", "natural cure", "detox your brain", "toxins cause",
+    "no medication needed", "stop taking meds", "heal yourself naturally",
+    "100% effective", "guaranteed cure", "scientifically proven cure",
+    "instant relief", "suppress the truth", "alternative medicine cures",
+    "vaccines cause mental", "wifi causes", "5g causes", "chemtrails",
+    "big pharma doesn't want", "they suppress", "hidden cure",
+    "cure depression", "cure anxiety", "cure schizophrenia", "cure bipolar",
+    "cure autism", "cure adhd", "detox cure",
+]
+# Signals that reduce the misinformation score
+_CREDIBILITY = [
+    "peer-reviewed", "clinical trial", "randomized controlled", "meta-analysis",
+    "published in", "according to research", "study shows", "evidence suggests",
+    "licensed therapist", "board-certified", "psychiatrist", "psychologist",
+    "cognitive behavioral", "evidence-based", "treatment guidelines",
+    "american psychological", "national institute", "who recommends",
+    "systematic review", "consult your doctor", "speak to a professional",
+    "mental health professional", "contact a therapist",
+]
+# Clickbait / sensationalist language
+_CLICKBAIT = [
+    "you won't believe", "shocking truth", "the truth about", "exposed",
+    "they lied", "watch before deleted", "banned video", "censored truth",
+    "must watch", "share before removed", "real truth", "wake up",
+    "open your eyes", "mainstream media won't", "what they hide",
+]
 def detect_misinformation(
     text: str,
+    tags: list,
     audio_transcript: str = "",
     video_transcript: str = "",
+) -> dict:
     """
+    4-stream scoring: title/desc, tags, transcript, credibility.
+    Returns score 0–1 (higher = more likely misinformation).
     """
+    combined = f"{text} {' '.join(tags)} {audio_transcript} {video_transcript}".lower()
+    tags_lower = [t.lower() for t in tags]
+    # Stream 1 — title / description
+    red_in_text = sum(1 for r in _RED_FLAGS if r in combined)
+    click_in_text = sum(1 for c in _CLICKBAIT if c in combined)
+    s1 = min((red_in_text * 0.18 + click_in_text * 0.12), 1.0)
+    # Stream 2 — tags
+    red_in_tags = sum(1 for tag in tags_lower for r in _RED_FLAGS if r in tag)
+    s2 = min(red_in_tags * 0.25, 1.0)
+    # Stream 3 — transcript density
+    word_count = max(len(combined.split()), 1)
+    red_density = sum(1 for r in _RED_FLAGS if r in combined) / (word_count / 100)
+    s3 = min(red_density * 0.15, 1.0)
+    # Stream 4 — credibility deficit (absence of credible language = risk)
+    cred_count = sum(1 for c in _CREDIBILITY if c in combined)
+    s4 = max(0.0, 0.6 - cred_count * 0.12)  # starts at 0.6, falls with credibility
+    stream_details = {
+        "Title & Description": round(s1, 3),
+        "Tags":                round(s2, 3),
+        "Transcript":          round(s3, 3),
+        "Credibility Gap":     round(s4, 3),
+    }
+    score = (s1 * 0.35 + s2 * 0.20 + s3 * 0.20 + s4 * 0.25)
+    score = max(0.0, min(1.0, score))
+    if score < 0.35:
+        reasoning = (
+            f"Content uses credible language ({cred_count} credibility markers found). "
+            "No major misinformation signals detected in title, tags, or transcript."
+        )
+    elif score < 0.65:
+        reasoning = (
+            f"Mixed signals detected — {red_in_text} red-flag phrase(s) alongside "
+            f"{cred_count} credibility indicator(s). Manual review recommended before sharing."
+        )
     else:
+        reasoning = (
+            f"High misinformation risk. {red_in_text} red-flag phrase(s) and "
+            f"{click_in_text} clickbait indicator(s) detected with low credibility language. "
+            "Exercise significant caution."
+        )
     return {
+        "score":          score,
+        "confidence_pct": int(round(score * 100)),
+        "reasoning":      reasoning,
+        "stream_details": stream_details,
     }
+# ═══════════════════════════════════════════════════════════════════════════════
+#  SENTIMENT ANALYSIS
+# ═══════════════════════════════════════════════════════════════════════════════
 def analyze_sentiment_batch(
+    texts: list,
     method: str = "vader",
     batch_size: int = 64,
+) -> list[dict]:
+    """Return list of {'label': str, 'compound': float, 'score': float}."""
+    if not texts:
+        return []
+    if method == "hf":
+        return _hf_sentiment(texts, batch_size=batch_size)
+    return _vader_sentiment(texts)
+def _vader_sentiment(texts: list) -> list[dict]:
+    try:
+        from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+        sia = SentimentIntensityAnalyzer()
+        results = []
         for text in texts:
+            sc = sia.polarity_scores(str(text))
+            c = sc["compound"]
+            label = "POSITIVE" if c >= 0.05 else ("NEGATIVE" if c <= -0.05 else "NEUTRAL")
+            results.append({"label": label, "compound": round(c, 4), "score": round(abs(c), 4)})
+        return results
+    except ImportError:
+        return _simple_sentiment(texts)
+    except Exception:
+        return _simple_sentiment(texts)
+def _hf_sentiment(texts: list, batch_size: int = 32) -> list[dict]:
+    try:
+        from transformers import pipeline as hf_pipeline
+        pipe = hf_pipeline(
+            "sentiment-analysis",
+            model="distilbert-base-uncased-finetuned-sst-2-english",
+            truncation=True,
+            max_length=512,
+        )
+        results = []
         for i in range(0, len(texts), batch_size):
+            chunk = [str(t)[:512] for t in texts[i: i + batch_size]]
+            out = pipe(chunk)
+            for item in out:
+                lbl = item["label"]
+                sc = item["score"]
+                compound = sc if lbl == "POSITIVE" else -sc
+                results.append({"label": lbl, "compound": round(compound, 4), "score": round(sc, 4)})
+        return results
+    except Exception:
+        return _vader_sentiment(texts)
+def _simple_sentiment(texts: list) -> list[dict]:
+    """Zero-dependency fallback when VADER isn't installed."""
+    pos_vocab = {
+        "good", "great", "excellent", "love", "amazing", "wonderful", "helpful",
+        "best", "thank", "thanks", "awesome", "brilliant", "perfect", "happy",
+        "fantastic", "outstanding", "superb", "recommend", "positive", "useful",
+    }
+    neg_vocab = {
+        "bad", "terrible", "awful", "hate", "worst", "horrible", "wrong",
+        "false", "misleading", "garbage", "useless", "poor", "disappointing",
+        "dangerous", "harmful", "misinformation", "lie", "lies", "fraud",
+    }
     results = []
     for text in texts:
+        words = set(str(text).lower().split())
+        pos = len(words & pos_vocab)
+        neg = len(words & neg_vocab)
         if pos > neg:
+            label, compound = "POSITIVE",  0.5
         elif neg > pos:
+            label, compound = "NEGATIVE", -0.5
         else:
+            label, compound = "NEUTRAL",   0.0
+        results.append({"label": label, "compound": compound, "score": abs(compound)})
     return results
+# ═══════════════════════════════════════════════════════════════════════════════
+#  SUMMARY + KEYWORDS
+# ═══════════════════════════════════════════════════════════════════════════════
+def sentiment_summary(sentiments: list) -> dict:
+    if not sentiments:
+        return {}
+    total = len(sentiments)
+    pos = sum(1 for s in sentiments if s["label"] == "POSITIVE")
+    neg = sum(1 for s in sentiments if s["label"] == "NEGATIVE")
+    neu = total - pos - neg
     return {
+        "total":   total,
+        "pos":     pos,
+        "neg":     neg,
+        "neu":     neu,
+        "pos_pct": round(pos / total * 100, 1),
+        "neg_pct": round(neg / total * 100, 1),
+        "neu_pct": round(neu / total * 100, 1),
     }
+_STOP = frozenset(
+    "the a an and or but in on at to for of with by from up is are was were be been "
+    "being have has had do does did will would could should may might this that these "
+    "those it its they them their we our you your i my he she his her not no so if as "
+    "about what how when who which all just more also can get like one there than now "
+    "then very much many some any such other very really just even still only well "
+    "http https www com".split()
+)
+def extract_keywords(text: str, tags: list, top_n: int = 15) -> list[tuple]:
+    words = re.findall(r"\b[a-z]{4,}\b", text.lower())
+    filtered = [w for w in words if w not in _STOP]
+    tag_words = [re.sub(r"[^a-z]", "", t.lower()) for t in tags]
+    tag_words = [w for w in tag_words if len(w) >= 4 and w not in _STOP]
+    all_words = filtered + tag_words * 3
+    return Counter(all_words).most_common(top_n)
 def sentiment_weighted_keywords(
+    df: pd.DataFrame,
+    sentiments: list,
+    top_n: int = 10,
+) -> tuple[list, list]:
+    if df.empty or not sentiments:
         return [], []
+    pos_words, neg_words = [], []
+    texts = df["text"].fillna("").tolist()
+    for text, sent in zip(texts, sentiments):
+        words = re.findall(r"\b[a-z]{4,}\b", str(text).lower())
+        words = [w for w in words if w not in _STOP]
         if sent["label"] == "POSITIVE":
+            pos_words.extend(words)
         elif sent["label"] == "NEGATIVE":
+            neg_words.extend(words)
+    return Counter(pos_words).most_common(top_n), Counter(neg_words).most_common(top_n)