MHMisinfo

Sleeping

App Files Files Community

rocky250 commited on Apr 19

Commit

f52d7fe

verified ·

1 Parent(s): f0f0ba5

Create analyzer.py

Browse files

Files changed (1) hide show

analyzer.py +258 -0

analyzer.py ADDED Viewed

	@@ -0,0 +1,258 @@

+"""
+analyzer.py — Sentiment analysis, keyword extraction, and misinformation placeholder.
+Handles large comment volumes efficiently via batching + caching.
+"""
+import re
+import math
+from collections import Counter
+from functools import lru_cache
+from typing import List, Dict, Tuple, Optional
+import numpy as np
+import pandas as pd
+# ── Lazy imports (heavy) ──────────────────────────────────────────────────────
+_sentiment_pipeline = None
+_vader_analyzer = None
+def _get_hf_pipeline():
+    global _sentiment_pipeline
+    if _sentiment_pipeline is None:
+        from transformers import pipeline
+        _sentiment_pipeline = pipeline(
+            "sentiment-analysis",
+            model="distilbert-base-uncased-finetuned-sst-2-english",
+            truncation=True,
+            max_length=512,
+        )
+    return _sentiment_pipeline
+def _get_vader():
+    global _vader_analyzer
+    if _vader_analyzer is None:
+        try:
+            from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+            _vader_analyzer = SentimentIntensityAnalyzer()
+        except ImportError:
+            pass
+    return _vader_analyzer
+# ── Misinformation Detector (PLACEHOLDER — plug in your model here) ───────────
+def detect_misinformation(
+    text: str,
+    tags: List[str] = None,
+    audio_transcript: str = "",
+    video_transcript: str = "",
+) -> Dict:
+    """
+    PLACEHOLDER — replace the body of this function with your MHMisinfo model.
+    Expected return format:
+    {
+        "score": float,           # 0.0–1.0, probability of misinformation
+        "label": str,             # "Misinformation" or "Credible"
+        "confidence_pct": int,    # 0–100
+        "reasoning": str,         # human-readable summary
+        "stream_details": dict,   # per-modality trust/sigma/CCM (optional)
+    }
+    """
+    # ── PLUG YOUR MODEL IN HERE ─────────────────────────────────────────────
+    # Example:
+    #   from your_model_module import load_model, run_inference
+    #   model = load_model("path/to/checkpoint")
+    #   result = run_inference(model, text, tags, audio_transcript, video_transcript)
+    #   return result
+    # ────────────────────────────────────────────────────────────────────────
+    # Heuristic placeholder for demo purposes
+    red_flags = [
+        "cure", "cures", "miracle", "they don't want you to know",
+        "doctors hate", "secret", "suppressed", "fake news",
+        "conspiracy", "detox", "toxins", "pseudoscience",
+        "100% natural", "big pharma", "government hiding",
+    ]
+    combined = f"{text} {' '.join(tags or [])} {audio_transcript}".lower()
+    hits = sum(1 for kw in red_flags if kw in combined)
+    score = min(0.15 + hits * 0.12, 0.95)
+    label = "⚠️ Potential Misinformation" if score >= 0.5 else "✅ Appears Credible"
+    reasons = []
+    if hits > 0:
+        found = [kw for kw in red_flags if kw in combined]
+        reasons.append(f"Detected {hits} red-flag keyword(s): {', '.join(found[:5])}")
+    else:
+        reasons.append("No common misinformation red-flag keywords detected.")
+    reasons.append("NOTE: This is a placeholder. Connect your MHMisinfo model for real results.")
+    return {
+        "score":          round(score, 4),
+        "label":          label,
+        "confidence_pct": int(score * 100),
+        "reasoning":      " • ".join(reasons),
+        "stream_details": {
+            "text":              round(score * 0.9, 3),
+            "audio_transcript":  round(score * 0.8, 3),
+            "video_transcript":  round(score * 0.85, 3),
+            "tags":              round(score * 0.7, 3),
+        },
+    }
+# ── Sentiment Analysis ────────────────────────────────────────────────────────
+def analyze_sentiment_batch(
+    texts: List[str],
+    method: str = "vader",
+    batch_size: int = 64,
+) -> List[Dict]:
+    """
+    Analyze sentiment for a list of texts efficiently.
+    For large comment volumes (200+ comments) we use VADER by default:
+      - O(n) linear pass, ~5k comments/second on CPU
+      - No GPU or model download required
+      - Returns compound score in [-1, 1]
+    Switch method="hf" for DistilBERT (slower but more accurate).
+    Efficiency strategy for HF:
+      - Batching: groups texts into batch_size chunks to avoid OOM
+      - Truncation: texts >512 tokens are truncated at the pipeline level
+      - Short-circuit: texts <3 chars skip inference entirely
+    """
+    results = []
+    if method == "vader":
+        vader = _get_vader()
+        if vader is None:
+            # Fallback: simple lexicon
+            return _simple_lexicon_sentiment(texts)
+        for text in texts:
+            if not text or len(text.strip()) < 3:
+                results.append({"label": "NEUTRAL", "score": 0.0, "compound": 0.0})
+                continue
+            vs = vader.polarity_scores(text)
+            compound = vs["compound"]
+            if compound >= 0.05:
+                label = "POSITIVE"
+            elif compound <= -0.05:
+                label = "NEGATIVE"
+            else:
+                label = "NEUTRAL"
+            results.append({"label": label, "score": abs(compound), "compound": compound})
+    elif method == "hf":
+        pipe = _get_hf_pipeline()
+        for i in range(0, len(texts), batch_size):
+            chunk = texts[i: i + batch_size]
+            safe = [t[:1000] if t else " " for t in chunk]
+            try:
+                batch_results = pipe(safe)
+                for r in batch_results:
+                    results.append({
+                        "label":    r["label"],
+                        "score":    round(r["score"], 4),
+                        "compound": r["score"] if r["label"] == "POSITIVE" else -r["score"],
+                    })
+            except Exception:
+                for _ in chunk:
+                    results.append({"label": "NEUTRAL", "score": 0.5, "compound": 0.0})
+    return results
+def _simple_lexicon_sentiment(texts: List[str]) -> List[Dict]:
+    """Ultra-fast lexicon fallback if VADER is not installed."""
+    pos_words = {"good","great","love","excellent","amazing","wonderful","best","happy","positive","helpful"}
+    neg_words = {"bad","terrible","hate","awful","worst","negative","harmful","wrong","fake","misinformation"}
+    results = []
+    for text in texts:
+        words = set(text.lower().split())
+        pos = len(words & pos_words)
+        neg = len(words & neg_words)
+        if pos > neg:
+            results.append({"label": "POSITIVE", "score": 0.7, "compound": 0.5})
+        elif neg > pos:
+            results.append({"label": "NEGATIVE", "score": 0.7, "compound": -0.5})
+        else:
+            results.append({"label": "NEUTRAL", "score": 0.5, "compound": 0.0})
+    return results
+def sentiment_summary(results: List[Dict]) -> Dict:
+    """Aggregate sentiment results into percentage counts."""
+    if not results:
+        return {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0, "total": 0,
+                "avg_compound": 0.0, "pos_pct": 0, "neg_pct": 0, "neu_pct": 0}
+    counts = Counter(r["label"] for r in results)
+    total = len(results)
+    avg_compound = np.mean([r.get("compound", 0.0) for r in results])
+    return {
+        "POSITIVE":    counts.get("POSITIVE", 0),
+        "NEGATIVE":    counts.get("NEGATIVE", 0),
+        "NEUTRAL":     counts.get("NEUTRAL", 0),
+        "total":       total,
+        "avg_compound": round(float(avg_compound), 3),
+        "pos_pct":     round(counts.get("POSITIVE", 0) / total * 100, 1),
+        "neg_pct":     round(counts.get("NEGATIVE", 0) / total * 100, 1),
+        "neu_pct":     round(counts.get("NEUTRAL",  0) / total * 100, 1),
+    }
+# ── Keyword / Tag Analysis ────────────────────────────────────────────────────
+STOPWORDS = {
+    "the","a","an","is","it","in","on","at","to","for","of","and","or","but",
+    "this","that","was","are","be","have","has","had","with","from","by","as",
+    "we","i","you","he","she","they","do","did","not","no","so","if","can",
+    "will","would","could","should","my","your","his","her","their","our",
+    "what","how","when","where","who","which","about","just","also","more",
+    "all","been","were","its","than","then","there","these","those","me",
+    "him","us","them","up","out","into","after","before","https","http","www",
+}
+def extract_keywords(
+    text: str,
+    tags: List[str] = None,
+    top_n: int = 20,
+) -> List[Tuple[str, int]]:
+    """Extract top keywords from combined text + tags by TF (frequency)."""
+    combined = text + " " + " ".join(tags or [])
+    tokens = re.findall(r"[a-zA-Z]{3,}", combined.lower())
+    filtered = [t for t in tokens if t not in STOPWORDS]
+    return Counter(filtered).most_common(top_n)
+def sentiment_weighted_keywords(
+    comments_df: pd.DataFrame,
+    sentiment_results: List[Dict],
+    top_n: int = 15,
+) -> Tuple[List[Tuple[str, float]], List[Tuple[str, float]]]:
+    """
+    Return (positive_keywords, negative_keywords) each as [(word, weight), ...].
+    Weight = TF × avg_sentiment_strength for that word.
+    """
+    if comments_df.empty or not sentiment_results:
+        return [], []
+    texts = comments_df["text"].fillna("").tolist()
+    pos_freq: Counter = Counter()
+    neg_freq: Counter = Counter()
+    for text, sent in zip(texts, sentiment_results):
+        tokens = [t for t in re.findall(r"[a-zA-Z]{3,}", text.lower()) if t not in STOPWORDS]
+        weight = sent.get("score", 0.5)
+        if sent["label"] == "POSITIVE":
+            pos_freq.update({t: weight for t in tokens})
+        elif sent["label"] == "NEGATIVE":
+            neg_freq.update({t: weight for t in tokens})
+    return pos_freq.most_common(top_n), neg_freq.most_common(top_n)