Spaces:
Sleeping
Sleeping
File size: 5,535 Bytes
39bbca0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | """
ML Classifier for VeriLens AI
Primary: HuggingFace text-classification pipeline (DistilBERT).
Fallback: Heuristic keyword-based scoring when the model is unavailable.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from pathlib import Path
logger = logging.getLogger(__name__)
# ββ Lazy-loaded globals βββββββββββββββββββββββββββββββββββββββββββββββββββββ
_pipeline = None
_model_ready = False
@dataclass
class ClassificationResult:
label: str # "FAKE" or "REAL"
confidence: float # 0.0 β 1.0
# ββ Heuristic fallback βββββββββββββββββββββββββββββββββββββββββββββββββββββ
_FAKE_SIGNALS = [
"you won't believe", "shocking", "exposed", "secret",
"they don't want you to know", "mind-blowing", "conspiracy",
"cover-up", "banned", "censored", "wake up", "big pharma",
"doctors hate", "one weird trick", "must watch",
"share before it's too late", "mainstream media won't tell you",
"spread this before it's deleted", "bombshell", "unbelievable",
]
_REAL_SIGNALS = [
"according to", "officials said", "the report states",
"data shows", "peer-reviewed", "study published",
"reuters", "associated press", "confirmed by",
"government statement", "press release", "research findings",
"published in the journal", "the investigation found",
]
def _heuristic_classify(text: str) -> ClassificationResult:
"""Simple keyword-based scoring used when the transformer is unavailable."""
lower = text.lower()
fake_hits = sum(1 for p in _FAKE_SIGNALS if p in lower)
real_hits = sum(1 for p in _REAL_SIGNALS if p in lower)
total = fake_hits + real_hits
if total == 0:
return ClassificationResult(label="UNCERTAIN", confidence=0.50)
fake_ratio = fake_hits / total
if fake_ratio > 0.6:
return ClassificationResult(label="FAKE", confidence=round(0.5 + fake_ratio * 0.4, 2))
if fake_ratio < 0.4:
return ClassificationResult(label="REAL", confidence=round(0.5 + (1 - fake_ratio) * 0.4, 2))
return ClassificationResult(label="UNCERTAIN", confidence=0.55)
# ββ Model loading ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
_LOCAL_MODEL_DIR = Path(__file__).resolve().parent / "trained_model_v2"
def load_model() -> None:
"""
Load the text-classification pipeline.
Prefers a locally fine-tuned model from ./trained_model if it exists,
otherwise falls back to the HuggingFace remote model.
Call once at startup; subsequent calls are no-ops.
"""
global _pipeline, _model_ready
if _model_ready:
return
try:
from transformers import pipeline as hf_pipeline
import torch
# β‘ Universal Hardware Detection (Windows / Mac / Linux)
if torch.cuda.is_available():
active_device = torch.device("cuda")
gpu_name = torch.cuda.get_device_name(0)
logger.info(f"Hardware detection: NVIDIA GPU ({gpu_name}) found. Routing to CUDA.")
elif torch.backends.mps.is_available():
active_device = torch.device("mps")
logger.info("Hardware detection: Apple Silicon found. Routing to MPS.")
else:
active_device = torch.device("cpu")
logger.info("Hardware detection: No GPU found. Defaulting to CPU.")
if _LOCAL_MODEL_DIR.exists() and (_LOCAL_MODEL_DIR / "config.json").exists():
model_path = str(_LOCAL_MODEL_DIR)
logger.info("Loading locally trained model from %s β¦", model_path)
else:
model_path = "hamzab/roberta-fake-news-classification"
logger.info("Loading HuggingFace remote model: %s β¦", model_path)
# β‘ Pass the dynamically selected device to the pipeline
_pipeline = hf_pipeline(
"text-classification",
model=model_path,
truncation=True,
max_length=512,
device=active_device
)
_model_ready = True
logger.info("Model loaded successfully.")
except Exception as exc:
logger.warning("Could not load model (%s). Using heuristic fallback.", exc)
_model_ready = False
def classify(text: str) -> ClassificationResult:
"""
Classify *text* as REAL or FAKE.
Falls back to heuristic scoring if the transformer model is unavailable.
"""
if not _model_ready or _pipeline is None:
return _heuristic_classify(text)
try:
# Truncate very long texts for speed
truncated = text[:2048]
result = _pipeline(truncated)[0]
raw_label: str = result["label"].upper()
score: float = result["score"]
# Normalise labels coming from the model
if "FAKE" in raw_label or raw_label in ("LABEL_0", "FAKE"):
label = "FAKE"
elif "REAL" in raw_label or raw_label in ("LABEL_1", "REAL"):
label = "REAL"
else:
label = "UNCERTAIN"
return ClassificationResult(label=label, confidence=round(score, 4))
except Exception as exc:
logger.error("Model inference failed: %s β falling back to heuristic.", exc)
return _heuristic_classify(text) |