Spaces:

aviseth
/

fake-news-api

Running

App Files Files Community

aviseth commited on 17 days ago

Commit

06e73d2

1 Parent(s): 16da8ce

Initial deployment

Browse files

Files changed (35) hide show

.dockerignore +17 -0
Dockerfile +28 -0
README.md +48 -5
requirements.txt +49 -0
scripts/download_models.py +75 -0
scripts/setup_environment.bat +102 -0
scripts/setup_environment.sh +135 -0
scripts/setup_supabase.sql +95 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-313.pyc +0 -0
src/api/__init__.py +0 -0
src/api/__pycache__/__init__.cpython-313.pyc +0 -0
src/api/__pycache__/main.cpython-313.pyc +0 -0
src/api/main.py +355 -0
src/data/__init__.py +0 -0
src/data/__pycache__/__init__.cpython-313.pyc +0 -0
src/data/__pycache__/dataset.cpython-313.pyc +0 -0
src/data/__pycache__/preprocessing.cpython-313.pyc +0 -0
src/data/dataset.py +91 -0
src/data/gnews_collector.py +189 -0
src/data/preprocessing.py +18 -0
src/models/__init__.py +12 -0
src/models/__pycache__/__init__.cpython-313.pyc +0 -0
src/models/__pycache__/evaluate.cpython-313.pyc +0 -0
src/models/__pycache__/inference.cpython-313.pyc +0 -0
src/models/__pycache__/train.cpython-313.pyc +0 -0
src/models/evaluate.py +46 -0
src/models/inference.py +345 -0
src/models/train.py +172 -0
src/utils/__init__.py +0 -0
src/utils/__pycache__/__init__.cpython-313.pyc +0 -0
src/utils/__pycache__/gnews_client.cpython-313.pyc +0 -0
src/utils/__pycache__/supabase_client.cpython-313.pyc +0 -0
src/utils/gnews_client.py +121 -0
src/utils/supabase_client.py +91 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,17 @@

+models/
+data/
+notebooks/
+frontend/
+venv/
+.git/
+.vscode/
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.env
+.env.example
+*.log
+.DS_Store
+README.md
+docker-compose.yml

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM python:3.9-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY src/ ./src/
+COPY scripts/ ./scripts/
+COPY .env.example .env.example
+# Download models from HuggingFace Hub at build time
+RUN mkdir -p models && \
+    huggingface-cli download aviseth/distilbert-fakenews --local-dir models/distilbert --exclude "checkpoints/*" && \
+    huggingface-cli download aviseth/roberta-fakenews --local-dir models/roberta --exclude "checkpoints/*" && \
+    huggingface-cli download aviseth/xlnet-fakenews --local-dir models/xlnet --exclude "checkpoints/*"
+# HuggingFace Spaces uses port 7860
+ENV PORT=7860
+EXPOSE 7860
+CMD uvicorn src.api.main:app --host 0.0.0.0 --port ${PORT}

README.md CHANGED Viewed

@@ -1,10 +1,53 @@
 ---
-title: Fake News Api
-emoji: 🌍
-colorFrom: purple
-colorTo: pink
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Fake News Detection API
+emoji: 🔍
+colorFrom: orange
+colorTo: red
 sdk: docker
 pinned: false
 ---
+# Fake News Detection API
+Multi-class fake news detection using fine-tuned DistilBERT, RoBERTa, and XLNet models.
+Classifies news articles into: **True** · **Fake** · **Satire** · **Bias**
+## Features
+- 3 transformer models (DistilBERT, RoBERTa, XLNet) trained on 80k+ articles
+- Real-time explainability via gradient saliency + SHAP
+- Live news integration via GNews API
+- Prediction statistics and user feedback collection
+- FastAPI backend with Swagger docs at `/docs`
+## Endpoints
+- `POST /predict` — classify text as True / Fake / Satire / Bias
+- `POST /explain` — gradient saliency + SHAP explainability
+- `GET /news` — live news via GNews
+- `GET /news/newspaper` — news grouped by predicted label
+- `POST /feedback` — submit label corrections
+- `GET /stats` — prediction statistics
+- `GET /health` — health check
+- `GET /docs` — Swagger UI
+## Environment Variables
+Set these in your Space settings:
+```
+SUPABASE_URL=your_supabase_url
+SUPABASE_KEY=your_supabase_anon_key
+SUPABASE_SERVICE_KEY=your_supabase_service_key
+GNEWS_API_KEY=your_gnews_api_key
+ALLOWED_ORIGINS=https://your-frontend.vercel.app
+```
+## Models
+Models are automatically downloaded from:
+- [aviseth/distilbert-fakenews](https://huggingface.co/aviseth/distilbert-fakenews)
+- [aviseth/roberta-fakenews](https://huggingface.co/aviseth/roberta-fakenews)
+- [aviseth/xlnet-fakenews](https://huggingface.co/aviseth/xlnet-fakenews)

requirements.txt ADDED Viewed

	@@ -0,0 +1,49 @@

+# Core ML
+torch>=2.0.0
+transformers>=4.30.0
+datasets>=2.12.0
+scikit-learn>=1.3.0
+accelerate>=0.26.0
+# Backend API
+fastapi>=0.100.0
+uvicorn[standard]>=0.23.0
+pydantic>=2.0.0
+python-multipart>=0.0.6
+# Database / Supabase
+supabase>=2.0.0
+postgrest>=0.10.0
+sqlalchemy>=2.0.0
+psycopg2-binary>=2.9.0
+# Data Processing
+pandas>=2.0.0
+numpy>=1.24.0
+nltk>=3.8.0
+spacy>=3.6.0
+# Explainability
+shap>=0.42.0
+lime>=0.2.0
+# News / Web
+requests>=2.31.0
+beautifulsoup4>=4.12.0
+newspaper3k>=0.2.8
+# MLOps
+wandb>=0.15.0
+# Utilities
+python-dotenv>=1.0.0
+pyyaml>=6.0
+tqdm>=4.65.0
+# Testing
+pytest>=7.4.0
+pytest-asyncio>=0.21.0
+# Visualization
+matplotlib>=3.7.0
+seaborn>=0.12.0

scripts/download_models.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+Downloads DistilBERT, RoBERTa, and XLNet base models from Hugging Face
+and saves them to the models/ directory with the correct label configuration.
+"""
+from pathlib import Path
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
+MODELS = {
+    "distilbert": {"name": "distilbert-base-uncased", "description": "66M parameters"},
+    "roberta":    {"name": "roberta-base",             "description": "125M parameters"},
+    "xlnet":      {"name": "xlnet-base-cased",         "description": "110M parameters"},
+}
+LABEL_MAP = {0: "True", 1: "Fake", 2: "Satire", 3: "Bias"}
+def download_model(model_key: str, model_info: dict, base_dir: Path) -> bool:
+    model_name = model_info["name"]
+    save_path = base_dir / model_key
+    print(f"\n{'='*60}")
+    print(
+        f"Downloading: {model_key} — {model_name} ({model_info['description']})")
+    print(f"{'='*60}\n")
+    try:
+        save_path.mkdir(parents=True, exist_ok=True)
+        print("[1/3] Tokenizer…")
+        AutoTokenizer.from_pretrained(model_name).save_pretrained(save_path)
+        print("[2/3] Config…")
+        config = AutoConfig.from_pretrained(
+            model_name,
+            num_labels=4,
+            id2label=LABEL_MAP,
+            label2id={v: k for k, v in LABEL_MAP.items()},
+        )
+        config.save_pretrained(save_path)
+        print("[3/3] Model weights…")
+        AutoModelForSequenceClassification.from_pretrained(
+            model_name, config=config).save_pretrained(save_path)
+        with open(save_path / "model_info.txt", "w") as f:
+            f.write(
+                f"Model: {model_name}\nParameters: {model_info['description']}\nLabels: {LABEL_MAP}\nStatus: pre-trained\n")
+        print(f"✅ {model_key} saved to {save_path}\n")
+        return True
+    except Exception as e:
+        print(f"❌ {model_key} failed: {e}\n")
+        return False
+def main():
+    models_dir = Path(__file__).parent.parent / "models"
+    models_dir.mkdir(parents=True, exist_ok=True)
+    results = {key: download_model(key, info, models_dir)
+               for key, info in MODELS.items()}
+    print("\n" + "=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    for key, ok in results.items():
+        print(f"  {key:15} {'✅' if ok else '❌'}")
+    print(f"\n{sum(results.values())}/{len(results)} models downloaded")
+    print("=" * 60 + "\n")
+if __name__ == "__main__":
+    main()

scripts/setup_environment.bat ADDED Viewed

	@@ -0,0 +1,102 @@

+@echo off
+REM ============================================================
+REM Fake News Detection - Environment Setup
+REM Run from project root: scripts\setup_environment.bat
+REM ============================================================
+REM Move to project root (one level up from scripts/)
+cd /d "%~dp0.."
+echo.
+echo ============================================================
+echo FAKE NEWS DETECTION - ENVIRONMENT SETUP
+echo ============================================================
+echo.
+REM Check Python
+echo [1/5] Checking Python...
+python --version >nul 2>&1
+if errorlevel 1 (
+    echo [ERROR] Python not found. Install from https://www.python.org/
+    pause & exit /b 1
+)
+python --version
+echo.
+REM Handle existing venv
+if exist venv (
+    echo [INFO] Virtual environment already exists.
+    set /p recreate="Recreate it? (y/n): "
+    if /i "%recreate%"=="y" (
+        echo Removing old venv...
+        rmdir /s /q venv
+    ) else (
+        goto :activate_venv
+    )
+)
+REM Create venv
+echo [2/5] Creating virtual environment...
+python -m venv venv
+if errorlevel 1 ( echo [ERROR] Failed to create venv & pause & exit /b 1 )
+echo [OK] venv created at %CD%\venv
+echo.
+:activate_venv
+echo [3/5] Activating virtual environment...
+call venv\Scripts\activate.bat
+if errorlevel 1 ( echo [ERROR] Failed to activate venv & pause & exit /b 1 )
+echo [OK] Activated
+echo.
+REM Upgrade pip
+echo [4/5] Upgrading pip...
+python -m pip install --upgrade pip --quiet
+echo [OK] pip upgraded
+echo.
+REM Install requirements
+echo [5/5] Installing requirements.txt...
+echo (This takes a few minutes on first run)
+echo.
+pip install -r requirements.txt
+if errorlevel 1 (
+    echo.
+    echo [ERROR] Some packages failed. Common fixes:
+    echo   - Run as Administrator
+    echo   - Install Visual C++ Build Tools: https://visualstudio.microsoft.com/visual-cpp-build-tools/
+    echo   - Check internet connection
+    pause & exit /b 1
+)
+echo.
+echo ============================================================
+echo DONE - Virtual environment ready
+echo ============================================================
+echo.
+echo Location : %CD%\venv
+python --version
+echo.
+echo Key packages installed:
+pip list --format=columns | findstr /C:"torch" /C:"transformers" /C:"fastapi" /C:"supabase" /C:"wandb"
+echo.
+echo ============================================================
+echo NEXT STEPS
+echo ============================================================
+echo.
+echo 1. Download base models (run once):
+echo    python scripts\download_models.py
+echo.
+echo 2. Run Supabase SQL schema:
+echo    Open Supabase dashboard ^> SQL Editor ^> paste scripts\setup_supabase.sql
+echo.
+echo 3. Test connections:
+echo    python scripts\test_connections.py
+echo.
+echo 4. Start API:
+echo    uvicorn src.api.main:app --reload
+echo.
+echo To activate venv in future sessions:
+echo    venv\Scripts\activate
+echo.
+pause

scripts/setup_environment.sh ADDED Viewed

	@@ -0,0 +1,135 @@

+#!/bin/bash
+# ============================================================
+# Fake News Detection - Environment Setup Script
+# This script creates virtual environment and installs all dependencies
+# ============================================================
+set -e  # Exit on error
+echo ""
+echo "============================================================"
+echo "FAKE NEWS DETECTION - ENVIRONMENT SETUP"
+echo "============================================================"
+echo ""
+# Colors
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+# Check if Python is installed
+echo "[1/5] Checking Python installation..."
+if ! command -v python3 &> /dev/null; then
+    if ! command -v python &> /dev/null; then
+        echo -e "${RED}[ERROR] Python is not installed${NC}"
+        echo "Please install Python 3.9 or higher"
+        exit 1
+    else
+        PYTHON_CMD=python
+    fi
+else
+    PYTHON_CMD=python3
+fi
+$PYTHON_CMD --version
+echo -e "${GREEN}[SUCCESS] Python is installed${NC}"
+echo ""
+# Check if virtual environment already exists
+if [ -d "venv" ]; then
+    echo -e "${YELLOW}[WARNING] Virtual environment already exists${NC}"
+    read -p "Do you want to recreate it? (y/n): " recreate
+    if [[ $recreate =~ ^[Yy]$ ]]; then
+        echo "[2/5] Removing existing virtual environment..."
+        rm -rf venv
+        echo -e "${GREEN}[SUCCESS] Removed existing virtual environment${NC}"
+    else
+        echo "[INFO] Using existing virtual environment"
+    fi
+fi
+# Create virtual environment if it doesn't exist
+if [ ! -d "venv" ]; then
+    echo "[2/5] Creating virtual environment..."
+    $PYTHON_CMD -m venv venv
+    echo -e "${GREEN}[SUCCESS] Virtual environment created${NC}"
+    echo ""
+else
+    echo "[2/5] Virtual environment already exists"
+    echo ""
+fi
+# Activate virtual environment
+echo "[3/5] Activating virtual environment..."
+source venv/bin/activate
+echo -e "${GREEN}[SUCCESS] Virtual environment activated${NC}"
+echo ""
+# Upgrade pip
+echo "[4/5] Upgrading pip..."
+pip install --upgrade pip --quiet
+echo -e "${GREEN}[SUCCESS] Pip upgraded${NC}"
+echo ""
+# Install requirements
+echo "[5/5] Installing dependencies from requirements.txt..."
+echo "This may take a few minutes..."
+echo ""
+if pip install -r requirements.txt; then
+    echo ""
+    echo "============================================================"
+    echo "INSTALLATION COMPLETE"
+    echo "============================================================"
+    echo ""
+    echo -e "${GREEN}[SUCCESS] All dependencies installed successfully!${NC}"
+    echo ""
+    echo "Virtual environment location: $(pwd)/venv"
+    echo "Python version: $($PYTHON_CMD --version)"
+    echo ""
+    echo "Installed packages:"
+    pip list | grep -E "torch|transformers|fastapi|supabase"
+    echo ""
+else
+    echo ""
+    echo -e "${RED}[ERROR] Failed to install some dependencies${NC}"
+    echo "Please check the error messages above"
+    echo ""
+    echo "Common solutions:"
+    echo "1. Make sure you have internet connection"
+    echo "2. Check if requirements.txt exists"
+    echo "3. Install build tools if needed"
+    echo ""
+    exit 1
+fi
+echo "============================================================"
+echo "NEXT STEPS"
+echo "============================================================"
+echo ""
+echo "1. Virtual environment is already activated"
+echo ""
+echo "2. Download models from Hugging Face:"
+echo "   python scripts/download_models.py"
+echo ""
+echo "3. Setup Supabase database:"
+echo "   - Open Supabase dashboard"
+echo "   - Run scripts/setup_supabase.sql in SQL Editor"
+echo ""
+echo "4. Test connections:"
+echo "   python scripts/test_connections.py"
+echo ""
+echo "5. Start the API server:"
+echo "   uvicorn src.api.main:app --reload"
+echo ""
+echo "============================================================"
+echo ""
+echo "To activate virtual environment in future sessions:"
+echo "   source venv/bin/activate"
+echo ""
+echo "To deactivate:"
+echo "   deactivate"
+echo ""
+echo "============================================================"
+echo ""

scripts/setup_supabase.sql ADDED Viewed

	@@ -0,0 +1,95 @@

+DROP TABLE IF EXISTS feedback          CASCADE;
+DROP TABLE IF EXISTS predictions       CASCADE;
+DROP TABLE IF EXISTS news_articles     CASCADE;
+DROP TABLE IF EXISTS model_performance CASCADE;
+DROP TABLE IF EXISTS user_sessions     CASCADE;
+DROP VIEW IF EXISTS prediction_stats   CASCADE;
+DROP VIEW IF EXISTS daily_predictions  CASCADE;
+DROP VIEW IF EXISTS feedback_accuracy  CASCADE;
+DROP VIEW IF EXISTS model_comparison   CASCADE;
+CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+CREATE TABLE predictions (
+    id              UUID         PRIMARY KEY DEFAULT uuid_generate_v4(),
+    article_id      VARCHAR      NOT NULL UNIQUE,
+    text            TEXT,
+    predicted_label VARCHAR(50)  NOT NULL,
+    confidence      FLOAT        NOT NULL,
+    model_name      VARCHAR(100) NOT NULL,
+    explanation     JSONB,
+    created_at      TIMESTAMPTZ  DEFAULT NOW()
+);
+CREATE INDEX idx_pred_created ON predictions(created_at DESC);
+CREATE INDEX idx_pred_label   ON predictions(predicted_label);
+CREATE INDEX idx_pred_model   ON predictions(model_name);
+CREATE TABLE feedback (
+    id              UUID        PRIMARY KEY DEFAULT uuid_generate_v4(),
+    article_id      VARCHAR     NOT NULL,
+    predicted_label VARCHAR(50) NOT NULL,
+    actual_label    VARCHAR(50) NOT NULL,
+    user_comment    TEXT,
+    created_at      TIMESTAMPTZ DEFAULT NOW()
+);
+CREATE INDEX idx_fb_created ON feedback(created_at DESC);
+CREATE INDEX idx_fb_article ON feedback(article_id);
+CREATE TABLE news_articles (
+    id            UUID         PRIMARY KEY DEFAULT uuid_generate_v4(),
+    title         TEXT         NOT NULL,
+    description   TEXT,
+    content       TEXT,
+    url           TEXT         NOT NULL UNIQUE,
+    image_url     TEXT,
+    published_at  TIMESTAMPTZ,
+    source_name   VARCHAR(255),
+    source_url    TEXT,
+    fetched_at    TIMESTAMPTZ  DEFAULT NOW(),
+    analyzed      BOOLEAN      DEFAULT FALSE,
+    prediction_id UUID
+);
+CREATE INDEX idx_news_published ON news_articles(published_at DESC);
+CREATE INDEX idx_news_analyzed  ON news_articles(analyzed);
+CREATE TABLE model_performance (
+    id                  UUID         PRIMARY KEY DEFAULT uuid_generate_v4(),
+    model_name          VARCHAR(100) NOT NULL,
+    accuracy            FLOAT,
+    precision           FLOAT,
+    recall              FLOAT,
+    f1_score            FLOAT,
+    total_predictions   INTEGER      DEFAULT 0,
+    correct_predictions INTEGER      DEFAULT 0,
+    evaluated_at        TIMESTAMPTZ  DEFAULT NOW()
+);
+CREATE TABLE user_sessions (
+    id            UUID        PRIMARY KEY DEFAULT uuid_generate_v4(),
+    session_id    VARCHAR     NOT NULL UNIQUE,
+    user_agent    TEXT,
+    ip_address    INET,
+    created_at    TIMESTAMPTZ DEFAULT NOW(),
+    last_activity TIMESTAMPTZ DEFAULT NOW()
+);
+ALTER TABLE predictions       DISABLE ROW LEVEL SECURITY;
+ALTER TABLE feedback          DISABLE ROW LEVEL SECURITY;
+ALTER TABLE news_articles     DISABLE ROW LEVEL SECURITY;
+ALTER TABLE model_performance DISABLE ROW LEVEL SECURITY;
+ALTER TABLE user_sessions     DISABLE ROW LEVEL SECURITY;
+CREATE VIEW prediction_stats AS
+SELECT predicted_label, COUNT(*) AS total_count, AVG(confidence) AS avg_confidence
+FROM predictions
+GROUP BY predicted_label;
+CREATE VIEW feedback_accuracy AS
+SELECT predicted_label, actual_label, COUNT(*) AS count
+FROM feedback
+GROUP BY predicted_label, actual_label
+ORDER BY count DESC;

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (192 Bytes). View file

src/api/__init__.py ADDED Viewed

File without changes

src/api/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (196 Bytes). View file

src/api/__pycache__/main.cpython-313.pyc ADDED Viewed

Binary file (17 kB). View file

src/api/main.py ADDED Viewed

	@@ -0,0 +1,355 @@

+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import Optional, List, Dict
+import os
+import uuid
+from dotenv import load_dotenv
+from src.utils.supabase_client import get_supabase_client
+from src.utils.gnews_client import get_gnews_client
+load_dotenv()
+app = FastAPI(
+    title="Fake News Detection API",
+    description="Multi-class fake news detection using DistilBERT, RoBERTa, and XLNet",
+    version="1.0.0",
+    docs_url="/docs",
+    redoc_url="/redoc",
+)
+allowed_origins = [
+    o.strip()
+    for o in os.getenv(
+        "ALLOWED_ORIGINS", "http://localhost:3000,http://localhost:5173"
+    ).split(",")
+    if o.strip()
+]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=allowed_origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+VALID_MODELS = {"distilbert", "roberta", "xlnet"}
+class PredictionRequest(BaseModel):
+    text: Optional[str] = None
+    url: Optional[str] = None
+    model: Optional[str] = "distilbert"
+class ExplanationData(BaseModel):
+    token: str
+    score: float
+class PredictionResponse(BaseModel):
+    article_id: str
+    label: str
+    confidence: float
+    scores: dict
+    model_used: str
+    explanation: List[ExplanationData]
+class FeedbackRequest(BaseModel):
+    article_id: str
+    predicted_label: str
+    actual_label: str
+    user_comment: Optional[str] = None
+class ExplainRequest(BaseModel):
+    text: str
+    model: Optional[str] = "distilbert"
+    deep: Optional[bool] = False
+@app.on_event("startup")
+async def startup_event():
+    try:
+        get_supabase_client()
+        print("✅ Supabase connected")
+    except Exception as e:
+        print(f"⚠️  Supabase: {e}")
+    try:
+        get_gnews_client()
+        print("✅ GNews API connected")
+    except Exception as e:
+        print(f"⚠️  GNews: {e}")
+    print("🚀 API server started")
+@app.get("/")
+async def root():
+    return {
+        "message": "Fake News Detection API",
+        "status": "running",
+        "version": "1.0.0",
+        "models": list(VALID_MODELS),
+    }
+@app.get("/health")
+async def health_check():
+    status = {"api": "healthy", "supabase": "unknown", "gnews": "unknown"}
+    try:
+        get_supabase_client()
+        status["supabase"] = "healthy"
+    except Exception as e:
+        status["supabase"] = f"unhealthy: {e}"
+    try:
+        get_gnews_client()
+        status["gnews"] = "healthy"
+    except Exception as e:
+        status["gnews"] = f"unhealthy: {e}"
+    return status
+@app.post("/predict", response_model=PredictionResponse)
+async def predict(request: PredictionRequest, background_tasks: BackgroundTasks):
+    """Classify news as True / Fake / Satire / Bias."""
+    if not request.text and not request.url:
+        raise HTTPException(status_code=400, detail="Provide text or url")
+    model_key = request.model if request.model in VALID_MODELS else "distilbert"
+    article_id = str(uuid.uuid4())
+    text = request.text or ""
+    if not text and request.url:
+        try:
+            import requests as req
+            from bs4 import BeautifulSoup
+            r = req.get(request.url, timeout=10)
+            soup = BeautifulSoup(r.text, "html.parser")
+            text = " ".join(p.get_text() for p in soup.find_all("p"))[:4000]
+        except Exception as e:
+            raise HTTPException(
+                status_code=422, detail=f"Could not fetch URL: {e}")
+    if len(text.strip()) < 10:
+        raise HTTPException(
+            status_code=422, detail="Text too short to classify")
+    try:
+        from src.models.inference import predict as run_inference
+        result = run_inference(text, model_key)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Inference error: {e}")
+    response = PredictionResponse(
+        article_id=article_id,
+        label=result["label"],
+        confidence=result["confidence"],
+        scores=result["scores"],
+        model_used=model_key,
+        explanation=[ExplanationData(**t) for t in result.get("tokens", [])],
+    )
+    def _store():
+        try:
+            supabase = get_supabase_client()
+            supabase.store_prediction(
+                article_id=article_id,
+                text=text,
+                predicted_label=result["label"],
+                confidence=result["confidence"],
+                model_name=model_key,
+                explanation=result.get("tokens", []),
+            )
+        except Exception as e:
+            print(f"[bg] store_prediction failed: {e}")
+    background_tasks.add_task(_store)
+    return response
+@app.post("/feedback")
+async def submit_feedback(feedback: FeedbackRequest):
+    """Submit user correction for active learning."""
+    try:
+        supabase = get_supabase_client()
+        result = supabase.store_feedback(
+            article_id=feedback.article_id,
+            predicted_label=feedback.predicted_label,
+            actual_label=feedback.actual_label,
+            user_comment=feedback.user_comment,
+        )
+        return {"status": "success", "message": "Feedback recorded", "data": result}
+    except Exception as e:
+        import traceback
+        print(f"[feedback] ERROR: {e}\n{traceback.format_exc()}")
+        raise HTTPException(
+            status_code=500, detail=f"Error storing feedback: {str(e)}")
+@app.get("/news")
+async def get_recent_news(
+    query: str = "breaking news",
+    max_results: int = 10,
+    category: Optional[str] = None,
+):
+    """Fetch recent articles from GNews."""
+    try:
+        gnews = get_gnews_client()
+        if category:
+            articles = gnews.get_top_headlines(
+                category=category, max_results=max_results)
+        else:
+            articles = gnews.search_news(query=query, max_results=max_results)
+        return {"status": "success", "count": len(articles), "articles": articles}
+    except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"Error fetching news: {e}")
+@app.get("/news/analyze")
+async def analyze_recent_news(topic: str = "politics", max_articles: int = 5):
+    """Fetch and classify recent news articles."""
+    try:
+        gnews = get_gnews_client()
+        articles = gnews.search_news(query=topic, max_results=max_articles)
+        from src.models.inference import predict as run_inference
+        results = []
+        for article in articles:
+            text = article.get("content") or article.get(
+                "description") or article.get("title", "")
+            if len(text.strip()) < 10:
+                continue
+            try:
+                pred = run_inference(text, "distilbert")
+                results.append({"article": article, "prediction": pred})
+            except Exception:
+                results.append({"article": article, "prediction": None})
+        return {"status": "success", "topic": topic, "analyzed_count": len(results), "results": results}
+    except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"Error analyzing news: {e}")
+@app.get("/news/newspaper")
+async def get_newspaper(max_per_topic: int = 6):
+    """Fetch and classify news across multiple topics, grouped by predicted label."""
+    topics = ["world news", "politics", "technology",
+              "science", "health", "business"]
+    try:
+        gnews = get_gnews_client()
+        from src.models.inference import predict as run_inference
+        all_results = []
+        seen_urls: set = set()
+        for topic in topics:
+            articles = gnews.search_news(
+                query=topic, max_results=max_per_topic)
+            for article in articles:
+                url = article.get("url", "")
+                if url in seen_urls:
+                    continue
+                seen_urls.add(url)
+                text = article.get("content") or article.get(
+                    "description") or article.get("title", "")
+                if len(text.strip()) < 10:
+                    continue
+                try:
+                    pred = run_inference(text, "distilbert")
+                    all_results.append(
+                        {"article": article, "prediction": pred})
+                except Exception:
+                    all_results.append({"article": article, "prediction": {
+                        "label": "True", "confidence": 0.5, "scores": {}, "tokens": []
+                    }})
+        grouped: Dict[str, list] = {"True": [],
+                                    "Fake": [], "Satire": [], "Bias": []}
+        for item in all_results:
+            lbl = item["prediction"].get(
+                "label", "True") if item["prediction"] else "True"
+            if lbl in grouped:
+                grouped[lbl].append(item)
+        return {"status": "success", "total": len(all_results), "grouped": grouped}
+    except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"Error building newspaper: {e}")
+@app.post("/explain")
+async def explain_prediction(request: ExplainRequest):
+    """
+    Return explainability data for a piece of text.
+    Always returns gradient saliency highlights. If deep=True, also runs SHAP via RoBERTa.
+    """
+    if len(request.text.strip()) < 10:
+        raise HTTPException(status_code=422, detail="Text too short")
+    model_key = request.model if request.model in VALID_MODELS else "distilbert"
+    try:
+        from src.models.inference import get_classifier
+        import asyncio
+        clf = get_classifier(model_key)
+        loop = asyncio.get_event_loop()
+        attention = await loop.run_in_executor(None, clf.attention_weights, request.text)
+        shap_tokens = []
+        explanation_text = ""
+        if request.deep:
+            shap_tokens = await loop.run_in_executor(None, clf.shap_explain, request.text)
+            if shap_tokens:
+                from src.models.inference import generate_explanation_text, predict as run_predict
+                pred = run_predict(request.text, model_key)
+                explanation_text = generate_explanation_text(
+                    shap_tokens, pred["label"], pred["confidence"], model_key
+                )
+        return {"attention": attention, "shap": shap_tokens, "explanation_text": explanation_text, "model_used": model_key}
+    except Exception as e:
+        import traceback
+        print(f"[explain] ERROR: {e}\n{traceback.format_exc()}")
+        raise HTTPException(status_code=500, detail=f"Explain error: {e}")
+@app.get("/stats")
+async def get_statistics():
+    """Prediction statistics from Supabase."""
+    try:
+        supabase = get_supabase_client()
+        stats = supabase.get_prediction_stats()
+        return {"status": "success", "statistics": stats}
+    except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"Error fetching stats: {e}")
+@app.get("/models")
+async def list_models():
+    """List available models and their training status."""
+    from pathlib import Path
+    models_dir = Path(__file__).parents[2] / "models"
+    available = []
+    for name in ["distilbert", "roberta", "xlnet"]:
+        path = models_dir / name
+        trained = (path / "config.json").exists()
+        available.append({"name": name, "trained": trained,
+                         "path": str(path) if trained else None})
+    return {"models": available, "default": "distilbert"}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        "src.api.main:app",
+        host=os.getenv("API_HOST", "0.0.0.0"),
+        port=int(os.getenv("API_PORT", 8000)),
+        reload=os.getenv("API_RELOAD", "true").lower() == "true",
+    )

src/data/__init__.py ADDED Viewed

File without changes

src/data/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (197 Bytes). View file

src/data/__pycache__/dataset.cpython-313.pyc ADDED Viewed

Binary file (5.85 kB). View file

src/data/__pycache__/preprocessing.cpython-313.pyc ADDED Viewed

Binary file (1.28 kB). View file

src/data/dataset.py ADDED Viewed

	@@ -0,0 +1,91 @@

+"""
+Dataset loader — reads Dataset_Clean.csv and returns tokenized HuggingFace DatasetDict splits.
+"""
+import pandas as pd
+from pathlib import Path
+from datasets import Dataset, DatasetDict
+from transformers import AutoTokenizer
+from sklearn.model_selection import train_test_split
+from src.data.preprocessing import clean_text
+LABEL2ID = {"True": 0, "Fake": 1, "Satire": 2, "Bias": 3}
+ID2LABEL = {v: k for k, v in LABEL2ID.items()}
+DEFAULT_CSV = Path(__file__).parents[2] / \
+    "data" / "processed" / "Dataset_Clean.csv"
+MAX_LENGTH = 256
+VAL_SPLIT = 0.10
+TEST_SPLIT = 0.10
+RANDOM_SEED = 42
+def load_dataframe(csv_path: str | Path = DEFAULT_CSV) -> pd.DataFrame:
+    """Load and clean Dataset_Clean.csv. Returns a DataFrame with columns: text, label (int)."""
+    df = pd.read_csv(csv_path, low_memory=False)
+    df["label_text"] = df["label_text"].astype(
+        str).str.strip().str.capitalize()
+    df = df[df["label_text"].isin(LABEL2ID)].copy()
+    df["content"] = df["content"].fillna("").astype(str)
+    df["title"] = df["title"].fillna("").astype(str)
+    df["text"] = df.apply(lambda r: r["content"] if len(
+        r["content"]) > 30 else r["title"], axis=1)
+    df["text"] = df["text"].apply(clean_text)
+    df = df[df["text"].str.len() > 10].copy()
+    df["label"] = df["label_text"].map(LABEL2ID).astype(int)
+    print(f"[dataset] Loaded {len(df):,} rows")
+    print(
+        f"[dataset] Label distribution:\n{df['label_text'].value_counts().to_string()}\n")
+    return df[["text", "label"]].reset_index(drop=True)
+def make_splits(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """Stratified train / val / test split."""
+    train_df, temp_df = train_test_split(
+        df, test_size=VAL_SPLIT + TEST_SPLIT, stratify=df["label"], random_state=RANDOM_SEED
+    )
+    val_df, test_df = train_test_split(
+        temp_df, test_size=TEST_SPLIT / (VAL_SPLIT + TEST_SPLIT),
+        stratify=temp_df["label"], random_state=RANDOM_SEED
+    )
+    print(
+        f"[dataset] Train: {len(train_df):,}  Val: {len(val_df):,}  Test: {len(test_df):,}")
+    return train_df, val_df, test_df
+def tokenize_dataset(dataset_dict: DatasetDict, tokenizer_name: str, max_length: int = MAX_LENGTH) -> DatasetDict:
+    """Tokenize all splits."""
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+    def _tokenize(batch):
+        return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=max_length)
+    tokenized = dataset_dict.map(_tokenize, batched=True, batch_size=512, remove_columns=[
+                                 "text"], desc="Tokenizing")
+    tokenized.set_format("torch")
+    return tokenized
+def build_dataset(
+    csv_path: str | Path = DEFAULT_CSV,
+    tokenizer_name: str = "distilbert-base-uncased",
+    max_length: int = MAX_LENGTH,
+) -> DatasetDict:
+    """Full pipeline: CSV → cleaned DataFrame → HuggingFace DatasetDict → tokenized splits."""
+    df = load_dataframe(csv_path)
+    train_df, val_df, test_df = make_splits(df)
+    raw = DatasetDict({
+        "train":      Dataset.from_pandas(train_df, preserve_index=False),
+        "validation": Dataset.from_pandas(val_df,   preserve_index=False),
+        "test":       Dataset.from_pandas(test_df,  preserve_index=False),
+    })
+    return tokenize_dataset(raw, tokenizer_name, max_length)
+if __name__ == "__main__":
+    ds = build_dataset()
+    print(ds)
+    print("Sample:", ds["train"][0])

src/data/gnews_collector.py ADDED Viewed

	@@ -0,0 +1,189 @@

+"""
+Fetches live GNews articles and appends them to the training dataset.
+Usage:
+    python -m src.data.gnews_collector              # fetch and save
+    python -m src.data.gnews_collector --preview    # print without saving
+    python -m src.data.gnews_collector --label --model-path models/distilbert --merge
+"""
+from src.data.preprocessing import clean_text
+from src.utils.gnews_client import GNewsClient
+import os
+import sys
+import uuid
+import argparse
+import pandas as pd
+from pathlib import Path
+from datetime import datetime
+from dotenv import load_dotenv
+sys.path.insert(0, str(Path(__file__).parents[2]))
+load_dotenv()
+PROJECT_ROOT = Path(__file__).parents[2]
+AUGMENTED_DIR = PROJECT_ROOT / "data" / "augmented"
+CLEAN_CSV = PROJECT_ROOT / "data" / "processed" / "Dataset_Clean.csv"
+FETCH_TOPICS = [
+    "scientific research breakthrough",
+    "official government announcement",
+    "verified breaking news",
+    "conspiracy theory debunked",
+    "fact check false claim",
+    "misinformation viral",
+    "satire news comedy",
+    "parody news article",
+    "political opinion editorial",
+    "partisan news analysis",
+]
+MAX_PER_TOPIC = 5
+def fetch_articles(max_per_topic: int = MAX_PER_TOPIC) -> list[dict]:
+    client = GNewsClient()
+    all_articles = []
+    seen_urls: set[str] = set()
+    for topic in FETCH_TOPICS:
+        try:
+            articles = client.search_news(
+                query=topic, max_results=max_per_topic)
+            for a in articles:
+                url = a.get("url", "")
+                if url and url not in seen_urls:
+                    seen_urls.add(url)
+                    all_articles.append(a)
+            print(f"  ✓ '{topic}' → {len(articles)} articles")
+        except Exception as e:
+            print(f"  ✗ '{topic}' → error: {e}")
+    print(f"\n[collector] Fetched {len(all_articles)} unique articles\n")
+    return all_articles
+def articles_to_dataframe(articles: list[dict]) -> pd.DataFrame:
+    """Convert raw GNews articles to Dataset_Clean.csv schema. Labels are set to -1 (unlabelled)."""
+    rows = []
+    for a in articles:
+        title = clean_text(a.get("title", ""))
+        content = clean_text(a.get("content", "") or a.get("description", ""))
+        text = content if len(content) > 30 else title
+        if len(text) < 10:
+            continue
+        rows.append({
+            "id":             f"GNEWS_{uuid.uuid4().hex[:8].upper()}",
+            "title":          title,
+            "content":        content,
+            "label": -1,
+            "label_text":     "UNLABELLED",
+            "label_original": "gnews_live",
+            "source_dataset": "GNews_Live",
+            "topic":          "",
+            "url":            a.get("url", ""),
+            "speaker":        a.get("source", ""),
+            "fetched_at":     datetime.utcnow().isoformat(),
+        })
+    return pd.DataFrame(rows)
+def pseudo_label(df: pd.DataFrame, model_path: str) -> pd.DataFrame:
+    """Assign pseudo-labels to unlabelled articles using a trained model."""
+    import torch
+    from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    ID2LABEL = {0: "True", 1: "Fake", 2: "Satire", 3: "Bias"}
+    print(f"[pseudo_label] Loading model from {model_path}…")
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = AutoModelForSequenceClassification.from_pretrained(model_path)
+    model.eval()
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+    texts = df["content"].fillna(df["title"]).tolist()
+    labels = []
+    confidences = []
+    for i in range(0, len(texts), 16):
+        batch = texts[i: i + 16]
+        enc = tokenizer(batch, padding=True, truncation=True,
+                        max_length=256, return_tensors="pt").to(device)
+        with torch.no_grad():
+            probs = torch.softmax(model(**enc).logits, dim=-1)
+        labels.extend(probs.argmax(dim=-1).cpu().tolist())
+        confidences.extend(probs.max(dim=-1).values.cpu().tolist())
+    df = df.copy()
+    df["label"] = labels
+    df["label_text"] = [ID2LABEL[l] for l in labels]
+    df["confidence"] = [round(c, 4) for c in confidences]
+    print(
+        f"[pseudo_label] Label distribution:\n{df['label_text'].value_counts().to_string()}")
+    return df
+def save_augmented(df: pd.DataFrame, tag: str = "") -> Path:
+    AUGMENTED_DIR.mkdir(parents=True, exist_ok=True)
+    ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
+    name = f"gnews_{ts}{('_' + tag) if tag else ''}.csv"
+    path = AUGMENTED_DIR / name
+    df.to_csv(path, index=False, encoding="utf-8")
+    print(f"[collector] Saved {len(df)} rows → {path}")
+    return path
+def merge_into_training(augmented_path: Path, min_confidence: float = 0.80) -> int:
+    """Merge pseudo-labelled articles into Dataset_Clean.csv, filtered by confidence threshold."""
+    aug_df = pd.read_csv(augmented_path)
+    if "confidence" in aug_df.columns:
+        aug_df = aug_df[aug_df["confidence"] >= min_confidence]
+    aug_df = aug_df[aug_df["label"] != -1]
+    if len(aug_df) == 0:
+        print("[merge] No rows met the confidence threshold.")
+        return 0
+    keep_cols = ["id", "title", "content", "label", "label_text",
+                 "label_original", "source_dataset", "topic", "url", "speaker"]
+    aug_df = aug_df[[c for c in keep_cols if c in aug_df.columns]]
+    aug_df.to_csv(CLEAN_CSV, mode="a", header=False,
+                  index=False, encoding="utf-8")
+    print(f"[merge] Added {len(aug_df)} rows to {CLEAN_CSV}")
+    return len(aug_df)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--preview",       action="store_true")
+    parser.add_argument("--label",         action="store_true")
+    parser.add_argument("--model-path",    type=str,
+                        default="models/distilbert")
+    parser.add_argument("--merge",         action="store_true")
+    parser.add_argument("--min-conf",      type=float, default=0.80)
+    parser.add_argument("--max-per-topic", type=int,   default=MAX_PER_TOPIC)
+    args = parser.parse_args()
+    articles = fetch_articles(max_per_topic=args.max_per_topic)
+    df = articles_to_dataframe(articles)
+    if args.preview:
+        print(df[["title", "source_dataset", "url"]].to_string())
+        return
+    raw_path = save_augmented(df, tag="raw")
+    if args.label:
+        model_path = str(PROJECT_ROOT / args.model_path)
+        if not Path(model_path).exists():
+            print(f"[error] Model not found at {model_path}")
+            return
+        df = pseudo_label(df, model_path)
+        labelled_path = save_augmented(df, tag="labelled")
+        if args.merge:
+            merge_into_training(labelled_path, min_confidence=args.min_conf)
+if __name__ == "__main__":
+    main()

src/data/preprocessing.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import re
+import html
+from typing import List
+def clean_text(text: str) -> str:
+    """Clean and normalize raw text — decodes HTML, strips URLs, normalizes whitespace."""
+    text = html.unescape(text)
+    text = re.sub(r'http\S+', '', text)
+    text = text.replace('\u201c', '"').replace(
+        '\u201d', '"').replace('\u2013', '-')
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+def preprocess_batch(texts: List[str]) -> List[str]:
+    """Apply clean_text to a list of strings."""
+    return [clean_text(text) for text in texts]

src/models/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from src.models.inference import predict, get_classifier, FakeNewsClassifier
+from src.models.train import train_model
+from src.models.evaluate import compute_metrics, full_report
+__all__ = [
+    "predict",
+    "get_classifier",
+    "FakeNewsClassifier",
+    "train_model",
+    "compute_metrics",
+    "full_report",
+]

src/models/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (502 Bytes). View file

src/models/__pycache__/evaluate.cpython-313.pyc ADDED Viewed

Binary file (2.53 kB). View file

src/models/__pycache__/inference.cpython-313.pyc ADDED Viewed

Binary file (18.9 kB). View file

src/models/__pycache__/train.cpython-313.pyc ADDED Viewed

Binary file (8.63 kB). View file

src/models/evaluate.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+Evaluation utilities — metrics computed during and after training.
+"""
+import numpy as np
+from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
+from transformers import EvalPrediction
+LABEL_NAMES = ["True", "Fake", "Satire", "Bias"]
+def compute_metrics(eval_pred: EvalPrediction) -> dict:
+    """Called by HuggingFace Trainer after every eval step. Returns accuracy and macro/weighted F1."""
+    logits, labels = eval_pred
+    preds = np.argmax(logits, axis=-1)
+    return {
+        "accuracy":    round(accuracy_score(labels, preds), 4),
+        "f1_macro":    round(f1_score(labels, preds, average="macro",    zero_division=0), 4),
+        "f1_weighted": round(f1_score(labels, preds, average="weighted", zero_division=0), 4),
+    }
+def full_report(model, tokenized_test, label_names=LABEL_NAMES) -> dict:
+    """Run full evaluation on the test split. Returns per-class metrics and confusion matrix."""
+    from transformers import Trainer
+    trainer = Trainer(model=model, compute_metrics=compute_metrics)
+    preds_out = trainer.predict(tokenized_test)
+    preds = np.argmax(preds_out.predictions, axis=-1)
+    labels = preds_out.label_ids
+    report = classification_report(
+        labels, preds, target_names=label_names, output_dict=True, zero_division=0)
+    cm = confusion_matrix(labels, preds)
+    print("\n" + "=" * 60)
+    print("CLASSIFICATION REPORT")
+    print("=" * 60)
+    print(classification_report(labels, preds,
+          target_names=label_names, zero_division=0))
+    print("Confusion Matrix:")
+    print(cm)
+    print("=" * 60 + "\n")
+    return {"report": report, "confusion_matrix": cm.tolist()}

src/models/inference.py ADDED Viewed

	@@ -0,0 +1,345 @@

+"""
+Model inference — lazy-loads fine-tuned models and runs predictions with explainability.
+"""
+import os
+import torch
+import numpy as np
+from pathlib import Path
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from dotenv import load_dotenv
+load_dotenv()
+ID2LABEL = {0: "True", 1: "Fake", 2: "Satire", 3: "Bias"}
+LABEL2ID = {v: k for k, v in ID2LABEL.items()}
+PROJECT_ROOT = Path(__file__).parents[2]
+MODELS_DIR = PROJECT_ROOT / "models"
+# Override with HF Hub repo IDs via env vars, e.g. HF_REPO_DISTILBERT=your-username/distilbert-fakenews
+MODEL_NAMES = {
+    "distilbert": os.getenv("HF_REPO_DISTILBERT", "distilbert-base-uncased"),
+    "roberta":    os.getenv("HF_REPO_ROBERTA",    "roberta-base"),
+    "xlnet":      os.getenv("HF_REPO_XLNET",      "xlnet-base-cased"),
+}
+class FakeNewsClassifier:
+    """Wraps a fine-tuned HuggingFace model. Lazy-loads on first call and caches in memory."""
+    def __init__(self, model_key: str = "distilbert", max_length: int = 256):
+        self.model_key = model_key
+        self.max_length = max_length
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self._model = None
+        self._tokenizer = None
+    def _load(self):
+        local_path = MODELS_DIR / self.model_key
+        source = str(local_path) if (
+            local_path / "config.json").exists() else MODEL_NAMES[self.model_key]
+        print(f"[inference] Loading {self.model_key} from: {source}")
+        self._tokenizer = AutoTokenizer.from_pretrained(source)
+        self._model = AutoModelForSequenceClassification.from_pretrained(
+            source,
+            num_labels=4,
+            id2label=ID2LABEL,
+            label2id=LABEL2ID,
+            ignore_mismatched_sizes=True,
+        )
+        self._model.to(self.device)
+        self._model.eval()
+        print(f"[inference] Model ready on {self.device}")
+    @property
+    def model(self):
+        if self._model is None:
+            self._load()
+        return self._model
+    @property
+    def tokenizer(self):
+        if self._tokenizer is None:
+            self._load()
+        return self._tokenizer
+    def predict(self, text: str) -> dict:
+        """
+        Run inference on a single text.
+        Returns label, confidence (0-1), per-class scores, and top token importance scores.
+        """
+        enc = self.tokenizer(
+            text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=self.max_length,
+            padding=True,
+        ).to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**enc)
+            probs = torch.softmax(outputs.logits, dim=-1)[0].cpu().numpy()
+        pred_id = int(np.argmax(probs))
+        label = ID2LABEL[pred_id]
+        confidence = float(probs[pred_id])
+        scores = {ID2LABEL[i]: round(float(p), 4) for i, p in enumerate(probs)}
+        tokens = self._token_importance(enc, pred_id)
+        return {
+            "label":      label,
+            "confidence": round(confidence, 4),
+            "scores":     scores,
+            "tokens":     tokens,
+        }
+    def _token_importance(self, enc, pred_id: int, top_k: int = 8) -> list[dict]:
+        """Gradient saliency — returns top-k tokens sorted by importance."""
+        try:
+            self.model.zero_grad()
+            input_ids = enc["input_ids"]
+            embeds = self.model.get_input_embeddings()(
+                input_ids).detach().requires_grad_(True)
+            outputs = self.model(inputs_embeds=embeds,
+                                 attention_mask=enc.get("attention_mask"))
+            outputs.logits[0, pred_id].backward()
+            importance = embeds.grad[0].norm(dim=-1).cpu().numpy()
+            tokens = self.tokenizer.convert_ids_to_tokens(
+                input_ids[0].cpu().tolist())
+            special = {"[CLS]", "[SEP]", "[PAD]", "<s>",
+                       "</s>", "<pad>", "<cls>", "<sep>", "▁", "Ġ"}
+            pairs = [
+                (t.replace("##", "").replace("▁", "").replace("Ġ", ""), float(s))
+                for t, s in zip(tokens, importance)
+                if t not in special and len(t.strip()) > 1
+            ]
+            if pairs:
+                max_s = max(s for _, s in pairs) or 1.0
+                pairs = [(t, round(s / max_s, 4)) for t, s in pairs]
+            pairs.sort(key=lambda x: x[1], reverse=True)
+            return [{"token": t, "score": s} for t, s in pairs[:top_k]]
+        except Exception:
+            return []
+    def attention_weights(self, text: str) -> list[dict]:
+        """
+        Gradient saliency mapped to original words in reading order.
+        Merges subword tokens (BERT ## and RoBERTa Ġ) back into full words.
+        """
+        try:
+            enc = self.tokenizer(
+                text,
+                return_tensors="pt",
+                truncation=True,
+                max_length=self.max_length,
+                padding=False,
+            ).to(self.device)
+            input_ids = enc["input_ids"]
+            self.model.zero_grad()
+            embeds = self.model.get_input_embeddings()(
+                input_ids).detach().requires_grad_(True)
+            outputs = self.model(inputs_embeds=embeds,
+                                 attention_mask=enc.get("attention_mask"))
+            pred_id = int(torch.argmax(outputs.logits, dim=-1)[0])
+            outputs.logits[0, pred_id].backward()
+            importance = embeds.grad[0].norm(dim=-1).cpu().numpy()
+            tokens = self.tokenizer.convert_ids_to_tokens(
+                input_ids[0].cpu().tolist())
+            SPECIAL = {"[CLS]", "[SEP]", "[PAD]", "<s>",
+                       "</s>", "<pad>", "<cls>", "<sep>", "<unk>"}
+            words = []
+            current_word = ""
+            current_score = 0.0
+            for tok, score in zip(tokens, importance):
+                if tok in SPECIAL:
+                    if current_word:
+                        words.append((current_word, current_score))
+                        current_word = ""
+                        current_score = 0.0
+                    continue
+                is_continuation = tok.startswith("##")
+                is_new_word = tok.startswith("Ġ") or tok.startswith("▁")
+                clean = tok.replace("##", "").replace("Ġ", "").replace("▁", "")
+                if is_continuation:
+                    current_word += clean
+                    current_score = max(current_score, float(score))
+                elif is_new_word:
+                    if current_word:
+                        words.append((current_word, current_score))
+                    current_word = clean
+                    current_score = float(score)
+                else:
+                    if current_word:
+                        words.append((current_word, current_score))
+                    current_word = clean
+                    current_score = float(score)
+            if current_word:
+                words.append((current_word, current_score))
+            if not words:
+                return []
+            max_s = max(s for _, s in words) or 1.0
+            return [{"word": w, "attention": round(s / max_s, 4)} for w, s in words if w.strip()]
+        except Exception as e:
+            print(f"[attention_weights] failed: {e}")
+            import traceback
+            traceback.print_exc()
+            return []
+    def shap_explain(self, text: str) -> list[dict]:
+        """
+        Word-level SHAP explanation using RoBERTa for better context.
+        Returns words sorted by absolute SHAP value, most influential first.
+        """
+        try:
+            import shap
+            clf = get_classifier("roberta")
+            def predict_proba(texts):
+                results = []
+                for t in texts:
+                    enc = clf.tokenizer(
+                        t, return_tensors="pt", truncation=True,
+                        max_length=clf.max_length, padding=True,
+                    ).to(clf.device)
+                    with torch.no_grad():
+                        logits = clf.model(**enc).logits
+                    probs = torch.softmax(logits, dim=-1)[0].cpu().numpy()
+                    results.append(probs)
+                return np.array(results)
+            masker = shap.maskers.Text(r"\W+")
+            explainer = shap.Explainer(
+                predict_proba, masker, output_names=list(ID2LABEL.values()))
+            shap_values = explainer([text], max_evals=200, batch_size=8)
+            enc = clf.tokenizer(text, return_tensors="pt", truncation=True,
+                                max_length=clf.max_length).to(clf.device)
+            with torch.no_grad():
+                pred_id = int(torch.argmax(clf.model(**enc).logits, dim=-1)[0])
+            words = shap_values.data[0]
+            values = shap_values.values[0, :, pred_id]
+            max_abs = float(np.max(np.abs(values))) if len(values) else 1.0
+            if max_abs == 0:
+                max_abs = 1.0
+            result = []
+            for word, val in zip(words, values):
+                w = word.strip()
+                if not w:
+                    continue
+                result.append(
+                    {"word": w, "shap_value": round(float(val) / max_abs, 4)})
+            # Keep original sentence order so inline text rendering makes sense
+            return result
+        except Exception as e:
+            print(f"[shap_explain] failed: {e}")
+            import traceback
+            traceback.print_exc()
+            return []
+_classifiers: dict[str, FakeNewsClassifier] = {}
+def get_classifier(model_key: str = "distilbert") -> FakeNewsClassifier:
+    """Get or create a cached classifier instance."""
+    if model_key not in _classifiers:
+        _classifiers[model_key] = FakeNewsClassifier(model_key)
+    return _classifiers[model_key]
+def predict(text: str, model_key: str = "distilbert") -> dict:
+    """Convenience wrapper for single prediction."""
+    return get_classifier(model_key).predict(text)
+def generate_explanation_text(
+    shap_tokens: list[dict],
+    label: str,
+    confidence: float,
+    model_key: str,
+) -> str:
+    """
+    Build a natural-language paragraph explaining the prediction from SHAP data.
+    No LLM required — derived entirely from token scores and prediction metadata.
+    """
+    if not shap_tokens:
+        return (
+            f"The {model_key} model classified this article as {label} "
+            f"with {round(confidence * 100)}% confidence, but no word-level "
+            f"explanation data was available for this prediction."
+        )
+    positive = sorted(
+        [t for t in shap_tokens if t["shap_value"] > 0.05],
+        key=lambda x: x["shap_value"], reverse=True
+    )[:5]
+    negative = sorted(
+        [t for t in shap_tokens if t["shap_value"] < -0.05],
+        key=lambda x: x["shap_value"]
+    )[:3]
+    conf_pct = round(confidence * 100)
+    model_display = {"distilbert": "DistilBERT", "roberta": "RoBERTa",
+                     "xlnet": "XLNet"}.get(model_key, model_key)
+    conf_phrase = (
+        "with very high confidence" if conf_pct >= 90 else
+        "with high confidence" if conf_pct >= 75 else
+        "with moderate confidence" if conf_pct >= 55 else
+        "with low confidence"
+    )
+    label_descriptions = {
+        "True":   "factual and credible reporting",
+        "Fake":   "fabricated or misleading content",
+        "Satire": "satirical or parody content",
+        "Bias":   "politically or ideologically biased reporting",
+    }
+    label_desc = label_descriptions.get(label, label)
+    parts = [
+        f"{model_display} classified this article as {label} ({label_desc}) "
+        f"{conf_phrase} ({conf_pct}%)."
+    ]
+    if positive:
+        word_list = ", ".join(f'"{t["word"]}"' for t in positive)
+        parts.append(
+            f"The words most strongly associated with this classification were {word_list}, "
+            f"which the model weighted heavily toward a {label} prediction."
+        )
+    if negative:
+        word_list = ", ".join(f'"{t["word"]}"' for t in negative)
+        parts.append(
+            f"On the other hand, terms like {word_list} pulled against this classification, "
+            f"suggesting some linguistic signals that are inconsistent with {label} content."
+        )
+    elif not negative:
+        parts.append(
+            f"The model found little linguistic evidence contradicting this classification."
+        )
+    if conf_pct < 65:
+        parts.append(
+            "The relatively lower confidence suggests the article contains mixed signals "
+            "and the prediction should be interpreted with caution."
+        )
+    return " ".join(parts)

src/models/train.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+Training script for fake news detection.
+Usage:
+    python -m src.models.train --model distilbert
+    python -m src.models.train --model roberta --epochs 5
+    python -m src.models.train --all
+"""
+from src.data.dataset import build_dataset, LABEL2ID, ID2LABEL
+from src.models.evaluate import compute_metrics, full_report
+import os
+import sys
+import json
+import argparse
+from pathlib import Path
+from datetime import datetime
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    TrainingArguments,
+    Trainer,
+    EarlyStoppingCallback,
+)
+from dotenv import load_dotenv
+sys.path.insert(0, str(Path(__file__).parents[2]))
+load_dotenv()
+MODELS = {
+    "distilbert": "distilbert-base-uncased",
+    "roberta":    "roberta-base",
+    "xlnet":      "xlnet-base-cased",
+}
+PROJECT_ROOT = Path(__file__).parents[2]
+MODELS_DIR = PROJECT_ROOT / "models"
+DATA_CSV = PROJECT_ROOT / "data" / "processed" / "Dataset_Clean.csv"
+def get_training_args(model_key, output_dir, epochs, batch_size, learning_rate, use_wandb) -> TrainingArguments:
+    return TrainingArguments(
+        output_dir=str(output_dir / "checkpoints"),
+        num_train_epochs=epochs,
+        per_device_train_batch_size=batch_size,
+        per_device_eval_batch_size=batch_size * 2,
+        learning_rate=learning_rate,
+        weight_decay=0.01,
+        warmup_ratio=0.06,
+        eval_strategy="epoch",
+        save_strategy="epoch",
+        load_best_model_at_end=True,
+        metric_for_best_model="f1_macro",
+        greater_is_better=True,
+        save_total_limit=2,
+        logging_dir=str(output_dir / "logs"),
+        logging_steps=50,
+        report_to="wandb" if use_wandb else "none",
+        run_name=f"{model_key}-{datetime.now().strftime('%Y%m%d-%H%M')}",
+        fp16=torch.cuda.is_available(),
+        dataloader_num_workers=0,
+        push_to_hub=False,
+    )
+def train_model(model_key, epochs=3, batch_size=16, learning_rate=2e-5, max_length=256, use_wandb=False) -> dict:
+    """Full training run for one model. Returns evaluation metrics."""
+    model_name = MODELS[model_key]
+    output_dir = MODELS_DIR / model_key
+    print("\n" + "=" * 60)
+    print(f"TRAINING: {model_key}  ({model_name})")
+    print(f"Epochs: {epochs}  |  Batch: {batch_size}  |  LR: {learning_rate}")
+    print(
+        f"Device: {'GPU (' + torch.cuda.get_device_name(0) + ')' if torch.cuda.is_available() else 'CPU'}")
+    print("=" * 60 + "\n")
+    print("[1/4] Building dataset…")
+    tokenized = build_dataset(
+        csv_path=DATA_CSV, tokenizer_name=model_name, max_length=max_length)
+    print("[2/4] Loading model…")
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_name, num_labels=4, id2label=ID2LABEL, label2id=LABEL2ID, ignore_mismatched_sizes=True,
+    )
+    print("[3/4] Setting up trainer…")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if use_wandb:
+        import wandb
+        wandb.init(
+            project=os.getenv("WANDB_PROJECT", "fake-news-detection"),
+            name=f"{model_key}-{datetime.now().strftime('%Y%m%d-%H%M')}",
+            config={"model": model_name, "epochs": epochs, "batch_size": batch_size,
+                    "learning_rate": learning_rate, "max_length": max_length},
+        )
+    trainer = Trainer(
+        model=model,
+        args=get_training_args(model_key, output_dir,
+                               epochs, batch_size, learning_rate, use_wandb),
+        train_dataset=tokenized["train"],
+        eval_dataset=tokenized["validation"],
+        compute_metrics=compute_metrics,
+        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
+    )
+    print("[4/4] Training…\n")
+    trainer.train()
+    print(f"\n[✓] Saving model to {output_dir}")
+    trainer.save_model(str(output_dir))
+    AutoTokenizer.from_pretrained(model_name).save_pretrained(str(output_dir))
+    print("[✓] Evaluating on test set…")
+    metrics = full_report(model, tokenized["test"])
+    metrics_path = output_dir / "metrics.json"
+    with open(metrics_path, "w") as f:
+        json.dump(metrics["report"], f, indent=2)
+    print(f"[✓] Metrics saved to {metrics_path}")
+    if use_wandb:
+        import wandb
+        wandb.log(metrics["report"])
+        wandb.finish()
+    return metrics
+def main():
+    parser = argparse.ArgumentParser(
+        description="Train fake news detection models")
+    parser.add_argument(
+        "--model",         choices=list(MODELS.keys()), default="distilbert")
+    parser.add_argument("--all",           action="store_true",
+                        help="Train all three models sequentially")
+    parser.add_argument("--epochs",        type=int,   default=3)
+    parser.add_argument("--batch-size",    type=int,   default=16)
+    parser.add_argument("--lr",            type=float, default=2e-5)
+    parser.add_argument("--max-length",    type=int,   default=256)
+    parser.add_argument("--wandb",         action="store_true")
+    args = parser.parse_args()
+    targets = list(MODELS.keys()) if args.all else [args.model]
+    all_metrics = {}
+    for model_key in targets:
+        all_metrics[model_key] = train_model(
+            model_key=model_key, epochs=args.epochs, batch_size=args.batch_size,
+            learning_rate=args.lr, max_length=args.max_length, use_wandb=args.wandb,
+        )
+    print("\n" + "=" * 60)
+    print("TRAINING SUMMARY")
+    print("=" * 60)
+    for key, m in all_metrics.items():
+        r = m["report"]
+        print(f"\n{key.upper()}")
+        print(f"  Accuracy:    {r.get('accuracy', 'N/A'):.4f}")
+        print(
+            f"  Macro F1:    {r.get('macro avg', {}).get('f1-score', 'N/A'):.4f}")
+        print(
+            f"  Weighted F1: {r.get('weighted avg', {}).get('f1-score', 'N/A'):.4f}")
+    print("\n" + "=" * 60 + "\n")
+if __name__ == "__main__":
+    main()

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (198 Bytes). View file

src/utils/__pycache__/gnews_client.cpython-313.pyc ADDED Viewed

Binary file (6.04 kB). View file

src/utils/__pycache__/supabase_client.cpython-313.pyc ADDED Viewed

Binary file (4.75 kB). View file

src/utils/gnews_client.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import os
+import requests
+from typing import List, Dict, Any, Optional
+from datetime import datetime, timedelta
+from dotenv import load_dotenv
+load_dotenv()
+class GNewsClient:
+    def __init__(self):
+        self.api_key = os.getenv("GNEWS_API_KEY")
+        self.base_url = os.getenv("GNEWS_API_URL", "https://gnews.io/api/v4")
+        if not self.api_key:
+            raise ValueError(
+                "GNEWS_API_KEY must be set in environment variables")
+    def search_news(
+        self,
+        query: str = "politics",
+        lang: str = "en",
+        country: Optional[str] = None,
+        max_results: int = 10,
+        from_date: Optional[datetime] = None,
+        to_date: Optional[datetime] = None,
+    ) -> List[Dict[str, Any]]:
+        """Search for news articles by query."""
+        params = {
+            "q": query,
+            "lang": lang,
+            "max": min(max_results, 100),
+            "apikey": self.api_key,
+        }
+        if country:
+            params["country"] = country
+        if from_date:
+            params["from"] = from_date.strftime("%Y-%m-%dT%H:%M:%SZ")
+        if to_date:
+            params["to"] = to_date.strftime("%Y-%m-%dT%H:%M:%SZ")
+        try:
+            response = requests.get(
+                f"{self.base_url}/search", params=params, timeout=10)
+            response.raise_for_status()
+            return self._format_articles(response.json().get("articles", []))
+        except requests.exceptions.RequestException as e:
+            print(f"Error fetching news: {e}")
+            return []
+    def get_top_headlines(
+        self,
+        category: Optional[str] = None,
+        lang: str = "en",
+        country: Optional[str] = None,
+        max_results: int = 10,
+    ) -> List[Dict[str, Any]]:
+        """Get top headlines, optionally filtered by category."""
+        params = {
+            "lang": lang,
+            "max": min(max_results, 100),
+            "apikey": self.api_key,
+        }
+        if category:
+            params["category"] = category
+        if country:
+            params["country"] = country
+        try:
+            response = requests.get(
+                f"{self.base_url}/top-headlines", params=params, timeout=10)
+            response.raise_for_status()
+            return self._format_articles(response.json().get("articles", []))
+        except requests.exceptions.RequestException as e:
+            print(f"Error fetching headlines: {e}")
+            return []
+    def _format_articles(self, articles: List[Dict]) -> List[Dict[str, Any]]:
+        return [
+            {
+                "title":       article.get("title", ""),
+                "description": article.get("description", ""),
+                "content":     article.get("content", ""),
+                "url":         article.get("url", ""),
+                "image":       article.get("image", ""),
+                "published_at": article.get("publishedAt", ""),
+                "source":      article.get("source", {}).get("name", ""),
+                "source_url":  article.get("source", {}).get("url", ""),
+            }
+            for article in articles
+        ]
+    def get_recent_news_for_analysis(
+        self,
+        topics: List[str] = ["politics", "breaking news", "world news"],
+        max_per_topic: int = 5,
+    ) -> List[Dict[str, Any]]:
+        """Fetch and deduplicate articles across multiple topics."""
+        all_articles = []
+        seen_urls: set = set()
+        for topic in topics:
+            articles = self.search_news(
+                query=topic,
+                max_results=max_per_topic,
+                from_date=datetime.now() - timedelta(days=1),
+            )
+            for article in articles:
+                url = article.get("url", "")
+                if url and url not in seen_urls:
+                    seen_urls.add(url)
+                    all_articles.append(article)
+        return all_articles
+_gnews_client: Optional[GNewsClient] = None
+def get_gnews_client() -> GNewsClient:
+    global _gnews_client
+    if _gnews_client is None:
+        _gnews_client = GNewsClient()
+    return _gnews_client

src/utils/supabase_client.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+from typing import Optional, Dict, Any, List
+from datetime import datetime
+from supabase import create_client, Client
+from dotenv import load_dotenv
+load_dotenv()
+class SupabaseClient:
+    def __init__(self):
+        self.url = os.getenv("SUPABASE_URL")
+        self.key = os.getenv(
+            "SUPABASE_SERVICE_KEY") or os.getenv("SUPABASE_KEY")
+        if not self.url or not self.key:
+            raise ValueError(
+                "SUPABASE_URL and SUPABASE_SERVICE_KEY must be set")
+        self.client: Client = create_client(self.url, self.key)
+    def store_prediction(
+        self,
+        article_id: str,
+        text: str,
+        predicted_label: str,
+        confidence: float,
+        model_name: str,
+        explanation=None,
+    ) -> Dict[str, Any]:
+        data = {
+            "article_id": article_id,
+            "text": text[:1000],
+            "predicted_label": predicted_label,
+            "confidence": confidence,
+            "model_name": model_name,
+            "explanation": explanation,
+            "created_at": datetime.utcnow().isoformat(),
+        }
+        response = self.client.table("predictions").insert(data).execute()
+        return response.data
+    def store_feedback(
+        self,
+        article_id: str,
+        predicted_label: str,
+        actual_label: str,
+        user_comment: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        data = {
+            "article_id": article_id,
+            "predicted_label": predicted_label,
+            "actual_label": actual_label,
+            "user_comment": user_comment,
+            "created_at": datetime.utcnow().isoformat(),
+        }
+        response = self.client.table("feedback").insert(data).execute()
+        return response.data
+    def get_prediction_stats(self) -> Dict[str, Any]:
+        total = self.client.table("predictions").select(
+            "*", count="exact").execute()
+        by_label_rows = self.client.table(
+            "predictions").select("predicted_label").execute()
+        label_counts: Dict[str, int] = {}
+        for row in by_label_rows.data:
+            lbl = row["predicted_label"]
+            label_counts[lbl] = label_counts.get(lbl, 0) + 1
+        return {
+            "total_predictions": total.count,
+            "by_label": label_counts,
+        }
+    def get_feedback_for_training(self, limit: int = 1000) -> List[Dict[str, Any]]:
+        response = self.client.table("feedback").select(
+            "*").limit(limit).execute()
+        return response.data
+_supabase_client: Optional[SupabaseClient] = None
+def get_supabase_client() -> SupabaseClient:
+    global _supabase_client
+    if _supabase_client is None:
+        _supabase_client = SupabaseClient()
+    return _supabase_client
+def reset_client():
+    """Force re-initialisation."""
+    global _supabase_client
+    _supabase_client = None