Spaces:

ar07xd
/

deepshield

Running

App Files Files Community

Spyderzz commited on 29 days ago

Commit

0853b44

1 Parent(s): a648128

Initial deployment of DeepShield backend

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env +39 -0
Colab_ViT_Training.ipynb +233 -0
Dockerfile +31 -0
README.md +11 -10
__init__.py +0 -0
analyze.py +177 -0
artifact_detector.py +229 -0
auth.py +30 -0
auth_service.py +67 -0
common.py +88 -0
config.py +53 -0
database.py +28 -0
datasets/__init__.py +0 -0
datasets/build_manifest.py +93 -0
datasets/download_dfdc_sample.py +44 -0
datasets/download_ffhq.py +49 -0
datasets/extract_frames.py +90 -0
datasets/procure_all.ps1 +40 -0
datasets/procure_all.sh +37 -0
deepshield_13_5bcf1328.pdf +148 -0
deps.py +46 -0
download_ffpp.py +261 -0
ela_service.py +88 -0
exif_service.py +129 -0
file_handler.py +96 -0
generate_colab_nb.py +213 -0
heatmap_generator.py +164 -0
image_service.py +58 -0
llm_explainer.py +182 -0
main.py +59 -0
model_loader.py +156 -0
models.py +45 -0
news_lookup.py +242 -0
report.html +367 -0
report_service.py +152 -0
requirements.txt +50 -0
router.py +10 -0
scoring.py +46 -0
screenshot_service.py +126 -0
test_image_classify.py +58 -0
test_news_api.py +43 -0
test_phase5.py +70 -0
test_text_analysis.py +34 -0
text_service.py +285 -0
v1/__init__.py +0 -0
v1/__pycache__/__init__.cpython-311.pyc +0 -0
v1/__pycache__/analyze.cpython-311.pyc +0 -0
v1/__pycache__/auth.cpython-311.pyc +0 -0
v1/__pycache__/health.cpython-311.pyc +0 -0
v1/__pycache__/history.cpython-311.pyc +0 -0

.env ADDED Viewed

	@@ -0,0 +1,39 @@

+# ═══════════════════════════════════════
+# DeepShield — Environment Configuration
+# ═══════════════════════════════════════
+# Copy this file to backend/.env and customize
+# Server
+APP_HOST=0.0.0.0
+APP_PORT=8000
+DEBUG=true
+CORS_ORIGINS=["http://localhost:5173"]
+# Database
+# For local dev: sqlite:///./deepshield.db
+# For production (Neon/Supabase): postgresql://username:password@ep-cool...
+DATABASE_URL=postgresql://neondb_owner:npg_YUdXqlrDP3H2@ep-divine-tooth-ame27uf3-pooler.c-5.us-east-1.aws.neon.tech/neondb?sslmode=require&channel_binding=require
+# File Upload
+MAX_UPLOAD_SIZE_MB=100
+UPLOAD_DIR=./temp_uploads
+FILE_RETENTION_SECONDS=300
+# AI Models
+IMAGE_MODEL_ID=prithivMLmods/Deep-Fake-Detector-v2-Model
+TEXT_MODEL_ID=jy46604790/Fake-News-Bert-Detect
+DEVICE=cpu
+PRELOAD_MODELS=true
+# News API (optional — sign up at https://newsdata.io)
+NEWS_API_KEY=pub_83c8fca805124a4fb074256825decd4c
+NEWS_API_BASE_URL=https://newsdata.io/api/1/news
+# PDF Reports
+REPORT_DIR=./temp_reports
+REPORT_TTL_SECONDS=3600
+# Auth — CHANGE JWT_SECRET_KEY IN PRODUCTION!
+JWT_SECRET_KEY=change-me-in-production
+JWT_ALGORITHM=HS256
+JWT_EXPIRATION_MINUTES=1440

Colab_ViT_Training.ipynb ADDED Viewed

	@@ -0,0 +1,233 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1e0e7b4a",
+   "metadata": {},
+   "source": [
+    "# DeepShield: FaceForensics++ ViT Training \n",
+    "Run this entirely in Google Colab.\n",
+    "**Before running**:\n",
+    "1. Go to `Runtime` -> `Change runtime type` -> select **T4 GPU**.\n",
+    "2. Run the cells below sequentially.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4fe293e7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install timm transformers datasets accelerate evaluate opencv-python\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9387c0f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We create the download script inside the Colab environment\n",
+    "download_script = '''#!/usr/bin/env python\n",
+    "import argparse\n",
+    "import os\n",
+    "import urllib.request\n",
+    "import tempfile\n",
+    "import time\n",
+    "import sys\n",
+    "import json\n",
+    "from tqdm import tqdm\n",
+    "from os.path import join\n",
+    "\n",
+    "FILELIST_URL = 'misc/filelist.json'\n",
+    "DEEPFEAKES_DETECTION_URL = 'misc/deepfake_detection_filenames.json'\n",
+    "DEEPFAKES_MODEL_NAMES = ['decoder_A.h5', 'decoder_B.h5', 'encoder.h5',]\n",
+    "DATASETS = {\n",
+    "    'original': 'original_sequences/youtube',\n",
+    "    'Deepfakes': 'manipulated_sequences/Deepfakes',\n",
+    "    'Face2Face': 'manipulated_sequences/Face2Face',\n",
+    "    'FaceShifter': 'manipulated_sequences/FaceShifter',\n",
+    "    'FaceSwap': 'manipulated_sequences/FaceSwap',\n",
+    "    'NeuralTextures': 'manipulated_sequences/NeuralTextures'\n",
+    "}\n",
+    "ALL_DATASETS = ['original', 'Deepfakes', 'Face2Face', 'FaceShifter', 'FaceSwap', 'NeuralTextures']\n",
+    "COMPRESSION = ['raw', 'c23', 'c40']\n",
+    "TYPE = ['videos']\n",
+    "\n",
+    "def download_file(url, out_file):\n",
+    "    os.makedirs(os.path.dirname(out_file), exist_ok=True)\n",
+    "    if not os.path.isfile(out_file):\n",
+    "        urllib.request.urlretrieve(url, out_file)\n",
+    "\n",
+    "def main():\n",
+    "    parser = argparse.ArgumentParser()\n",
+    "    parser.add_argument('output_path', type=str)\n",
+    "    parser.add_argument('-d', '--dataset', type=str, default='all')\n",
+    "    parser.add_argument('-c', '--compression', type=str, default='c40')\n",
+    "    parser.add_argument('-t', '--type', type=str, default='videos')\n",
+    "    parser.add_argument('-n', '--num_videos', type=int, default=50) # Small amount for tutorial\n",
+    "    args = parser.parse_args()\n",
+    "    \n",
+    "    base_url = 'http://kaldir.vc.in.tum.de/faceforensics/v3/'\n",
+    "    \n",
+    "    datasets = [args.dataset] if args.dataset != 'all' else ALL_DATASETS\n",
+    "    for dataset in datasets:\n",
+    "        dataset_path = DATASETS[dataset]\n",
+    "        print(f'Downloading {args.compression} of {dataset}')\n",
+    "        \n",
+    "        file_pairs = json.loads(urllib.request.urlopen(base_url + FILELIST_URL).read().decode(\"utf-8\"))\n",
+    "        filelist = []\n",
+    "        if 'original' in dataset_path:\n",
+    "            for pair in file_pairs:\n",
+    "                filelist += pair\n",
+    "        else:\n",
+    "            for pair in file_pairs:\n",
+    "                filelist.append('_'.join(pair))\n",
+    "                filelist.append('_'.join(pair[::-1]))\n",
+    "            \n",
+    "        filelist = filelist[:args.num_videos]\n",
+    "        dataset_videos_url = base_url + f'{dataset_path}/{args.compression}/{args.type}/'\n",
+    "        dataset_output_path = join(args.output_path, dataset_path, args.compression, args.type)\n",
+    "        \n",
+    "        for filename in tqdm(filelist):\n",
+    "            download_file(dataset_videos_url + filename + \".mp4\", join(dataset_output_path, filename + \".mp4\"))\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()\n",
+    "'''\n",
+    "\n",
+    "with open(\"download_ffpp.py\", \"w\") as f:\n",
+    "    f.write(download_script)\n",
+    "\n",
+    "!python download_ffpp.py ./data -d all -c c40 -t videos -n 50\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f33716f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cv2\n",
+    "import os\n",
+    "import glob\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "def extract_frames(video_folder, output_folder, label, max_frames=4):\n",
+    "    os.makedirs(output_folder, exist_ok=True)\n",
+    "    videos = glob.glob(os.path.join(video_folder, \"*.mp4\"))\n",
+    "    \n",
+    "    for vid_path in tqdm(videos, desc=f\"Extracting {label}\"):\n",
+    "        vid_name = os.path.basename(vid_path).replace('.mp4','')\n",
+    "        cap = cv2.VideoCapture(vid_path)\n",
+    "        count = 0\n",
+    "        while cap.isOpened() and count < max_frames:\n",
+    "            ret, frame = cap.read()\n",
+    "            if not ret: break\n",
+    "            frame = cv2.resize(frame, (224, 224))\n",
+    "            out_path = os.path.join(output_folder, f\"{vid_name}_f{count}.jpg\")\n",
+    "            cv2.imwrite(out_path, frame)\n",
+    "            count += 1\n",
+    "        cap.release()\n",
+    "\n",
+    "# Extract Real\n",
+    "extract_frames('./data/original_sequences/youtube/c40/videos', './dataset/real', 'real')\n",
+    "\n",
+    "# Extract Fakes\n",
+    "fakes = ['Deepfakes', 'Face2Face', 'FaceSwap', 'NeuralTextures']\n",
+    "for f in fakes:\n",
+    "    extract_frames(f'./data/manipulated_sequences/{f}/c40/videos', './dataset/fake', 'fake')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b79cdd85",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from datasets import load_dataset\n",
+    "from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer\n",
+    "import torch\n",
+    "\n",
+    "# 1. Load Dataset\n",
+    "dataset = load_dataset('imagefolder', data_dir='./dataset')\n",
+    "# Split into train/validation\n",
+    "dataset = dataset['train'].train_test_split(test_size=0.1)\n",
+    "\n",
+    "# 2. Preprocessor\n",
+    "model_name_or_path = 'google/vit-base-patch16-224-in21k'\n",
+    "processor = ViTImageProcessor.from_pretrained(model_name_or_path)\n",
+    "\n",
+    "def transform(example_batch):\n",
+    "    # Take a list of PIL images and turn them to pixel values\n",
+    "    inputs = processor([x.convert(\"RGB\") for x in example_batch['image']], return_tensors='pt')\n",
+    "    inputs['labels'] = example_batch['label']\n",
+    "    return inputs\n",
+    "\n",
+    "prepared_ds = dataset.with_transform(transform)\n",
+    "\n",
+    "def collate_fn(batch):\n",
+    "    return {\n",
+    "        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),\n",
+    "        'labels': torch.tensor([x['labels'] for x in batch])\n",
+    "    }\n",
+    "\n",
+    "# 3. Load Model\n",
+    "labels = dataset['train'].features['label'].names\n",
+    "model = ViTForImageClassification.from_pretrained(\n",
+    "    model_name_or_path,\n",
+    "    num_labels=len(labels),\n",
+    "    id2label={str(i): c for i, c in enumerate(labels)},\n",
+    "    label2id={c: str(i) for i, c in enumerate(labels)}\n",
+    ")\n",
+    "\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"./vit-deepshield\",\n",
+    "    per_device_train_batch_size=16,\n",
+    "    eval_strategy=\"steps\",\n",
+    "    num_train_epochs=3,\n",
+    "    fp16=True, # Mixed precision for speed\n",
+    "    save_steps=100,\n",
+    "    eval_steps=100,\n",
+    "    logging_steps=10,\n",
+    "    learning_rate=2e-4,\n",
+    "    save_total_limit=2,\n",
+    "    remove_unused_columns=False,\n",
+    "    push_to_hub=False,\n",
+    "    load_best_model_at_end=True,\n",
+    ")\n",
+    "\n",
+    "import evaluate\n",
+    "metric = evaluate.load(\"accuracy\")\n",
+    "def compute_metrics(p):\n",
+    "    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)\n",
+    "\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    data_collator=collate_fn,\n",
+    "    compute_metrics=compute_metrics,\n",
+    "    train_dataset=prepared_ds[\"train\"],\n",
+    "    eval_dataset=prepared_ds[\"test\"],\n",
+    ")\n",
+    "\n",
+    "# 4. Train\n",
+    "train_results = trainer.train()\n",
+    "trainer.save_model(\"deepshield_vit_model\")\n",
+    "processor.save_pretrained(\"deepshield_vit_model\")\n",
+    "trainer.log_metrics(\"train\", train_results.metrics)\n",
+    "trainer.save_metrics(\"train\", train_results.metrics)\n",
+    "trainer.save_state()\n",
+    "print(\"Training Complete! The model is saved to ./deepshield_vit_model\")\n"
+   ]
+  }
+ ],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+# Base image with Python 3.10
+FROM python:3.10-slim
+# Set the working directory
+WORKDIR /app
+# Install system dependencies required for OpenCV, PyTorch, etc.
+RUN apt-get update && apt-get install -y \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Copy the requirements file into the container
+COPY requirements.txt .
+# Install Python dependencies
+# Using --no-cache-dir keeps the Docker image smaller
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the backend code
+COPY . .
+# Create directories for models and temporary uploads if they don't exist
+RUN mkdir -p /app/temp_uploads /app/models
+# Expose port 7860 (This is the default port required by Hugging Face Spaces)
+EXPOSE 7860
+# Run the FastAPI server on port 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,11 +1,12 @@
----
-title: Deepshield
-emoji: 🏆
-colorFrom: yellow
-colorTo: purple
-sdk: docker
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# backend/training
+Training pipeline for the DeepShield image detector (BUILD_PLAN2 Phase 11).
+| Phase | Module |
+|---|---|
+| 11.1 Dataset procurement | [`datasets/`](./datasets/) — see [../../docs/datasets.md](../../docs/datasets.md) |
+| 11.2 Training | `dataset.py`, `train_convnext.py` (pending) |
+| 11.2 Calibration | `calibrate.py` (pending) |
+| 11.2 Evaluation | `eval.py` (pending) |
+Run `bash datasets/procure_all.sh` to build `./data/manifest.csv`.

__init__.py ADDED Viewed

File without changes

analyze.py ADDED Viewed

	@@ -0,0 +1,177 @@

+from __future__ import annotations
+from typing import List
+from pydantic import BaseModel
+from schemas.common import (
+    ArtifactIndicator,
+    ContradictingEvidence,
+    ExifSummary,
+    LLMExplainabilitySummary,
+    ProcessingSummary,
+    TrustedSource,
+    TruthOverride,
+    Verdict,
+    VLMBreakdown,
+)
+class SensationalismBreakdown(BaseModel):
+    score: int = 0
+    level: str = "Low"
+    exclamation_count: int = 0
+    caps_word_count: int = 0
+    clickbait_matches: int = 0
+    emotional_word_count: int = 0
+    superlative_count: int = 0
+class ManipulationIndicatorOut(BaseModel):
+    pattern_type: str
+    matched_text: str
+    start_pos: int
+    end_pos: int
+    severity: str
+    description: str
+class TextExplainability(BaseModel):
+    fake_probability: float
+    top_label: str
+    all_scores: dict = {}
+    keywords: List[str] = []
+    sensationalism: SensationalismBreakdown = SensationalismBreakdown()
+    manipulation_indicators: List[ManipulationIndicatorOut] = []
+    detected_language: str = "en"       # ISO 639-1 code, e.g. "en", "hi"
+    truth_override: TruthOverride | None = None
+class TextAnalysisResponse(BaseModel):
+    analysis_id: str
+    record_id: int = 0
+    media_type: str = "text"
+    timestamp: str
+    verdict: Verdict
+    explainability: TextExplainability
+    llm_summary: LLMExplainabilitySummary | None = None
+    trusted_sources: List[TrustedSource] = []
+    contradicting_evidence: List[ContradictingEvidence] = []
+    processing_summary: ProcessingSummary
+    responsible_ai_notice: str = (
+        "AI-based analysis may not be 100% accurate. Cross-check with trusted sources before sharing."
+    )
+class OCRBoxOut(BaseModel):
+    text: str
+    bbox: List[List[int]]
+    confidence: float
+class SuspiciousPhraseOut(BaseModel):
+    text: str
+    bbox: List[List[int]]
+    pattern_type: str
+    severity: str
+    description: str
+class LayoutAnomalyOut(BaseModel):
+    type: str
+    severity: str
+    description: str
+    confidence: float
+class ScreenshotExplainability(BaseModel):
+    extracted_text: str = ""
+    ocr_boxes: List[OCRBoxOut] = []
+    fake_probability: float = 0.0
+    sensationalism: SensationalismBreakdown = SensationalismBreakdown()
+    suspicious_phrases: List[SuspiciousPhraseOut] = []
+    layout_anomalies: List[LayoutAnomalyOut] = []
+    keywords: List[str] = []
+    detected_language: str = "en"
+    truth_override: TruthOverride | None = None
+class ScreenshotAnalysisResponse(BaseModel):
+    analysis_id: str
+    record_id: int = 0
+    media_type: str = "screenshot"
+    timestamp: str
+    verdict: Verdict
+    explainability: ScreenshotExplainability
+    llm_summary: LLMExplainabilitySummary | None = None
+    trusted_sources: List[TrustedSource] = []
+    contradicting_evidence: List[ContradictingEvidence] = []
+    processing_summary: ProcessingSummary
+    responsible_ai_notice: str = (
+        "AI-based analysis may not be 100% accurate. Cross-check with trusted sources before sharing."
+    )
+class ImageExplainability(BaseModel):
+    heatmap_base64: str = ""
+    ela_base64: str = ""
+    boxes_base64: str = ""
+    heatmap_status: str = "success"  # success | failed | degraded
+    artifact_indicators: List[ArtifactIndicator] = []
+    exif: ExifSummary | None = None
+    llm_summary: LLMExplainabilitySummary | None = None
+    vlm_breakdown: VLMBreakdown | None = None
+class FrameAnalysisOut(BaseModel):
+    index: int
+    timestamp_s: float
+    label: str
+    confidence: float
+    suspicious_prob: float
+    is_suspicious: bool
+    has_face: bool = False
+    scored: bool = False
+class VideoExplainability(BaseModel):
+    num_frames_sampled: int
+    num_face_frames: int = 0
+    num_suspicious_frames: int
+    mean_suspicious_prob: float
+    max_suspicious_prob: float
+    suspicious_ratio: float
+    insufficient_faces: bool = False
+    suspicious_timestamps: List[float] = []
+    frames: List[FrameAnalysisOut] = []
+class VideoAnalysisResponse(BaseModel):
+    analysis_id: str
+    record_id: int = 0
+    media_type: str = "video"
+    timestamp: str
+    verdict: Verdict
+    explainability: VideoExplainability
+    llm_summary: LLMExplainabilitySummary | None = None
+    trusted_sources: List[TrustedSource] = []
+    contradicting_evidence: List[ContradictingEvidence] = []
+    processing_summary: ProcessingSummary
+    responsible_ai_notice: str = (
+        "AI-based analysis may not be 100% accurate. Cross-check with trusted sources before sharing."
+    )
+class ImageAnalysisResponse(BaseModel):
+    analysis_id: str
+    record_id: int = 0
+    media_type: str = "image"
+    timestamp: str
+    verdict: Verdict
+    explainability: ImageExplainability
+    trusted_sources: List[TrustedSource] = []
+    contradicting_evidence: List[ContradictingEvidence] = []
+    processing_summary: ProcessingSummary
+    responsible_ai_notice: str = (
+        "AI-based analysis may not be 100% accurate. Cross-check with trusted sources before sharing."
+    )

artifact_detector.py ADDED Viewed

	@@ -0,0 +1,229 @@

+from __future__ import annotations
+import io
+from typing import List
+import numpy as np
+from loguru import logger
+from PIL import Image
+from schemas.common import ArtifactIndicator
+def _severity_from_score(score: float) -> str:
+    if score >= 0.7:
+        return "high"
+    if score >= 0.4:
+        return "medium"
+    return "low"
+# ---------- 1. GAN high-frequency signature (FFT) ----------
+def detect_gan_hf_artifact(pil_img: Image.Image) -> ArtifactIndicator | None:
+    """Compute high-frequency energy ratio on the luminance channel.
+    Real photos typically follow a ~1/f spectrum; many GAN outputs show
+    elevated HF energy or spectral peaks.
+    """
+    try:
+        gray = np.asarray(pil_img.convert("L"), dtype=np.float32)
+        # downsample for speed
+        if max(gray.shape) > 512:
+            import cv2
+            scale = 512 / max(gray.shape)
+            gray = cv2.resize(gray, (int(gray.shape[1] * scale), int(gray.shape[0] * scale)))
+        fft = np.fft.fftshift(np.fft.fft2(gray))
+        mag = np.abs(fft)
+        h, w = mag.shape
+        cy, cx = h // 2, w // 2
+        y, x = np.ogrid[:h, :w]
+        r = np.sqrt((x - cx) ** 2 + (y - cy) ** 2)
+        r_max = np.sqrt(cx * cx + cy * cy)
+        hf_mask = r > (0.5 * r_max)
+        total = float(mag.sum() + 1e-9)
+        hf = float(mag[hf_mask].sum())
+        ratio = hf / total  # typically 0.05–0.20 for natural photos
+        # normalize to [0,1] suspiciousness
+        score = max(0.0, min(1.0, (ratio - 0.10) / 0.20))
+        sev = _severity_from_score(score)
+        return ArtifactIndicator(
+            type="gan_artifact",
+            severity=sev,
+            description=(
+                f"High-frequency energy ratio {ratio:.3f} — "
+                + ("elevated HF energy consistent with GAN/diffusion outputs" if score > 0.4
+                   else "natural frequency falloff")
+            ),
+            confidence=float(score),
+        )
+    except Exception as e:  # noqa: BLE001
+        logger.warning(f"GAN HF detection failed: {e}")
+        return None
+# ---------- 2. JPEG quantization table anomaly ----------
+_STANDARD_Q_SUMS = {  # rough heuristic: camera JPEGs fall in these ranges
+    50: (1500, 4500),
+    75: (600, 2500),
+    90: (200, 1000),
+    95: (100, 600),
+}
+def detect_compression_anomaly(raw_bytes: bytes) -> ArtifactIndicator | None:
+    """Inspect JPEG quantization tables. Missing tables, non-standard layouts,
+    or re-saved tables often indicate manipulation or re-encoding.
+    """
+    try:
+        img = Image.open(io.BytesIO(raw_bytes))
+        if img.format != "JPEG":
+            return ArtifactIndicator(
+                type="compression",
+                severity="low",
+                description=f"Non-JPEG format ({img.format}); compression signature not available",
+                confidence=0.1,
+            )
+        q = getattr(img, "quantization", None)
+        if not q:
+            return ArtifactIndicator(
+                type="compression",
+                severity="low",
+                description="No JPEG quantization tables readable",
+                confidence=0.2,
+            )
+        tables = list(q.values())
+        sums = [int(sum(t)) for t in tables]
+        num_tables = len(tables)
+        # Heuristics: very low sum → very high quality (possibly re-saved);
+        # non-standard number of tables; extreme values.
+        suspicious = 0.0
+        reasons: list[str] = []
+        if num_tables not in (1, 2):
+            suspicious += 0.4
+            reasons.append(f"unusual table count ({num_tables})")
+        if any(s < 60 for s in sums):
+            suspicious += 0.3
+            reasons.append("very low quantization sums (possible re-encoding)")
+        if any(s > 8000 for s in sums):
+            suspicious += 0.2
+            reasons.append("very high quantization sums")
+        score = max(0.0, min(1.0, suspicious))
+        sev = _severity_from_score(score)
+        desc = (
+            f"JPEG Q-table sums {sums}"
+            + (f"; {', '.join(reasons)}" if reasons else "; within typical camera range")
+        )
+        return ArtifactIndicator(
+            type="compression",
+            severity=sev,
+            description=desc,
+            confidence=float(score),
+        )
+    except Exception as e:  # noqa: BLE001
+        logger.warning(f"Compression anomaly detection failed: {e}")
+        return None
+# ---------- 3. Facial boundary + 4. Lighting (MediaPipe) ----------
+def detect_face_based_artifacts(pil_img: Image.Image) -> List[ArtifactIndicator]:
+    """If a face is detected, analyze jaw boundary variance and per-quadrant
+    luminance balance. Returns 0, 1, or 2 indicators.
+    """
+    results: List[ArtifactIndicator] = []
+    try:
+        import mediapipe as mp  # type: ignore
+        from models.model_loader import get_model_loader
+        detector = get_model_loader().load_face_detector()
+        rgb = np.asarray(pil_img.convert("RGB"))
+        h, w = rgb.shape[:2]
+        mp_result = detector.process(rgb)
+        if not mp_result.multi_face_landmarks:
+            return results
+        landmarks = mp_result.multi_face_landmarks[0].landmark
+        # ----- Jaw boundary jitter -----
+        # FaceMesh jaw/oval landmark indices (approximate face contour)
+        JAW_IDX = [
+            10, 338, 297, 332, 284, 251, 389, 356, 454, 323, 361,
+            288, 397, 365, 379, 378, 400, 377, 152, 148, 176, 149,
+            150, 136, 172, 58, 132, 93, 234, 127, 162, 21, 54, 103, 67, 109,
+        ]
+        pts = np.array([(landmarks[i].x * w, landmarks[i].y * h) for i in JAW_IDX])
+        # Second-difference magnitude = local curvature jitter
+        diffs = np.diff(pts, axis=0)
+        seconds = np.diff(diffs, axis=0)
+        jitter = float(np.linalg.norm(seconds, axis=1).mean()) / max(w, h)
+        jitter_score = max(0.0, min(1.0, (jitter - 0.003) / 0.010))
+        results.append(
+            ArtifactIndicator(
+                type="facial_boundary",
+                severity=_severity_from_score(jitter_score),
+                description=(
+                    f"Jaw-contour jitter {jitter:.4f} (normalized) — "
+                    + ("inconsistent boundary blending detected" if jitter_score > 0.4
+                       else "face boundary appears smooth")
+                ),
+                confidence=float(jitter_score),
+            )
+        )
+        # ----- Lighting inconsistency (per-quadrant luminance) -----
+        xs = np.array([lm.x * w for lm in landmarks])
+        ys = np.array([lm.y * h for lm in landmarks])
+        x0, x1 = int(max(0, xs.min())), int(min(w, xs.max()))
+        y0, y1 = int(max(0, ys.min())), int(min(h, ys.max()))
+        if x1 > x0 + 4 and y1 > y0 + 4:
+            face_crop = rgb[y0:y1, x0:x1]
+            gray = 0.299 * face_crop[..., 0] + 0.587 * face_crop[..., 1] + 0.114 * face_crop[..., 2]
+            hh, ww = gray.shape
+            quads = [
+                gray[: hh // 2, : ww // 2],
+                gray[: hh // 2, ww // 2 :],
+                gray[hh // 2 :, : ww // 2],
+                gray[hh // 2 :, ww // 2 :],
+            ]
+            means = np.array([q.mean() for q in quads if q.size > 0])
+            if means.size == 4 and means.mean() > 1e-3:
+                imbalance = float(means.std() / means.mean())
+                lighting_score = max(0.0, min(1.0, (imbalance - 0.08) / 0.20))
+                results.append(
+                    ArtifactIndicator(
+                        type="lighting",
+                        severity=_severity_from_score(lighting_score),
+                        description=(
+                            f"Luminance imbalance across face quadrants {imbalance:.3f} — "
+                            + ("inconsistent lighting direction" if lighting_score > 0.4
+                               else "lighting appears uniform")
+                        ),
+                        confidence=float(lighting_score),
+                    )
+                )
+    except Exception as e:  # noqa: BLE001
+        logger.warning(f"Face-based artifact detection failed: {e}")
+    return results
+# ---------- Orchestrator ----------
+def scan_artifacts(pil_img: Image.Image, raw_bytes: bytes) -> List[ArtifactIndicator]:
+    indicators: List[ArtifactIndicator] = []
+    for fn in (
+        lambda: detect_gan_hf_artifact(pil_img),
+        lambda: detect_compression_anomaly(raw_bytes),
+    ):
+        ind = fn()
+        if ind is not None:
+            indicators.append(ind)
+    indicators.extend(detect_face_based_artifacts(pil_img))
+    return indicators

auth.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from __future__ import annotations
+from datetime import datetime
+from pydantic import BaseModel, EmailStr, Field
+class RegisterBody(BaseModel):
+    email: EmailStr
+    password: str = Field(min_length=6, max_length=128)
+    name: str | None = Field(default=None, max_length=255)
+class LoginBody(BaseModel):
+    email: EmailStr
+    password: str
+class UserOut(BaseModel):
+    id: int
+    email: str
+    name: str | None = None
+    created_at: datetime
+class TokenResponse(BaseModel):
+    access_token: str
+    token_type: str = "bearer"
+    expires_in_minutes: int
+    user: UserOut

auth_service.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from __future__ import annotations
+from datetime import datetime, timedelta, timezone
+from typing import Any
+import bcrypt
+from jose import JWTError, jwt
+from sqlalchemy.orm import Session
+from config import settings
+from db.models import User
+def _encode_pw(plain: str) -> bytes:
+    # bcrypt truncates to 72 bytes silently in some builds and hard-errors in others.
+    # Truncate explicitly so behavior is deterministic across versions.
+    return plain.encode("utf-8")[:72]
+def hash_password(plain: str) -> str:
+    return bcrypt.hashpw(_encode_pw(plain), bcrypt.gensalt()).decode("utf-8")
+def verify_password(plain: str, hashed: str) -> bool:
+    try:
+        return bcrypt.checkpw(_encode_pw(plain), hashed.encode("utf-8"))
+    except Exception:
+        return False
+def create_access_token(user_id: int, email: str) -> str:
+    now = datetime.now(timezone.utc)
+    payload = {
+        "sub": str(user_id),
+        "email": email,
+        "iat": int(now.timestamp()),
+        "exp": int((now + timedelta(minutes=settings.JWT_EXPIRATION_MINUTES)).timestamp()),
+    }
+    return jwt.encode(payload, settings.JWT_SECRET_KEY, algorithm=settings.JWT_ALGORITHM)
+def decode_token(token: str) -> dict[str, Any] | None:
+    try:
+        return jwt.decode(token, settings.JWT_SECRET_KEY, algorithms=[settings.JWT_ALGORITHM])
+    except JWTError:
+        return None
+def register_user(db: Session, email: str, password: str, name: str | None) -> User:
+    email = email.strip().lower()
+    user = User(email=email, password_hash=hash_password(password), name=(name or None))
+    db.add(user)
+    db.commit()
+    db.refresh(user)
+    return user
+def authenticate(db: Session, email: str, password: str) -> User | None:
+    email = email.strip().lower()
+    user = db.query(User).filter(User.email == email).first()
+    if not user or not verify_password(password, user.password_hash):
+        return None
+    return user
+def get_user(db: Session, user_id: int) -> User | None:
+    return db.query(User).filter(User.id == user_id).first()

common.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from __future__ import annotations
+from typing import List, Optional
+from pydantic import BaseModel, ConfigDict, Field
+class Verdict(BaseModel):
+    model_config = ConfigDict(protected_namespaces=())
+    label: str
+    severity: str
+    authenticity_score: int = Field(ge=0, le=100)
+    model_confidence: float = Field(ge=0.0, le=1.0)
+    model_label: str
+class ArtifactIndicator(BaseModel):
+    type: str
+    severity: str  # low | medium | high
+    description: str
+    confidence: float = Field(ge=0.0, le=1.0)
+class TrustedSource(BaseModel):
+    source_name: str
+    title: str
+    url: str
+    published_at: Optional[str] = None
+    relevance_score: float = Field(ge=0.0, le=1.0)
+class ContradictingEvidence(BaseModel):
+    source_name: str
+    title: str
+    url: str
+    type: str = "fact_check"
+class TruthOverride(BaseModel):
+    applied: bool = False
+    source_url: str = ""
+    source_name: str = ""
+    similarity: float = 0.0
+    fake_prob_before: float = 0.0
+    fake_prob_after: float = 0.0
+class ExifSummary(BaseModel):
+    make: Optional[str] = None
+    model: Optional[str] = None
+    datetime_original: Optional[str] = None
+    gps_info: Optional[str] = None
+    software: Optional[str] = None
+    lens_model: Optional[str] = None
+    trust_adjustment: int = 0  # negative = more real, positive = more fake
+    trust_reason: str = ""
+class LLMExplainabilitySummary(BaseModel):
+    paragraph: str = ""
+    bullets: List[str] = []
+    model_used: str = ""
+    cached: bool = False
+class VLMComponentScore(BaseModel):
+    score: int = Field(ge=0, le=100, default=75)
+    notes: str = ""
+class VLMBreakdown(BaseModel):
+    facial_symmetry: VLMComponentScore = VLMComponentScore()
+    skin_texture: VLMComponentScore = VLMComponentScore()
+    lighting_consistency: VLMComponentScore = VLMComponentScore()
+    background_coherence: VLMComponentScore = VLMComponentScore()
+    anatomy_hands_eyes: VLMComponentScore = VLMComponentScore()
+    context_objects: VLMComponentScore = VLMComponentScore()
+    model_used: str = ""
+    cached: bool = False
+class ProcessingSummary(BaseModel):
+    model_config = ConfigDict(protected_namespaces=())
+    stages_completed: List[str]
+    total_duration_ms: int
+    model_used: str

config.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    # Server
+    APP_HOST: str = "0.0.0.0"
+    APP_PORT: int = 8000
+    DEBUG: bool = False
+    CORS_ORIGINS: list[str] = ["http://localhost:5173"]
+    # Database
+    DATABASE_URL: str = "sqlite:///./deepshield.db"
+    # File Upload
+    MAX_UPLOAD_SIZE_MB: int = 100
+    UPLOAD_DIR: str = "./temp_uploads"
+    ALLOWED_IMAGE_TYPES: list[str] = ["image/jpeg", "image/png", "image/webp"]
+    ALLOWED_VIDEO_TYPES: list[str] = ["video/mp4", "video/avi", "video/mov", "video/webm"]
+    FILE_RETENTION_SECONDS: int = 300
+    # AI Models
+    IMAGE_MODEL_ID: str = "prithivMLmods/Deep-Fake-Detector-v2-Model"
+    TEXT_MODEL_ID: str = "jy46604790/Fake-News-Bert-Detect"
+    # Multilingual text model for non-English (Hindi etc.). Leave empty to fall back to TEXT_MODEL_ID.
+    TEXT_MULTILANG_MODEL_ID: str = ""
+    DEVICE: str = "cpu"
+    PRELOAD_MODELS: bool = True  # preload models at startup
+    # Phase 13: OCR language list (comma-separated ISO codes, e.g. "en,hi")
+    OCR_LANGS: str = "en,hi"
+    # News API
+    NEWS_API_KEY: str = ""
+    NEWS_API_BASE_URL: str = "https://newsdata.io/api/1/news"
+    # Reports
+    REPORT_DIR: str = "./temp_reports"
+    REPORT_TTL_SECONDS: int = 3600  # 1h expiry
+    # LLM Explainability (Phase 12)
+    LLM_PROVIDER: str = "gemini"  # "gemini" | "openai"
+    LLM_API_KEY: str = ""
+    LLM_MODEL: str = "gemini-1.5-flash"  # or "gpt-4o-mini"
+    # Auth
+    JWT_SECRET_KEY: str = "change-me-in-production"
+    JWT_ALGORITHM: str = "HS256"
+    JWT_EXPIRATION_MINUTES: int = 1440
+    model_config = SettingsConfigDict(env_file=".env", extra="ignore")
+settings = Settings()

database.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from sqlalchemy import create_engine
+from sqlalchemy.orm import DeclarativeBase, sessionmaker
+from config import settings
+engine = create_engine(
+    settings.DATABASE_URL,
+    connect_args={"check_same_thread": False} if settings.DATABASE_URL.startswith("sqlite") else {},
+)
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+class Base(DeclarativeBase):
+    pass
+def get_db():
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
+def init_db():
+    from db import models  # noqa: F401
+    Base.metadata.create_all(bind=engine)

datasets/__init__.py ADDED Viewed

File without changes

datasets/build_manifest.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""Build a unified train/val/test manifest (70/15/15) across all dataset buckets.
+Expected input layout (produced by the other scripts in this package):
+    data_root/
+      real/
+        ffpp_youtube/*.jpg          # frames from FFPP original_sequences
+        ffhq/*.jpg                  # FFHQ thumbnails
+      fake/
+        ffpp_deepfakes/*.jpg
+        ffpp_face2face/*.jpg
+        ffpp_faceswap/*.jpg
+        ffpp_neuraltextures/*.jpg
+        ffpp_faceshifter/*.jpg
+        dfdc/*.jpg
+The manifest is stratified by (label, source) so FFHQ stays represented
+in val/test.
+Usage:
+    python -m backend.training.datasets.build_manifest \
+        --data ./data --out ./data/manifest.csv --seed 42
+"""
+from __future__ import annotations
+import argparse
+import csv
+import random
+from collections import defaultdict
+from pathlib import Path
+IMG_EXTS = {".jpg", ".jpeg", ".png"}
+def collect(data_root: Path) -> list[tuple[str, str, str]]:
+    rows: list[tuple[str, str, str]] = []
+    for label in ("real", "fake"):
+        label_root = data_root / label
+        if not label_root.exists():
+            continue
+        for source_dir in sorted(p for p in label_root.iterdir() if p.is_dir()):
+            for img in source_dir.rglob("*"):
+                if img.suffix.lower() in IMG_EXTS and img.is_file():
+                    rows.append((str(img.resolve()), label, source_dir.name))
+    return rows
+def split(rows: list[tuple[str, str, str]], seed: int) -> dict[str, list[tuple[str, str, str]]]:
+    buckets: dict[tuple[str, str], list[tuple[str, str, str]]] = defaultdict(list)
+    for r in rows:
+        buckets[(r[1], r[2])].append(r)
+    rng = random.Random(seed)
+    out = {"train": [], "val": [], "test": []}
+    for key, items in buckets.items():
+        rng.shuffle(items)
+        n = len(items)
+        n_train = int(0.70 * n)
+        n_val = int(0.15 * n)
+        out["train"].extend(items[:n_train])
+        out["val"].extend(items[n_train : n_train + n_val])
+        out["test"].extend(items[n_train + n_val :])
+    return out
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--data", required=True, type=Path)
+    ap.add_argument("--out", required=True, type=Path)
+    ap.add_argument("--seed", type=int, default=42)
+    args = ap.parse_args()
+    rows = collect(args.data)
+    if not rows:
+        raise SystemExit(f"No images found under {args.data}")
+    splits = split(rows, args.seed)
+    args.out.parent.mkdir(parents=True, exist_ok=True)
+    with args.out.open("w", newline="", encoding="utf-8") as f:
+        w = csv.writer(f)
+        w.writerow(["path", "label", "source", "split"])
+        for name, items in splits.items():
+            for path, label, source in items:
+                w.writerow([path, label, source, name])
+    summary = {k: len(v) for k, v in splits.items()}
+    print(f"Manifest: {args.out}")
+    print(f"Totals: {summary} (overall {sum(summary.values())})")
+if __name__ == "__main__":
+    main()

datasets/download_dfdc_sample.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""Download a sample of the DFDC (Deepfake Detection Challenge) Preview dataset.
+The full DFDC is ~470GB; the *preview* release (~5GB, Kaggle) is enough for
+diversity augmentation alongside FFPP.
+Requires the Kaggle CLI (`pip install kaggle`) and ~/.kaggle/kaggle.json.
+Usage:
+    python -m backend.training.datasets.download_dfdc_sample --output ./data/dfdc_preview
+"""
+from __future__ import annotations
+import argparse
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--output", required=True, type=Path)
+    ap.add_argument(
+        "--competition",
+        default="deepfake-detection-challenge",
+        help="Kaggle competition slug (default: deepfake-detection-challenge preview).",
+    )
+    args = ap.parse_args()
+    kaggle = shutil.which("kaggle")
+    if kaggle is None:
+        print("Kaggle CLI not found. Install with: pip install kaggle", file=sys.stderr)
+        print("Then place kaggle.json in ~/.kaggle/ (chmod 600).", file=sys.stderr)
+        sys.exit(2)
+    args.output.mkdir(parents=True, exist_ok=True)
+    cmd = [kaggle, "competitions", "download", "-c", args.competition, "-p", str(args.output)]
+    print("Running:", " ".join(cmd))
+    subprocess.run(cmd, check=True)
+    print(f"Downloaded to {args.output}. Unzip with: unzip *.zip")
+if __name__ == "__main__":
+    main()

datasets/download_ffhq.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""Download the FFHQ 128x128 thumbnail subset from the official Google Drive mirror.
+Pulls up to N images (default 10k) into the `real` bucket of the training set.
+Falls back to the NVlabs 'ffhq-dataset' helper if available; otherwise expects
+user to run the manual download once.
+Usage:
+    python -m backend.training.datasets.download_ffhq --output ./data/real/ffhq -n 10000
+"""
+from __future__ import annotations
+import argparse
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+def try_nvlabs_helper(output: Path, num: int) -> bool:
+    """Prefer the official ffhq-dataset downloader if installed."""
+    helper = shutil.which("ffhq-dataset")
+    if helper is None:
+        return False
+    cmd = [helper, "--json", "ffhq-dataset-v2.json", "--thumbs", "--num_threads", "4"]
+    print("Running:", " ".join(cmd))
+    subprocess.run(cmd, cwd=output, check=False)
+    return True
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--output", required=True, type=Path)
+    ap.add_argument("-n", "--num", type=int, default=10000)
+    args = ap.parse_args()
+    args.output.mkdir(parents=True, exist_ok=True)
+    if try_nvlabs_helper(args.output, args.num):
+        return
+    print("[!] `ffhq-dataset` helper not installed.")
+    print("    Install via: pip install ffhq-dataset  (requires gdown)")
+    print("    Or download thumbnails128x128.zip manually from:")
+    print("      https://github.com/NVlabs/ffhq-dataset")
+    print(f"    Extract into: {args.output}")
+    sys.exit(1)
+if __name__ == "__main__":
+    main()

datasets/extract_frames.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""Convert FFPP / DFDC videos -> 16 sampled frames at 224x224 RGB.
+Usage:
+    python -m backend.training.datasets.extract_frames \
+        --input ./ffpp_data/original_sequences/youtube/raw/videos \
+        --output ./ffpp_data/frames/real \
+        --label real --frames 16 --size 224
+"""
+from __future__ import annotations
+import argparse
+import csv
+from pathlib import Path
+import cv2
+import numpy as np
+from tqdm import tqdm
+def sample_frame_indices(total: int, n: int) -> list[int]:
+    if total <= 0:
+        return []
+    if total <= n:
+        return list(range(total))
+    step = total / float(n)
+    return [min(total - 1, int(step * i + step / 2)) for i in range(n)]
+def extract_from_video(path: Path, out_dir: Path, n: int, size: int) -> int:
+    cap = cv2.VideoCapture(str(path))
+    if not cap.isOpened():
+        return 0
+    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    indices = set(sample_frame_indices(total, n))
+    out_dir.mkdir(parents=True, exist_ok=True)
+    saved = 0
+    i = 0
+    while True:
+        ok, frame = cap.read()
+        if not ok:
+            break
+        if i in indices:
+            frame = cv2.resize(frame, (size, size), interpolation=cv2.INTER_AREA)
+            cv2.imwrite(str(out_dir / f"{path.stem}_f{i:06d}.jpg"), frame, [cv2.IMWRITE_JPEG_QUALITY, 95])
+            saved += 1
+        i += 1
+    cap.release()
+    return saved
+def main() -> None:
+    ap = argparse.ArgumentParser(description="Sample N frames per video and resize.")
+    ap.add_argument("--input", required=True, type=Path, help="Directory of .mp4 videos (recursive).")
+    ap.add_argument("--output", required=True, type=Path, help="Directory to write .jpg frames.")
+    ap.add_argument("--label", required=True, choices=["real", "fake"], help="Label tag for manifest.")
+    ap.add_argument("--frames", type=int, default=16)
+    ap.add_argument("--size", type=int, default=224)
+    ap.add_argument("--manifest", type=Path, default=None, help="Optional CSV manifest append path.")
+    args = ap.parse_args()
+    videos = [p for p in args.input.rglob("*.mp4")]
+    if not videos:
+        print(f"No .mp4 found under {args.input}")
+        return
+    rows: list[tuple[str, str, str]] = []
+    total_frames = 0
+    for vid in tqdm(videos, desc=f"extract[{args.label}]"):
+        rel_out = args.output / vid.stem
+        saved = extract_from_video(vid, rel_out, args.frames, args.size)
+        total_frames += saved
+        if args.manifest is not None:
+            for jpg in rel_out.glob("*.jpg"):
+                rows.append((str(jpg), args.label, vid.stem))
+    if args.manifest is not None and rows:
+        args.manifest.parent.mkdir(parents=True, exist_ok=True)
+        new_file = not args.manifest.exists()
+        with args.manifest.open("a", newline="", encoding="utf-8") as f:
+            w = csv.writer(f)
+            if new_file:
+                w.writerow(["path", "label", "source_video"])
+            w.writerows(rows)
+    print(f"Done. Videos: {len(videos)}, frames written: {total_frames}")
+if __name__ == "__main__":
+    main()

datasets/procure_all.ps1 ADDED Viewed

	@@ -0,0 +1,40 @@

+# Phase 11.1 orchestrator for Windows (PowerShell)
+$ErrorActionPreference = "Stop"
+$ROOT = if ($env:ROOT) { $env:ROOT } else { ".\data" }
+$FFPP = if ($env:FFPP) { $env:FFPP } else { ".\ffpp_data" }
+New-Item -ItemType Directory -Force -Path "$ROOT\real" | Out-Null
+New-Item -ItemType Directory -Force -Path "$ROOT\fake" | Out-Null
+New-Item -ItemType Directory -Force -Path $FFPP | Out-Null
+Write-Host "1. FaceForensics++ (highly compressed c40, 10 videos only) -- requires TOS keypress"
+python backend\scripts\download_ffpp.py $FFPP -d all -c c40 -t videos -n 10
+Write-Host "2. Frame extraction: real (original youtube)"
+python -m backend.training.datasets.extract_frames `
+    --input "$FFPP\original_sequences\youtube\c40\videos" `
+    --output "$ROOT\real\ffpp_youtube" --label real --frames 4 --size 224
+Write-Host "3. Frame extraction: fakes (each manipulation family)"
+$Families = @("Deepfakes", "Face2Face", "FaceSwap", "NeuralTextures", "FaceShifter")
+foreach ($fam in $Families) {
+    $famLower = $fam.ToLower()
+    python -m backend.training.datasets.extract_frames `
+        --input "$FFPP\manipulated_sequences\$fam\c40\videos" `
+        --output "$ROOT\fake\ffpp_$famLower" --label fake --frames 4 --size 224
+}
+Write-Host "4. FFHQ thumbnails (real - limited to 100 items)"
+python -m backend.training.datasets.download_ffhq --output "$ROOT\real\ffhq" -n 100
+Write-Host "6. DFDC preview sample (fake+real)"
+python -m backend.training.datasets.download_dfdc_sample --output "$ROOT\_dfdc_raw"
+Write-Host "NOTE: You will need to manually unzip + sort DFDC into $ROOT\fake\dfdc and $ROOT\real\dfdc"
+Write-Host "7. Build manifest"
+python -m backend.training.datasets.build_manifest `
+    --data $ROOT --out "$ROOT\manifest.csv" --seed 42
+Write-Host "Phase 11.1 complete. See $ROOT\manifest.csv"

datasets/procure_all.sh ADDED Viewed

	@@ -0,0 +1,37 @@

+#!/usr/bin/env bash
+# Phase 11.1 orchestrator: download + frame-extract + manifest.
+# Total disk target: ~120k labeled images. Expect 60-80GB intermediate, ~30GB frames.
+set -euo pipefail
+ROOT="${ROOT:-./data}"
+FFPP="${FFPP:-./ffpp_data}"
+mkdir -p "$ROOT/real" "$ROOT/fake" "$FFPP"
+# 1. FaceForensics++ (raw, videos) -- requires TOS keypress
+python backend/scripts/download_ffpp.py "$FFPP" -d all -c raw -t videos
+# 2. Frame extraction: real (original youtube)
+python -m backend.training.datasets.extract_frames \
+    --input  "$FFPP/original_sequences/youtube/raw/videos" \
+    --output "$ROOT/real/ffpp_youtube" --label real --frames 16 --size 224
+# 3. Frame extraction: fakes (each manipulation family)
+for fam in Deepfakes Face2Face FaceSwap NeuralTextures FaceShifter; do
+    python -m backend.training.datasets.extract_frames \
+        --input  "$FFPP/manipulated_sequences/$fam/raw/videos" \
+        --output "$ROOT/fake/ffpp_${fam,,}" --label fake --frames 16 --size 224
+done
+# 4. FFHQ thumbnails (real)
+python -m backend.training.datasets.download_ffhq --output "$ROOT/real/ffhq" -n 10000
+# 6. DFDC preview sample (fake+real) -- needs Kaggle creds
+python -m backend.training.datasets.download_dfdc_sample --output "$ROOT/_dfdc_raw"
+# NOTE: unzip + sort into $ROOT/fake/dfdc  and  $ROOT/real/dfdc  per DFDC metadata.json
+# 7. Build manifest
+python -m backend.training.datasets.build_manifest \
+    --data "$ROOT" --out "$ROOT/manifest.csv" --seed 42
+echo "Phase 11.1 complete. See $ROOT/manifest.csv"

deepshield_13_5bcf1328.pdf ADDED Viewed

	@@ -0,0 +1,148 @@

+%PDF-1.4
+%���� ReportLab Generated PDF document (opensource)
+1 0 obj
+<<
+/F1 2 0 R /F2 3 0 R /F3 5 0 R
+>>
+endobj
+2 0 obj
+<<
+/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
+>>
+endobj
+3 0 obj
+<<
+/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
+>>
+endobj
+4 0 obj
+<<
+/Contents 18 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 17 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+>>
+  /Type /Page
+>>
+endobj
+5 0 obj
+<<
+/BaseFont /Symbol /Name /F3 /Subtype /Type1 /Type /Font
+>>
+endobj
+6 0 obj
+<<
+/Contents 19 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 17 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+>>
+  /Type /Page
+>>
+endobj
+7 0 obj
+<<
+/Outlines 9 0 R /PageMode /UseNone /Pages 17 0 R /Type /Catalog
+>>
+endobj
+8 0 obj
+<<
+/Author () /CreationDate (D:20260415181653+05'00') /Creator (\(unspecified\)) /Keywords () /ModDate (D:20260415181653+05'00') /Producer (xhtml2pdf <https://github.com/xhtml2pdf/xhtml2pdf/>)
+  /Subject () /Title (DeepShield Analysis Report \204 7771f496-45b1-4c97-8a1a-d9d2492ca67d) /Trapped /False
+>>
+endobj
+9 0 obj
+<<
+/Count 3 /First 10 0 R /Last 10 0 R /Type /Outlines
+>>
+endobj
+10 0 obj
+<<
+/Count -4 /Dest [ 4 0 R /Fit ] /First 11 0 R /Last 16 0 R /Parent 9 0 R /Title (DeepShield Analysis Report)
+>>
+endobj
+11 0 obj
+<<
+/Dest [ 4 0 R /Fit ] /Next 12 0 R /Parent 10 0 R /Title (Verdict)
+>>
+endobj
+12 0 obj
+<<
+/Count -2 /Dest [ 4 0 R /Fit ] /First 13 0 R /Last 14 0 R /Next 15 0 R /Parent 10 0 R
+  /Prev 11 0 R /Title (Text Classification)
+>>
+endobj
+13 0 obj
+<<
+/Dest [ 4 0 R /Fit ] /Next 14 0 R /Parent 12 0 R /Title (Sensationalism Signals)
+>>
+endobj
+14 0 obj
+<<
+/Dest [ 4 0 R /Fit ] /Parent 12 0 R /Prev 13 0 R /Title (Extracted Keywords)
+>>
+endobj
+15 0 obj
+<<
+/Dest [ 4 0 R /Fit ] /Next 16 0 R /Parent 10 0 R /Prev 12 0 R /Title (Trusted Source Cross-Reference \(1\))
+>>
+endobj
+16 0 obj
+<<
+/Dest [ 6 0 R /Fit ] /Parent 10 0 R /Prev 15 0 R /Title (Processing Summary)
+>>
+endobj
+17 0 obj
+<<
+/Count 2 /Kids [ 4 0 R 6 0 R ] /Type /Pages
+>>
+endobj
+18 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 1750
+>>
+stream
+Gb"/(9lo&I&A@sBlm4G[Acr2Y4p^$ca2t\gAsuiHo\c,I9gURE8lSA3M>qu?,XkR;()9nE&%0G$"Ts\%gUFdJ0E[3iXSb#I!k]Slq-+&^_fu5V&-:f'>`[5155TjpXI_!]U"iQd1qrcX0jNK021sk.K_S`f[kfkaR[pr2$LLU)UX&`3>7R17rJ3t':B_<4Kk*Grr8\a:5/Z<<[I]mbfHq28c@Y+3O)t)0k@mu0K^fiq^N*(u.%T.'jl<s/Nh4He2l7^V7l^6+r/e]g]la.!>S?L^o+>>SgBV8H:sX>5A0-l`)&\h4Lk6L5I=)ArV#_bh%^>M_c,"jSErfH[2A&CfKtLn_&K3h)!u;:i'6.H*(apE@/QWkIgF*OaTZ"ZT=me'_?iN-hL[(uHeb"'/B!\/7d068ieW>Y3P8NcsU#;"%eOe_!^-"Xsc?9a'H,u4"nMEm$3F[>c1S8J!`Sh;Ye8pG>de>ac3KpI*&j-(`*[@OB&i#OgJSl=(I-'<c@@S(D;k%W_$;Jl?$^4Y-G*rH-Rk_h_*=&9o`q/eu[3o$--Zc#XoX(sA&CI7RqS'cWBhG2:+ODa!):O6`^NT((K7(:%BVJ3=F%emKe-WmK3EIie5ZAbGXt^Hf,[uurZtImn"m<3AaU$p)@,./&T/aMg@_t-oU(Al5HTNb;0J4E-fqZg*4Y/o@,5%"0ObY@,kKsQdk#2'pZOD8tZrghVcMH[#FI&3f.,FmGKKKNo9?B[@`=FkP`:=oo>;4Vs.^rc%L+kt99^Gd]mfUsWoLD02jLH*WUl.Pb(oF^j?7RUN!m&Us22M!@A<RB<?,"#orPd]<&>ld**8+J._-f-FEVm$t<`HO6GNqd_[bhJ&8qK0d-ZKt;EB60u<VCgOQ;8F:jeCp]E2HpO&5==e.Z2c5.#%nBkfCHsrt>d0-2Z<CdP%-(PZ=R(ET3u6<D1@I(u[6LMn;M%:K3fl4ls;SX'd>:*Z]IT(dG)'7QU\#<V$$AmO6;HncG;?UO[<qf,QJem^o.f$D3^V'_h3dF.f82/[@>u^ecY/FgdnO#RWf_=Js*t;iiO?'fQ:g&@nC/Xhu.;&o1b+?_6-Z%i4;1H5GAUag0*4LfL'2;Sl`["O/H6p>jU\SO4%Ffq^-']m<b(Mo1Vg;h"E$f8Z?_AL@bH31kAKY%KEP\PmsdK2MJ^Dfb%0.sgc_9*[9&'t*;+>uUp/PKbuj>J71&Mh5t,WF_k&]O@P+do^;.WV"r6Kkb#5`,aF$-adPdc+'072](pse[q;.^?I#Q#kci1Qr9Z_U:Q_lQ53n!nIBHrchNfMeP-HF*=<22XdSrZ8j>sP4CR1SEP\Ge.aCh(VEW.)F'<]`"gVnaq<<]K,.uCIMlUqSgV3U</GlN`:3?Ft9S-uHH\_0/'rV&dUBe&=8^c)"F#b/Te`H6Yn1DnZc?T$IiaKe%'S][\*'W-]E<4.cnD8?.XB5)khib.oe$NkDa0D^I+$2a=[rbp"D3eQQqq@TO]aNHTMcGM3B3cn9,9'giRF__Y[<^:+bB3]sACEq,A$s%=n\8Vk/OM\c,W"mZ11,MaZ61]7"M`X1/qmcr-hH,#8+udNN9@p:IAM="9:b-RnD&FAVj^G'kW4tPgO+M25'hLH])Ped#fB*fOs>Te;V8("S^2/7e`3>4E]],alEY#@T-dG.(=/^7(s[bh3%omN/'WKl<"q_K`T7$VrMt.GfckX6]1EfAB]1F6o6g>\:2Etf)rD.XNrRc2pgl"Hr<(1MCd%~>endstream
+endobj
+19 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 1251
+>>
+stream
+Gau`R;01GN&:Vs/fU'm&SZsB\Z>@pd[^l$Ne'"!6Hco+&(^1n<bt7%'s8H%#$m^MQApR0<`)taLn([eaAHiiuRK&mT!C!?!I`[+[8FM*9+s?gk^Sb`ESFuBheu'`^-k@VZQnjgqaj:g4M2J-c)%`([:iWt%O9mV9ZO6(4"\bX`WWWGJ,s27(iVrdq]@Q&`bX7t`KV@dkk1#U3_]/$nF6>.H%;Q95P;kU[/"Vgs.N%@'=M6kAJN1afF&?E_+rA+1KE+S:4],1QpOr^qg01e<#d,;@\e=!\1-*,1T[41J&^DSg86dC5.#&+tMiZhie$%p]f=sWJ!9ni#^ZR?Gp5lVJY,M<YHnZf[nt2A3ZtRV6dLh4C-*^gI%O$[,o&o;u7[Nu/XEmkj&m4-UHNFF#I0VCUiaS-$S2Gs[@(=.(Fg-V>W+]dGA*V*5[2WS\gs>9t%t32b/^W)[_+r7&3kOLD>8WTI508QU_ZkVRb*l"j_,ie@Wk/$,J'=rjAsRr^aIAp,g4N\@rcW@_7fV)G7.f:C\2aDCnK2"(-Yh-fNKV4ogPJ_Bbno/AG^W)=l`02mHESBSd,2MW2Q,8S^O,7f_^Pj+'$c\[n!'TZ'8A[[6$M/6Vlo9egXU318J0Zl;rXSYgM=-\-3TecfRc]m]FKNI.=E4amT3\PSaWQi;TtrPVN"#t`E;<R<T0FHF)>bkNM&M.:/OC)MK2$$?Jp$`SY/%t"jbj6*+.%6.71qjEsp)j@\0#RIF/1!&^q"O7Ou;8DL^2(?$>18.AWa`<qQ;FS*8d605U,LRjPYl%CQZ"EZ)d6ggmR/\emf.%.#K=ZXlPbU\40kfi-URgEX``iXe1pOV?N=StFNQ>H$Fi,Ak&SQPl+Y^;rG>nArp/_q%9B[r]_;\_^p'[__7OH7)iuf]c[rld?RB/M<r(<QsU%pNedj)1NmPM-_fL1VD1tNQL&@c-=<:"`[Vpojg6J[HJ4:,T\L_]InN3jJke4J(kV<hYN(d]b#E=":iOW#=k#-U%PKO/p'+,)f951AW&jRK9')Q>rP3T8Xk7<ZOVAq$3lpK6YL6tc'D2V%1G(jM8"TncWs=[!hW2(D30g$5(Q/MN1htIgRt\ADhN@$l202Af7(c#1P6?P("GPEU+>VY%=qG1""FA,mioCp,lF3^-AZtKRg/NFX>&kA^rZpnFA<r!,IA42rZQ6YFrrrLL)tME=&"E=g6gSrChSiOfRe!l*<?[tTYGRI@6&N"%Fn3=3;X6Dm0TH~>endstream
+endobj
+xref
+0 20
+0000000000 65535 f
+0000000061 00000 n
+0000000112 00000 n
+0000000219 00000 n
+0000000331 00000 n
+0000000536 00000 n
+0000000613 00000 n
+0000000818 00000 n
+0000000903 00000 n
+0000001223 00000 n
+0000001296 00000 n
+0000001426 00000 n
+0000001514 00000 n
+0000001667 00000 n
+0000001770 00000 n
+0000001869 00000 n
+0000001999 00000 n
+0000002098 00000 n
+0000002164 00000 n
+0000004006 00000 n
+trailer
+<<
+/ID
+[<8e273c2672d813e3cd44109eb1edd604><8e273c2672d813e3cd44109eb1edd604>]
+% ReportLab generated PDF document -- digest (opensource)
+/Info 8 0 R
+/Root 7 0 R
+/Size 20
+>>
+startxref
+5349
+%%EOF

deps.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from __future__ import annotations
+from fastapi import Depends, Header, HTTPException, status
+from sqlalchemy.orm import Session
+from db.database import get_db
+from db.models import User
+from services.auth_service import decode_token, get_user
+def _extract_bearer(authorization: str | None) -> str | None:
+    if not authorization:
+        return None
+    parts = authorization.split()
+    if len(parts) != 2 or parts[0].lower() != "bearer":
+        return None
+    return parts[1]
+def get_current_user(
+    authorization: str | None = Header(default=None),
+    db: Session = Depends(get_db),
+) -> User:
+    token = _extract_bearer(authorization)
+    if not token:
+        raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Missing bearer token")
+    payload = decode_token(token)
+    if not payload or "sub" not in payload:
+        raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Invalid or expired token")
+    user = get_user(db, int(payload["sub"]))
+    if not user:
+        raise HTTPException(status.HTTP_401_UNAUTHORIZED, "User not found")
+    return user
+def optional_current_user(
+    authorization: str | None = Header(default=None),
+    db: Session = Depends(get_db),
+) -> User | None:
+    token = _extract_bearer(authorization)
+    if not token:
+        return None
+    payload = decode_token(token)
+    if not payload or "sub" not in payload:
+        return None
+    return get_user(db, int(payload["sub"]))

download_ffpp.py ADDED Viewed

	@@ -0,0 +1,261 @@

+#!/usr/bin/env python
+""" Downloads FaceForensics++ and Deep Fake Detection public data release
+Example usage:
+    see -h or https://github.com/ondyari/FaceForensics
+"""
+# -*- coding: utf-8 -*-
+import argparse
+import os
+import urllib
+import urllib.request
+import tempfile
+import time
+import sys
+import json
+import random
+from tqdm import tqdm
+from os.path import join
+# URLs and filenames
+FILELIST_URL = 'misc/filelist.json'
+DEEPFEAKES_DETECTION_URL = 'misc/deepfake_detection_filenames.json'
+DEEPFAKES_MODEL_NAMES = ['decoder_A.h5', 'decoder_B.h5', 'encoder.h5',]
+# Parameters
+DATASETS = {
+    'original_youtube_videos': 'misc/downloaded_youtube_videos.zip',
+    'original_youtube_videos_info': 'misc/downloaded_youtube_videos_info.zip',
+    'original': 'original_sequences/youtube',
+    'DeepFakeDetection_original': 'original_sequences/actors',
+    'Deepfakes': 'manipulated_sequences/Deepfakes',
+    'DeepFakeDetection': 'manipulated_sequences/DeepFakeDetection',
+    'Face2Face': 'manipulated_sequences/Face2Face',
+    'FaceShifter': 'manipulated_sequences/FaceShifter',
+    'FaceSwap': 'manipulated_sequences/FaceSwap',
+    'NeuralTextures': 'manipulated_sequences/NeuralTextures'
+    }
+ALL_DATASETS = ['original', 'DeepFakeDetection_original', 'Deepfakes',
+                'DeepFakeDetection', 'Face2Face', 'FaceShifter', 'FaceSwap',
+                'NeuralTextures']
+COMPRESSION = ['raw', 'c23', 'c40']
+TYPE = ['videos', 'masks', 'models']
+SERVERS = ['EU', 'EU2', 'CA']
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Downloads FaceForensics v2 public data release.',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument('output_path', type=str, help='Output directory.')
+    parser.add_argument('-d', '--dataset', type=str, default='all',
+                        help='Which dataset to download, either pristine or '
+                             'manipulated data or the downloaded youtube '
+                             'videos.',
+                        choices=list(DATASETS.keys()) + ['all']
+                        )
+    parser.add_argument('-c', '--compression', type=str, default='raw',
+                        help='Which compression degree. All videos '
+                             'have been generated with h264 with a varying '
+                             'codec. Raw (c0) videos are lossless compressed.',
+                        choices=COMPRESSION
+                        )
+    parser.add_argument('-t', '--type', type=str, default='videos',
+                        help='Which file type, i.e. videos, masks, for our '
+                             'manipulation methods, models, for Deepfakes.',
+                        choices=TYPE
+                        )
+    parser.add_argument('-n', '--num_videos', type=int, default=None,
+                        help='Select a number of videos number to '
+                             "download if you don't want to download the full"
+                             ' dataset.')
+    parser.add_argument('--server', type=str, default='EU',
+                        help='Server to download the data from. If you '
+                             'encounter a slow download speed, consider '
+                             'changing the server.',
+                        choices=SERVERS
+                        )
+    args = parser.parse_args()
+    # URLs
+    server = args.server
+    if server == 'EU':
+        server_url = 'http://canis.vc.in.tum.de:8100/'
+    elif server == 'EU2':
+        server_url = 'http://kaldir.vc.in.tum.de/faceforensics/'
+    elif server == 'CA':
+        server_url = 'http://falas.cmpt.sfu.ca:8100/'
+    else:
+        raise Exception('Wrong server name. Choices: {}'.format(str(SERVERS)))
+    args.tos_url = server_url + 'webpage/FaceForensics_TOS.pdf'
+    args.base_url = server_url + 'v3/'
+    args.deepfakes_model_url = server_url + 'v3/manipulated_sequences/' + \
+                               'Deepfakes/models/'
+    return args
+def download_files(filenames, base_url, output_path, report_progress=True):
+    os.makedirs(output_path, exist_ok=True)
+    if report_progress:
+        filenames = tqdm(filenames)
+    for filename in filenames:
+        download_file(base_url + filename, join(output_path, filename))
+def reporthook(count, block_size, total_size):
+    global start_time
+    if count == 0:
+        start_time = time.time()
+        return
+    duration = time.time() - start_time
+    progress_size = int(count * block_size)
+    speed = int(progress_size / (1024 * duration))
+    percent = int(count * block_size * 100 / total_size)
+    sys.stdout.write("\rProgress: %d%%, %d MB, %d KB/s, %d seconds passed" %
+                     (percent, progress_size / (1024 * 1024), speed, duration))
+    sys.stdout.flush()
+def download_file(url, out_file, report_progress=False):
+    out_dir = os.path.dirname(out_file)
+    if not os.path.isfile(out_file):
+        fh, out_file_tmp = tempfile.mkstemp(dir=out_dir)
+        f = os.fdopen(fh, 'w')
+        f.close()
+        if report_progress:
+            urllib.request.urlretrieve(url, out_file_tmp,
+                                       reporthook=reporthook)
+        else:
+            urllib.request.urlretrieve(url, out_file_tmp)
+        os.rename(out_file_tmp, out_file)
+    else:
+        tqdm.write('WARNING: skipping download of existing file ' + out_file)
+def main(args):
+    # TOS
+    print('By pressing any key to continue you confirm that you have agreed '\
+          'to the FaceForensics terms of use as described at:')
+    print(args.tos_url)
+    print('***')
+    print('Press any key to continue, or CTRL-C to exit.')
+    _ = input('')
+    # Extract arguments
+    c_datasets = [args.dataset] if args.dataset != 'all' else ALL_DATASETS
+    c_type = args.type
+    c_compression = args.compression
+    num_videos = args.num_videos
+    output_path = args.output_path
+    os.makedirs(output_path, exist_ok=True)
+    # Check for special dataset cases
+    for dataset in c_datasets:
+        dataset_path = DATASETS[dataset]
+        # Special cases
+        if 'original_youtube_videos' in dataset:
+            # Here we download the original youtube videos zip file
+            print('Downloading original youtube videos.')
+            if not 'info' in dataset_path:
+                print('Please be patient, this may take a while (~40gb)')
+                suffix = ''
+            else:
+            	suffix = 'info'
+            download_file(args.base_url + '/' + dataset_path,
+                          out_file=join(output_path,
+                                        'downloaded_videos{}.zip'.format(
+                                            suffix)),
+                          report_progress=True)
+            return
+        # Else: regular datasets
+        print('Downloading {} of dataset "{}"'.format(
+            c_type, dataset_path
+        ))
+        # Get filelists and video lenghts list from server
+        if 'DeepFakeDetection' in dataset_path or 'actors' in dataset_path:
+        	filepaths = json.loads(urllib.request.urlopen(args.base_url + '/' +
+                DEEPFEAKES_DETECTION_URL).read().decode("utf-8"))
+        	if 'actors' in dataset_path:
+        		filelist = filepaths['actors']
+        	else:
+        		filelist = filepaths['DeepFakesDetection']
+        elif 'original' in dataset_path:
+            # Load filelist from server
+            file_pairs = json.loads(urllib.request.urlopen(args.base_url + '/' +
+                FILELIST_URL).read().decode("utf-8"))
+            filelist = []
+            for pair in file_pairs:
+            	filelist += pair
+        else:
+            # Load filelist from server
+            file_pairs = json.loads(urllib.request.urlopen(args.base_url + '/' +
+                FILELIST_URL).read().decode("utf-8"))
+            # Get filelist
+            filelist = []
+            for pair in file_pairs:
+                filelist.append('_'.join(pair))
+                if c_type != 'models':
+                    filelist.append('_'.join(pair[::-1]))
+        # Maybe limit number of videos for download
+        if num_videos is not None and num_videos > 0:
+        	print('Downloading the first {} videos'.format(num_videos))
+        	filelist = filelist[:num_videos]
+        # Server and local paths
+        dataset_videos_url = args.base_url + '{}/{}/{}/'.format(
+            dataset_path, c_compression, c_type)
+        dataset_mask_url = args.base_url + '{}/{}/videos/'.format(
+            dataset_path, 'masks', c_type)
+        if c_type == 'videos':
+            dataset_output_path = join(output_path, dataset_path, c_compression,
+                                       c_type)
+            print('Output path: {}'.format(dataset_output_path))
+            filelist = [filename + '.mp4' for filename in filelist]
+            download_files(filelist, dataset_videos_url, dataset_output_path)
+        elif c_type == 'masks':
+            dataset_output_path = join(output_path, dataset_path, c_type,
+                                       'videos')
+            print('Output path: {}'.format(dataset_output_path))
+            if 'original' in dataset:
+                if args.dataset != 'all':
+                    print('Only videos available for original data. Aborting.')
+                    return
+                else:
+                    print('Only videos available for original data. '
+                          'Skipping original.\n')
+                    continue
+            if 'FaceShifter' in dataset:
+                print('Masks not available for FaceShifter. Aborting.')
+                return
+            filelist = [filename + '.mp4' for filename in filelist]
+            download_files(filelist, dataset_mask_url, dataset_output_path)
+        # Else: models for deepfakes
+        else:
+            if dataset != 'Deepfakes' and c_type == 'models':
+                print('Models only available for Deepfakes. Aborting')
+                return
+            dataset_output_path = join(output_path, dataset_path, c_type)
+            print('Output path: {}'.format(dataset_output_path))
+            # Get Deepfakes models
+            for folder in tqdm(filelist):
+                folder_filelist = DEEPFAKES_MODEL_NAMES
+                # Folder paths
+                folder_base_url = args.deepfakes_model_url + folder + '/'
+                folder_dataset_output_path = join(dataset_output_path,
+                                                  folder)
+                download_files(folder_filelist, folder_base_url,
+                               folder_dataset_output_path,
+                               report_progress=False)   # already done
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

ela_service.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""Error Level Analysis (ELA) — Phase 12.1
+Re-saves an image at a fixed JPEG quality and diffs against the original to reveal
+per-pixel manipulation artifacts. Regions that were recently edited will show
+higher error levels than untouched areas.
+"""
+from __future__ import annotations
+import base64
+import io
+import cv2
+import numpy as np
+from loguru import logger
+from PIL import Image
+def _compute_ela(pil_img: Image.Image, quality: int = 90, scale: float = 15.0) -> np.ndarray:
+    """Return an ELA difference map as a uint8 (H,W,3) RGB array.
+    Args:
+        pil_img: Input image (any format — converted to RGB internally).
+        quality: JPEG re-save quality level (lower = more aggressive compression).
+        scale: Amplification factor for the difference (higher = more contrast).
+    Returns:
+        Difference image as uint8 (H,W,3) array.
+    """
+    rgb = pil_img.convert("RGB")
+    # Re-save at specified JPEG quality into an in-memory buffer
+    buf = io.BytesIO()
+    rgb.save(buf, format="JPEG", quality=quality)
+    buf.seek(0)
+    resaved = Image.open(buf).convert("RGB")
+    original_arr = np.array(rgb, dtype=np.float32)
+    resaved_arr = np.array(resaved, dtype=np.float32)
+    # Per-pixel absolute difference, amplified
+    diff = np.abs(original_arr - resaved_arr) * scale
+    diff = np.clip(diff, 0, 255).astype(np.uint8)
+    return diff
+def generate_ela_base64(pil_img: Image.Image, quality: int = 90, scale: float = 15.0) -> str:
+    """Produce a base64 data-URL PNG of the ELA difference map.
+    Regions with higher error levels (brighter in the output) are more likely
+    to have been digitally manipulated.
+    """
+    diff = _compute_ela(pil_img, quality=quality, scale=scale)
+    buf = io.BytesIO()
+    Image.fromarray(diff).save(buf, format="PNG")
+    b64 = base64.b64encode(buf.getvalue()).decode("ascii")
+    logger.info(f"ELA map generated ({diff.shape[1]}x{diff.shape[0]})")
+    return f"data:image/png;base64,{b64}"
+def generate_blended_ela_base64(
+    pil_img: Image.Image,
+    gradcam_weight: float = 0.6,
+    ela_weight: float = 0.4,
+    quality: int = 90,
+    scale: float = 15.0,
+) -> str:
+    """Blend Grad-CAM heatmap overlay with ELA at specified weights.
+    This is a utility for the 'blended' mode — it composites the ELA
+    difference map on top of the original image for visual clarity.
+    """
+    rgb = pil_img.convert("RGB")
+    original_arr = np.array(rgb, dtype=np.float32)
+    ela_arr = _compute_ela(pil_img, quality=quality, scale=scale).astype(np.float32)
+    # Blend: overlay ELA on the original for visual context
+    blended = np.clip(original_arr * 0.5 + ela_arr * 0.5, 0, 255).astype(np.uint8)
+    buf = io.BytesIO()
+    Image.fromarray(blended).save(buf, format="PNG")
+    b64 = base64.b64encode(buf.getvalue()).decode("ascii")
+    logger.info(f"Blended ELA generated ({blended.shape[1]}x{blended.shape[0]})")
+    return f"data:image/png;base64,{b64}"

exif_service.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""EXIF Metadata Extraction — Phase 12.2
+Extracts camera metadata from uploaded images and computes a trust adjustment
+score: presence of authentic camera metadata lowers fake probability, while
+evidence of editing software raises it.
+"""
+from __future__ import annotations
+from typing import Optional
+from loguru import logger
+from PIL import Image
+from PIL.ExifTags import TAGS, GPSTAGS
+from schemas.common import ExifSummary
+# Software strings that suggest post-processing / generation
+_SUSPICIOUS_SOFTWARE = {
+    "adobe photoshop", "photoshop", "gimp", "affinity photo",
+    "stable diffusion", "midjourney", "dall-e", "comfyui",
+    "automatic1111", "invokeai",
+}
+# Software strings that are normal camera firmware
+_CAMERA_SOFTWARE = {
+    "ver.", "firmware", "camera", "dji", "gopro",
+}
+def _decode_gps(gps_info: dict) -> Optional[str]:
+    """Decode EXIF GPSInfo dict into a human-readable lat/lon string."""
+    try:
+        def _to_decimal(values, ref):
+            d, m, s = [float(v) for v in values]
+            decimal = d + m / 60.0 + s / 3600.0
+            if ref in ("S", "W"):
+                decimal = -decimal
+            return decimal
+        lat = _to_decimal(gps_info.get(2, (0, 0, 0)), gps_info.get(1, "N"))
+        lon = _to_decimal(gps_info.get(4, (0, 0, 0)), gps_info.get(3, "E"))
+        return f"{lat:.6f}, {lon:.6f}"
+    except Exception:
+        return None
+def extract_exif(pil_img: Image.Image, raw_bytes: bytes) -> ExifSummary:
+    """Extract EXIF metadata and compute a trust adjustment score.
+    Trust adjustment logic:
+    - Valid Make + Model + DateTimeOriginal → -15 (more likely real camera photo)
+    - GPS info present → -5 additional (real photos often have GPS)
+    - Suspicious editing software detected → +10 (more likely manipulated)
+    - No EXIF at all → 0 (inconclusive — many platforms strip EXIF)
+    """
+    summary = ExifSummary()
+    try:
+        exif_data = pil_img._getexif()
+    except Exception:
+        exif_data = None
+    if not exif_data:
+        # Try exifread as fallback for formats Pillow doesn't handle well
+        try:
+            import exifread
+            from io import BytesIO
+            tags = exifread.process_file(BytesIO(raw_bytes), details=False)
+            if tags:
+                summary.make = str(tags.get("Image Make", "")).strip() or None
+                summary.model = str(tags.get("Image Model", "")).strip() or None
+                summary.datetime_original = str(tags.get("EXIF DateTimeOriginal", "")).strip() or None
+                summary.software = str(tags.get("Image Software", "")).strip() or None
+                summary.lens_model = str(tags.get("EXIF LensModel", "")).strip() or None
+        except ImportError:
+            logger.debug("exifread not installed, skipping fallback EXIF extraction")
+        except Exception as e:
+            logger.debug(f"exifread fallback failed: {e}")
+    else:
+        # Decode Pillow EXIF
+        decoded = {}
+        for tag_id, value in exif_data.items():
+            tag_name = TAGS.get(tag_id, tag_id)
+            decoded[tag_name] = value
+        summary.make = str(decoded.get("Make", "")).strip() or None
+        summary.model = str(decoded.get("Model", "")).strip() or None
+        summary.datetime_original = str(decoded.get("DateTimeOriginal", "")).strip() or None
+        summary.software = str(decoded.get("Software", "")).strip() or None
+        summary.lens_model = str(decoded.get("LensModel", "")).strip() or None
+        # GPS
+        gps_raw = decoded.get("GPSInfo")
+        if gps_raw and isinstance(gps_raw, dict):
+            gps_decoded = {}
+            for k, v in gps_raw.items():
+                gps_decoded[GPSTAGS.get(k, k)] = v
+            summary.gps_info = _decode_gps(gps_decoded)
+    # ── Trust adjustment scoring ──
+    adjustment = 0
+    reasons = []
+    has_camera_meta = summary.make and summary.model and summary.datetime_original
+    if has_camera_meta:
+        adjustment -= 15
+        reasons.append("valid camera metadata (Make/Model/DateTime)")
+    if summary.gps_info:
+        adjustment -= 5
+        reasons.append("GPS coordinates present")
+    if summary.software:
+        sw_lower = summary.software.lower()
+        if any(s in sw_lower for s in _SUSPICIOUS_SOFTWARE):
+            adjustment += 10
+            reasons.append(f"editing software detected: {summary.software}")
+        elif any(s in sw_lower for s in _CAMERA_SOFTWARE):
+            adjustment -= 2
+            reasons.append("camera firmware in Software field")
+    summary.trust_adjustment = adjustment
+    summary.trust_reason = "; ".join(reasons) if reasons else "no EXIF metadata found"
+    logger.info(f"EXIF extracted: make={summary.make}, model={summary.model}, "
+                f"adjustment={adjustment} ({summary.trust_reason})")
+    return summary

file_handler.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from __future__ import annotations
+import io
+import os
+import tempfile
+from typing import Iterable
+from fastapi import HTTPException, UploadFile, status
+from config import settings
+IMAGE_MAGIC_BYTES: dict[bytes, str] = {
+    b"\xff\xd8\xff": "image/jpeg",
+    b"\x89PNG\r\n\x1a\n": "image/png",
+    b"RIFF": "image/webp",  # partial; WEBP has 'RIFF....WEBP'
+}
+def _detect_mime_by_magic(head: bytes) -> str | None:
+    for sig, mime in IMAGE_MAGIC_BYTES.items():
+        if head.startswith(sig):
+            if mime == "image/webp" and b"WEBP" not in head[:16]:
+                continue
+            return mime
+    return None
+async def read_upload_bytes(
+    file: UploadFile,
+    allowed_mimes: Iterable[str],
+    max_size_mb: int,
+) -> tuple[bytes, str]:
+    """Read an UploadFile into memory after validating type and size.
+    Returns (raw_bytes, detected_mime). Raises HTTPException on failure.
+    """
+    data = await file.read()
+    size_mb = len(data) / (1024 * 1024)
+    if size_mb > max_size_mb:
+        raise HTTPException(
+            status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
+            detail=f"File too large ({size_mb:.1f} MB > {max_size_mb} MB)",
+        )
+    mime = _detect_mime_by_magic(data[:16]) or (file.content_type or "")
+    if mime not in allowed_mimes:
+        raise HTTPException(
+            status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
+            detail=f"Unsupported type '{mime}'. Allowed: {list(allowed_mimes)}",
+        )
+    return data, mime
+def bytes_to_buffer(data: bytes) -> io.BytesIO:
+    return io.BytesIO(data)
+async def save_upload_to_tempfile(
+    file: UploadFile,
+    allowed_mimes: Iterable[str],
+    max_size_mb: int,
+    suffix: str = ".mp4",
+) -> tuple[str, str]:
+    """Stream an UploadFile to a temp file on disk. Returns (path, mime).
+    MIME is taken from the client's content_type (no magic-byte check for videos).
+    Caller is responsible for deleting the temp file.
+    """
+    mime = (file.content_type or "").lower()
+    if mime not in allowed_mimes:
+        raise HTTPException(
+            status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
+            detail=f"Unsupported type '{mime}'. Allowed: {list(allowed_mimes)}",
+        )
+    max_bytes = max_size_mb * 1024 * 1024
+    fd, path = tempfile.mkstemp(suffix=suffix, prefix="ds_vid_")
+    written = 0
+    try:
+        with os.fdopen(fd, "wb") as out:
+            while True:
+                chunk = await file.read(1024 * 1024)
+                if not chunk:
+                    break
+                written += len(chunk)
+                if written > max_bytes:
+                    raise HTTPException(
+                        status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
+                        detail=f"File too large (> {max_size_mb} MB)",
+                    )
+                out.write(chunk)
+    except Exception:
+        try:
+            os.unlink(path)
+        except OSError:
+            pass
+        raise
+    return path, mime

generate_colab_nb.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import nbformat as nbf
+import os
+nb = nbf.v4.new_notebook()
+text = """\
+# DeepShield: FaceForensics++ ViT Training
+Run this entirely in Google Colab.
+**Before running**:
+1. Go to `Runtime` -> `Change runtime type` -> select **T4 GPU**.
+2. Run the cells below sequentially.
+"""
+code_install = """\
+!pip install timm transformers datasets accelerate evaluate opencv-python
+"""
+code_ffpp = """\
+# We create the download script inside the Colab environment
+download_script = '''#!/usr/bin/env python
+import argparse
+import os
+import urllib.request
+import tempfile
+import time
+import sys
+import json
+from tqdm import tqdm
+from os.path import join
+FILELIST_URL = 'misc/filelist.json'
+DEEPFEAKES_DETECTION_URL = 'misc/deepfake_detection_filenames.json'
+DEEPFAKES_MODEL_NAMES = ['decoder_A.h5', 'decoder_B.h5', 'encoder.h5',]
+DATASETS = {
+    'original': 'original_sequences/youtube',
+    'Deepfakes': 'manipulated_sequences/Deepfakes',
+    'Face2Face': 'manipulated_sequences/Face2Face',
+    'FaceShifter': 'manipulated_sequences/FaceShifter',
+    'FaceSwap': 'manipulated_sequences/FaceSwap',
+    'NeuralTextures': 'manipulated_sequences/NeuralTextures'
+}
+ALL_DATASETS = ['original', 'Deepfakes', 'Face2Face', 'FaceShifter', 'FaceSwap', 'NeuralTextures']
+COMPRESSION = ['raw', 'c23', 'c40']
+TYPE = ['videos']
+def download_file(url, out_file):
+    os.makedirs(os.path.dirname(out_file), exist_ok=True)
+    if not os.path.isfile(out_file):
+        urllib.request.urlretrieve(url, out_file)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('output_path', type=str)
+    parser.add_argument('-d', '--dataset', type=str, default='all')
+    parser.add_argument('-c', '--compression', type=str, default='c40')
+    parser.add_argument('-t', '--type', type=str, default='videos')
+    parser.add_argument('-n', '--num_videos', type=int, default=50) # Small amount for tutorial
+    args = parser.parse_args()
+    base_url = 'http://kaldir.vc.in.tum.de/faceforensics/v3/'
+    datasets = [args.dataset] if args.dataset != 'all' else ALL_DATASETS
+    for dataset in datasets:
+        dataset_path = DATASETS[dataset]
+        print(f'Downloading {args.compression} of {dataset}')
+        file_pairs = json.loads(urllib.request.urlopen(base_url + FILELIST_URL).read().decode("utf-8"))
+        filelist = []
+        if 'original' in dataset_path:
+            for pair in file_pairs:
+                filelist += pair
+        else:
+            for pair in file_pairs:
+                filelist.append('_'.join(pair))
+                filelist.append('_'.join(pair[::-1]))
+        filelist = filelist[:args.num_videos]
+        dataset_videos_url = base_url + f'{dataset_path}/{args.compression}/{args.type}/'
+        dataset_output_path = join(args.output_path, dataset_path, args.compression, args.type)
+        for filename in tqdm(filelist):
+            download_file(dataset_videos_url + filename + ".mp4", join(dataset_output_path, filename + ".mp4"))
+if __name__ == "__main__":
+    main()
+'''
+with open("download_ffpp.py", "w") as f:
+    f.write(download_script)
+!python download_ffpp.py ./data -d all -c c40 -t videos -n 50
+"""
+code_extract = """\
+import cv2
+import os
+import glob
+from tqdm import tqdm
+def extract_frames(video_folder, output_folder, label, max_frames=4):
+    os.makedirs(output_folder, exist_ok=True)
+    videos = glob.glob(os.path.join(video_folder, "*.mp4"))
+    for vid_path in tqdm(videos, desc=f"Extracting {label}"):
+        vid_name = os.path.basename(vid_path).replace('.mp4','')
+        cap = cv2.VideoCapture(vid_path)
+        count = 0
+        while cap.isOpened() and count < max_frames:
+            ret, frame = cap.read()
+            if not ret: break
+            frame = cv2.resize(frame, (224, 224))
+            out_path = os.path.join(output_folder, f"{vid_name}_f{count}.jpg")
+            cv2.imwrite(out_path, frame)
+            count += 1
+        cap.release()
+# Extract Real
+extract_frames('./data/original_sequences/youtube/c40/videos', './dataset/real', 'real')
+# Extract Fakes
+fakes = ['Deepfakes', 'Face2Face', 'FaceSwap', 'NeuralTextures']
+for f in fakes:
+    extract_frames(f'./data/manipulated_sequences/{f}/c40/videos', './dataset/fake', 'fake')
+"""
+code_train = """\
+import numpy as np
+from datasets import load_dataset
+from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer
+import torch
+# 1. Load Dataset
+dataset = load_dataset('imagefolder', data_dir='./dataset')
+# Split into train/validation
+dataset = dataset['train'].train_test_split(test_size=0.1)
+# 2. Preprocessor
+model_name_or_path = 'google/vit-base-patch16-224-in21k'
+processor = ViTImageProcessor.from_pretrained(model_name_or_path)
+def transform(example_batch):
+    # Take a list of PIL images and turn them to pixel values
+    inputs = processor([x.convert("RGB") for x in example_batch['image']], return_tensors='pt')
+    inputs['labels'] = example_batch['label']
+    return inputs
+prepared_ds = dataset.with_transform(transform)
+def collate_fn(batch):
+    return {
+        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
+        'labels': torch.tensor([x['labels'] for x in batch])
+    }
+# 3. Load Model
+labels = dataset['train'].features['label'].names
+model = ViTForImageClassification.from_pretrained(
+    model_name_or_path,
+    num_labels=len(labels),
+    id2label={str(i): c for i, c in enumerate(labels)},
+    label2id={c: str(i) for i, c in enumerate(labels)}
+)
+training_args = TrainingArguments(
+    output_dir="./vit-deepshield",
+    per_device_train_batch_size=16,
+    eval_strategy="steps",
+    num_train_epochs=3,
+    fp16=True, # Mixed precision for speed
+    save_steps=100,
+    eval_steps=100,
+    logging_steps=10,
+    learning_rate=2e-4,
+    save_total_limit=2,
+    remove_unused_columns=False,
+    push_to_hub=False,
+    load_best_model_at_end=True,
+)
+import evaluate
+metric = evaluate.load("accuracy")
+def compute_metrics(p):
+    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    data_collator=collate_fn,
+    compute_metrics=compute_metrics,
+    train_dataset=prepared_ds["train"],
+    eval_dataset=prepared_ds["test"],
+)
+# 4. Train
+train_results = trainer.train()
+trainer.save_model("deepshield_vit_model")
+processor.save_pretrained("deepshield_vit_model")
+trainer.log_metrics("train", train_results.metrics)
+trainer.save_metrics("train", train_results.metrics)
+trainer.save_state()
+print("Training Complete! The model is saved to ./deepshield_vit_model")
+"""
+nb['cells'] = [
+    nbf.v4.new_markdown_cell(text),
+    nbf.v4.new_code_cell(code_install),
+    nbf.v4.new_code_cell(code_ffpp),
+    nbf.v4.new_code_cell(code_extract),
+    nbf.v4.new_code_cell(code_train)
+]
+with open(r'c:\Users\athar\Desktop\minor2\backend\training\Colab_ViT_Training.ipynb', 'w', encoding='utf-8') as f:
+    nbf.write(nb, f)

heatmap_generator.py ADDED Viewed

	@@ -0,0 +1,164 @@

+from __future__ import annotations
+import base64
+import io
+from typing import Optional
+import cv2
+import numpy as np
+import torch
+from loguru import logger
+from PIL import Image
+from pytorch_grad_cam import GradCAMPlusPlus
+from pytorch_grad_cam.utils.image import show_cam_on_image
+from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
+from config import settings
+from models.model_loader import get_model_loader
+class _HFLogitsWrapper(torch.nn.Module):
+    """Wrap a HuggingFace image classification model so forward() returns logits
+    as a plain tensor (pytorch_grad_cam expects tensor outputs, not dicts/dataclasses).
+    """
+    def __init__(self, model: torch.nn.Module) -> None:
+        super().__init__()
+        self.model = model
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:  # type: ignore[override]
+        return self.model(pixel_values=pixel_values).logits
+def _vit_reshape_transform(tensor: torch.Tensor, height: int = 14, width: int = 14) -> torch.Tensor:
+    """Grad-CAM expects (B, C, H, W); ViT hidden states are (B, 1+H*W, C).
+    Drop the CLS token and reshape tokens into a spatial grid.
+    """
+    result = tensor[:, 1:, :]
+    b, n, c = result.shape
+    result = result.reshape(b, height, width, c)
+    result = result.permute(0, 3, 1, 2)  # (B, C, H, W)
+    return result
+def _preprocess_for_cam(pil_img: Image.Image, processor) -> tuple[torch.Tensor, np.ndarray]:
+    """Return (input_tensor, rgb_float_224) where rgb_float_224 is a (H,W,3) float
+    array in [0,1] matching the model input geometry — needed for overlaying.
+    """
+    inputs = processor(images=pil_img, return_tensors="pt")
+    input_tensor = inputs["pixel_values"].to(settings.DEVICE)
+    size = getattr(processor, "size", {"height": 224, "width": 224})
+    h = size.get("height", 224) if isinstance(size, dict) else 224
+    w = size.get("width", 224) if isinstance(size, dict) else 224
+    resized = pil_img.resize((w, h), Image.BILINEAR)
+    rgb = np.array(resized).astype(np.float32) / 255.0  # (H,W,3) in [0,1]
+    return input_tensor, rgb
+def _encode_overlay_to_base64(overlay: np.ndarray) -> str:
+    """Encode a uint8 (H,W,3) RGB overlay to a base64 data-URL PNG."""
+    buf = io.BytesIO()
+    Image.fromarray(overlay).save(buf, format="PNG")
+    b64 = base64.b64encode(buf.getvalue()).decode("ascii")
+    return f"data:image/png;base64,{b64}"
+def _compute_gradcam_pp(
+    pil_img: Image.Image,
+    target_class_idx: Optional[int] = None,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Compute Grad-CAM++ averaged across the last 3 ViT encoder layers.
+    Returns (grayscale_cam, rgb_float) where grayscale_cam is (H,W) in [0,1].
+    """
+    loader = get_model_loader()
+    model, processor = loader.load_image_model()
+    model.eval()
+    for p in model.parameters():
+        p.requires_grad_(True)
+    input_tensor, rgb_float = _preprocess_for_cam(pil_img, processor)
+    grid = int(model.config.image_size / model.config.patch_size)
+    # Average across last 3 ViT encoder layers for smoother heatmaps
+    num_layers = len(model.vit.encoder.layer)
+    last_n = min(3, num_layers)
+    target_layers = [
+        model.vit.encoder.layer[-(i + 1)].layernorm_before
+        for i in range(last_n)
+    ]
+    wrapped = _HFLogitsWrapper(model)
+    targets = None
+    if target_class_idx is not None:
+        targets = [ClassifierOutputTarget(int(target_class_idx))]
+    with GradCAMPlusPlus(
+        model=wrapped,
+        target_layers=target_layers,
+        reshape_transform=lambda t: _vit_reshape_transform(t, grid, grid),
+    ) as cam:
+        grayscale_cam = cam(input_tensor=input_tensor, targets=targets)[0]  # (H,W) in [0,1]
+    return grayscale_cam, rgb_float
+def generate_heatmap_base64(
+    pil_img: Image.Image,
+    target_class_idx: Optional[int] = None,
+) -> str:
+    """Produce a base64 data-URL PNG of the Grad-CAM++ overlay for the given image."""
+    grayscale_cam, rgb_float = _compute_gradcam_pp(pil_img, target_class_idx)
+    overlay = show_cam_on_image(rgb_float, grayscale_cam, use_rgb=True)
+    logger.info(f"Heatmap generated ({overlay.shape[0]}x{overlay.shape[1]})")
+    return _encode_overlay_to_base64(overlay)
+def generate_boxes_base64(
+    pil_img: Image.Image,
+    target_class_idx: Optional[int] = None,
+    top_k: int = 5,
+    threshold: float = 0.4,
+) -> str:
+    """Produce bounding boxes around top-K connected components from Grad-CAM++ activation.
+    Renders colored boxes (red/yellow/orange by intensity) on the original image.
+    """
+    grayscale_cam, rgb_float = _compute_gradcam_pp(pil_img, target_class_idx)
+    h, w = rgb_float.shape[:2]
+    base_img = (rgb_float * 255).astype(np.uint8).copy()
+    # Threshold the heatmap to find activated regions
+    binary = (grayscale_cam >= threshold).astype(np.uint8) * 255
+    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    if not contours:
+        logger.info("No significant activation regions found for bounding boxes")
+        return _encode_overlay_to_base64(base_img)
+    # Sort by area descending, take top_k
+    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:top_k]
+    # Color by mean activation intensity within each box
+    for cnt in contours:
+        x, y, bw, bh = cv2.boundingRect(cnt)
+        region_activation = grayscale_cam[y:y + bh, x:x + bw].mean()
+        if region_activation >= 0.7:
+            color = (220, 40, 40)    # red — high suspicion
+        elif region_activation >= 0.5:
+            color = (240, 140, 20)   # orange — medium
+        else:
+            color = (230, 200, 40)   # yellow — lower
+        cv2.rectangle(base_img, (x, y), (x + bw, y + bh), color, 2)
+        label = f"{region_activation * 100:.0f}%"
+        cv2.putText(base_img, label, (x, max(y - 6, 12)),
+                     cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1, cv2.LINE_AA)
+    logger.info(f"Bounding boxes generated: {len(contours)} regions")
+    return _encode_overlay_to_base64(base_img)

image_service.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from __future__ import annotations
+import io
+from dataclasses import dataclass
+from typing import Tuple
+import torch
+from loguru import logger
+from PIL import Image
+from config import settings
+from models.model_loader import get_model_loader
+@dataclass
+class ImageClassification:
+    label: str
+    confidence: float
+    all_scores: dict[str, float]
+def load_image_from_bytes(data: bytes) -> Image.Image:
+    img = Image.open(io.BytesIO(data))
+    if img.mode != "RGB":
+        img = img.convert("RGB")
+    return img
+def classify_image(pil_img: Image.Image) -> ImageClassification:
+    """Run the ViT deepfake classifier on a PIL image."""
+    loader = get_model_loader()
+    model, processor = loader.load_image_model()
+    inputs = processor(images=pil_img, return_tensors="pt")
+    inputs = {k: v.to(settings.DEVICE) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs.logits  # (1, num_labels)
+        probs = torch.softmax(logits, dim=-1)[0]
+    id2label: dict[int, str] = getattr(model.config, "id2label", {})
+    all_scores = {id2label.get(i, str(i)): float(p.item()) for i, p in enumerate(probs)}
+    top_idx = int(torch.argmax(probs).item())
+    top_label = id2label.get(top_idx, str(top_idx))
+    top_conf = float(probs[top_idx].item())
+    logger.info(f"Image classify → {top_label} @ {top_conf:.3f}")
+    return ImageClassification(label=top_label, confidence=top_conf, all_scores=all_scores)
+def preprocess_and_classify(raw_bytes: bytes) -> Tuple[Image.Image, ImageClassification]:
+    """Convenience: decode bytes → PIL → classify. Returns the PIL image too so
+    downstream steps (heatmap, artifact scan) can reuse it.
+    """
+    pil = load_image_from_bytes(raw_bytes)
+    result = classify_image(pil)
+    return pil, result

llm_explainer.py ADDED Viewed

	@@ -0,0 +1,182 @@

+"""LLM Explainability Card — Phase 12.3
+Generates a plain-English summary paragraph + 3 key-signal bullets from the
+full analysis payload.  Supports Gemini (default) and OpenAI providers.
+Results are cached per record_id to avoid re-spending tokens.
+"""
+from __future__ import annotations
+import json
+from abc import ABC, abstractmethod
+from functools import lru_cache
+from typing import Any
+from loguru import logger
+from config import settings
+from schemas.common import LLMExplainabilitySummary
+# ── In-memory cache keyed by record_id ──
+_cache: dict[str, LLMExplainabilitySummary] = {}
+_PROMPT_TEMPLATE = """\
+You are DeepShield's explainability engine. Given the JSON analysis payload below,
+write a concise, accessible summary for a non-technical user.
+**Output format (strict JSON only — no markdown fences):**
+{{
+  "paragraph": "<2-3 sentence plain-English summary of the verdict and key signals>",
+  "bullets": [
+    "<key signal 1>",
+    "<key signal 2>",
+    "<key signal 3>"
+  ]
+}}
+Rules:
+- Be factual. State what the analysis found, not what you speculate.
+- Reference specific indicators (e.g. "GAN artifact score", "EXIF metadata", "sensationalism level").
+- If the verdict is "Likely Authentic", reassure the user and explain why.
+- If the verdict is "Likely Manipulated" or "Suspicious", highlight the strongest evidence.
+- Keep the paragraph under 60 words. Each bullet under 20 words.
+**Analysis payload:**
+{payload_json}
+"""
+class _LLMProvider(ABC):
+    @abstractmethod
+    def generate(self, prompt: str) -> str:
+        """Send prompt to LLM and return raw text response."""
+class _GeminiProvider(_LLMProvider):
+    def __init__(self) -> None:
+        import google.generativeai as genai
+        genai.configure(api_key=settings.LLM_API_KEY)
+        self._model = genai.GenerativeModel(settings.LLM_MODEL)
+    def generate(self, prompt: str) -> str:
+        response = self._model.generate_content(prompt)
+        return response.text
+class _OpenAIProvider(_LLMProvider):
+    def __init__(self) -> None:
+        from openai import OpenAI
+        self._client = OpenAI(api_key=settings.LLM_API_KEY)
+    def generate(self, prompt: str) -> str:
+        response = self._client.chat.completions.create(
+            model=settings.LLM_MODEL,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.3,
+            max_tokens=300,
+        )
+        return response.choices[0].message.content
+@lru_cache(maxsize=1)
+def _get_provider() -> _LLMProvider:
+    """Lazy-init the configured LLM provider (singleton)."""
+    provider_name = settings.LLM_PROVIDER.lower()
+    if provider_name == "openai":
+        return _OpenAIProvider()
+    return _GeminiProvider()
+def _parse_llm_response(raw: str) -> tuple[str, list[str]]:
+    """Parse the LLM's JSON response into (paragraph, bullets).
+    Handles cases where the LLM wraps output in markdown fences.
+    """
+    text = raw.strip()
+    # Strip markdown code fences if present
+    if text.startswith("```"):
+        lines = text.split("\n")
+        # Remove first and last fence lines
+        lines = [l for l in lines if not l.strip().startswith("```")]
+        text = "\n".join(lines).strip()
+    parsed = json.loads(text)
+    paragraph = parsed.get("paragraph", "")
+    bullets = parsed.get("bullets", [])
+    if not isinstance(bullets, list):
+        bullets = [str(bullets)]
+    return paragraph, bullets[:3]
+def generate_llm_summary(
+    payload: dict[str, Any],
+    record_id: str | None = None,
+) -> LLMExplainabilitySummary:
+    """Generate an LLM-powered plain-English explanation for an analysis result.
+    Args:
+        payload: The full analysis response dict (verdict, scores, indicators, etc.).
+        record_id: Optional cache key. If provided and cached, returns cached result.
+    Returns:
+        LLMExplainabilitySummary with paragraph, bullets, and model info.
+    """
+    # Check cache
+    if record_id and record_id in _cache:
+        logger.debug(f"LLM summary cache hit for record_id={record_id}")
+        cached = _cache[record_id]
+        cached.cached = True
+        return cached
+    # Guard: no API key configured
+    if not settings.LLM_API_KEY:
+        logger.warning("LLM_API_KEY not set — skipping LLM explainability card")
+        return LLMExplainabilitySummary(
+            paragraph="LLM explanation unavailable (no API key configured).",
+            bullets=[],
+            model_used="none",
+        )
+    # Strip heavy base64 fields to reduce token usage
+    slim_payload = {k: v for k, v in payload.items()
+                    if k not in ("explainability",)}
+    # Include explainability but strip base64 images
+    if "explainability" in payload and isinstance(payload["explainability"], dict):
+        expl = {k: v for k, v in payload["explainability"].items()
+                if not k.endswith("_base64")}
+        slim_payload["explainability"] = expl
+    prompt = _PROMPT_TEMPLATE.format(payload_json=json.dumps(slim_payload, indent=2, default=str))
+    try:
+        provider = _get_provider()
+        raw_response = provider.generate(prompt)
+        paragraph, bullets = _parse_llm_response(raw_response)
+        summary = LLMExplainabilitySummary(
+            paragraph=paragraph,
+            bullets=bullets,
+            model_used=f"{settings.LLM_PROVIDER}/{settings.LLM_MODEL}",
+        )
+        # Cache result
+        if record_id:
+            _cache[record_id] = summary
+        logger.info(f"LLM summary generated via {settings.LLM_PROVIDER}/{settings.LLM_MODEL}")
+        return summary
+    except json.JSONDecodeError as e:
+        logger.error(f"LLM returned unparseable JSON: {e}")
+        return LLMExplainabilitySummary(
+            paragraph="Analysis complete. See the detailed indicators below for specifics.",
+            bullets=["LLM explanation could not be parsed"],
+            model_used=f"{settings.LLM_PROVIDER}/{settings.LLM_MODEL}",
+        )
+    except Exception as e:
+        logger.error(f"LLM explainer failed: {e}")
+        return LLMExplainabilitySummary(
+            paragraph="Analysis complete. See the detailed indicators below for specifics.",
+            bullets=["LLM explanation temporarily unavailable"],
+            model_used="error",
+        )

main.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import asyncio
+from contextlib import asynccontextmanager
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from loguru import logger
+from api.router import api_router
+from config import settings
+from db.database import init_db
+from models.model_loader import get_model_loader
+from services.report_service import cleanup_expired
+async def _report_cleanup_loop():
+    while True:
+        try:
+            cleanup_expired()
+        except Exception as e:  # noqa: BLE001
+            logger.warning(f"Report cleanup error: {e}")
+        await asyncio.sleep(600)  # every 10 min
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    logger.info("Starting DeepShield backend")
+    init_db()
+    logger.info("Database initialized")
+    if settings.PRELOAD_MODELS:
+        get_model_loader().preload_phase1()
+    else:
+        logger.info("PRELOAD_MODELS=false — models will load on first use")
+    task = asyncio.create_task(_report_cleanup_loop())
+    yield
+    task.cancel()
+    logger.info("Shutting down DeepShield backend")
+app = FastAPI(
+    title="DeepShield API",
+    description="Explainable AI-based multimodal misinformation detection",
+    version="0.1.0",
+    lifespan=lifespan,
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=settings.CORS_ORIGINS,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.include_router(api_router)
+@app.get("/")
+def root():
+    return {"service": "DeepShield", "docs": "/docs", "health": "/api/v1/health"}

model_loader.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from __future__ import annotations
+from threading import Lock
+from typing import Optional, Tuple
+from loguru import logger
+from config import settings
+class ModelLoader:
+    """Singleton holder for preloaded AI models. Thread-safe lazy init."""
+    _instance: Optional["ModelLoader"] = None
+    _lock: Lock = Lock()
+    def __new__(cls) -> "ModelLoader":
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = super().__new__(cls)
+                    cls._instance._image_model = None
+                    cls._instance._image_processor = None
+                    cls._instance._text_pipeline = None
+                    cls._instance._multilang_text_pipeline = None
+                    cls._instance._ocr_reader = None
+                    cls._instance._face_detector = None
+                    cls._instance._spacy_nlp = None
+                    cls._instance._sentence_transformer = None
+        return cls._instance
+    @classmethod
+    def get_instance(cls) -> "ModelLoader":
+        return cls()
+    # ---------- Image (ViT deepfake classifier) ----------
+    def load_image_model(self) -> Tuple[object, object]:
+        if self._image_model is None:
+            logger.info(f"Loading image model: {settings.IMAGE_MODEL_ID}")
+            from transformers import AutoImageProcessor, AutoModelForImageClassification
+            self._image_processor = AutoImageProcessor.from_pretrained(settings.IMAGE_MODEL_ID)
+            model = AutoModelForImageClassification.from_pretrained(settings.IMAGE_MODEL_ID)
+            model.to(settings.DEVICE)
+            model.eval()
+            self._image_model = model
+            logger.info("Image model loaded")
+        return self._image_model, self._image_processor
+    # ---------- Text (BERT fake-news classifier — English) ----------
+    def load_text_model(self):
+        if self._text_pipeline is None:
+            logger.info(f"Loading text model: {settings.TEXT_MODEL_ID}")
+            from transformers import pipeline
+            self._text_pipeline = pipeline(
+                "text-classification",
+                model=settings.TEXT_MODEL_ID,
+                device=0 if settings.DEVICE == "cuda" else -1,
+            )
+            logger.info("Text model loaded")
+        return self._text_pipeline
+    # ---------- Multilingual text model (Phase 13) ----------
+    def load_multilang_text_model(self):
+        """Load multilingual fake-news classifier. Falls back to English model if not configured."""
+        model_id = settings.TEXT_MULTILANG_MODEL_ID
+        if not model_id:
+            logger.debug("TEXT_MULTILANG_MODEL_ID not set — falling back to English text model")
+            return self.load_text_model()
+        if self._multilang_text_pipeline is None:
+            logger.info(f"Loading multilingual text model: {model_id}")
+            from transformers import pipeline
+            self._multilang_text_pipeline = pipeline(
+                "text-classification",
+                model=model_id,
+                device=0 if settings.DEVICE == "cuda" else -1,
+            )
+            logger.info("Multilingual text model loaded")
+        return self._multilang_text_pipeline
+    # ---------- spaCy NLP (Phase 13 NER) ----------
+    def load_spacy_nlp(self):
+        """Lazy-load spaCy English NLP model. Returns None if spaCy is not installed."""
+        if self._spacy_nlp is None:
+            try:
+                import spacy  # type: ignore
+                try:
+                    self._spacy_nlp = spacy.load("en_core_web_sm")
+                    logger.info("spaCy en_core_web_sm loaded")
+                except OSError:
+                    logger.warning(
+                        "spaCy model 'en_core_web_sm' not found. "
+                        "Run: python -m spacy download en_core_web_sm"
+                    )
+                    return None
+            except ImportError:
+                logger.warning("spaCy not installed — NER keyword extraction disabled")
+                return None
+        return self._spacy_nlp
+    # ---------- Sentence-Transformer (Phase 13 truth-override) ----------
+    def load_sentence_transformer(self):
+        """Lazy-load sentence-transformers/all-MiniLM-L6-v2. Returns None if not installed."""
+        if self._sentence_transformer is None:
+            try:
+                from sentence_transformers import SentenceTransformer  # type: ignore
+                self._sentence_transformer = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+                logger.info("Sentence-transformer (all-MiniLM-L6-v2) loaded")
+            except ImportError:
+                logger.warning("sentence-transformers not installed — truth-override disabled")
+                return None
+            except Exception as e:
+                logger.warning(f"Sentence-transformer load failed: {e}")
+                return None
+        return self._sentence_transformer
+    # ---------- OCR (EasyOCR) — Phase 13: use OCR_LANGS from config ----------
+    def load_ocr_engine(self):
+        if self._ocr_reader is None:
+            langs = [l.strip() for l in settings.OCR_LANGS.split(",") if l.strip()]
+            if not langs:
+                langs = ["en"]
+            logger.info(f"Loading EasyOCR reader (langs: {langs})")
+            import easyocr  # type: ignore
+            self._ocr_reader = easyocr.Reader(
+                langs, gpu=(settings.DEVICE == "cuda"), verbose=False, download_enabled=True,
+            )
+            logger.info("EasyOCR loaded")
+        return self._ocr_reader
+    # ---------- Face detector (MediaPipe) ----------
+    def load_face_detector(self):
+        if self._face_detector is None:
+            logger.info("Loading MediaPipe FaceMesh")
+            import mediapipe as mp  # type: ignore
+            self._face_detector = mp.solutions.face_mesh.FaceMesh(
+                static_image_mode=True,
+                max_num_faces=5,
+                min_detection_confidence=0.5,
+            )
+            logger.info("MediaPipe FaceMesh loaded")
+        return self._face_detector
+    # ---------- Preload ----------
+    def preload_phase1(self) -> None:
+        """Preload only what Phase 1 needs (image model)."""
+        self.load_image_model()
+def get_model_loader() -> ModelLoader:
+    return ModelLoader.get_instance()

models.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from datetime import datetime
+from sqlalchemy import DateTime, ForeignKey, Integer, String, Text
+from sqlalchemy.orm import Mapped, mapped_column, relationship
+from db.database import Base
+class User(Base):
+    __tablename__ = "users"
+    id: Mapped[int] = mapped_column(Integer, primary_key=True, index=True)
+    email: Mapped[str] = mapped_column(String(255), unique=True, index=True, nullable=False)
+    password_hash: Mapped[str] = mapped_column(String(255), nullable=False)
+    name: Mapped[str | None] = mapped_column(String(255), nullable=True)
+    created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
+    analyses: Mapped[list["AnalysisRecord"]] = relationship(back_populates="user")
+class AnalysisRecord(Base):
+    __tablename__ = "analyses"
+    id: Mapped[int] = mapped_column(Integer, primary_key=True, index=True)
+    user_id: Mapped[int | None] = mapped_column(ForeignKey("users.id"), nullable=True)
+    media_type: Mapped[str] = mapped_column(String(32), nullable=False)  # image|video|text|screenshot
+    verdict: Mapped[str] = mapped_column(String(32), nullable=False)
+    authenticity_score: Mapped[float] = mapped_column(nullable=False)
+    result_json: Mapped[str] = mapped_column(Text, nullable=False)
+    created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
+    user: Mapped["User | None"] = relationship(back_populates="analyses")
+    report: Mapped["Report | None"] = relationship(back_populates="analysis", uselist=False)
+class Report(Base):
+    __tablename__ = "reports"
+    id: Mapped[int] = mapped_column(Integer, primary_key=True, index=True)
+    analysis_id: Mapped[int] = mapped_column(ForeignKey("analyses.id"), nullable=False)
+    file_path: Mapped[str] = mapped_column(String(512), nullable=False)
+    created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
+    expires_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
+    analysis: Mapped["AnalysisRecord"] = relationship(back_populates="report")

news_lookup.py ADDED Viewed

	@@ -0,0 +1,242 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+from urllib.parse import urlparse
+import httpx
+from loguru import logger
+from config import settings
+from schemas.common import ContradictingEvidence, TrustedSource, TruthOverride
+# Trusted news domains — higher relevance boost
+TRUSTED_DOMAINS = {
+    "reuters.com": 1.0, "apnews.com": 1.0, "bbc.com": 1.0, "bbc.co.uk": 1.0,
+    "theguardian.com": 0.95, "nytimes.com": 0.95, "washingtonpost.com": 0.95,
+    "cnn.com": 0.9, "npr.org": 0.95, "aljazeera.com": 0.9,
+    "thehindu.com": 0.9, "indianexpress.com": 0.9, "ndtv.com": 0.85,
+    "hindustantimes.com": 0.85, "pti.news": 0.95,
+}
+# Fact-check / contradiction sources
+FACTCHECK_DOMAINS = {
+    "factcheck.org", "snopes.com", "politifact.com", "fullfact.org",
+    "reuters.com/fact-check", "apnews.com/hub/ap-fact-check",
+    "factly.in", "altnews.in", "boomlive.in", "vishvasnews.com",
+}
+# Domains eligible for truth-override (weight >= 0.9 per BUILD_PLAN spec)
+_HIGH_TRUST_DOMAINS = {d for d, w in TRUSTED_DOMAINS.items() if w >= 0.9}
+# Thresholds per BUILD_PLAN §13.2
+_OVERRIDE_SIMILARITY_THRESHOLD = 0.6
+_OVERRIDE_FAKE_PROB_CAP = 0.15
+_OVERRIDE_FAKE_PROB_MULTIPLIER = 0.3
+@dataclass
+class NewsLookupResult:
+    trusted_sources: List[TrustedSource]
+    contradicting_evidence: List[ContradictingEvidence]
+    total_articles: int
+    truth_override: Optional[TruthOverride] = None
+def _domain_of(url: str) -> str:
+    try:
+        return urlparse(url).netloc.lower().replace("www.", "")
+    except Exception:
+        return ""
+def _is_factcheck(url: str, title: str) -> bool:
+    dom = _domain_of(url)
+    if any(fc in dom for fc in FACTCHECK_DOMAINS):
+        return True
+    tl = (title or "").lower()
+    return any(kw in tl for kw in ("fact check", "fact-check", "debunked", "false claim", "misleading", "hoax"))
+def _relevance(url: str) -> float:
+    dom = _domain_of(url)
+    for td, score in TRUSTED_DOMAINS.items():
+        if td in dom:
+            return score
+    return 0.5
+def _is_high_trust(url: str) -> bool:
+    dom = _domain_of(url)
+    return any(ht in dom for ht in _HIGH_TRUST_DOMAINS)
+def _compute_truth_override(
+    input_text: str,
+    trusted_sources: List[TrustedSource],
+    current_fake_prob: float,
+) -> Optional[TruthOverride]:
+    """Check if any high-trust source corroborates the input text at >= 0.6 cosine similarity.
+    Per BUILD_PLAN §13.2:
+    - Compute cosine similarity between input_text and each trusted-source headline+description
+    - If ≥ 1 high-trust source (weight ≥ 0.9) has similarity ≥ 0.6 → apply fake_prob *= 0.3, cap at 0.15
+    """
+    if not input_text or not trusted_sources:
+        return None
+    # Filter to high-trust sources only
+    high_trust = [s for s in trusted_sources if _is_high_trust(s.url)]
+    if not high_trust:
+        return None
+    # Lazy-load sentence-transformer
+    from models.model_loader import get_model_loader
+    st_model = get_model_loader().load_sentence_transformer()
+    if st_model is None:
+        return None
+    try:
+        import numpy as np
+        # Encode input text and all high-trust headlines
+        source_texts = [
+            f"{s.title}" for s in high_trust
+        ]
+        all_texts = [input_text[:512]] + source_texts
+        embeddings = st_model.encode(all_texts, convert_to_numpy=True, normalize_embeddings=True)
+        query_vec = embeddings[0]       # (D,)
+        source_vecs = embeddings[1:]    # (N, D)
+        # Cosine similarity — already normalized, so dot product = cosine similarity
+        similarities = np.dot(source_vecs, query_vec)
+        best_idx = int(np.argmax(similarities))
+        best_sim = float(similarities[best_idx])
+        best_source = high_trust[best_idx]
+        logger.info(
+            f"Truth-override: best similarity={best_sim:.3f} "
+            f"source={best_source.source_name} url={best_source.url}"
+        )
+        if best_sim >= _OVERRIDE_SIMILARITY_THRESHOLD:
+            new_fake_prob = min(
+                current_fake_prob * _OVERRIDE_FAKE_PROB_MULTIPLIER,
+                _OVERRIDE_FAKE_PROB_CAP,
+            )
+            logger.info(
+                f"Truth-override APPLIED: fake_prob {current_fake_prob:.3f} → {new_fake_prob:.3f}"
+            )
+            return TruthOverride(
+                applied=True,
+                source_url=best_source.url,
+                source_name=best_source.source_name,
+                similarity=round(best_sim, 4),
+                fake_prob_before=round(current_fake_prob, 4),
+                fake_prob_after=round(new_fake_prob, 4),
+            )
+        return TruthOverride(
+            applied=False,
+            source_url=best_source.url,
+            source_name=best_source.source_name,
+            similarity=round(best_sim, 4),
+            fake_prob_before=round(current_fake_prob, 4),
+            fake_prob_after=round(current_fake_prob, 4),
+        )
+    except Exception as e:
+        logger.warning(f"Truth-override computation failed: {e}")
+        return None
+async def _fetch(q: str, country: Optional[str]) -> list[dict]:
+    target_country = country or "in"
+    params = {"apikey": settings.NEWS_API_KEY, "q": q, "language": "en", "size": 10, "country": "in"}
+    try:
+        async with httpx.AsyncClient(timeout=8.0) as c:
+            r = await c.get(settings.NEWS_API_BASE_URL, params=params)
+            r.raise_for_status()
+            return (r.json() or {}).get("results") or []
+    except Exception as e:
+        logger.warning(f"News lookup failed: {e}")
+        return []
+async def search_news(
+    keywords: List[str],
+    limit: int = 6,
+    country: Optional[str] = None,
+) -> List[TrustedSource]:
+    """Back-compat simple form — returns trusted sources only."""
+    result = await search_news_full(keywords, limit=limit, country=country)
+    return result.trusted_sources
+async def search_news_full(
+    keywords: List[str],
+    limit: int = 6,
+    country: Optional[str] = None,
+    original_text: Optional[str] = None,
+    current_fake_prob: float = 0.5,
+) -> NewsLookupResult:
+    """Full news lookup with truth-override support.
+    Args:
+        keywords: NER-extracted or frequency-extracted keywords to search.
+        limit: Max sources to return.
+        country: Country code for newsdata.io.
+        original_text: Input text to compare against headlines for truth-override.
+        current_fake_prob: Current fake probability — may be adjusted by truth-override.
+    """
+    if not settings.NEWS_API_KEY or not keywords:
+        return NewsLookupResult([], [], 0)
+    q = " ".join(keywords[:4])
+    articles = await _fetch(q, country)
+    seen: set[str] = set()
+    trusted: List[TrustedSource] = []
+    contradictions: List[ContradictingEvidence] = []
+    for art in articles:
+        url = art.get("link") or ""
+        if not url or url in seen:
+            continue
+        seen.add(url)
+        title = art.get("title") or ""
+        dom = _domain_of(url)
+        src_name = art.get("source_id") or dom or "news"
+        if _is_factcheck(url, title):
+            contradictions.append(ContradictingEvidence(
+                source_name=src_name, title=title, url=url, type="fact_check",
+            ))
+            continue
+        trusted.append(TrustedSource(
+            source_name=src_name,
+            title=title,
+            url=url,
+            published_at=art.get("pubDate"),
+            relevance_score=_relevance(url),
+        ))
+    trusted.sort(key=lambda s: -s.relevance_score)
+    trusted = trusted[:limit]
+    # ── Phase 13.2: Truth-override ──
+    truth_override = None
+    if original_text and trusted:
+        truth_override = _compute_truth_override(original_text, trusted, current_fake_prob)
+    return NewsLookupResult(
+        trusted_sources=trusted,
+        contradicting_evidence=contradictions[:limit],
+        total_articles=len(articles),
+        truth_override=truth_override,
+    )

report.html ADDED Viewed

	@@ -0,0 +1,367 @@

+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset="utf-8" />
+  <title>DeepShield Analysis Report — {{ analysis_id }}</title>
+  <style>
+    @page { size: A4; margin: 16mm 18mm; }
+    body { font-family: Helvetica, Arial, sans-serif; color: #1A202C; font-size: 10pt; line-height: 1.45; }
+    /* ── Typography ── */
+    h1 { color: #4F46E5; margin: 0 0 2pt 0; font-size: 18pt; letter-spacing: -0.3pt; }
+    h2 { color: #4F46E5; margin: 14pt 0 5pt 0; font-size: 12pt; border-bottom: 1pt solid #E5E7EB; padding-bottom: 2pt; }
+    h3 { margin: 10pt 0 4pt 0; font-size: 10.5pt; color: #374151; }
+    .muted { color: #6B7280; font-size: 8.5pt; }
+    /* ── Header / logo row ── */
+    .header-table { width: 100%; border-collapse: collapse; border-bottom: 2pt solid #4F46E5; padding-bottom: 6pt; margin-bottom: 10pt; }
+    .logo-cell { font-size: 22pt; font-weight: bold; color: #4F46E5; width: 1%; white-space: nowrap; padding-right: 8pt; }
+    .logo-shield { color: #6366F1; }
+    .meta-cell { font-size: 8.5pt; color: #6B7280; vertical-align: bottom; }
+    /* ── Verdict row ── */
+    .verdict-table { width: 100%; border-collapse: collapse; margin: 6pt 0 10pt 0; background: #F9FAFB; }
+    .verdict-score-cell { width: 90pt; text-align: center; vertical-align: middle; padding: 8pt; }
+    .score-num { font-size: 26pt; font-weight: bold; }
+    .score-denom { font-size: 9pt; color: #6B7280; }
+    .score.real { color: #43A047; }
+    .score.warn { color: #FB8C00; }
+    .score.fake { color: #E53935; }
+    .verdict-detail-cell { padding: 8pt 10pt; vertical-align: middle; }
+    .verdict-label { font-size: 13pt; font-weight: bold; color: #1A202C; }
+    .verdict-sub { font-size: 8.5pt; color: #6B7280; margin-top: 2pt; }
+    .donut-cell { width: 75pt; text-align: center; vertical-align: middle; padding: 4pt; }
+    .donut-cell img { width: 72pt; }
+    /* ── LLM card ── */
+    .llm-box { background: #EEF2FF; border-left: 3pt solid #4F46E5; padding: 7pt 9pt; margin: 6pt 0; border-radius: 2pt; }
+    .llm-para { font-size: 9.5pt; color: #1A202C; margin: 0 0 5pt 0; }
+    .llm-bullets { margin: 0; padding-left: 14pt; }
+    .llm-bullets li { font-size: 9pt; color: #374151; margin-bottom: 2pt; }
+    /* ── Tables ── */
+    table.data { width: 100%; border-collapse: collapse; margin: 5pt 0; }
+    table.data th { background: #F3F4F6; color: #374151; font-size: 8.5pt; text-align: left; padding: 3pt 6pt; border-bottom: 1pt solid #E5E7EB; }
+    table.data td { font-size: 9pt; padding: 3pt 6pt; border-bottom: 1pt solid #F3F4F6; vertical-align: top; }
+    table.data tr:last-child td { border-bottom: none; }
+    /* ── VLM breakdown ── */
+    .vlm-score-bar-wrap { background: #E5E7EB; border-radius: 3pt; height: 5pt; width: 70pt; display: inline-block; vertical-align: middle; overflow: hidden; }
+    .vlm-score-bar { height: 5pt; border-radius: 3pt; }
+    .vlm-real { background: #43A047; }
+    .vlm-warn { background: #FB8C00; }
+    .vlm-fake { background: #E53935; }
+    /* ── Badges ── */
+    .badge { display: inline-block; padding: 1pt 5pt; border-radius: 3pt; font-size: 8pt; font-weight: bold; }
+    .sev-high { background: #FEE2E2; color: #B91C1C; }
+    .sev-medium { background: #FEF3C7; color: #92400E; }
+    .sev-low { background: #DBEAFE; color: #1E40AF; }
+    .badge-green { background: #DCFCE7; color: #166534; }
+    .badge-red   { background: #FEE2E2; color: #991B1B; }
+    /* ── Keywords ── */
+    .keyword { display: inline-block; background: #EEF2FF; color: #4F46E5; padding: 1pt 6pt; border-radius: 3pt; margin: 1pt; font-size: 8.5pt; }
+    /* ── Truth-override ── */
+    .truth-box { background: #DCFCE7; border-left: 3pt solid #16A34A; padding: 5pt 8pt; margin: 5pt 0; font-size: 9pt; border-radius: 2pt; }
+    /* ── Footer ── */
+    .footer { margin-top: 16pt; padding-top: 5pt; border-top: 1pt solid #E5E7EB; color: #9CA3AF; font-size: 8pt; }
+  </style>
+</head>
+<body>
+  {# ── Header ── #}
+  <table class="header-table">
+    <tr>
+      <td class="logo-cell"><span class="logo-shield">&#9646;</span> DeepShield</td>
+      <td class="meta-cell">
+        Analysis Report &nbsp;·&nbsp; ID: {{ analysis_id }}<br />
+        Media: <b>{{ media_type | upper }}</b> &nbsp;·&nbsp; Generated: {{ generated_at }}
+      </td>
+    </tr>
+  </table>
+  {# ── Verdict ── #}
+  <h2>Verdict</h2>
+  <table class="verdict-table">
+    <tr>
+      <td class="verdict-score-cell">
+        <div class="score-num score {{ score_class }}">{{ verdict.authenticity_score }}</div>
+        <div class="score-denom">/ 100</div>
+      </td>
+      <td class="verdict-detail-cell">
+        <div class="verdict-label">{{ verdict.label }}</div>
+        <div class="verdict-sub">Severity: {{ verdict.severity }}</div>
+        <div class="verdict-sub">Model: {{ verdict.model_label }} &nbsp;({{ '%.1f' | format(verdict.model_confidence * 100) }}% confidence)</div>
+      </td>
+      {% if donut_b64 %}
+      <td class="donut-cell">
+        <img src="data:image/png;base64,{{ donut_b64 }}" alt="score donut" />
+      </td>
+      {% endif %}
+    </tr>
+  </table>
+  {# ── LLM Explanation ── #}
+  {% if llm_summary and llm_summary.paragraph %}
+  <h2>AI Explanation</h2>
+  <div class="llm-box">
+    <p class="llm-para">{{ llm_summary.paragraph }}</p>
+    {% if llm_summary.bullets %}
+    <ul class="llm-bullets">
+      {% for b in llm_summary.bullets %}<li>{{ b }}</li>{% endfor %}
+    </ul>
+    {% endif %}
+    {% if llm_summary.model_used %}
+    <div class="muted" style="margin-top:4pt;">via {{ llm_summary.model_used }}</div>
+    {% endif %}
+  </div>
+  {% endif %}
+  {# ══════════ IMAGE ══════════ #}
+  {% if media_type == 'image' %}
+    {# EXIF #}
+    {% if explainability.exif %}
+    <h2>EXIF Metadata</h2>
+    <table class="data">
+      <tr><th>Field</th><th>Value</th><th>Trust Signal</th></tr>
+      {% if explainability.exif.make %}
+      <tr><td>Camera Make</td><td>{{ explainability.exif.make }}</td><td><span class="badge badge-green">+real</span></td></tr>
+      {% endif %}
+      {% if explainability.exif.model %}
+      <tr><td>Camera Model</td><td>{{ explainability.exif.model }}</td><td></td></tr>
+      {% endif %}
+      {% if explainability.exif.datetime_original %}
+      <tr><td>Date Taken</td><td>{{ explainability.exif.datetime_original }}</td><td><span class="badge badge-green">+real</span></td></tr>
+      {% endif %}
+      {% if explainability.exif.software %}
+      <tr><td>Software</td><td>{{ explainability.exif.software }}</td>
+        <td>{% if 'photoshop' in explainability.exif.software | lower %}<span class="badge badge-red">+fake</span>{% endif %}</td></tr>
+      {% endif %}
+      {% if explainability.exif.lens_model %}
+      <tr><td>Lens Model</td><td>{{ explainability.exif.lens_model }}</td><td></td></tr>
+      {% endif %}
+      {% if explainability.exif.gps_info %}
+      <tr><td>GPS</td><td>{{ explainability.exif.gps_info }}</td><td></td></tr>
+      {% endif %}
+      <tr>
+        <td colspan="2"><b>Trust adjustment</b></td>
+        <td>
+          {% if explainability.exif.trust_adjustment > 0 %}
+            <span class="badge badge-red">+{{ explainability.exif.trust_adjustment }} (fake signal)</span>
+          {% elif explainability.exif.trust_adjustment < 0 %}
+            <span class="badge badge-green">{{ explainability.exif.trust_adjustment }} (real signal)</span>
+          {% else %}
+            neutral
+          {% endif %}
+        </td>
+      </tr>
+    </table>
+    {% endif %}
+    {# Artifact indicators #}
+    {% if explainability.artifact_indicators %}
+    <h2>Artifact Indicators</h2>
+    <table class="data">
+      <tr><th>Type</th><th>Severity</th><th>Confidence</th><th>Description</th></tr>
+      {% for ind in explainability.artifact_indicators %}
+      <tr>
+        <td>{{ ind.type }}</td>
+        <td><span class="badge sev-{{ ind.severity }}">{{ ind.severity }}</span></td>
+        <td>{{ '%.0f' | format(ind.confidence * 100) }}%</td>
+        <td>{{ ind.description }}</td>
+      </tr>
+      {% endfor %}
+    </table>
+    {% else %}
+    <h2>Artifact Indicators</h2>
+    <div class="muted">No artifacts detected.</div>
+    {% endif %}
+    {# VLM Detailed Breakdown #}
+    {% if explainability.vlm_breakdown %}
+    <h2>Detailed Breakdown</h2>
+    {% if explainability.vlm_breakdown.model_used %}
+    <div class="muted" style="margin-bottom:5pt;">Scored by {{ explainability.vlm_breakdown.model_used }}</div>
+    {% endif %}
+    <table class="data">
+      <tr><th>Component</th><th>Score</th><th>Bar</th><th>Notes</th></tr>
+      {% set bd = explainability.vlm_breakdown %}
+      {% for comp_key, comp_label in [
+          ('facial_symmetry',      'Facial Symmetry'),
+          ('skin_texture',         'Skin Texture'),
+          ('lighting_consistency', 'Lighting Consistency'),
+          ('background_coherence', 'Background Coherence'),
+          ('anatomy_hands_eyes',   'Anatomy / Hands & Eyes'),
+          ('context_objects',      'Context & Objects')
+      ] %}
+        {% set comp = bd[comp_key] %}
+        {% set sc2 = comp.score if comp else 75 %}
+        {% set bar_cls = 'vlm-real' if sc2 >= 70 else ('vlm-warn' if sc2 >= 40 else 'vlm-fake') %}
+      <tr>
+        <td>{{ comp_label }}</td>
+        <td><b>{{ sc2 }}</b>/100</td>
+        <td>
+          <span class="vlm-score-bar-wrap">
+            <span class="vlm-score-bar {{ bar_cls }}" style="width:{{ sc2 }}%;display:block;"></span>
+          </span>
+        </td>
+        <td class="muted">{{ comp.notes if comp else '' }}</td>
+      </tr>
+      {% endfor %}
+    </table>
+    {% endif %}
+  {% endif %}{# end image #}
+  {# ══════════ VIDEO ══════════ #}
+  {% if media_type == 'video' %}
+  <h2>Frame-Level Analysis</h2>
+  <table class="data">
+    <tr><th>Metric</th><th>Value</th></tr>
+    <tr><td>Frames sampled</td><td>{{ explainability.num_frames_sampled }}</td></tr>
+    <tr><td>Frames with face</td><td>{{ explainability.num_face_frames }}</td></tr>
+    <tr><td>Suspicious frames</td><td>{{ explainability.num_suspicious_frames }}</td></tr>
+    <tr><td>Mean suspicious prob</td><td>{{ '%.1f' | format(explainability.mean_suspicious_prob * 100) }}%</td></tr>
+    <tr><td>Max suspicious prob</td><td>{{ '%.1f' | format(explainability.max_suspicious_prob * 100) }}%</td></tr>
+    <tr><td>Insufficient faces</td><td>{{ explainability.insufficient_faces }}</td></tr>
+  </table>
+  {% endif %}
+  {# ══════════ TEXT ══════════ #}
+  {% if media_type == 'text' %}
+    {# Language + truth-override #}
+    {% if explainability.detected_language and explainability.detected_language != 'en' %}
+    <h2>Language</h2>
+    <div class="muted">Detected: <b>{{ explainability.detected_language | upper }}</b> — analysed via multilingual model</div>
+    {% endif %}
+    {% if explainability.truth_override and explainability.truth_override.applied %}
+    <div class="truth-box">
+      <b>Truth-override applied.</b>
+      Corroborated by {{ explainability.truth_override.source_name }}
+      ({{ '%.0f' | format(explainability.truth_override.similarity * 100) }}% similarity).
+      Fake probability reduced from {{ '%.1f' | format(explainability.truth_override.fake_prob_before * 100) }}%
+      to {{ '%.1f' | format(explainability.truth_override.fake_prob_after * 100) }}%.
+    </div>
+    {% endif %}
+    <h2>Text Classification</h2>
+    <table class="data">
+      <tr><th>Metric</th><th>Value</th></tr>
+      <tr><td>Fake probability</td><td>{{ '%.1f' | format(explainability.fake_probability * 100) }}%</td></tr>
+      <tr><td>Top label</td><td>{{ explainability.top_label }}</td></tr>
+      <tr><td>Sensationalism score</td><td>{{ explainability.sensationalism.score }}/100 ({{ explainability.sensationalism.level }})</td></tr>
+      <tr><td>Exclamations</td><td>{{ explainability.sensationalism.exclamation_count }}</td></tr>
+      <tr><td>ALL CAPS words</td><td>{{ explainability.sensationalism.caps_word_count }}</td></tr>
+      <tr><td>Clickbait matches</td><td>{{ explainability.sensationalism.clickbait_matches }}</td></tr>
+      <tr><td>Emotional words</td><td>{{ explainability.sensationalism.emotional_word_count }}</td></tr>
+    </table>
+    {% if explainability.manipulation_indicators %}
+    <h3>Manipulation Indicators ({{ explainability.manipulation_indicators | length }})</h3>
+    <table class="data">
+      <tr><th>Pattern</th><th>Severity</th><th>Matched text</th></tr>
+      {% for m in explainability.manipulation_indicators %}
+      <tr>
+        <td>{{ m.pattern_type }}</td>
+        <td><span class="badge sev-{{ m.severity }}">{{ m.severity }}</span></td>
+        <td>{{ m.matched_text }}</td>
+      </tr>
+      {% endfor %}
+    </table>
+    {% endif %}
+    {% if explainability.keywords %}
+    <h3>Extracted Keywords</h3>
+    <div>{% for kw in explainability.keywords %}<span class="keyword">{{ kw }}</span>{% endfor %}</div>
+    {% endif %}
+  {% endif %}{# end text #}
+  {# ══════════ SCREENSHOT ══════════ #}
+  {% if media_type == 'screenshot' %}
+    {% if explainability.detected_language and explainability.detected_language != 'en' %}
+    <div class="muted" style="margin-bottom:4pt;">Detected language: <b>{{ explainability.detected_language | upper }}</b></div>
+    {% endif %}
+    {% if explainability.truth_override and explainability.truth_override.applied %}
+    <div class="truth-box">
+      <b>Truth-override applied.</b> {{ explainability.truth_override.source_name }}
+      ({{ '%.0f' | format(explainability.truth_override.similarity * 100) }}% similarity)
+    </div>
+    {% endif %}
+    <h2>Extracted Text</h2>
+    <div class="muted">{{ explainability.ocr_boxes | length }} OCR regions detected</div>
+    <table class="data">
+      <tr><td style="white-space:pre-wrap; font-size:8.5pt; padding:6pt;">{{ explainability.extracted_text }}</td></tr>
+    </table>
+    <h3>Analysis Summary</h3>
+    <table class="data">
+      <tr><th>Metric</th><th>Value</th></tr>
+      <tr><td>Fake probability</td><td>{{ '%.1f' | format(explainability.fake_probability * 100) }}%</td></tr>
+      <tr><td>Sensationalism</td><td>{{ explainability.sensationalism.score }}/100 ({{ explainability.sensationalism.level }})</td></tr>
+      <tr><td>Suspicious phrases</td><td>{{ explainability.suspicious_phrases | length }}</td></tr>
+      <tr><td>Layout anomalies</td><td>{{ explainability.layout_anomalies | length }}</td></tr>
+    </table>
+    {% if explainability.suspicious_phrases %}
+    <h3>Suspicious Phrases</h3>
+    <table class="data">
+      <tr><th>Text</th><th>Pattern</th><th>Severity</th></tr>
+      {% for p in explainability.suspicious_phrases %}
+      <tr>
+        <td>{{ p.text }}</td>
+        <td>{{ p.pattern_type }}</td>
+        <td><span class="badge sev-{{ p.severity }}">{{ p.severity }}</span></td>
+      </tr>
+      {% endfor %}
+    </table>
+    {% endif %}
+  {% endif %}{# end screenshot #}
+  {# ══════════ SOURCES (all types) ══════════ #}
+  {% if trusted_sources %}
+  <h2>Trusted Source Cross-Reference ({{ trusted_sources | length }})</h2>
+  <table class="data">
+    <tr><th>Source</th><th>Title</th><th>Relevance</th></tr>
+    {% for s in trusted_sources %}
+    <tr>
+      <td>{{ s.source_name }}</td>
+      <td>{{ s.title }}</td>
+      <td>{{ '%.0f' | format(s.relevance_score * 100) }}%</td>
+    </tr>
+    {% endfor %}
+  </table>
+  {% endif %}
+  {% if contradicting_evidence %}
+  <h2 style="color:#B91C1C;">Contradicting Evidence ({{ contradicting_evidence | length }})</h2>
+  <table class="data">
+    <tr><th>Source</th><th>Title</th><th>Type</th></tr>
+    {% for c in contradicting_evidence %}
+    <tr><td>{{ c.source_name }}</td><td>{{ c.title }}</td><td>{{ c.type }}</td></tr>
+    {% endfor %}
+  </table>
+  {% endif %}
+  {# ══════════ PROCESSING ══════════ #}
+  <h2>Processing Summary</h2>
+  <div class="muted">Model: {{ processing_summary.model_used }} &nbsp;·&nbsp; Duration: {{ processing_summary.total_duration_ms }} ms</div>
+  <div style="font-size:8.5pt; margin-top:3pt;">{{ processing_summary.stages_completed | join(' → ') }}</div>
+  {# ══════════ FOOTER ══════════ #}
+  <div class="footer">
+    <b>DeepShield Responsible-AI Notice.</b> {{ responsible_ai_notice }}<br />
+    Generated {{ generated_at }}. Report expires in 1 hour.
+    AI-assisted analysis — cross-check with trusted sources before sharing.
+  </div>
+</body>
+</html>

report_service.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from __future__ import annotations
+import base64
+import json
+import os
+import time
+import uuid
+from datetime import datetime, timedelta, timezone
+from io import BytesIO
+from pathlib import Path
+from typing import Any, Optional
+from jinja2 import Environment, FileSystemLoader, select_autoescape
+from loguru import logger
+from xhtml2pdf import pisa  # type: ignore
+from config import settings
+from db.models import AnalysisRecord, Report
+TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates"
+_env = Environment(
+    loader=FileSystemLoader(str(TEMPLATES_DIR)),
+    autoescape=select_autoescape(["html", "xml"]),
+)
+def _score_class(score: int) -> str:
+    if score >= 70:
+        return "real"
+    if score >= 40:
+        return "warn"
+    return "fake"
+def _ensure_dir() -> Path:
+    p = Path(settings.REPORT_DIR)
+    p.mkdir(parents=True, exist_ok=True)
+    return p
+def _make_donut_chart(score: int, score_cls: str) -> str:
+    """Render authenticity score as a donut chart PNG; return base64 or '' on failure."""
+    try:
+        import matplotlib  # type: ignore
+        matplotlib.use("Agg")
+        import matplotlib.pyplot as plt  # type: ignore
+        color_map = {"real": "#43A047", "warn": "#FB8C00", "fake": "#E53935"}
+        color = color_map.get(score_cls, "#6B7280")
+        fig, ax = plt.subplots(figsize=(2.2, 2.2), dpi=96)
+        sizes = [score, 100 - score]
+        wedge_colors = [color, "#F3F4F6"]
+        ax.pie(sizes, colors=wedge_colors, startangle=90,
+               wedgeprops=dict(width=0.42, edgecolor="white", linewidth=1))
+        ax.text(0, 0, str(score), ha="center", va="center",
+                fontsize=20, fontweight="bold", color=color)
+        ax.set_aspect("equal")
+        plt.tight_layout(pad=0.05)
+        buf = BytesIO()
+        fig.savefig(buf, format="png", bbox_inches="tight", transparent=True)
+        plt.close(fig)
+        buf.seek(0)
+        return base64.b64encode(buf.read()).decode()
+    except Exception as e:
+        logger.debug(f"Donut chart skipped: {e}")
+        return ""
+def _extract_llm_summary(analysis_json: dict) -> dict | None:
+    """Extract llm_summary from either top-level or inside explainability (images)."""
+    top = analysis_json.get("llm_summary")
+    if top:
+        return top
+    return (analysis_json.get("explainability") or {}).get("llm_summary")
+def render_html(analysis_json: dict) -> str:
+    score = analysis_json.get("verdict", {}).get("authenticity_score", 50)
+    sc = _score_class(score)
+    donut_b64 = _make_donut_chart(score, sc)
+    llm_summary = _extract_llm_summary(analysis_json)
+    expl: dict[str, Any] = analysis_json.get("explainability") or {}
+    tmpl = _env.get_template("report.html")
+    return tmpl.render(
+        analysis_id=analysis_json.get("analysis_id", ""),
+        media_type=analysis_json.get("media_type", "unknown"),
+        verdict=analysis_json.get("verdict", {}),
+        explainability=expl,
+        trusted_sources=analysis_json.get("trusted_sources", []),
+        contradicting_evidence=analysis_json.get("contradicting_evidence", []),
+        processing_summary=analysis_json.get("processing_summary", {}),
+        responsible_ai_notice=analysis_json.get(
+            "responsible_ai_notice",
+            "AI-based analysis may not be 100% accurate.",
+        ),
+        score_class=sc,
+        generated_at=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC"),
+        donut_b64=donut_b64,
+        llm_summary=llm_summary,
+    )
+def html_to_pdf(html: str, out_path: Path) -> None:
+    with open(out_path, "wb") as f:
+        result = pisa.CreatePDF(html, dest=f)
+    if result.err:
+        raise RuntimeError(f"xhtml2pdf failed with {result.err} errors")
+def generate_report(record: AnalysisRecord) -> Path:
+    out_dir = _ensure_dir()
+    filename = f"deepshield_{record.id}_{uuid.uuid4().hex[:8]}.pdf"
+    out_path = out_dir / filename
+    data = json.loads(record.result_json)
+    html = render_html(data)
+    html_to_pdf(html, out_path)
+    logger.info(f"Report generated id={record.id} path={out_path} size={out_path.stat().st_size}B")
+    return out_path
+def create_report_row(analysis_id: int, path: Path) -> Report:
+    return Report(
+        analysis_id=analysis_id,
+        file_path=str(path),
+        expires_at=datetime.utcnow() + timedelta(seconds=settings.REPORT_TTL_SECONDS),
+    )
+def cleanup_expired(now: Optional[datetime] = None) -> int:
+    """Delete expired PDFs from disk. Returns count deleted."""
+    now = now or datetime.utcnow()
+    d = Path(settings.REPORT_DIR)
+    if not d.exists():
+        return 0
+    deleted = 0
+    ttl = timedelta(seconds=settings.REPORT_TTL_SECONDS)
+    for f in d.glob("*.pdf"):
+        try:
+            mtime = datetime.utcfromtimestamp(f.stat().st_mtime)
+            if now - mtime > ttl:
+                f.unlink()
+                deleted += 1
+        except OSError as e:
+            logger.warning(f"Cleanup failed for {f}: {e}")
+    if deleted:
+        logger.info(f"Cleaned up {deleted} expired reports")
+    return deleted

requirements.txt ADDED Viewed

	@@ -0,0 +1,50 @@

+fastapi==0.115.0
+uvicorn[standard]==0.32.0
+pydantic==2.9.2
+pydantic-settings==2.6.0
+python-multipart==0.0.12
+python-dotenv==1.0.1
+loguru==0.7.2
+SQLAlchemy==2.0.35
+psycopg2-binary==2.9.9
+alembic==1.13.3
+python-jose[cryptography]==3.3.0
+bcrypt==4.2.0
+# === Phase 1: Image Detection ===
+# Install torch separately with CPU index first (see README): pip install torch==2.4.1 --index-url https://download.pytorch.org/whl/cpu
+torch==2.4.1
+torchvision==0.19.1
+transformers==4.44.2
+Pillow>=10.4.0
+numpy>=1.26,<3
+opencv-python==4.10.0.84
+grad-cam==1.5.4
+mediapipe==0.10.14
+# === Phase 12: Explainability v2 ===
+exifread==3.0.0
+google-generativeai>=0.3.0    # Gemini provider for LLM explainability
+openai>=1.0.0                 # OpenAI provider (alternative to Gemini)
+# === Phase 14: PDF v2 donut chart ===
+matplotlib>=3.9.0
+# === Phase 13: Text Pipeline Hardening ===
+# After installing, run: python -m spacy download en_core_web_sm
+spacy>=3.7.0,<4.0.0
+sentence-transformers>=2.7.0  # for truth-override cosine similarity (all-MiniLM-L6-v2)
+langdetect==1.0.9             # lightweight language detection
+# === Phase 3: Text / News ===
+httpx==0.27.2
+# === Phase 4: Screenshot / OCR ===
+easyocr==1.7.2
+# === Phase 7: PDF Reports ===
+Jinja2==3.1.4
+xhtml2pdf==0.2.16
+# === Phase 8: Auth ===
+email-validator==2.2.0

router.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from fastapi import APIRouter
+from api.v1 import analyze, auth, health, history, report
+api_router = APIRouter(prefix="/api/v1")
+api_router.include_router(health.router)
+api_router.include_router(analyze.router)
+api_router.include_router(report.router)
+api_router.include_router(auth.router)
+api_router.include_router(history.router)

scoring.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from __future__ import annotations
+from typing import Tuple
+TRUST_SCALE = [
+    (0, 20, "Very Likely Fake", "critical"),
+    (21, 40, "Likely Fake", "danger"),
+    (41, 60, "Possibly Manipulated", "warning"),
+    (61, 80, "Likely Real", "positive"),
+    (81, 100, "Very Likely Real", "safe"),
+]
+def compute_authenticity_score(model_confidence: float, label: str) -> int:
+    """Map (confidence, label) to 0-100 authenticity score.
+    Real-ish labels give high score; fake-ish labels give low score.
+    """
+    label_l = label.lower()
+    fake_tokens = ("fake", "deepfake", "manipulated", "ai", "generated", "synthetic")
+    if any(tok in label_l for tok in fake_tokens):
+        score = (1.0 - float(model_confidence)) * 100.0
+    else:
+        score = float(model_confidence) * 100.0
+    return int(round(max(0.0, min(100.0, score))))
+def get_verdict_label(score: int) -> Tuple[str, str]:
+    for lo, hi, label, severity in TRUST_SCALE:
+        if lo <= score <= hi:
+            return label, severity
+    return "Unknown", "warning"
+def get_score_color(score: int) -> str:
+    """Linear interpolate Red (#E53935) → Amber (#FFA726) → Green (#43A047)."""
+    def lerp(a: int, b: int, t: float) -> int:
+        return int(round(a + (b - a) * t))
+    score = max(0, min(100, score))
+    if score <= 50:
+        t = score / 50.0
+        r, g, b = lerp(0xE5, 0xFF, t), lerp(0x39, 0xA7, t), lerp(0x35, 0x26, t)
+    else:
+        t = (score - 50) / 50.0
+        r, g, b = lerp(0xFF, 0x43, t), lerp(0xA7, 0xA0, t), lerp(0x26, 0x47, t)
+    return f"#{r:02X}{g:02X}{b:02X}"

screenshot_service.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import List, Tuple
+import numpy as np
+from loguru import logger
+from PIL import Image
+from models.model_loader import get_model_loader
+@dataclass
+class OCRBox:
+    text: str
+    bbox: List[List[int]]  # 4 points [[x,y],...]
+    confidence: float
+@dataclass
+class SuspiciousPhrase:
+    text: str
+    bbox: List[List[int]]
+    pattern_type: str
+    severity: str
+    description: str
+@dataclass
+class LayoutAnomaly:
+    type: str  # misalignment / font_mismatch / uneven_spacing
+    severity: str
+    description: str
+    confidence: float
+def run_ocr(pil_img: Image.Image) -> List[OCRBox]:
+    reader = get_model_loader().load_ocr_engine()
+    arr = np.array(pil_img.convert("RGB"))
+    results = reader.readtext(arr, detail=1, paragraph=False)
+    out: List[OCRBox] = []
+    for bbox, text, conf in results:
+        out.append(OCRBox(
+            text=str(text),
+            bbox=[[int(p[0]), int(p[1])] for p in bbox],
+            confidence=float(conf),
+        ))
+    logger.info(f"OCR extracted {len(out)} text regions")
+    return out
+def extract_full_text(boxes: List[OCRBox]) -> str:
+    return " ".join(b.text for b in boxes if b.text.strip())
+def map_phrases_to_boxes(boxes: List[OCRBox], manipulation_indicators) -> List[SuspiciousPhrase]:
+    """Map each manipulation indicator to the OCR box whose text contains it."""
+    out: List[SuspiciousPhrase] = []
+    for mi in manipulation_indicators:
+        needle = mi.matched_text.lower()
+        for b in boxes:
+            if needle in b.text.lower():
+                out.append(SuspiciousPhrase(
+                    text=mi.matched_text,
+                    bbox=b.bbox,
+                    pattern_type=mi.pattern_type,
+                    severity=mi.severity,
+                    description=mi.description,
+                ))
+                break
+    return out
+def detect_layout_anomalies(boxes: List[OCRBox]) -> List[LayoutAnomaly]:
+    """Heuristic layout checks on OCR bboxes."""
+    out: List[LayoutAnomaly] = []
+    if len(boxes) < 3:
+        return out
+    heights = []
+    x_lefts = []
+    for b in boxes:
+        pts = b.bbox
+        ys = [p[1] for p in pts]
+        xs = [p[0] for p in pts]
+        heights.append(max(ys) - min(ys))
+        x_lefts.append(min(xs))
+    h_arr = np.array(heights, dtype=float)
+    if h_arr.mean() > 0:
+        cv_h = float(h_arr.std() / h_arr.mean())
+        if cv_h > 0.7:
+            out.append(LayoutAnomaly(
+                type="font_mismatch",
+                severity="medium" if cv_h < 1.2 else "high",
+                description=f"High variance in text heights (cv={cv_h:.2f}) — mixed fonts/sizes possible",
+                confidence=min(cv_h / 1.5, 1.0),
+            ))
+    x_arr = np.array(x_lefts, dtype=float)
+    if x_arr.std() > 0 and len(x_arr) > 4:
+        clustered = sum(1 for x in x_arr if abs(x - np.median(x_arr)) < 20)
+        align_ratio = clustered / len(x_arr)
+        if align_ratio < 0.4:
+            out.append(LayoutAnomaly(
+                type="misalignment",
+                severity="low",
+                description=f"Only {align_ratio*100:.0f}% of text blocks share left-alignment — unusual layout",
+                confidence=1.0 - align_ratio,
+            ))
+    if len(boxes) >= 4:
+        tops = sorted([min(p[1] for p in b.bbox) for b in boxes])
+        gaps = np.diff(tops)
+        gaps = gaps[gaps > 0]
+        if len(gaps) >= 3 and gaps.mean() > 0:
+            cv_g = float(gaps.std() / gaps.mean())
+            if cv_g > 1.5:
+                out.append(LayoutAnomaly(
+                    type="uneven_spacing",
+                    severity="low",
+                    description=f"Irregular vertical spacing between text blocks (cv={cv_g:.2f})",
+                    confidence=min(cv_g / 2.5, 1.0),
+                ))
+    return out

test_image_classify.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""Phase 1.2 smoke test: download a sample image and run the ViT classifier.
+Run from backend/:
+    .venv/Scripts/python.exe scripts/test_image_classify.py
+"""
+from __future__ import annotations
+import sys
+import urllib.request
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+import base64
+from models.heatmap_generator import generate_heatmap_base64
+from services.artifact_detector import scan_artifacts
+from services.image_service import preprocess_and_classify
+from utils.scoring import compute_authenticity_score, get_verdict_label
+SAMPLE_URL = "https://picsum.photos/seed/deepshield/512/512"
+def main() -> int:
+    print(f"Fetching sample image: {SAMPLE_URL}")
+    req = urllib.request.Request(SAMPLE_URL, headers={"User-Agent": "DeepShield/0.1"})
+    with urllib.request.urlopen(req, timeout=30) as r:
+        data = r.read()
+    print(f"  got {len(data)} bytes")
+    print("Running classifier (first run will download model ~350MB)…")
+    pil, result = preprocess_and_classify(data)
+    print(f"  image size: {pil.size}")
+    print(f"  label: {result.label}")
+    print(f"  confidence: {result.confidence:.4f}")
+    print(f"  all scores: {result.all_scores}")
+    score = compute_authenticity_score(result.confidence, result.label)
+    verdict_label, severity = get_verdict_label(score)
+    print(f"\n  authenticity_score: {score}")
+    print(f"  verdict: {verdict_label} ({severity})")
+    print("\nScanning artifact indicators\u2026")
+    for ind in scan_artifacts(pil, data):
+        print(f"  [{ind.severity.upper():6s}] {ind.type}: {ind.description} (conf {ind.confidence:.2f})")
+    print("\nGenerating Grad-CAM heatmap\u2026")
+    heatmap_url = generate_heatmap_base64(pil)
+    header, b64 = heatmap_url.split(",", 1)
+    out_path = Path(__file__).resolve().parent.parent / "heatmap_smoketest.png"
+    out_path.write_bytes(base64.b64decode(b64))
+    print(f"  saved: {out_path}")
+    print(f"  data URL length: {len(heatmap_url)} chars")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

test_news_api.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""Test script for the NewsData API integration."""
+import asyncio
+import sys
+import os
+# Add backend directory to sys.path so we can import modules
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from config import settings
+from services.news_lookup import search_news_full
+async def test_news():
+    print(f"Testing News API Integration with key: {settings.NEWS_API_KEY[:6]}... (masked)")
+    if not settings.NEWS_API_KEY:
+        print("ERROR: NEWS_API_KEY is empty in .env")
+        return
+    keywords = ["modi", "election", "bjp", "congress"]
+    print(f"Searching for keywords: {keywords}")
+    try:
+        result = await search_news_full(keywords, limit=5)
+        print("\n=== RAW RESULT ===")
+        print(f"Total articles found: {result.total_articles}")
+        print("\n=== TRUSTED SOURCES ===")
+        for i, source in enumerate(result.trusted_sources, 1):
+            date_str = str(source.published_at)[:10] if source.published_at else "Unknown date"
+            print(f"{i}. [{source.relevance_score}] {source.source_name}: {source.title[:60]}... ({date_str})")
+        print("\n=== CONTRADICTING EVIDENCE / FACT CHECKS ===")
+        if not result.contradicting_evidence:
+            print("No fact-check articles found for these keywords.")
+        for i, ev in enumerate(result.contradicting_evidence, 1):
+            print(f"{i}. {ev.source_name}: {ev.title[:60]}...")
+    except Exception as e:
+        print(f"\nERROR running test: {e}")
+if __name__ == "__main__":
+    asyncio.run(test_news())

test_phase5.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""Phase 5 smoke: unit-test news_lookup classification + endpoint wiring."""
+from __future__ import annotations
+import asyncio
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from services.news_lookup import (
+    _domain_of, _is_factcheck, _relevance, search_news_full,
+)
+def test_domain():
+    assert _domain_of("https://www.reuters.com/article/x") == "reuters.com"
+    assert _domain_of("https://snopes.com/fact-check/abc") == "snopes.com"
+    print("[OK] _domain_of")
+def test_factcheck_detection():
+    assert _is_factcheck("https://snopes.com/x", "Claim about moon")
+    assert _is_factcheck("https://factly.in/x", "")
+    assert _is_factcheck("https://example.com/x", "FACT CHECK: viral video debunked")
+    assert not _is_factcheck("https://bbc.com/news/world-123", "Election results")
+    print("[OK] _is_factcheck")
+def test_relevance():
+    assert _relevance("https://reuters.com/x") == 1.0
+    assert _relevance("https://ndtv.com/x") == 0.85
+    assert _relevance("https://random-blog.xyz/x") == 0.5
+    print("[OK] _relevance weights")
+async def test_empty_key_returns_empty():
+    res = await search_news_full(["modi", "election"])
+    assert res.trusted_sources == []
+    assert res.contradicting_evidence == []
+    assert res.total_articles == 0
+    print(f"[OK] empty-key path -> {res}")
+async def test_endpoint_wiring():
+    import httpx
+    body = {"text": "BREAKING!!! You won't BELIEVE this SHOCKING miracle cure doctors don't want you to know!!! Click now!"}
+    async with httpx.AsyncClient(timeout=180.0) as c:
+        r = await c.post("http://127.0.0.1:8000/api/v1/analyze/text", json=body)
+    r.raise_for_status()
+    j = r.json()
+    assert j["media_type"] == "text"
+    assert "trusted_sources" in j
+    assert "contradicting_evidence" in j
+    assert "news_lookup" in j["processing_summary"]["stages_completed"]
+    print(f"[OK] /analyze/text -> verdict={j['verdict']['label']} "
+          f"score={j['verdict']['authenticity_score']} "
+          f"trusted={len(j['trusted_sources'])} contradictions={len(j['contradicting_evidence'])}")
+async def main():
+    test_domain()
+    test_factcheck_detection()
+    test_relevance()
+    await test_empty_key_returns_empty()
+    await test_endpoint_wiring()
+    print("\n=== Phase 5 smoke PASS ===")
+if __name__ == "__main__":
+    asyncio.run(main())

test_text_analysis.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""Quick smoke test for sensationalism + manipulation detection."""
+import sys
+sys.path.insert(0, ".")
+from services.text_service import score_sensationalism, detect_manipulation_indicators
+# --- Sensationalism ---
+text1 = "BREAKING: You wont believe this SHOCKING truth! Experts confirm the most DEVASTATING scandal exposed!!!"
+s = score_sensationalism(text1)
+print(f"Sensationalism: score={s.score} level={s.level}")
+print(f"  excl={s.exclamation_count} caps={s.caps_word_count} clickbait={s.clickbait_matches} emotional={s.emotional_word_count} superlative={s.superlative_count}")
+assert s.score > 50, f"Expected high sensationalism, got {s.score}"
+assert s.level in ("Medium", "High"), f"Expected Medium/High, got {s.level}"
+print("  PASS")
+# --- Manipulation ---
+text2 = "Sources say that experts confirm the shocking truth. Allegedly, everyone knows this is a proven fact."
+m = detect_manipulation_indicators(text2)
+print(f"\nManipulation indicators: {len(m)} found")
+for ind in m:
+    print(f"  [{ind.severity}] {ind.pattern_type}: \"{ind.matched_text}\"")
+assert len(m) >= 3, f"Expected >=3 indicators, got {len(m)}"
+print("  PASS")
+# --- Clean text ---
+text3 = "The weather today is sunny with clear skies in New Delhi."
+s2 = score_sensationalism(text3)
+m2 = detect_manipulation_indicators(text3)
+print(f"\nClean text: sensationalism={s2.score} ({s2.level}), manipulation={len(m2)}")
+assert s2.score < 20, f"Expected low sensationalism for clean text, got {s2.score}"
+assert len(m2) == 0, f"Expected 0 manipulation indicators for clean text, got {len(m2)}"
+print("  PASS")
+print("\nAll tests passed!")

text_service.py ADDED Viewed

	@@ -0,0 +1,285 @@

+from __future__ import annotations
+import re
+from dataclasses import dataclass, field
+from typing import List, Optional
+from loguru import logger
+from models.model_loader import get_model_loader
+FAKE_TOKENS = ("fake", "false", "unreliable", "misinformation")
+# --- Sensationalism patterns ---
+CLICKBAIT_PATTERNS = [
+    (r"\byou won'?t believe\b", "clickbait"),
+    (r"\bbreaking\s*:", "clickbait"),
+    (r"\bshocking\s*:", "clickbait"),
+    (r"\bexclusive\s*:", "clickbait"),
+    (r"\bjust\s+in\s*:", "clickbait"),
+    (r"\burgent\s*:", "clickbait"),
+    (r"\bwhat\s+happens\s+next\b", "clickbait"),
+    (r"\bthis\s+will\s+change\b", "clickbait"),
+    (r"\b(?:everyone|nobody)\s+(?:is|was)\s+talking\b", "clickbait"),
+]
+EMOTIONAL_WORDS = {
+    "outrage", "shocking", "horrifying", "disgusting", "amazing", "incredible",
+    "unbelievable", "devastating", "terrifying", "explosive", "bombshell",
+    "jaw-dropping", "heartbreaking", "furious", "scandal", "crisis",
+    "chaos", "destroyed", "slammed", "blasted", "exposed", "revealed",
+}
+SUPERLATIVES = {
+    "best", "worst", "greatest", "biggest", "most", "least",
+    "fastest", "deadliest", "largest", "smallest", "ultimate",
+}
+# --- Manipulation indicator patterns ---
+MANIPULATION_PATTERNS = [
+    # Unverified claims
+    (r"\bsources?\s+(?:say|said|claim|report)\b", "unverified_claim", "medium",
+     "Unverified source attribution without specific citation"),
+    (r"\ballegedly\b", "unverified_claim", "low",
+     "Hedging language suggests unverified information"),
+    (r"\breports?\s+suggest\b", "unverified_claim", "medium",
+     "Vague report attribution"),
+    (r"\baccording\s+to\s+(?:some|many|several)\b", "unverified_claim", "medium",
+     "Non-specific source attribution"),
+    (r"\brunconfirmed\b", "unverified_claim", "medium",
+     "Explicitly unconfirmed information"),
+    # Emotional manipulation
+    (r"\boutrage\b", "emotional_manipulation", "medium",
+     "Emotional trigger word designed to provoke reaction"),
+    (r"\bshocking\s+truth\b", "emotional_manipulation", "high",
+     "Sensationalist phrase designed to manipulate reader emotion"),
+    (r"\bwake\s+up\b", "emotional_manipulation", "medium",
+     "Call-to-action implying hidden knowledge"),
+    (r"\bthey\s+don'?t\s+want\s+you\s+to\s+know\b", "emotional_manipulation", "high",
+     "Conspiracy framing language"),
+    (r"\bopen\s+your\s+eyes\b", "emotional_manipulation", "medium",
+     "Implies audience ignorance"),
+    # False authority
+    (r"\bexperts?\s+(?:confirm|say|agree|warn)\b", "false_authority", "medium",
+     "Unnamed expert citation without specific attribution"),
+    (r"\bscientists?\s+(?:confirm|prove|say)\b", "false_authority", "medium",
+     "Unnamed scientist citation"),
+    (r"\bstudies?\s+(?:show|prove|confirm)\b", "false_authority", "low",
+     "Vague study reference without citation"),
+    (r"\beveryone\s+knows\b", "false_authority", "medium",
+     "Appeal to common knowledge fallacy"),
+    (r"\bit'?s\s+(?:a\s+)?(?:well-?known|proven)\s+fact\b", "false_authority", "medium",
+     "Assertion of fact without evidence"),
+]
+# NER entity labels to prefer for keyword extraction
+_NER_PREFERRED = {"PERSON", "ORG", "GPE", "EVENT", "PRODUCT", "NORP"}
+@dataclass
+class TextClassification:
+    label: str
+    confidence: float
+    fake_prob: float
+    all_scores: dict[str, float]
+@dataclass
+class SensationalismResult:
+    score: int  # 0-100
+    level: str  # Low / Medium / High
+    exclamation_count: int
+    caps_word_count: int
+    clickbait_matches: int
+    emotional_word_count: int
+    superlative_count: int
+@dataclass
+class ManipulationIndicator:
+    pattern_type: str       # unverified_claim / emotional_manipulation / false_authority
+    matched_text: str
+    start_pos: int
+    end_pos: int
+    severity: str           # low / medium / high
+    description: str
+def detect_language(text: str) -> str:
+    """Detect the primary language of text using langdetect.
+    Returns ISO 639-1 code (e.g. 'en', 'hi'). Falls back to 'en' on failure.
+    """
+    if not text or len(text.strip()) < 10:
+        return "en"
+    try:
+        from langdetect import detect  # type: ignore
+        lang = detect(text.strip())
+        logger.info(f"Language detected: {lang}")
+        return lang
+    except ImportError:
+        logger.debug("langdetect not installed — defaulting to 'en'")
+        return "en"
+    except Exception as e:
+        logger.debug(f"Language detection failed: {e} — defaulting to 'en'")
+        return "en"
+def _scores_to_classification(items) -> TextClassification:
+    """Convert pipeline output to TextClassification."""
+    scores = {i["label"]: float(i["score"]) for i in items}
+    top_label, top_conf = max(scores.items(), key=lambda kv: kv[1])
+    # Extract fake probability
+    fake_prob = 0.0
+    if "LABEL_0" in scores:
+        fake_prob = scores["LABEL_0"]
+    else:
+        fake_prob = max(
+            (p for lbl, p in scores.items() if any(t in lbl.lower() for t in FAKE_TOKENS)),
+            default=0.0,
+        )
+    return TextClassification(top_label, top_conf, fake_prob, scores)
+def classify_text(text: str, language: Optional[str] = None) -> TextClassification:
+    """Classify text as fake/real.
+    Routes to multilingual model when language is non-English and the model is configured.
+    """
+    text = (text or "").strip()
+    if not text:
+        return TextClassification("unknown", 0.0, 0.0, {})
+    loader = get_model_loader()
+    if language and language != "en":
+        pipe = loader.load_multilang_text_model()
+    else:
+        pipe = loader.load_text_model()
+    out = pipe(text[:2000], truncation=True, top_k=None)
+    items = out[0] if isinstance(out[0], list) else out
+    clf = _scores_to_classification(items)
+    logger.info(
+        f"Text classify [{language or 'en'}] → {clf.label} @ {clf.confidence:.3f} "
+        f"fake_p={clf.fake_prob:.3f}"
+    )
+    return clf
+def score_sensationalism(text: str) -> SensationalismResult:
+    """Compute a 0-100 sensationalism score from structural/linguistic signals."""
+    if not text:
+        return SensationalismResult(0, "Low", 0, 0, 0, 0, 0)
+    words = text.split()
+    total_words = max(len(words), 1)
+    excl = text.count("!")
+    caps = sum(1 for w in words if w.isupper() and len(w) > 2)
+    clickbait = sum(
+        1 for pat, _ in CLICKBAIT_PATTERNS
+        if re.search(pat, text, re.IGNORECASE)
+    )
+    emotional = sum(1 for w in words if w.lower().strip(".,!?;:") in EMOTIONAL_WORDS)
+    superlative = sum(1 for w in words if w.lower().strip(".,!?;:") in SUPERLATIVES)
+    raw = (
+        min(excl * 8, 25)
+        + min(caps / total_words * 200, 25)
+        + min(clickbait * 12, 25)
+        + min(emotional * 6, 15)
+        + min(superlative * 5, 10)
+    )
+    score = int(min(100, max(0, raw)))
+    level = "Low" if score < 30 else ("Medium" if score < 60 else "High")
+    logger.info(f"Sensationalism → {score} ({level}) excl={excl} caps={caps} cb={clickbait} emo={emotional}")
+    return SensationalismResult(score, level, excl, caps, clickbait, emotional, superlative)
+def detect_manipulation_indicators(text: str) -> List[ManipulationIndicator]:
+    """Scan text for manipulation linguistic patterns with positions."""
+    if not text:
+        return []
+    indicators: List[ManipulationIndicator] = []
+    for pattern, ptype, severity, description in MANIPULATION_PATTERNS:
+        for m in re.finditer(pattern, text, re.IGNORECASE):
+            indicators.append(ManipulationIndicator(
+                pattern_type=ptype,
+                matched_text=m.group(),
+                start_pos=m.start(),
+                end_pos=m.end(),
+                severity=severity,
+                description=description,
+            ))
+    indicators.sort(key=lambda i: i.start_pos)
+    logger.info(f"Manipulation indicators → {len(indicators)} found")
+    return indicators
+def extract_entities(text: str, max_k: int = 6) -> List[str]:
+    """Extract keywords via spaCy NER (PERSON, ORG, GPE, EVENT preferred).
+    Falls back to frequency-based extraction when spaCy is unavailable or text is too short.
+    """
+    if not text or len(text.strip()) < 20:
+        return _extract_keywords_freq(text, max_k)
+    loader = get_model_loader()
+    nlp = loader.load_spacy_nlp()
+    if nlp is None:
+        # spaCy not available — use frequency fallback
+        return _extract_keywords_freq(text, max_k)
+    try:
+        doc = nlp(text[:5000])  # cap for performance
+        # Collect named entities, preferring high-value types
+        preferred: List[str] = []
+        other: List[str] = []
+        seen: set[str] = set()
+        for ent in doc.ents:
+            norm = ent.text.strip()
+            norm_lower = norm.lower()
+            if not norm or norm_lower in seen or len(norm) < 2:
+                continue
+            seen.add(norm_lower)
+            if ent.label_ in _NER_PREFERRED:
+                preferred.append(norm)
+            else:
+                other.append(norm)
+        entities = preferred + other
+        if len(entities) >= 2:
+            logger.info(f"NER extracted {len(entities)} entities: {entities[:max_k]}")
+            return entities[:max_k]
+        # Not enough entities — supplement with frequency keywords
+        freq_kws = _extract_keywords_freq(text, max_k)
+        combined = entities + [k for k in freq_kws if k.lower() not in seen]
+        return combined[:max_k]
+    except Exception as e:
+        logger.warning(f"spaCy NER failed: {e} — falling back to frequency extraction")
+        return _extract_keywords_freq(text, max_k)
+def _extract_keywords_freq(text: str, max_k: int = 6) -> List[str]:
+    """Frequency-based keyword extraction (original implementation, kept as fallback)."""
+    stop = {
+        "the","a","an","is","are","was","were","be","been","being","to","of","and","or","but",
+        "in","on","at","for","with","by","from","as","that","this","it","its","has","have","had",
+        "will","would","can","could","should","may","might","do","does","did","not","no","so",
+        "than","then","there","their","they","them","we","our","you","your","he","she","his","her",
+    }
+    words = re.findall(r"[A-Za-z][A-Za-z\-']{2,}", text or "")
+    freq: dict[str, int] = {}
+    for w in words:
+        wl = w.lower()
+        if wl in stop:
+            continue
+        freq[wl] = freq.get(wl, 0) + 1
+    return [w for w, _ in sorted(freq.items(), key=lambda kv: (-kv[1], kv[0]))[:max_k]]
+# Back-compat alias: routes that still call extract_keywords get NER-first behaviour
+extract_keywords = extract_entities

v1/__init__.py ADDED Viewed

File without changes

v1/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (165 Bytes). View file

v1/__pycache__/analyze.cpython-311.pyc ADDED Viewed

Binary file (21.6 kB). View file

v1/__pycache__/auth.cpython-311.pyc ADDED Viewed

Binary file (3.82 kB). View file

v1/__pycache__/health.cpython-311.pyc ADDED Viewed

Binary file (556 Bytes). View file

v1/__pycache__/history.cpython-311.pyc ADDED Viewed

Binary file (5.19 kB). View file