Spyderzz commited on
Commit
0853b44
·
1 Parent(s): a648128

Initial deployment of DeepShield backend

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
.env ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ═══════════════════════════════════════
2
+ # DeepShield — Environment Configuration
3
+ # ═══════════════════════════════════════
4
+ # Copy this file to backend/.env and customize
5
+
6
+ # Server
7
+ APP_HOST=0.0.0.0
8
+ APP_PORT=8000
9
+ DEBUG=true
10
+ CORS_ORIGINS=["http://localhost:5173"]
11
+
12
+ # Database
13
+ # For local dev: sqlite:///./deepshield.db
14
+ # For production (Neon/Supabase): postgresql://username:password@ep-cool...
15
+ DATABASE_URL=postgresql://neondb_owner:npg_YUdXqlrDP3H2@ep-divine-tooth-ame27uf3-pooler.c-5.us-east-1.aws.neon.tech/neondb?sslmode=require&channel_binding=require
16
+
17
+ # File Upload
18
+ MAX_UPLOAD_SIZE_MB=100
19
+ UPLOAD_DIR=./temp_uploads
20
+ FILE_RETENTION_SECONDS=300
21
+
22
+ # AI Models
23
+ IMAGE_MODEL_ID=prithivMLmods/Deep-Fake-Detector-v2-Model
24
+ TEXT_MODEL_ID=jy46604790/Fake-News-Bert-Detect
25
+ DEVICE=cpu
26
+ PRELOAD_MODELS=true
27
+
28
+ # News API (optional — sign up at https://newsdata.io)
29
+ NEWS_API_KEY=pub_83c8fca805124a4fb074256825decd4c
30
+ NEWS_API_BASE_URL=https://newsdata.io/api/1/news
31
+
32
+ # PDF Reports
33
+ REPORT_DIR=./temp_reports
34
+ REPORT_TTL_SECONDS=3600
35
+
36
+ # Auth — CHANGE JWT_SECRET_KEY IN PRODUCTION!
37
+ JWT_SECRET_KEY=change-me-in-production
38
+ JWT_ALGORITHM=HS256
39
+ JWT_EXPIRATION_MINUTES=1440
Colab_ViT_Training.ipynb ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "1e0e7b4a",
6
+ "metadata": {},
7
+ "source": [
8
+ "# DeepShield: FaceForensics++ ViT Training \n",
9
+ "Run this entirely in Google Colab.\n",
10
+ "**Before running**:\n",
11
+ "1. Go to `Runtime` -> `Change runtime type` -> select **T4 GPU**.\n",
12
+ "2. Run the cells below sequentially.\n"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": null,
18
+ "id": "4fe293e7",
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "!pip install timm transformers datasets accelerate evaluate opencv-python\n"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": null,
28
+ "id": "c9387c0f",
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "# We create the download script inside the Colab environment\n",
33
+ "download_script = '''#!/usr/bin/env python\n",
34
+ "import argparse\n",
35
+ "import os\n",
36
+ "import urllib.request\n",
37
+ "import tempfile\n",
38
+ "import time\n",
39
+ "import sys\n",
40
+ "import json\n",
41
+ "from tqdm import tqdm\n",
42
+ "from os.path import join\n",
43
+ "\n",
44
+ "FILELIST_URL = 'misc/filelist.json'\n",
45
+ "DEEPFEAKES_DETECTION_URL = 'misc/deepfake_detection_filenames.json'\n",
46
+ "DEEPFAKES_MODEL_NAMES = ['decoder_A.h5', 'decoder_B.h5', 'encoder.h5',]\n",
47
+ "DATASETS = {\n",
48
+ " 'original': 'original_sequences/youtube',\n",
49
+ " 'Deepfakes': 'manipulated_sequences/Deepfakes',\n",
50
+ " 'Face2Face': 'manipulated_sequences/Face2Face',\n",
51
+ " 'FaceShifter': 'manipulated_sequences/FaceShifter',\n",
52
+ " 'FaceSwap': 'manipulated_sequences/FaceSwap',\n",
53
+ " 'NeuralTextures': 'manipulated_sequences/NeuralTextures'\n",
54
+ "}\n",
55
+ "ALL_DATASETS = ['original', 'Deepfakes', 'Face2Face', 'FaceShifter', 'FaceSwap', 'NeuralTextures']\n",
56
+ "COMPRESSION = ['raw', 'c23', 'c40']\n",
57
+ "TYPE = ['videos']\n",
58
+ "\n",
59
+ "def download_file(url, out_file):\n",
60
+ " os.makedirs(os.path.dirname(out_file), exist_ok=True)\n",
61
+ " if not os.path.isfile(out_file):\n",
62
+ " urllib.request.urlretrieve(url, out_file)\n",
63
+ "\n",
64
+ "def main():\n",
65
+ " parser = argparse.ArgumentParser()\n",
66
+ " parser.add_argument('output_path', type=str)\n",
67
+ " parser.add_argument('-d', '--dataset', type=str, default='all')\n",
68
+ " parser.add_argument('-c', '--compression', type=str, default='c40')\n",
69
+ " parser.add_argument('-t', '--type', type=str, default='videos')\n",
70
+ " parser.add_argument('-n', '--num_videos', type=int, default=50) # Small amount for tutorial\n",
71
+ " args = parser.parse_args()\n",
72
+ " \n",
73
+ " base_url = 'http://kaldir.vc.in.tum.de/faceforensics/v3/'\n",
74
+ " \n",
75
+ " datasets = [args.dataset] if args.dataset != 'all' else ALL_DATASETS\n",
76
+ " for dataset in datasets:\n",
77
+ " dataset_path = DATASETS[dataset]\n",
78
+ " print(f'Downloading {args.compression} of {dataset}')\n",
79
+ " \n",
80
+ " file_pairs = json.loads(urllib.request.urlopen(base_url + FILELIST_URL).read().decode(\"utf-8\"))\n",
81
+ " filelist = []\n",
82
+ " if 'original' in dataset_path:\n",
83
+ " for pair in file_pairs:\n",
84
+ " filelist += pair\n",
85
+ " else:\n",
86
+ " for pair in file_pairs:\n",
87
+ " filelist.append('_'.join(pair))\n",
88
+ " filelist.append('_'.join(pair[::-1]))\n",
89
+ " \n",
90
+ " filelist = filelist[:args.num_videos]\n",
91
+ " dataset_videos_url = base_url + f'{dataset_path}/{args.compression}/{args.type}/'\n",
92
+ " dataset_output_path = join(args.output_path, dataset_path, args.compression, args.type)\n",
93
+ " \n",
94
+ " for filename in tqdm(filelist):\n",
95
+ " download_file(dataset_videos_url + filename + \".mp4\", join(dataset_output_path, filename + \".mp4\"))\n",
96
+ "\n",
97
+ "if __name__ == \"__main__\":\n",
98
+ " main()\n",
99
+ "'''\n",
100
+ "\n",
101
+ "with open(\"download_ffpp.py\", \"w\") as f:\n",
102
+ " f.write(download_script)\n",
103
+ "\n",
104
+ "!python download_ffpp.py ./data -d all -c c40 -t videos -n 50\n"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": null,
110
+ "id": "f33716f6",
111
+ "metadata": {},
112
+ "outputs": [],
113
+ "source": [
114
+ "import cv2\n",
115
+ "import os\n",
116
+ "import glob\n",
117
+ "from tqdm import tqdm\n",
118
+ "\n",
119
+ "def extract_frames(video_folder, output_folder, label, max_frames=4):\n",
120
+ " os.makedirs(output_folder, exist_ok=True)\n",
121
+ " videos = glob.glob(os.path.join(video_folder, \"*.mp4\"))\n",
122
+ " \n",
123
+ " for vid_path in tqdm(videos, desc=f\"Extracting {label}\"):\n",
124
+ " vid_name = os.path.basename(vid_path).replace('.mp4','')\n",
125
+ " cap = cv2.VideoCapture(vid_path)\n",
126
+ " count = 0\n",
127
+ " while cap.isOpened() and count < max_frames:\n",
128
+ " ret, frame = cap.read()\n",
129
+ " if not ret: break\n",
130
+ " frame = cv2.resize(frame, (224, 224))\n",
131
+ " out_path = os.path.join(output_folder, f\"{vid_name}_f{count}.jpg\")\n",
132
+ " cv2.imwrite(out_path, frame)\n",
133
+ " count += 1\n",
134
+ " cap.release()\n",
135
+ "\n",
136
+ "# Extract Real\n",
137
+ "extract_frames('./data/original_sequences/youtube/c40/videos', './dataset/real', 'real')\n",
138
+ "\n",
139
+ "# Extract Fakes\n",
140
+ "fakes = ['Deepfakes', 'Face2Face', 'FaceSwap', 'NeuralTextures']\n",
141
+ "for f in fakes:\n",
142
+ " extract_frames(f'./data/manipulated_sequences/{f}/c40/videos', './dataset/fake', 'fake')\n"
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "code",
147
+ "execution_count": null,
148
+ "id": "b79cdd85",
149
+ "metadata": {},
150
+ "outputs": [],
151
+ "source": [
152
+ "import numpy as np\n",
153
+ "from datasets import load_dataset\n",
154
+ "from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer\n",
155
+ "import torch\n",
156
+ "\n",
157
+ "# 1. Load Dataset\n",
158
+ "dataset = load_dataset('imagefolder', data_dir='./dataset')\n",
159
+ "# Split into train/validation\n",
160
+ "dataset = dataset['train'].train_test_split(test_size=0.1)\n",
161
+ "\n",
162
+ "# 2. Preprocessor\n",
163
+ "model_name_or_path = 'google/vit-base-patch16-224-in21k'\n",
164
+ "processor = ViTImageProcessor.from_pretrained(model_name_or_path)\n",
165
+ "\n",
166
+ "def transform(example_batch):\n",
167
+ " # Take a list of PIL images and turn them to pixel values\n",
168
+ " inputs = processor([x.convert(\"RGB\") for x in example_batch['image']], return_tensors='pt')\n",
169
+ " inputs['labels'] = example_batch['label']\n",
170
+ " return inputs\n",
171
+ "\n",
172
+ "prepared_ds = dataset.with_transform(transform)\n",
173
+ "\n",
174
+ "def collate_fn(batch):\n",
175
+ " return {\n",
176
+ " 'pixel_values': torch.stack([x['pixel_values'] for x in batch]),\n",
177
+ " 'labels': torch.tensor([x['labels'] for x in batch])\n",
178
+ " }\n",
179
+ "\n",
180
+ "# 3. Load Model\n",
181
+ "labels = dataset['train'].features['label'].names\n",
182
+ "model = ViTForImageClassification.from_pretrained(\n",
183
+ " model_name_or_path,\n",
184
+ " num_labels=len(labels),\n",
185
+ " id2label={str(i): c for i, c in enumerate(labels)},\n",
186
+ " label2id={c: str(i) for i, c in enumerate(labels)}\n",
187
+ ")\n",
188
+ "\n",
189
+ "training_args = TrainingArguments(\n",
190
+ " output_dir=\"./vit-deepshield\",\n",
191
+ " per_device_train_batch_size=16,\n",
192
+ " eval_strategy=\"steps\",\n",
193
+ " num_train_epochs=3,\n",
194
+ " fp16=True, # Mixed precision for speed\n",
195
+ " save_steps=100,\n",
196
+ " eval_steps=100,\n",
197
+ " logging_steps=10,\n",
198
+ " learning_rate=2e-4,\n",
199
+ " save_total_limit=2,\n",
200
+ " remove_unused_columns=False,\n",
201
+ " push_to_hub=False,\n",
202
+ " load_best_model_at_end=True,\n",
203
+ ")\n",
204
+ "\n",
205
+ "import evaluate\n",
206
+ "metric = evaluate.load(\"accuracy\")\n",
207
+ "def compute_metrics(p):\n",
208
+ " return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)\n",
209
+ "\n",
210
+ "trainer = Trainer(\n",
211
+ " model=model,\n",
212
+ " args=training_args,\n",
213
+ " data_collator=collate_fn,\n",
214
+ " compute_metrics=compute_metrics,\n",
215
+ " train_dataset=prepared_ds[\"train\"],\n",
216
+ " eval_dataset=prepared_ds[\"test\"],\n",
217
+ ")\n",
218
+ "\n",
219
+ "# 4. Train\n",
220
+ "train_results = trainer.train()\n",
221
+ "trainer.save_model(\"deepshield_vit_model\")\n",
222
+ "processor.save_pretrained(\"deepshield_vit_model\")\n",
223
+ "trainer.log_metrics(\"train\", train_results.metrics)\n",
224
+ "trainer.save_metrics(\"train\", train_results.metrics)\n",
225
+ "trainer.save_state()\n",
226
+ "print(\"Training Complete! The model is saved to ./deepshield_vit_model\")\n"
227
+ ]
228
+ }
229
+ ],
230
+ "metadata": {},
231
+ "nbformat": 4,
232
+ "nbformat_minor": 5
233
+ }
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Base image with Python 3.10
2
+ FROM python:3.10-slim
3
+
4
+ # Set the working directory
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies required for OpenCV, PyTorch, etc.
8
+ RUN apt-get update && apt-get install -y \
9
+ libgl1-mesa-glx \
10
+ libglib2.0-0 \
11
+ build-essential \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Copy the requirements file into the container
15
+ COPY requirements.txt .
16
+
17
+ # Install Python dependencies
18
+ # Using --no-cache-dir keeps the Docker image smaller
19
+ RUN pip install --no-cache-dir -r requirements.txt
20
+
21
+ # Copy the rest of the backend code
22
+ COPY . .
23
+
24
+ # Create directories for models and temporary uploads if they don't exist
25
+ RUN mkdir -p /app/temp_uploads /app/models
26
+
27
+ # Expose port 7860 (This is the default port required by Hugging Face Spaces)
28
+ EXPOSE 7860
29
+
30
+ # Run the FastAPI server on port 7860
31
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,11 +1,12 @@
1
- ---
2
- title: Deepshield
3
- emoji: 🏆
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
1
+ # backend/training
 
 
 
 
 
 
 
 
2
 
3
+ Training pipeline for the DeepShield image detector (BUILD_PLAN2 Phase 11).
4
+
5
+ | Phase | Module |
6
+ |---|---|
7
+ | 11.1 Dataset procurement | [`datasets/`](./datasets/) — see [../../docs/datasets.md](../../docs/datasets.md) |
8
+ | 11.2 Training | `dataset.py`, `train_convnext.py` (pending) |
9
+ | 11.2 Calibration | `calibrate.py` (pending) |
10
+ | 11.2 Evaluation | `eval.py` (pending) |
11
+
12
+ Run `bash datasets/procure_all.sh` to build `./data/manifest.csv`.
__init__.py ADDED
File without changes
analyze.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import List
4
+
5
+ from pydantic import BaseModel
6
+
7
+ from schemas.common import (
8
+ ArtifactIndicator,
9
+ ContradictingEvidence,
10
+ ExifSummary,
11
+ LLMExplainabilitySummary,
12
+ ProcessingSummary,
13
+ TrustedSource,
14
+ TruthOverride,
15
+ Verdict,
16
+ VLMBreakdown,
17
+ )
18
+
19
+
20
+ class SensationalismBreakdown(BaseModel):
21
+ score: int = 0
22
+ level: str = "Low"
23
+ exclamation_count: int = 0
24
+ caps_word_count: int = 0
25
+ clickbait_matches: int = 0
26
+ emotional_word_count: int = 0
27
+ superlative_count: int = 0
28
+
29
+
30
+ class ManipulationIndicatorOut(BaseModel):
31
+ pattern_type: str
32
+ matched_text: str
33
+ start_pos: int
34
+ end_pos: int
35
+ severity: str
36
+ description: str
37
+
38
+
39
+ class TextExplainability(BaseModel):
40
+ fake_probability: float
41
+ top_label: str
42
+ all_scores: dict = {}
43
+ keywords: List[str] = []
44
+ sensationalism: SensationalismBreakdown = SensationalismBreakdown()
45
+ manipulation_indicators: List[ManipulationIndicatorOut] = []
46
+ detected_language: str = "en" # ISO 639-1 code, e.g. "en", "hi"
47
+ truth_override: TruthOverride | None = None
48
+
49
+
50
+ class TextAnalysisResponse(BaseModel):
51
+ analysis_id: str
52
+ record_id: int = 0
53
+ media_type: str = "text"
54
+ timestamp: str
55
+ verdict: Verdict
56
+ explainability: TextExplainability
57
+ llm_summary: LLMExplainabilitySummary | None = None
58
+ trusted_sources: List[TrustedSource] = []
59
+ contradicting_evidence: List[ContradictingEvidence] = []
60
+ processing_summary: ProcessingSummary
61
+ responsible_ai_notice: str = (
62
+ "AI-based analysis may not be 100% accurate. Cross-check with trusted sources before sharing."
63
+ )
64
+
65
+
66
+ class OCRBoxOut(BaseModel):
67
+ text: str
68
+ bbox: List[List[int]]
69
+ confidence: float
70
+
71
+
72
+ class SuspiciousPhraseOut(BaseModel):
73
+ text: str
74
+ bbox: List[List[int]]
75
+ pattern_type: str
76
+ severity: str
77
+ description: str
78
+
79
+
80
+ class LayoutAnomalyOut(BaseModel):
81
+ type: str
82
+ severity: str
83
+ description: str
84
+ confidence: float
85
+
86
+
87
+ class ScreenshotExplainability(BaseModel):
88
+ extracted_text: str = ""
89
+ ocr_boxes: List[OCRBoxOut] = []
90
+ fake_probability: float = 0.0
91
+ sensationalism: SensationalismBreakdown = SensationalismBreakdown()
92
+ suspicious_phrases: List[SuspiciousPhraseOut] = []
93
+ layout_anomalies: List[LayoutAnomalyOut] = []
94
+ keywords: List[str] = []
95
+ detected_language: str = "en"
96
+ truth_override: TruthOverride | None = None
97
+
98
+
99
+ class ScreenshotAnalysisResponse(BaseModel):
100
+ analysis_id: str
101
+ record_id: int = 0
102
+ media_type: str = "screenshot"
103
+ timestamp: str
104
+ verdict: Verdict
105
+ explainability: ScreenshotExplainability
106
+ llm_summary: LLMExplainabilitySummary | None = None
107
+ trusted_sources: List[TrustedSource] = []
108
+ contradicting_evidence: List[ContradictingEvidence] = []
109
+ processing_summary: ProcessingSummary
110
+ responsible_ai_notice: str = (
111
+ "AI-based analysis may not be 100% accurate. Cross-check with trusted sources before sharing."
112
+ )
113
+
114
+
115
+ class ImageExplainability(BaseModel):
116
+ heatmap_base64: str = ""
117
+ ela_base64: str = ""
118
+ boxes_base64: str = ""
119
+ heatmap_status: str = "success" # success | failed | degraded
120
+ artifact_indicators: List[ArtifactIndicator] = []
121
+ exif: ExifSummary | None = None
122
+ llm_summary: LLMExplainabilitySummary | None = None
123
+ vlm_breakdown: VLMBreakdown | None = None
124
+
125
+
126
+ class FrameAnalysisOut(BaseModel):
127
+ index: int
128
+ timestamp_s: float
129
+ label: str
130
+ confidence: float
131
+ suspicious_prob: float
132
+ is_suspicious: bool
133
+ has_face: bool = False
134
+ scored: bool = False
135
+
136
+
137
+ class VideoExplainability(BaseModel):
138
+ num_frames_sampled: int
139
+ num_face_frames: int = 0
140
+ num_suspicious_frames: int
141
+ mean_suspicious_prob: float
142
+ max_suspicious_prob: float
143
+ suspicious_ratio: float
144
+ insufficient_faces: bool = False
145
+ suspicious_timestamps: List[float] = []
146
+ frames: List[FrameAnalysisOut] = []
147
+
148
+
149
+ class VideoAnalysisResponse(BaseModel):
150
+ analysis_id: str
151
+ record_id: int = 0
152
+ media_type: str = "video"
153
+ timestamp: str
154
+ verdict: Verdict
155
+ explainability: VideoExplainability
156
+ llm_summary: LLMExplainabilitySummary | None = None
157
+ trusted_sources: List[TrustedSource] = []
158
+ contradicting_evidence: List[ContradictingEvidence] = []
159
+ processing_summary: ProcessingSummary
160
+ responsible_ai_notice: str = (
161
+ "AI-based analysis may not be 100% accurate. Cross-check with trusted sources before sharing."
162
+ )
163
+
164
+
165
+ class ImageAnalysisResponse(BaseModel):
166
+ analysis_id: str
167
+ record_id: int = 0
168
+ media_type: str = "image"
169
+ timestamp: str
170
+ verdict: Verdict
171
+ explainability: ImageExplainability
172
+ trusted_sources: List[TrustedSource] = []
173
+ contradicting_evidence: List[ContradictingEvidence] = []
174
+ processing_summary: ProcessingSummary
175
+ responsible_ai_notice: str = (
176
+ "AI-based analysis may not be 100% accurate. Cross-check with trusted sources before sharing."
177
+ )
artifact_detector.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ from typing import List
5
+
6
+ import numpy as np
7
+ from loguru import logger
8
+ from PIL import Image
9
+
10
+ from schemas.common import ArtifactIndicator
11
+
12
+
13
+ def _severity_from_score(score: float) -> str:
14
+ if score >= 0.7:
15
+ return "high"
16
+ if score >= 0.4:
17
+ return "medium"
18
+ return "low"
19
+
20
+
21
+ # ---------- 1. GAN high-frequency signature (FFT) ----------
22
+ def detect_gan_hf_artifact(pil_img: Image.Image) -> ArtifactIndicator | None:
23
+ """Compute high-frequency energy ratio on the luminance channel.
24
+ Real photos typically follow a ~1/f spectrum; many GAN outputs show
25
+ elevated HF energy or spectral peaks.
26
+ """
27
+ try:
28
+ gray = np.asarray(pil_img.convert("L"), dtype=np.float32)
29
+ # downsample for speed
30
+ if max(gray.shape) > 512:
31
+ import cv2
32
+
33
+ scale = 512 / max(gray.shape)
34
+ gray = cv2.resize(gray, (int(gray.shape[1] * scale), int(gray.shape[0] * scale)))
35
+
36
+ fft = np.fft.fftshift(np.fft.fft2(gray))
37
+ mag = np.abs(fft)
38
+ h, w = mag.shape
39
+ cy, cx = h // 2, w // 2
40
+ y, x = np.ogrid[:h, :w]
41
+ r = np.sqrt((x - cx) ** 2 + (y - cy) ** 2)
42
+ r_max = np.sqrt(cx * cx + cy * cy)
43
+ hf_mask = r > (0.5 * r_max)
44
+
45
+ total = float(mag.sum() + 1e-9)
46
+ hf = float(mag[hf_mask].sum())
47
+ ratio = hf / total # typically 0.05–0.20 for natural photos
48
+
49
+ # normalize to [0,1] suspiciousness
50
+ score = max(0.0, min(1.0, (ratio - 0.10) / 0.20))
51
+ sev = _severity_from_score(score)
52
+ return ArtifactIndicator(
53
+ type="gan_artifact",
54
+ severity=sev,
55
+ description=(
56
+ f"High-frequency energy ratio {ratio:.3f} — "
57
+ + ("elevated HF energy consistent with GAN/diffusion outputs" if score > 0.4
58
+ else "natural frequency falloff")
59
+ ),
60
+ confidence=float(score),
61
+ )
62
+ except Exception as e: # noqa: BLE001
63
+ logger.warning(f"GAN HF detection failed: {e}")
64
+ return None
65
+
66
+
67
+ # ---------- 2. JPEG quantization table anomaly ----------
68
+ _STANDARD_Q_SUMS = { # rough heuristic: camera JPEGs fall in these ranges
69
+ 50: (1500, 4500),
70
+ 75: (600, 2500),
71
+ 90: (200, 1000),
72
+ 95: (100, 600),
73
+ }
74
+
75
+
76
+ def detect_compression_anomaly(raw_bytes: bytes) -> ArtifactIndicator | None:
77
+ """Inspect JPEG quantization tables. Missing tables, non-standard layouts,
78
+ or re-saved tables often indicate manipulation or re-encoding.
79
+ """
80
+ try:
81
+ img = Image.open(io.BytesIO(raw_bytes))
82
+ if img.format != "JPEG":
83
+ return ArtifactIndicator(
84
+ type="compression",
85
+ severity="low",
86
+ description=f"Non-JPEG format ({img.format}); compression signature not available",
87
+ confidence=0.1,
88
+ )
89
+
90
+ q = getattr(img, "quantization", None)
91
+ if not q:
92
+ return ArtifactIndicator(
93
+ type="compression",
94
+ severity="low",
95
+ description="No JPEG quantization tables readable",
96
+ confidence=0.2,
97
+ )
98
+
99
+ tables = list(q.values())
100
+ sums = [int(sum(t)) for t in tables]
101
+ num_tables = len(tables)
102
+
103
+ # Heuristics: very low sum → very high quality (possibly re-saved);
104
+ # non-standard number of tables; extreme values.
105
+ suspicious = 0.0
106
+ reasons: list[str] = []
107
+ if num_tables not in (1, 2):
108
+ suspicious += 0.4
109
+ reasons.append(f"unusual table count ({num_tables})")
110
+ if any(s < 60 for s in sums):
111
+ suspicious += 0.3
112
+ reasons.append("very low quantization sums (possible re-encoding)")
113
+ if any(s > 8000 for s in sums):
114
+ suspicious += 0.2
115
+ reasons.append("very high quantization sums")
116
+
117
+ score = max(0.0, min(1.0, suspicious))
118
+ sev = _severity_from_score(score)
119
+ desc = (
120
+ f"JPEG Q-table sums {sums}"
121
+ + (f"; {', '.join(reasons)}" if reasons else "; within typical camera range")
122
+ )
123
+ return ArtifactIndicator(
124
+ type="compression",
125
+ severity=sev,
126
+ description=desc,
127
+ confidence=float(score),
128
+ )
129
+ except Exception as e: # noqa: BLE001
130
+ logger.warning(f"Compression anomaly detection failed: {e}")
131
+ return None
132
+
133
+
134
+ # ---------- 3. Facial boundary + 4. Lighting (MediaPipe) ----------
135
+ def detect_face_based_artifacts(pil_img: Image.Image) -> List[ArtifactIndicator]:
136
+ """If a face is detected, analyze jaw boundary variance and per-quadrant
137
+ luminance balance. Returns 0, 1, or 2 indicators.
138
+ """
139
+ results: List[ArtifactIndicator] = []
140
+ try:
141
+ import mediapipe as mp # type: ignore
142
+
143
+ from models.model_loader import get_model_loader
144
+
145
+ detector = get_model_loader().load_face_detector()
146
+ rgb = np.asarray(pil_img.convert("RGB"))
147
+ h, w = rgb.shape[:2]
148
+ mp_result = detector.process(rgb)
149
+
150
+ if not mp_result.multi_face_landmarks:
151
+ return results
152
+
153
+ landmarks = mp_result.multi_face_landmarks[0].landmark
154
+
155
+ # ----- Jaw boundary jitter -----
156
+ # FaceMesh jaw/oval landmark indices (approximate face contour)
157
+ JAW_IDX = [
158
+ 10, 338, 297, 332, 284, 251, 389, 356, 454, 323, 361,
159
+ 288, 397, 365, 379, 378, 400, 377, 152, 148, 176, 149,
160
+ 150, 136, 172, 58, 132, 93, 234, 127, 162, 21, 54, 103, 67, 109,
161
+ ]
162
+ pts = np.array([(landmarks[i].x * w, landmarks[i].y * h) for i in JAW_IDX])
163
+ # Second-difference magnitude = local curvature jitter
164
+ diffs = np.diff(pts, axis=0)
165
+ seconds = np.diff(diffs, axis=0)
166
+ jitter = float(np.linalg.norm(seconds, axis=1).mean()) / max(w, h)
167
+ jitter_score = max(0.0, min(1.0, (jitter - 0.003) / 0.010))
168
+ results.append(
169
+ ArtifactIndicator(
170
+ type="facial_boundary",
171
+ severity=_severity_from_score(jitter_score),
172
+ description=(
173
+ f"Jaw-contour jitter {jitter:.4f} (normalized) — "
174
+ + ("inconsistent boundary blending detected" if jitter_score > 0.4
175
+ else "face boundary appears smooth")
176
+ ),
177
+ confidence=float(jitter_score),
178
+ )
179
+ )
180
+
181
+ # ----- Lighting inconsistency (per-quadrant luminance) -----
182
+ xs = np.array([lm.x * w for lm in landmarks])
183
+ ys = np.array([lm.y * h for lm in landmarks])
184
+ x0, x1 = int(max(0, xs.min())), int(min(w, xs.max()))
185
+ y0, y1 = int(max(0, ys.min())), int(min(h, ys.max()))
186
+ if x1 > x0 + 4 and y1 > y0 + 4:
187
+ face_crop = rgb[y0:y1, x0:x1]
188
+ gray = 0.299 * face_crop[..., 0] + 0.587 * face_crop[..., 1] + 0.114 * face_crop[..., 2]
189
+ hh, ww = gray.shape
190
+ quads = [
191
+ gray[: hh // 2, : ww // 2],
192
+ gray[: hh // 2, ww // 2 :],
193
+ gray[hh // 2 :, : ww // 2],
194
+ gray[hh // 2 :, ww // 2 :],
195
+ ]
196
+ means = np.array([q.mean() for q in quads if q.size > 0])
197
+ if means.size == 4 and means.mean() > 1e-3:
198
+ imbalance = float(means.std() / means.mean())
199
+ lighting_score = max(0.0, min(1.0, (imbalance - 0.08) / 0.20))
200
+ results.append(
201
+ ArtifactIndicator(
202
+ type="lighting",
203
+ severity=_severity_from_score(lighting_score),
204
+ description=(
205
+ f"Luminance imbalance across face quadrants {imbalance:.3f} — "
206
+ + ("inconsistent lighting direction" if lighting_score > 0.4
207
+ else "lighting appears uniform")
208
+ ),
209
+ confidence=float(lighting_score),
210
+ )
211
+ )
212
+ except Exception as e: # noqa: BLE001
213
+ logger.warning(f"Face-based artifact detection failed: {e}")
214
+
215
+ return results
216
+
217
+
218
+ # ---------- Orchestrator ----------
219
+ def scan_artifacts(pil_img: Image.Image, raw_bytes: bytes) -> List[ArtifactIndicator]:
220
+ indicators: List[ArtifactIndicator] = []
221
+ for fn in (
222
+ lambda: detect_gan_hf_artifact(pil_img),
223
+ lambda: detect_compression_anomaly(raw_bytes),
224
+ ):
225
+ ind = fn()
226
+ if ind is not None:
227
+ indicators.append(ind)
228
+ indicators.extend(detect_face_based_artifacts(pil_img))
229
+ return indicators
auth.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+
5
+ from pydantic import BaseModel, EmailStr, Field
6
+
7
+
8
+ class RegisterBody(BaseModel):
9
+ email: EmailStr
10
+ password: str = Field(min_length=6, max_length=128)
11
+ name: str | None = Field(default=None, max_length=255)
12
+
13
+
14
+ class LoginBody(BaseModel):
15
+ email: EmailStr
16
+ password: str
17
+
18
+
19
+ class UserOut(BaseModel):
20
+ id: int
21
+ email: str
22
+ name: str | None = None
23
+ created_at: datetime
24
+
25
+
26
+ class TokenResponse(BaseModel):
27
+ access_token: str
28
+ token_type: str = "bearer"
29
+ expires_in_minutes: int
30
+ user: UserOut
auth_service.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime, timedelta, timezone
4
+ from typing import Any
5
+
6
+ import bcrypt
7
+ from jose import JWTError, jwt
8
+ from sqlalchemy.orm import Session
9
+
10
+ from config import settings
11
+ from db.models import User
12
+
13
+
14
+ def _encode_pw(plain: str) -> bytes:
15
+ # bcrypt truncates to 72 bytes silently in some builds and hard-errors in others.
16
+ # Truncate explicitly so behavior is deterministic across versions.
17
+ return plain.encode("utf-8")[:72]
18
+
19
+
20
+ def hash_password(plain: str) -> str:
21
+ return bcrypt.hashpw(_encode_pw(plain), bcrypt.gensalt()).decode("utf-8")
22
+
23
+
24
+ def verify_password(plain: str, hashed: str) -> bool:
25
+ try:
26
+ return bcrypt.checkpw(_encode_pw(plain), hashed.encode("utf-8"))
27
+ except Exception:
28
+ return False
29
+
30
+
31
+ def create_access_token(user_id: int, email: str) -> str:
32
+ now = datetime.now(timezone.utc)
33
+ payload = {
34
+ "sub": str(user_id),
35
+ "email": email,
36
+ "iat": int(now.timestamp()),
37
+ "exp": int((now + timedelta(minutes=settings.JWT_EXPIRATION_MINUTES)).timestamp()),
38
+ }
39
+ return jwt.encode(payload, settings.JWT_SECRET_KEY, algorithm=settings.JWT_ALGORITHM)
40
+
41
+
42
+ def decode_token(token: str) -> dict[str, Any] | None:
43
+ try:
44
+ return jwt.decode(token, settings.JWT_SECRET_KEY, algorithms=[settings.JWT_ALGORITHM])
45
+ except JWTError:
46
+ return None
47
+
48
+
49
+ def register_user(db: Session, email: str, password: str, name: str | None) -> User:
50
+ email = email.strip().lower()
51
+ user = User(email=email, password_hash=hash_password(password), name=(name or None))
52
+ db.add(user)
53
+ db.commit()
54
+ db.refresh(user)
55
+ return user
56
+
57
+
58
+ def authenticate(db: Session, email: str, password: str) -> User | None:
59
+ email = email.strip().lower()
60
+ user = db.query(User).filter(User.email == email).first()
61
+ if not user or not verify_password(password, user.password_hash):
62
+ return None
63
+ return user
64
+
65
+
66
+ def get_user(db: Session, user_id: int) -> User | None:
67
+ return db.query(User).filter(User.id == user_id).first()
common.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Optional
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field
6
+
7
+
8
+ class Verdict(BaseModel):
9
+ model_config = ConfigDict(protected_namespaces=())
10
+
11
+ label: str
12
+ severity: str
13
+ authenticity_score: int = Field(ge=0, le=100)
14
+ model_confidence: float = Field(ge=0.0, le=1.0)
15
+ model_label: str
16
+
17
+
18
+ class ArtifactIndicator(BaseModel):
19
+ type: str
20
+ severity: str # low | medium | high
21
+ description: str
22
+ confidence: float = Field(ge=0.0, le=1.0)
23
+
24
+
25
+ class TrustedSource(BaseModel):
26
+ source_name: str
27
+ title: str
28
+ url: str
29
+ published_at: Optional[str] = None
30
+ relevance_score: float = Field(ge=0.0, le=1.0)
31
+
32
+
33
+ class ContradictingEvidence(BaseModel):
34
+ source_name: str
35
+ title: str
36
+ url: str
37
+ type: str = "fact_check"
38
+
39
+
40
+ class TruthOverride(BaseModel):
41
+ applied: bool = False
42
+ source_url: str = ""
43
+ source_name: str = ""
44
+ similarity: float = 0.0
45
+ fake_prob_before: float = 0.0
46
+ fake_prob_after: float = 0.0
47
+
48
+
49
+ class ExifSummary(BaseModel):
50
+ make: Optional[str] = None
51
+ model: Optional[str] = None
52
+ datetime_original: Optional[str] = None
53
+ gps_info: Optional[str] = None
54
+ software: Optional[str] = None
55
+ lens_model: Optional[str] = None
56
+ trust_adjustment: int = 0 # negative = more real, positive = more fake
57
+ trust_reason: str = ""
58
+
59
+
60
+ class LLMExplainabilitySummary(BaseModel):
61
+ paragraph: str = ""
62
+ bullets: List[str] = []
63
+ model_used: str = ""
64
+ cached: bool = False
65
+
66
+
67
+ class VLMComponentScore(BaseModel):
68
+ score: int = Field(ge=0, le=100, default=75)
69
+ notes: str = ""
70
+
71
+
72
+ class VLMBreakdown(BaseModel):
73
+ facial_symmetry: VLMComponentScore = VLMComponentScore()
74
+ skin_texture: VLMComponentScore = VLMComponentScore()
75
+ lighting_consistency: VLMComponentScore = VLMComponentScore()
76
+ background_coherence: VLMComponentScore = VLMComponentScore()
77
+ anatomy_hands_eyes: VLMComponentScore = VLMComponentScore()
78
+ context_objects: VLMComponentScore = VLMComponentScore()
79
+ model_used: str = ""
80
+ cached: bool = False
81
+
82
+
83
+ class ProcessingSummary(BaseModel):
84
+ model_config = ConfigDict(protected_namespaces=())
85
+
86
+ stages_completed: List[str]
87
+ total_duration_ms: int
88
+ model_used: str
config.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings, SettingsConfigDict
2
+
3
+
4
+ class Settings(BaseSettings):
5
+ # Server
6
+ APP_HOST: str = "0.0.0.0"
7
+ APP_PORT: int = 8000
8
+ DEBUG: bool = False
9
+ CORS_ORIGINS: list[str] = ["http://localhost:5173"]
10
+
11
+ # Database
12
+ DATABASE_URL: str = "sqlite:///./deepshield.db"
13
+
14
+ # File Upload
15
+ MAX_UPLOAD_SIZE_MB: int = 100
16
+ UPLOAD_DIR: str = "./temp_uploads"
17
+ ALLOWED_IMAGE_TYPES: list[str] = ["image/jpeg", "image/png", "image/webp"]
18
+ ALLOWED_VIDEO_TYPES: list[str] = ["video/mp4", "video/avi", "video/mov", "video/webm"]
19
+ FILE_RETENTION_SECONDS: int = 300
20
+
21
+ # AI Models
22
+ IMAGE_MODEL_ID: str = "prithivMLmods/Deep-Fake-Detector-v2-Model"
23
+ TEXT_MODEL_ID: str = "jy46604790/Fake-News-Bert-Detect"
24
+ # Multilingual text model for non-English (Hindi etc.). Leave empty to fall back to TEXT_MODEL_ID.
25
+ TEXT_MULTILANG_MODEL_ID: str = ""
26
+ DEVICE: str = "cpu"
27
+ PRELOAD_MODELS: bool = True # preload models at startup
28
+
29
+ # Phase 13: OCR language list (comma-separated ISO codes, e.g. "en,hi")
30
+ OCR_LANGS: str = "en,hi"
31
+
32
+ # News API
33
+ NEWS_API_KEY: str = ""
34
+ NEWS_API_BASE_URL: str = "https://newsdata.io/api/1/news"
35
+
36
+ # Reports
37
+ REPORT_DIR: str = "./temp_reports"
38
+ REPORT_TTL_SECONDS: int = 3600 # 1h expiry
39
+
40
+ # LLM Explainability (Phase 12)
41
+ LLM_PROVIDER: str = "gemini" # "gemini" | "openai"
42
+ LLM_API_KEY: str = ""
43
+ LLM_MODEL: str = "gemini-1.5-flash" # or "gpt-4o-mini"
44
+
45
+ # Auth
46
+ JWT_SECRET_KEY: str = "change-me-in-production"
47
+ JWT_ALGORITHM: str = "HS256"
48
+ JWT_EXPIRATION_MINUTES: int = 1440
49
+
50
+ model_config = SettingsConfigDict(env_file=".env", extra="ignore")
51
+
52
+
53
+ settings = Settings()
database.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import create_engine
2
+ from sqlalchemy.orm import DeclarativeBase, sessionmaker
3
+
4
+ from config import settings
5
+
6
+ engine = create_engine(
7
+ settings.DATABASE_URL,
8
+ connect_args={"check_same_thread": False} if settings.DATABASE_URL.startswith("sqlite") else {},
9
+ )
10
+
11
+ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
12
+
13
+
14
+ class Base(DeclarativeBase):
15
+ pass
16
+
17
+
18
+ def get_db():
19
+ db = SessionLocal()
20
+ try:
21
+ yield db
22
+ finally:
23
+ db.close()
24
+
25
+
26
+ def init_db():
27
+ from db import models # noqa: F401
28
+ Base.metadata.create_all(bind=engine)
datasets/__init__.py ADDED
File without changes
datasets/build_manifest.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Build a unified train/val/test manifest (70/15/15) across all dataset buckets.
2
+
3
+ Expected input layout (produced by the other scripts in this package):
4
+
5
+ data_root/
6
+ real/
7
+ ffpp_youtube/*.jpg # frames from FFPP original_sequences
8
+ ffhq/*.jpg # FFHQ thumbnails
9
+
10
+ fake/
11
+ ffpp_deepfakes/*.jpg
12
+ ffpp_face2face/*.jpg
13
+ ffpp_faceswap/*.jpg
14
+ ffpp_neuraltextures/*.jpg
15
+ ffpp_faceshifter/*.jpg
16
+ dfdc/*.jpg
17
+
18
+ The manifest is stratified by (label, source) so FFHQ stays represented
19
+ in val/test.
20
+
21
+ Usage:
22
+ python -m backend.training.datasets.build_manifest \
23
+ --data ./data --out ./data/manifest.csv --seed 42
24
+ """
25
+ from __future__ import annotations
26
+
27
+ import argparse
28
+ import csv
29
+ import random
30
+ from collections import defaultdict
31
+ from pathlib import Path
32
+
33
+ IMG_EXTS = {".jpg", ".jpeg", ".png"}
34
+
35
+
36
+ def collect(data_root: Path) -> list[tuple[str, str, str]]:
37
+ rows: list[tuple[str, str, str]] = []
38
+ for label in ("real", "fake"):
39
+ label_root = data_root / label
40
+ if not label_root.exists():
41
+ continue
42
+ for source_dir in sorted(p for p in label_root.iterdir() if p.is_dir()):
43
+ for img in source_dir.rglob("*"):
44
+ if img.suffix.lower() in IMG_EXTS and img.is_file():
45
+ rows.append((str(img.resolve()), label, source_dir.name))
46
+ return rows
47
+
48
+
49
+ def split(rows: list[tuple[str, str, str]], seed: int) -> dict[str, list[tuple[str, str, str]]]:
50
+ buckets: dict[tuple[str, str], list[tuple[str, str, str]]] = defaultdict(list)
51
+ for r in rows:
52
+ buckets[(r[1], r[2])].append(r)
53
+
54
+ rng = random.Random(seed)
55
+ out = {"train": [], "val": [], "test": []}
56
+ for key, items in buckets.items():
57
+ rng.shuffle(items)
58
+ n = len(items)
59
+ n_train = int(0.70 * n)
60
+ n_val = int(0.15 * n)
61
+ out["train"].extend(items[:n_train])
62
+ out["val"].extend(items[n_train : n_train + n_val])
63
+ out["test"].extend(items[n_train + n_val :])
64
+ return out
65
+
66
+
67
+ def main() -> None:
68
+ ap = argparse.ArgumentParser()
69
+ ap.add_argument("--data", required=True, type=Path)
70
+ ap.add_argument("--out", required=True, type=Path)
71
+ ap.add_argument("--seed", type=int, default=42)
72
+ args = ap.parse_args()
73
+
74
+ rows = collect(args.data)
75
+ if not rows:
76
+ raise SystemExit(f"No images found under {args.data}")
77
+
78
+ splits = split(rows, args.seed)
79
+ args.out.parent.mkdir(parents=True, exist_ok=True)
80
+ with args.out.open("w", newline="", encoding="utf-8") as f:
81
+ w = csv.writer(f)
82
+ w.writerow(["path", "label", "source", "split"])
83
+ for name, items in splits.items():
84
+ for path, label, source in items:
85
+ w.writerow([path, label, source, name])
86
+
87
+ summary = {k: len(v) for k, v in splits.items()}
88
+ print(f"Manifest: {args.out}")
89
+ print(f"Totals: {summary} (overall {sum(summary.values())})")
90
+
91
+
92
+ if __name__ == "__main__":
93
+ main()
datasets/download_dfdc_sample.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Download a sample of the DFDC (Deepfake Detection Challenge) Preview dataset.
2
+
3
+ The full DFDC is ~470GB; the *preview* release (~5GB, Kaggle) is enough for
4
+ diversity augmentation alongside FFPP.
5
+
6
+ Requires the Kaggle CLI (`pip install kaggle`) and ~/.kaggle/kaggle.json.
7
+
8
+ Usage:
9
+ python -m backend.training.datasets.download_dfdc_sample --output ./data/dfdc_preview
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import shutil
15
+ import subprocess
16
+ import sys
17
+ from pathlib import Path
18
+
19
+
20
+ def main() -> None:
21
+ ap = argparse.ArgumentParser()
22
+ ap.add_argument("--output", required=True, type=Path)
23
+ ap.add_argument(
24
+ "--competition",
25
+ default="deepfake-detection-challenge",
26
+ help="Kaggle competition slug (default: deepfake-detection-challenge preview).",
27
+ )
28
+ args = ap.parse_args()
29
+
30
+ kaggle = shutil.which("kaggle")
31
+ if kaggle is None:
32
+ print("Kaggle CLI not found. Install with: pip install kaggle", file=sys.stderr)
33
+ print("Then place kaggle.json in ~/.kaggle/ (chmod 600).", file=sys.stderr)
34
+ sys.exit(2)
35
+
36
+ args.output.mkdir(parents=True, exist_ok=True)
37
+ cmd = [kaggle, "competitions", "download", "-c", args.competition, "-p", str(args.output)]
38
+ print("Running:", " ".join(cmd))
39
+ subprocess.run(cmd, check=True)
40
+ print(f"Downloaded to {args.output}. Unzip with: unzip *.zip")
41
+
42
+
43
+ if __name__ == "__main__":
44
+ main()
datasets/download_ffhq.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Download the FFHQ 128x128 thumbnail subset from the official Google Drive mirror.
2
+
3
+ Pulls up to N images (default 10k) into the `real` bucket of the training set.
4
+ Falls back to the NVlabs 'ffhq-dataset' helper if available; otherwise expects
5
+ user to run the manual download once.
6
+
7
+ Usage:
8
+ python -m backend.training.datasets.download_ffhq --output ./data/real/ffhq -n 10000
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import shutil
14
+ import subprocess
15
+ import sys
16
+ from pathlib import Path
17
+
18
+
19
+ def try_nvlabs_helper(output: Path, num: int) -> bool:
20
+ """Prefer the official ffhq-dataset downloader if installed."""
21
+ helper = shutil.which("ffhq-dataset")
22
+ if helper is None:
23
+ return False
24
+ cmd = [helper, "--json", "ffhq-dataset-v2.json", "--thumbs", "--num_threads", "4"]
25
+ print("Running:", " ".join(cmd))
26
+ subprocess.run(cmd, cwd=output, check=False)
27
+ return True
28
+
29
+
30
+ def main() -> None:
31
+ ap = argparse.ArgumentParser()
32
+ ap.add_argument("--output", required=True, type=Path)
33
+ ap.add_argument("-n", "--num", type=int, default=10000)
34
+ args = ap.parse_args()
35
+ args.output.mkdir(parents=True, exist_ok=True)
36
+
37
+ if try_nvlabs_helper(args.output, args.num):
38
+ return
39
+
40
+ print("[!] `ffhq-dataset` helper not installed.")
41
+ print(" Install via: pip install ffhq-dataset (requires gdown)")
42
+ print(" Or download thumbnails128x128.zip manually from:")
43
+ print(" https://github.com/NVlabs/ffhq-dataset")
44
+ print(f" Extract into: {args.output}")
45
+ sys.exit(1)
46
+
47
+
48
+ if __name__ == "__main__":
49
+ main()
datasets/extract_frames.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Convert FFPP / DFDC videos -> 16 sampled frames at 224x224 RGB.
2
+
3
+ Usage:
4
+ python -m backend.training.datasets.extract_frames \
5
+ --input ./ffpp_data/original_sequences/youtube/raw/videos \
6
+ --output ./ffpp_data/frames/real \
7
+ --label real --frames 16 --size 224
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import csv
13
+ from pathlib import Path
14
+
15
+ import cv2
16
+ import numpy as np
17
+ from tqdm import tqdm
18
+
19
+
20
+ def sample_frame_indices(total: int, n: int) -> list[int]:
21
+ if total <= 0:
22
+ return []
23
+ if total <= n:
24
+ return list(range(total))
25
+ step = total / float(n)
26
+ return [min(total - 1, int(step * i + step / 2)) for i in range(n)]
27
+
28
+
29
+ def extract_from_video(path: Path, out_dir: Path, n: int, size: int) -> int:
30
+ cap = cv2.VideoCapture(str(path))
31
+ if not cap.isOpened():
32
+ return 0
33
+ total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
34
+ indices = set(sample_frame_indices(total, n))
35
+ out_dir.mkdir(parents=True, exist_ok=True)
36
+
37
+ saved = 0
38
+ i = 0
39
+ while True:
40
+ ok, frame = cap.read()
41
+ if not ok:
42
+ break
43
+ if i in indices:
44
+ frame = cv2.resize(frame, (size, size), interpolation=cv2.INTER_AREA)
45
+ cv2.imwrite(str(out_dir / f"{path.stem}_f{i:06d}.jpg"), frame, [cv2.IMWRITE_JPEG_QUALITY, 95])
46
+ saved += 1
47
+ i += 1
48
+ cap.release()
49
+ return saved
50
+
51
+
52
+ def main() -> None:
53
+ ap = argparse.ArgumentParser(description="Sample N frames per video and resize.")
54
+ ap.add_argument("--input", required=True, type=Path, help="Directory of .mp4 videos (recursive).")
55
+ ap.add_argument("--output", required=True, type=Path, help="Directory to write .jpg frames.")
56
+ ap.add_argument("--label", required=True, choices=["real", "fake"], help="Label tag for manifest.")
57
+ ap.add_argument("--frames", type=int, default=16)
58
+ ap.add_argument("--size", type=int, default=224)
59
+ ap.add_argument("--manifest", type=Path, default=None, help="Optional CSV manifest append path.")
60
+ args = ap.parse_args()
61
+
62
+ videos = [p for p in args.input.rglob("*.mp4")]
63
+ if not videos:
64
+ print(f"No .mp4 found under {args.input}")
65
+ return
66
+
67
+ rows: list[tuple[str, str, str]] = []
68
+ total_frames = 0
69
+ for vid in tqdm(videos, desc=f"extract[{args.label}]"):
70
+ rel_out = args.output / vid.stem
71
+ saved = extract_from_video(vid, rel_out, args.frames, args.size)
72
+ total_frames += saved
73
+ if args.manifest is not None:
74
+ for jpg in rel_out.glob("*.jpg"):
75
+ rows.append((str(jpg), args.label, vid.stem))
76
+
77
+ if args.manifest is not None and rows:
78
+ args.manifest.parent.mkdir(parents=True, exist_ok=True)
79
+ new_file = not args.manifest.exists()
80
+ with args.manifest.open("a", newline="", encoding="utf-8") as f:
81
+ w = csv.writer(f)
82
+ if new_file:
83
+ w.writerow(["path", "label", "source_video"])
84
+ w.writerows(rows)
85
+
86
+ print(f"Done. Videos: {len(videos)}, frames written: {total_frames}")
87
+
88
+
89
+ if __name__ == "__main__":
90
+ main()
datasets/procure_all.ps1 ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 11.1 orchestrator for Windows (PowerShell)
2
+ $ErrorActionPreference = "Stop"
3
+
4
+ $ROOT = if ($env:ROOT) { $env:ROOT } else { ".\data" }
5
+ $FFPP = if ($env:FFPP) { $env:FFPP } else { ".\ffpp_data" }
6
+
7
+ New-Item -ItemType Directory -Force -Path "$ROOT\real" | Out-Null
8
+ New-Item -ItemType Directory -Force -Path "$ROOT\fake" | Out-Null
9
+ New-Item -ItemType Directory -Force -Path $FFPP | Out-Null
10
+
11
+ Write-Host "1. FaceForensics++ (highly compressed c40, 10 videos only) -- requires TOS keypress"
12
+ python backend\scripts\download_ffpp.py $FFPP -d all -c c40 -t videos -n 10
13
+
14
+ Write-Host "2. Frame extraction: real (original youtube)"
15
+ python -m backend.training.datasets.extract_frames `
16
+ --input "$FFPP\original_sequences\youtube\c40\videos" `
17
+ --output "$ROOT\real\ffpp_youtube" --label real --frames 4 --size 224
18
+
19
+ Write-Host "3. Frame extraction: fakes (each manipulation family)"
20
+ $Families = @("Deepfakes", "Face2Face", "FaceSwap", "NeuralTextures", "FaceShifter")
21
+ foreach ($fam in $Families) {
22
+ $famLower = $fam.ToLower()
23
+ python -m backend.training.datasets.extract_frames `
24
+ --input "$FFPP\manipulated_sequences\$fam\c40\videos" `
25
+ --output "$ROOT\fake\ffpp_$famLower" --label fake --frames 4 --size 224
26
+ }
27
+
28
+ Write-Host "4. FFHQ thumbnails (real - limited to 100 items)"
29
+ python -m backend.training.datasets.download_ffhq --output "$ROOT\real\ffhq" -n 100
30
+
31
+
32
+ Write-Host "6. DFDC preview sample (fake+real)"
33
+ python -m backend.training.datasets.download_dfdc_sample --output "$ROOT\_dfdc_raw"
34
+ Write-Host "NOTE: You will need to manually unzip + sort DFDC into $ROOT\fake\dfdc and $ROOT\real\dfdc"
35
+
36
+ Write-Host "7. Build manifest"
37
+ python -m backend.training.datasets.build_manifest `
38
+ --data $ROOT --out "$ROOT\manifest.csv" --seed 42
39
+
40
+ Write-Host "Phase 11.1 complete. See $ROOT\manifest.csv"
datasets/procure_all.sh ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Phase 11.1 orchestrator: download + frame-extract + manifest.
3
+ # Total disk target: ~120k labeled images. Expect 60-80GB intermediate, ~30GB frames.
4
+
5
+ set -euo pipefail
6
+
7
+ ROOT="${ROOT:-./data}"
8
+ FFPP="${FFPP:-./ffpp_data}"
9
+ mkdir -p "$ROOT/real" "$ROOT/fake" "$FFPP"
10
+
11
+ # 1. FaceForensics++ (raw, videos) -- requires TOS keypress
12
+ python backend/scripts/download_ffpp.py "$FFPP" -d all -c raw -t videos
13
+
14
+ # 2. Frame extraction: real (original youtube)
15
+ python -m backend.training.datasets.extract_frames \
16
+ --input "$FFPP/original_sequences/youtube/raw/videos" \
17
+ --output "$ROOT/real/ffpp_youtube" --label real --frames 16 --size 224
18
+
19
+ # 3. Frame extraction: fakes (each manipulation family)
20
+ for fam in Deepfakes Face2Face FaceSwap NeuralTextures FaceShifter; do
21
+ python -m backend.training.datasets.extract_frames \
22
+ --input "$FFPP/manipulated_sequences/$fam/raw/videos" \
23
+ --output "$ROOT/fake/ffpp_${fam,,}" --label fake --frames 16 --size 224
24
+ done
25
+
26
+ # 4. FFHQ thumbnails (real)
27
+ python -m backend.training.datasets.download_ffhq --output "$ROOT/real/ffhq" -n 10000
28
+
29
+ # 6. DFDC preview sample (fake+real) -- needs Kaggle creds
30
+ python -m backend.training.datasets.download_dfdc_sample --output "$ROOT/_dfdc_raw"
31
+ # NOTE: unzip + sort into $ROOT/fake/dfdc and $ROOT/real/dfdc per DFDC metadata.json
32
+
33
+ # 7. Build manifest
34
+ python -m backend.training.datasets.build_manifest \
35
+ --data "$ROOT" --out "$ROOT/manifest.csv" --seed 42
36
+
37
+ echo "Phase 11.1 complete. See $ROOT/manifest.csv"
deepshield_13_5bcf1328.pdf ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ %PDF-1.4
2
+ %���� ReportLab Generated PDF document (opensource)
3
+ 1 0 obj
4
+ <<
5
+ /F1 2 0 R /F2 3 0 R /F3 5 0 R
6
+ >>
7
+ endobj
8
+ 2 0 obj
9
+ <<
10
+ /BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
11
+ >>
12
+ endobj
13
+ 3 0 obj
14
+ <<
15
+ /BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
16
+ >>
17
+ endobj
18
+ 4 0 obj
19
+ <<
20
+ /Contents 18 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 17 0 R /Resources <<
21
+ /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
22
+ >> /Rotate 0 /Trans <<
23
+
24
+ >>
25
+ /Type /Page
26
+ >>
27
+ endobj
28
+ 5 0 obj
29
+ <<
30
+ /BaseFont /Symbol /Name /F3 /Subtype /Type1 /Type /Font
31
+ >>
32
+ endobj
33
+ 6 0 obj
34
+ <<
35
+ /Contents 19 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 17 0 R /Resources <<
36
+ /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
37
+ >> /Rotate 0 /Trans <<
38
+
39
+ >>
40
+ /Type /Page
41
+ >>
42
+ endobj
43
+ 7 0 obj
44
+ <<
45
+ /Outlines 9 0 R /PageMode /UseNone /Pages 17 0 R /Type /Catalog
46
+ >>
47
+ endobj
48
+ 8 0 obj
49
+ <<
50
+ /Author () /CreationDate (D:20260415181653+05'00') /Creator (\(unspecified\)) /Keywords () /ModDate (D:20260415181653+05'00') /Producer (xhtml2pdf <https://github.com/xhtml2pdf/xhtml2pdf/>)
51
+ /Subject () /Title (DeepShield Analysis Report \204 7771f496-45b1-4c97-8a1a-d9d2492ca67d) /Trapped /False
52
+ >>
53
+ endobj
54
+ 9 0 obj
55
+ <<
56
+ /Count 3 /First 10 0 R /Last 10 0 R /Type /Outlines
57
+ >>
58
+ endobj
59
+ 10 0 obj
60
+ <<
61
+ /Count -4 /Dest [ 4 0 R /Fit ] /First 11 0 R /Last 16 0 R /Parent 9 0 R /Title (DeepShield Analysis Report)
62
+ >>
63
+ endobj
64
+ 11 0 obj
65
+ <<
66
+ /Dest [ 4 0 R /Fit ] /Next 12 0 R /Parent 10 0 R /Title (Verdict)
67
+ >>
68
+ endobj
69
+ 12 0 obj
70
+ <<
71
+ /Count -2 /Dest [ 4 0 R /Fit ] /First 13 0 R /Last 14 0 R /Next 15 0 R /Parent 10 0 R
72
+ /Prev 11 0 R /Title (Text Classification)
73
+ >>
74
+ endobj
75
+ 13 0 obj
76
+ <<
77
+ /Dest [ 4 0 R /Fit ] /Next 14 0 R /Parent 12 0 R /Title (Sensationalism Signals)
78
+ >>
79
+ endobj
80
+ 14 0 obj
81
+ <<
82
+ /Dest [ 4 0 R /Fit ] /Parent 12 0 R /Prev 13 0 R /Title (Extracted Keywords)
83
+ >>
84
+ endobj
85
+ 15 0 obj
86
+ <<
87
+ /Dest [ 4 0 R /Fit ] /Next 16 0 R /Parent 10 0 R /Prev 12 0 R /Title (Trusted Source Cross-Reference \(1\))
88
+ >>
89
+ endobj
90
+ 16 0 obj
91
+ <<
92
+ /Dest [ 6 0 R /Fit ] /Parent 10 0 R /Prev 15 0 R /Title (Processing Summary)
93
+ >>
94
+ endobj
95
+ 17 0 obj
96
+ <<
97
+ /Count 2 /Kids [ 4 0 R 6 0 R ] /Type /Pages
98
+ >>
99
+ endobj
100
+ 18 0 obj
101
+ <<
102
+ /Filter [ /ASCII85Decode /FlateDecode ] /Length 1750
103
+ >>
104
+ stream
105
+ Gb"/(9lo&I&A@sBlm4G[Acr2Y4p^$ca2t\gAsuiHo\c,I9gURE8lSA3M>qu?,XkR;()9nE&%0G$"Ts\%gUFdJ0E[3iXSb#I!k]Slq-+&^_fu5V&-:f'>`[5155TjpXI_!]U"iQd1qrcX0jNK021sk.K_S`f[kfkaR[pr2$LLU)UX&`3>7R17rJ3t':B_<4Kk*Grr8\a:5/Z<<[I]mbfHq28c@Y+3O)t)0k@mu0K^fiq^N*(u.%T.'jl<s/Nh4He2l7^V7l^6+r/e]g]la.!>S?L^o+>>SgBV8H:sX>5A0-l`)&\h4Lk6L5I=)ArV#_bh%^>M_c,"jSErfH[2A&CfKtLn_&K3h)!u;:i'6.H*(apE@/QWkIgF*OaTZ"ZT=me'_?iN-hL[(uHeb"'/B!\/7d068ieW>Y3P8NcsU#;"%eOe_!^-"Xsc?9a'H,u4"nMEm$3F[>c1S8J!`Sh;Ye8pG>de>ac3KpI*&j-(`*[@OB&i#OgJSl=(I-'<c@@S(D;k%W_$;Jl?$^4Y-G*rH-Rk_h_*=&9o`q/eu[3o$--Zc#XoX(sA&CI7RqS'cWBhG2:+ODa!):O6`^NT((K7(:%BVJ3=F%emKe-WmK3EIie5ZAbGXt^Hf,[uurZtImn"m<3AaU$p)@,./&T/aMg@_t-oU(Al5HTNb;0J4E-fqZg*4Y/o@,5%"0ObY@,kKsQdk#2'pZOD8tZrghVcMH[#FI&3f.,FmGKKKNo9?B[@`=FkP`:=oo>;4Vs.^rc%L+kt99^Gd]mfUsWoLD02jLH*WUl.Pb(oF^j?7RUN!m&Us22M!@A<RB<?,"#orPd]<&>ld**8+J._-f-FEVm$t<`HO6GNqd_[bhJ&8qK0d-ZKt;EB60u<VCgOQ;8F:jeCp]E2HpO&5==e.Z2c5.#%nBkfCHsrt>d0-2Z<CdP%-(PZ=R(ET3u6<D1@I(u[6LMn;M%:K3fl4ls;SX'd>:*Z]IT(dG)'7QU\#<V$$AmO6;HncG;?UO[<qf,QJem^o.f$D3^V'_h3dF.f82/[@>u^ecY/FgdnO#RWf_=Js*t;iiO?'fQ:g&@nC/Xhu.;&o1b+?_6-Z%i4;1H5GAUag0*4LfL'2;Sl`["O/H6p>jU\SO4%Ffq^-']m<b(Mo1Vg;h"E$f8Z?_AL@bH31kAKY%KEP\PmsdK2MJ^Dfb%0.sgc_9*[9&'t*;+>uUp/PKbuj>J71&Mh5t,WF_k&]O@P+do^;.WV"r6Kkb#5`,aF$-adPdc+'072](pse[q;.^?I#Q#kci1Qr9Z_U:Q_lQ53n!nIBHrchNfMeP-HF*=<22XdSrZ8j>sP4CR1SEP\Ge.aCh(VEW.)F'<]`"gVnaq<<]K,.uCIMlUqSgV3U</GlN`:3?Ft9S-uHH\_0/'rV&dUBe&=8^c)"F#b/Te`H6Yn1DnZc?T$IiaKe%'S][\*'W-]E<4.cnD8?.XB5)khib.oe$NkDa0D^I+$2a=[rbp"D3eQQqq@TO]aNHTMcGM3B3cn9,9'giRF__Y[<^:+bB3]sACEq,A$s%=n\8Vk/OM\c,W"mZ11,MaZ61]7"M`X1/qmcr-hH,#8+udNN9@p:IAM="9:b-RnD&FAVj^G'kW4tPgO+M25'hLH])Ped#fB*fOs>Te;V8("S^2/7e`3>4E]],alEY#@T-dG.(=/^7(s[bh3%omN/'WKl<"q_K`T7$VrMt.GfckX6]1EfAB]1F6o6g>\:2Etf)rD.XNrRc2pgl"Hr<(1MCd%~>endstream
106
+ endobj
107
+ 19 0 obj
108
+ <<
109
+ /Filter [ /ASCII85Decode /FlateDecode ] /Length 1251
110
+ >>
111
+ stream
112
+ Gau`R;01GN&:Vs/fU'm&SZsB\Z>@pd[^l$Ne'"!6Hco+&(^1n<bt7%'s8H%#$m^MQApR0<`)taLn([eaAHiiuRK&mT!C!?!I`[+[8FM*9+s?gk^Sb`ESFuBheu'`^-k@VZQnjgqaj:g4M2J-c)%`([:iWt%O9mV9ZO6(4"\bX`WWWGJ,s27(iVrdq]@Q&`bX7t`KV@dkk1#U3_]/$nF6>.H%;Q95P;kU[/"Vgs.N%@'=M6kAJN1afF&?E_+rA+1KE+S:4],1QpOr^qg01e<#d,;@\e=!\1-*,1T[41J&^DSg86dC5.#&+tMiZhie$%p]f=sWJ!9ni#^ZR?Gp5lVJY,M<YHnZf[nt2A3ZtRV6dLh4C-*^gI%O$[,o&o;u7[Nu/XEmkj&m4-UHNFF#I0VCUiaS-$S2Gs[@(=.(Fg-V>W+]dGA*V*5[2WS\gs>9t%t32b/^W)[_+r7&3kOLD>8WTI508QU_ZkVRb*l"j_,ie@Wk/$,J'=rjAsRr^aIAp,g4N\@rcW@_7fV)G7.f:C\2aDCnK2"(-Yh-fNKV4ogPJ_Bbno/AG^W)=l`02mHESBSd,2MW2Q,8S^O,7f_^Pj+'$c\[n!'TZ'8A[[6$M/6Vlo9egXU318J0Zl;rXSYgM=-\-3TecfRc]m]FKNI.=E4amT3\PSaWQi;TtrPVN"#t`E;<R<T0FHF)>bkNM&M.:/OC)MK2$$?Jp$`SY/%t"jbj6*+.%6.71qjEsp)j@\0#RIF/1!&^q"O7Ou;8DL^2(?$>18.AWa`<qQ;FS*8d605U,LRjPYl%CQZ"EZ)d6ggmR/\emf.%.#K=ZXlPbU\40kfi-URgEX``iXe1pOV?N=StFNQ>H$Fi,Ak&SQPl+Y^;rG>nArp/_q%9B[r]_;\_^p'[__7OH7)iuf]c[rld?RB/M<r(<QsU%pNedj)1NmPM-_fL1VD1tNQL&@c-=<:"`[Vpojg6J[HJ4:,T\L_]InN3jJke4J(kV<hYN(d]b#E=":iOW#=k#-U%PKO/p'+,)f951AW&jRK9')Q>rP3T8Xk7<ZOVAq$3lpK6YL6tc'D2V%1G(jM8"TncWs=[!hW2(D30g$5(Q/MN1htIgRt\ADhN@$l202Af7(c#1P6?P("GPEU+>VY%=qG1""FA,mioCp,lF3^-AZtKRg/NFX>&kA^rZpnFA<r!,IA42rZQ6YFrrrLL)tME=&"E=g6gSrChSiOfRe!l*<?[tTYGRI@6&N"%Fn3=3;X6Dm0TH~>endstream
113
+ endobj
114
+ xref
115
+ 0 20
116
+ 0000000000 65535 f
117
+ 0000000061 00000 n
118
+ 0000000112 00000 n
119
+ 0000000219 00000 n
120
+ 0000000331 00000 n
121
+ 0000000536 00000 n
122
+ 0000000613 00000 n
123
+ 0000000818 00000 n
124
+ 0000000903 00000 n
125
+ 0000001223 00000 n
126
+ 0000001296 00000 n
127
+ 0000001426 00000 n
128
+ 0000001514 00000 n
129
+ 0000001667 00000 n
130
+ 0000001770 00000 n
131
+ 0000001869 00000 n
132
+ 0000001999 00000 n
133
+ 0000002098 00000 n
134
+ 0000002164 00000 n
135
+ 0000004006 00000 n
136
+ trailer
137
+ <<
138
+ /ID
139
+ [<8e273c2672d813e3cd44109eb1edd604><8e273c2672d813e3cd44109eb1edd604>]
140
+ % ReportLab generated PDF document -- digest (opensource)
141
+
142
+ /Info 8 0 R
143
+ /Root 7 0 R
144
+ /Size 20
145
+ >>
146
+ startxref
147
+ 5349
148
+ %%EOF
deps.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from fastapi import Depends, Header, HTTPException, status
4
+ from sqlalchemy.orm import Session
5
+
6
+ from db.database import get_db
7
+ from db.models import User
8
+ from services.auth_service import decode_token, get_user
9
+
10
+
11
+ def _extract_bearer(authorization: str | None) -> str | None:
12
+ if not authorization:
13
+ return None
14
+ parts = authorization.split()
15
+ if len(parts) != 2 or parts[0].lower() != "bearer":
16
+ return None
17
+ return parts[1]
18
+
19
+
20
+ def get_current_user(
21
+ authorization: str | None = Header(default=None),
22
+ db: Session = Depends(get_db),
23
+ ) -> User:
24
+ token = _extract_bearer(authorization)
25
+ if not token:
26
+ raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Missing bearer token")
27
+ payload = decode_token(token)
28
+ if not payload or "sub" not in payload:
29
+ raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Invalid or expired token")
30
+ user = get_user(db, int(payload["sub"]))
31
+ if not user:
32
+ raise HTTPException(status.HTTP_401_UNAUTHORIZED, "User not found")
33
+ return user
34
+
35
+
36
+ def optional_current_user(
37
+ authorization: str | None = Header(default=None),
38
+ db: Session = Depends(get_db),
39
+ ) -> User | None:
40
+ token = _extract_bearer(authorization)
41
+ if not token:
42
+ return None
43
+ payload = decode_token(token)
44
+ if not payload or "sub" not in payload:
45
+ return None
46
+ return get_user(db, int(payload["sub"]))
download_ffpp.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """ Downloads FaceForensics++ and Deep Fake Detection public data release
3
+ Example usage:
4
+ see -h or https://github.com/ondyari/FaceForensics
5
+ """
6
+ # -*- coding: utf-8 -*-
7
+ import argparse
8
+ import os
9
+ import urllib
10
+ import urllib.request
11
+ import tempfile
12
+ import time
13
+ import sys
14
+ import json
15
+ import random
16
+ from tqdm import tqdm
17
+ from os.path import join
18
+
19
+
20
+ # URLs and filenames
21
+ FILELIST_URL = 'misc/filelist.json'
22
+ DEEPFEAKES_DETECTION_URL = 'misc/deepfake_detection_filenames.json'
23
+ DEEPFAKES_MODEL_NAMES = ['decoder_A.h5', 'decoder_B.h5', 'encoder.h5',]
24
+
25
+ # Parameters
26
+ DATASETS = {
27
+ 'original_youtube_videos': 'misc/downloaded_youtube_videos.zip',
28
+ 'original_youtube_videos_info': 'misc/downloaded_youtube_videos_info.zip',
29
+ 'original': 'original_sequences/youtube',
30
+ 'DeepFakeDetection_original': 'original_sequences/actors',
31
+ 'Deepfakes': 'manipulated_sequences/Deepfakes',
32
+ 'DeepFakeDetection': 'manipulated_sequences/DeepFakeDetection',
33
+ 'Face2Face': 'manipulated_sequences/Face2Face',
34
+ 'FaceShifter': 'manipulated_sequences/FaceShifter',
35
+ 'FaceSwap': 'manipulated_sequences/FaceSwap',
36
+ 'NeuralTextures': 'manipulated_sequences/NeuralTextures'
37
+ }
38
+ ALL_DATASETS = ['original', 'DeepFakeDetection_original', 'Deepfakes',
39
+ 'DeepFakeDetection', 'Face2Face', 'FaceShifter', 'FaceSwap',
40
+ 'NeuralTextures']
41
+ COMPRESSION = ['raw', 'c23', 'c40']
42
+ TYPE = ['videos', 'masks', 'models']
43
+ SERVERS = ['EU', 'EU2', 'CA']
44
+
45
+
46
+ def parse_args():
47
+ parser = argparse.ArgumentParser(
48
+ description='Downloads FaceForensics v2 public data release.',
49
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
50
+ )
51
+ parser.add_argument('output_path', type=str, help='Output directory.')
52
+ parser.add_argument('-d', '--dataset', type=str, default='all',
53
+ help='Which dataset to download, either pristine or '
54
+ 'manipulated data or the downloaded youtube '
55
+ 'videos.',
56
+ choices=list(DATASETS.keys()) + ['all']
57
+ )
58
+ parser.add_argument('-c', '--compression', type=str, default='raw',
59
+ help='Which compression degree. All videos '
60
+ 'have been generated with h264 with a varying '
61
+ 'codec. Raw (c0) videos are lossless compressed.',
62
+ choices=COMPRESSION
63
+ )
64
+ parser.add_argument('-t', '--type', type=str, default='videos',
65
+ help='Which file type, i.e. videos, masks, for our '
66
+ 'manipulation methods, models, for Deepfakes.',
67
+ choices=TYPE
68
+ )
69
+ parser.add_argument('-n', '--num_videos', type=int, default=None,
70
+ help='Select a number of videos number to '
71
+ "download if you don't want to download the full"
72
+ ' dataset.')
73
+ parser.add_argument('--server', type=str, default='EU',
74
+ help='Server to download the data from. If you '
75
+ 'encounter a slow download speed, consider '
76
+ 'changing the server.',
77
+ choices=SERVERS
78
+ )
79
+ args = parser.parse_args()
80
+
81
+ # URLs
82
+ server = args.server
83
+ if server == 'EU':
84
+ server_url = 'http://canis.vc.in.tum.de:8100/'
85
+ elif server == 'EU2':
86
+ server_url = 'http://kaldir.vc.in.tum.de/faceforensics/'
87
+ elif server == 'CA':
88
+ server_url = 'http://falas.cmpt.sfu.ca:8100/'
89
+ else:
90
+ raise Exception('Wrong server name. Choices: {}'.format(str(SERVERS)))
91
+ args.tos_url = server_url + 'webpage/FaceForensics_TOS.pdf'
92
+ args.base_url = server_url + 'v3/'
93
+ args.deepfakes_model_url = server_url + 'v3/manipulated_sequences/' + \
94
+ 'Deepfakes/models/'
95
+
96
+ return args
97
+
98
+
99
+ def download_files(filenames, base_url, output_path, report_progress=True):
100
+ os.makedirs(output_path, exist_ok=True)
101
+ if report_progress:
102
+ filenames = tqdm(filenames)
103
+ for filename in filenames:
104
+ download_file(base_url + filename, join(output_path, filename))
105
+
106
+
107
+ def reporthook(count, block_size, total_size):
108
+ global start_time
109
+ if count == 0:
110
+ start_time = time.time()
111
+ return
112
+ duration = time.time() - start_time
113
+ progress_size = int(count * block_size)
114
+ speed = int(progress_size / (1024 * duration))
115
+ percent = int(count * block_size * 100 / total_size)
116
+ sys.stdout.write("\rProgress: %d%%, %d MB, %d KB/s, %d seconds passed" %
117
+ (percent, progress_size / (1024 * 1024), speed, duration))
118
+ sys.stdout.flush()
119
+
120
+
121
+ def download_file(url, out_file, report_progress=False):
122
+ out_dir = os.path.dirname(out_file)
123
+ if not os.path.isfile(out_file):
124
+ fh, out_file_tmp = tempfile.mkstemp(dir=out_dir)
125
+ f = os.fdopen(fh, 'w')
126
+ f.close()
127
+ if report_progress:
128
+ urllib.request.urlretrieve(url, out_file_tmp,
129
+ reporthook=reporthook)
130
+ else:
131
+ urllib.request.urlretrieve(url, out_file_tmp)
132
+ os.rename(out_file_tmp, out_file)
133
+ else:
134
+ tqdm.write('WARNING: skipping download of existing file ' + out_file)
135
+
136
+
137
+ def main(args):
138
+ # TOS
139
+ print('By pressing any key to continue you confirm that you have agreed '\
140
+ 'to the FaceForensics terms of use as described at:')
141
+ print(args.tos_url)
142
+ print('***')
143
+ print('Press any key to continue, or CTRL-C to exit.')
144
+ _ = input('')
145
+
146
+ # Extract arguments
147
+ c_datasets = [args.dataset] if args.dataset != 'all' else ALL_DATASETS
148
+ c_type = args.type
149
+ c_compression = args.compression
150
+ num_videos = args.num_videos
151
+ output_path = args.output_path
152
+ os.makedirs(output_path, exist_ok=True)
153
+
154
+ # Check for special dataset cases
155
+ for dataset in c_datasets:
156
+ dataset_path = DATASETS[dataset]
157
+ # Special cases
158
+ if 'original_youtube_videos' in dataset:
159
+ # Here we download the original youtube videos zip file
160
+ print('Downloading original youtube videos.')
161
+ if not 'info' in dataset_path:
162
+ print('Please be patient, this may take a while (~40gb)')
163
+ suffix = ''
164
+ else:
165
+ suffix = 'info'
166
+ download_file(args.base_url + '/' + dataset_path,
167
+ out_file=join(output_path,
168
+ 'downloaded_videos{}.zip'.format(
169
+ suffix)),
170
+ report_progress=True)
171
+ return
172
+
173
+ # Else: regular datasets
174
+ print('Downloading {} of dataset "{}"'.format(
175
+ c_type, dataset_path
176
+ ))
177
+
178
+ # Get filelists and video lenghts list from server
179
+ if 'DeepFakeDetection' in dataset_path or 'actors' in dataset_path:
180
+ filepaths = json.loads(urllib.request.urlopen(args.base_url + '/' +
181
+ DEEPFEAKES_DETECTION_URL).read().decode("utf-8"))
182
+ if 'actors' in dataset_path:
183
+ filelist = filepaths['actors']
184
+ else:
185
+ filelist = filepaths['DeepFakesDetection']
186
+ elif 'original' in dataset_path:
187
+ # Load filelist from server
188
+ file_pairs = json.loads(urllib.request.urlopen(args.base_url + '/' +
189
+ FILELIST_URL).read().decode("utf-8"))
190
+ filelist = []
191
+ for pair in file_pairs:
192
+ filelist += pair
193
+ else:
194
+ # Load filelist from server
195
+ file_pairs = json.loads(urllib.request.urlopen(args.base_url + '/' +
196
+ FILELIST_URL).read().decode("utf-8"))
197
+ # Get filelist
198
+ filelist = []
199
+ for pair in file_pairs:
200
+ filelist.append('_'.join(pair))
201
+ if c_type != 'models':
202
+ filelist.append('_'.join(pair[::-1]))
203
+ # Maybe limit number of videos for download
204
+ if num_videos is not None and num_videos > 0:
205
+ print('Downloading the first {} videos'.format(num_videos))
206
+ filelist = filelist[:num_videos]
207
+
208
+ # Server and local paths
209
+ dataset_videos_url = args.base_url + '{}/{}/{}/'.format(
210
+ dataset_path, c_compression, c_type)
211
+ dataset_mask_url = args.base_url + '{}/{}/videos/'.format(
212
+ dataset_path, 'masks', c_type)
213
+
214
+ if c_type == 'videos':
215
+ dataset_output_path = join(output_path, dataset_path, c_compression,
216
+ c_type)
217
+ print('Output path: {}'.format(dataset_output_path))
218
+ filelist = [filename + '.mp4' for filename in filelist]
219
+ download_files(filelist, dataset_videos_url, dataset_output_path)
220
+ elif c_type == 'masks':
221
+ dataset_output_path = join(output_path, dataset_path, c_type,
222
+ 'videos')
223
+ print('Output path: {}'.format(dataset_output_path))
224
+ if 'original' in dataset:
225
+ if args.dataset != 'all':
226
+ print('Only videos available for original data. Aborting.')
227
+ return
228
+ else:
229
+ print('Only videos available for original data. '
230
+ 'Skipping original.\n')
231
+ continue
232
+ if 'FaceShifter' in dataset:
233
+ print('Masks not available for FaceShifter. Aborting.')
234
+ return
235
+ filelist = [filename + '.mp4' for filename in filelist]
236
+ download_files(filelist, dataset_mask_url, dataset_output_path)
237
+
238
+ # Else: models for deepfakes
239
+ else:
240
+ if dataset != 'Deepfakes' and c_type == 'models':
241
+ print('Models only available for Deepfakes. Aborting')
242
+ return
243
+ dataset_output_path = join(output_path, dataset_path, c_type)
244
+ print('Output path: {}'.format(dataset_output_path))
245
+
246
+ # Get Deepfakes models
247
+ for folder in tqdm(filelist):
248
+ folder_filelist = DEEPFAKES_MODEL_NAMES
249
+
250
+ # Folder paths
251
+ folder_base_url = args.deepfakes_model_url + folder + '/'
252
+ folder_dataset_output_path = join(dataset_output_path,
253
+ folder)
254
+ download_files(folder_filelist, folder_base_url,
255
+ folder_dataset_output_path,
256
+ report_progress=False) # already done
257
+
258
+
259
+ if __name__ == "__main__":
260
+ args = parse_args()
261
+ main(args)
ela_service.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Error Level Analysis (ELA) — Phase 12.1
2
+
3
+ Re-saves an image at a fixed JPEG quality and diffs against the original to reveal
4
+ per-pixel manipulation artifacts. Regions that were recently edited will show
5
+ higher error levels than untouched areas.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import base64
11
+ import io
12
+
13
+ import cv2
14
+ import numpy as np
15
+ from loguru import logger
16
+ from PIL import Image
17
+
18
+
19
+ def _compute_ela(pil_img: Image.Image, quality: int = 90, scale: float = 15.0) -> np.ndarray:
20
+ """Return an ELA difference map as a uint8 (H,W,3) RGB array.
21
+
22
+ Args:
23
+ pil_img: Input image (any format — converted to RGB internally).
24
+ quality: JPEG re-save quality level (lower = more aggressive compression).
25
+ scale: Amplification factor for the difference (higher = more contrast).
26
+
27
+ Returns:
28
+ Difference image as uint8 (H,W,3) array.
29
+ """
30
+ rgb = pil_img.convert("RGB")
31
+
32
+ # Re-save at specified JPEG quality into an in-memory buffer
33
+ buf = io.BytesIO()
34
+ rgb.save(buf, format="JPEG", quality=quality)
35
+ buf.seek(0)
36
+ resaved = Image.open(buf).convert("RGB")
37
+
38
+ original_arr = np.array(rgb, dtype=np.float32)
39
+ resaved_arr = np.array(resaved, dtype=np.float32)
40
+
41
+ # Per-pixel absolute difference, amplified
42
+ diff = np.abs(original_arr - resaved_arr) * scale
43
+ diff = np.clip(diff, 0, 255).astype(np.uint8)
44
+
45
+ return diff
46
+
47
+
48
+ def generate_ela_base64(pil_img: Image.Image, quality: int = 90, scale: float = 15.0) -> str:
49
+ """Produce a base64 data-URL PNG of the ELA difference map.
50
+
51
+ Regions with higher error levels (brighter in the output) are more likely
52
+ to have been digitally manipulated.
53
+ """
54
+ diff = _compute_ela(pil_img, quality=quality, scale=scale)
55
+
56
+ buf = io.BytesIO()
57
+ Image.fromarray(diff).save(buf, format="PNG")
58
+ b64 = base64.b64encode(buf.getvalue()).decode("ascii")
59
+
60
+ logger.info(f"ELA map generated ({diff.shape[1]}x{diff.shape[0]})")
61
+ return f"data:image/png;base64,{b64}"
62
+
63
+
64
+ def generate_blended_ela_base64(
65
+ pil_img: Image.Image,
66
+ gradcam_weight: float = 0.6,
67
+ ela_weight: float = 0.4,
68
+ quality: int = 90,
69
+ scale: float = 15.0,
70
+ ) -> str:
71
+ """Blend Grad-CAM heatmap overlay with ELA at specified weights.
72
+
73
+ This is a utility for the 'blended' mode — it composites the ELA
74
+ difference map on top of the original image for visual clarity.
75
+ """
76
+ rgb = pil_img.convert("RGB")
77
+ original_arr = np.array(rgb, dtype=np.float32)
78
+ ela_arr = _compute_ela(pil_img, quality=quality, scale=scale).astype(np.float32)
79
+
80
+ # Blend: overlay ELA on the original for visual context
81
+ blended = np.clip(original_arr * 0.5 + ela_arr * 0.5, 0, 255).astype(np.uint8)
82
+
83
+ buf = io.BytesIO()
84
+ Image.fromarray(blended).save(buf, format="PNG")
85
+ b64 = base64.b64encode(buf.getvalue()).decode("ascii")
86
+
87
+ logger.info(f"Blended ELA generated ({blended.shape[1]}x{blended.shape[0]})")
88
+ return f"data:image/png;base64,{b64}"
exif_service.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """EXIF Metadata Extraction — Phase 12.2
2
+
3
+ Extracts camera metadata from uploaded images and computes a trust adjustment
4
+ score: presence of authentic camera metadata lowers fake probability, while
5
+ evidence of editing software raises it.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Optional
11
+
12
+ from loguru import logger
13
+ from PIL import Image
14
+ from PIL.ExifTags import TAGS, GPSTAGS
15
+
16
+ from schemas.common import ExifSummary
17
+
18
+
19
+ # Software strings that suggest post-processing / generation
20
+ _SUSPICIOUS_SOFTWARE = {
21
+ "adobe photoshop", "photoshop", "gimp", "affinity photo",
22
+ "stable diffusion", "midjourney", "dall-e", "comfyui",
23
+ "automatic1111", "invokeai",
24
+ }
25
+
26
+ # Software strings that are normal camera firmware
27
+ _CAMERA_SOFTWARE = {
28
+ "ver.", "firmware", "camera", "dji", "gopro",
29
+ }
30
+
31
+
32
+ def _decode_gps(gps_info: dict) -> Optional[str]:
33
+ """Decode EXIF GPSInfo dict into a human-readable lat/lon string."""
34
+ try:
35
+ def _to_decimal(values, ref):
36
+ d, m, s = [float(v) for v in values]
37
+ decimal = d + m / 60.0 + s / 3600.0
38
+ if ref in ("S", "W"):
39
+ decimal = -decimal
40
+ return decimal
41
+
42
+ lat = _to_decimal(gps_info.get(2, (0, 0, 0)), gps_info.get(1, "N"))
43
+ lon = _to_decimal(gps_info.get(4, (0, 0, 0)), gps_info.get(3, "E"))
44
+ return f"{lat:.6f}, {lon:.6f}"
45
+ except Exception:
46
+ return None
47
+
48
+
49
+ def extract_exif(pil_img: Image.Image, raw_bytes: bytes) -> ExifSummary:
50
+ """Extract EXIF metadata and compute a trust adjustment score.
51
+
52
+ Trust adjustment logic:
53
+ - Valid Make + Model + DateTimeOriginal → -15 (more likely real camera photo)
54
+ - GPS info present → -5 additional (real photos often have GPS)
55
+ - Suspicious editing software detected → +10 (more likely manipulated)
56
+ - No EXIF at all → 0 (inconclusive — many platforms strip EXIF)
57
+ """
58
+ summary = ExifSummary()
59
+
60
+ try:
61
+ exif_data = pil_img._getexif()
62
+ except Exception:
63
+ exif_data = None
64
+
65
+ if not exif_data:
66
+ # Try exifread as fallback for formats Pillow doesn't handle well
67
+ try:
68
+ import exifread
69
+ from io import BytesIO
70
+ tags = exifread.process_file(BytesIO(raw_bytes), details=False)
71
+ if tags:
72
+ summary.make = str(tags.get("Image Make", "")).strip() or None
73
+ summary.model = str(tags.get("Image Model", "")).strip() or None
74
+ summary.datetime_original = str(tags.get("EXIF DateTimeOriginal", "")).strip() or None
75
+ summary.software = str(tags.get("Image Software", "")).strip() or None
76
+ summary.lens_model = str(tags.get("EXIF LensModel", "")).strip() or None
77
+ except ImportError:
78
+ logger.debug("exifread not installed, skipping fallback EXIF extraction")
79
+ except Exception as e:
80
+ logger.debug(f"exifread fallback failed: {e}")
81
+ else:
82
+ # Decode Pillow EXIF
83
+ decoded = {}
84
+ for tag_id, value in exif_data.items():
85
+ tag_name = TAGS.get(tag_id, tag_id)
86
+ decoded[tag_name] = value
87
+
88
+ summary.make = str(decoded.get("Make", "")).strip() or None
89
+ summary.model = str(decoded.get("Model", "")).strip() or None
90
+ summary.datetime_original = str(decoded.get("DateTimeOriginal", "")).strip() or None
91
+ summary.software = str(decoded.get("Software", "")).strip() or None
92
+ summary.lens_model = str(decoded.get("LensModel", "")).strip() or None
93
+
94
+ # GPS
95
+ gps_raw = decoded.get("GPSInfo")
96
+ if gps_raw and isinstance(gps_raw, dict):
97
+ gps_decoded = {}
98
+ for k, v in gps_raw.items():
99
+ gps_decoded[GPSTAGS.get(k, k)] = v
100
+ summary.gps_info = _decode_gps(gps_decoded)
101
+
102
+ # ── Trust adjustment scoring ──
103
+ adjustment = 0
104
+ reasons = []
105
+
106
+ has_camera_meta = summary.make and summary.model and summary.datetime_original
107
+ if has_camera_meta:
108
+ adjustment -= 15
109
+ reasons.append("valid camera metadata (Make/Model/DateTime)")
110
+
111
+ if summary.gps_info:
112
+ adjustment -= 5
113
+ reasons.append("GPS coordinates present")
114
+
115
+ if summary.software:
116
+ sw_lower = summary.software.lower()
117
+ if any(s in sw_lower for s in _SUSPICIOUS_SOFTWARE):
118
+ adjustment += 10
119
+ reasons.append(f"editing software detected: {summary.software}")
120
+ elif any(s in sw_lower for s in _CAMERA_SOFTWARE):
121
+ adjustment -= 2
122
+ reasons.append("camera firmware in Software field")
123
+
124
+ summary.trust_adjustment = adjustment
125
+ summary.trust_reason = "; ".join(reasons) if reasons else "no EXIF metadata found"
126
+
127
+ logger.info(f"EXIF extracted: make={summary.make}, model={summary.model}, "
128
+ f"adjustment={adjustment} ({summary.trust_reason})")
129
+ return summary
file_handler.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import os
5
+ import tempfile
6
+ from typing import Iterable
7
+
8
+ from fastapi import HTTPException, UploadFile, status
9
+
10
+ from config import settings
11
+
12
+ IMAGE_MAGIC_BYTES: dict[bytes, str] = {
13
+ b"\xff\xd8\xff": "image/jpeg",
14
+ b"\x89PNG\r\n\x1a\n": "image/png",
15
+ b"RIFF": "image/webp", # partial; WEBP has 'RIFF....WEBP'
16
+ }
17
+
18
+
19
+ def _detect_mime_by_magic(head: bytes) -> str | None:
20
+ for sig, mime in IMAGE_MAGIC_BYTES.items():
21
+ if head.startswith(sig):
22
+ if mime == "image/webp" and b"WEBP" not in head[:16]:
23
+ continue
24
+ return mime
25
+ return None
26
+
27
+
28
+ async def read_upload_bytes(
29
+ file: UploadFile,
30
+ allowed_mimes: Iterable[str],
31
+ max_size_mb: int,
32
+ ) -> tuple[bytes, str]:
33
+ """Read an UploadFile into memory after validating type and size.
34
+ Returns (raw_bytes, detected_mime). Raises HTTPException on failure.
35
+ """
36
+ data = await file.read()
37
+ size_mb = len(data) / (1024 * 1024)
38
+ if size_mb > max_size_mb:
39
+ raise HTTPException(
40
+ status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
41
+ detail=f"File too large ({size_mb:.1f} MB > {max_size_mb} MB)",
42
+ )
43
+
44
+ mime = _detect_mime_by_magic(data[:16]) or (file.content_type or "")
45
+ if mime not in allowed_mimes:
46
+ raise HTTPException(
47
+ status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
48
+ detail=f"Unsupported type '{mime}'. Allowed: {list(allowed_mimes)}",
49
+ )
50
+ return data, mime
51
+
52
+
53
+ def bytes_to_buffer(data: bytes) -> io.BytesIO:
54
+ return io.BytesIO(data)
55
+
56
+
57
+ async def save_upload_to_tempfile(
58
+ file: UploadFile,
59
+ allowed_mimes: Iterable[str],
60
+ max_size_mb: int,
61
+ suffix: str = ".mp4",
62
+ ) -> tuple[str, str]:
63
+ """Stream an UploadFile to a temp file on disk. Returns (path, mime).
64
+ MIME is taken from the client's content_type (no magic-byte check for videos).
65
+ Caller is responsible for deleting the temp file.
66
+ """
67
+ mime = (file.content_type or "").lower()
68
+ if mime not in allowed_mimes:
69
+ raise HTTPException(
70
+ status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
71
+ detail=f"Unsupported type '{mime}'. Allowed: {list(allowed_mimes)}",
72
+ )
73
+
74
+ max_bytes = max_size_mb * 1024 * 1024
75
+ fd, path = tempfile.mkstemp(suffix=suffix, prefix="ds_vid_")
76
+ written = 0
77
+ try:
78
+ with os.fdopen(fd, "wb") as out:
79
+ while True:
80
+ chunk = await file.read(1024 * 1024)
81
+ if not chunk:
82
+ break
83
+ written += len(chunk)
84
+ if written > max_bytes:
85
+ raise HTTPException(
86
+ status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
87
+ detail=f"File too large (> {max_size_mb} MB)",
88
+ )
89
+ out.write(chunk)
90
+ except Exception:
91
+ try:
92
+ os.unlink(path)
93
+ except OSError:
94
+ pass
95
+ raise
96
+ return path, mime
generate_colab_nb.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nbformat as nbf
2
+ import os
3
+
4
+ nb = nbf.v4.new_notebook()
5
+
6
+ text = """\
7
+ # DeepShield: FaceForensics++ ViT Training
8
+ Run this entirely in Google Colab.
9
+ **Before running**:
10
+ 1. Go to `Runtime` -> `Change runtime type` -> select **T4 GPU**.
11
+ 2. Run the cells below sequentially.
12
+ """
13
+
14
+ code_install = """\
15
+ !pip install timm transformers datasets accelerate evaluate opencv-python
16
+ """
17
+
18
+ code_ffpp = """\
19
+ # We create the download script inside the Colab environment
20
+ download_script = '''#!/usr/bin/env python
21
+ import argparse
22
+ import os
23
+ import urllib.request
24
+ import tempfile
25
+ import time
26
+ import sys
27
+ import json
28
+ from tqdm import tqdm
29
+ from os.path import join
30
+
31
+ FILELIST_URL = 'misc/filelist.json'
32
+ DEEPFEAKES_DETECTION_URL = 'misc/deepfake_detection_filenames.json'
33
+ DEEPFAKES_MODEL_NAMES = ['decoder_A.h5', 'decoder_B.h5', 'encoder.h5',]
34
+ DATASETS = {
35
+ 'original': 'original_sequences/youtube',
36
+ 'Deepfakes': 'manipulated_sequences/Deepfakes',
37
+ 'Face2Face': 'manipulated_sequences/Face2Face',
38
+ 'FaceShifter': 'manipulated_sequences/FaceShifter',
39
+ 'FaceSwap': 'manipulated_sequences/FaceSwap',
40
+ 'NeuralTextures': 'manipulated_sequences/NeuralTextures'
41
+ }
42
+ ALL_DATASETS = ['original', 'Deepfakes', 'Face2Face', 'FaceShifter', 'FaceSwap', 'NeuralTextures']
43
+ COMPRESSION = ['raw', 'c23', 'c40']
44
+ TYPE = ['videos']
45
+
46
+ def download_file(url, out_file):
47
+ os.makedirs(os.path.dirname(out_file), exist_ok=True)
48
+ if not os.path.isfile(out_file):
49
+ urllib.request.urlretrieve(url, out_file)
50
+
51
+ def main():
52
+ parser = argparse.ArgumentParser()
53
+ parser.add_argument('output_path', type=str)
54
+ parser.add_argument('-d', '--dataset', type=str, default='all')
55
+ parser.add_argument('-c', '--compression', type=str, default='c40')
56
+ parser.add_argument('-t', '--type', type=str, default='videos')
57
+ parser.add_argument('-n', '--num_videos', type=int, default=50) # Small amount for tutorial
58
+ args = parser.parse_args()
59
+
60
+ base_url = 'http://kaldir.vc.in.tum.de/faceforensics/v3/'
61
+
62
+ datasets = [args.dataset] if args.dataset != 'all' else ALL_DATASETS
63
+ for dataset in datasets:
64
+ dataset_path = DATASETS[dataset]
65
+ print(f'Downloading {args.compression} of {dataset}')
66
+
67
+ file_pairs = json.loads(urllib.request.urlopen(base_url + FILELIST_URL).read().decode("utf-8"))
68
+ filelist = []
69
+ if 'original' in dataset_path:
70
+ for pair in file_pairs:
71
+ filelist += pair
72
+ else:
73
+ for pair in file_pairs:
74
+ filelist.append('_'.join(pair))
75
+ filelist.append('_'.join(pair[::-1]))
76
+
77
+ filelist = filelist[:args.num_videos]
78
+ dataset_videos_url = base_url + f'{dataset_path}/{args.compression}/{args.type}/'
79
+ dataset_output_path = join(args.output_path, dataset_path, args.compression, args.type)
80
+
81
+ for filename in tqdm(filelist):
82
+ download_file(dataset_videos_url + filename + ".mp4", join(dataset_output_path, filename + ".mp4"))
83
+
84
+ if __name__ == "__main__":
85
+ main()
86
+ '''
87
+
88
+ with open("download_ffpp.py", "w") as f:
89
+ f.write(download_script)
90
+
91
+ !python download_ffpp.py ./data -d all -c c40 -t videos -n 50
92
+ """
93
+
94
+ code_extract = """\
95
+ import cv2
96
+ import os
97
+ import glob
98
+ from tqdm import tqdm
99
+
100
+ def extract_frames(video_folder, output_folder, label, max_frames=4):
101
+ os.makedirs(output_folder, exist_ok=True)
102
+ videos = glob.glob(os.path.join(video_folder, "*.mp4"))
103
+
104
+ for vid_path in tqdm(videos, desc=f"Extracting {label}"):
105
+ vid_name = os.path.basename(vid_path).replace('.mp4','')
106
+ cap = cv2.VideoCapture(vid_path)
107
+ count = 0
108
+ while cap.isOpened() and count < max_frames:
109
+ ret, frame = cap.read()
110
+ if not ret: break
111
+ frame = cv2.resize(frame, (224, 224))
112
+ out_path = os.path.join(output_folder, f"{vid_name}_f{count}.jpg")
113
+ cv2.imwrite(out_path, frame)
114
+ count += 1
115
+ cap.release()
116
+
117
+ # Extract Real
118
+ extract_frames('./data/original_sequences/youtube/c40/videos', './dataset/real', 'real')
119
+
120
+ # Extract Fakes
121
+ fakes = ['Deepfakes', 'Face2Face', 'FaceSwap', 'NeuralTextures']
122
+ for f in fakes:
123
+ extract_frames(f'./data/manipulated_sequences/{f}/c40/videos', './dataset/fake', 'fake')
124
+ """
125
+
126
+ code_train = """\
127
+ import numpy as np
128
+ from datasets import load_dataset
129
+ from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer
130
+ import torch
131
+
132
+ # 1. Load Dataset
133
+ dataset = load_dataset('imagefolder', data_dir='./dataset')
134
+ # Split into train/validation
135
+ dataset = dataset['train'].train_test_split(test_size=0.1)
136
+
137
+ # 2. Preprocessor
138
+ model_name_or_path = 'google/vit-base-patch16-224-in21k'
139
+ processor = ViTImageProcessor.from_pretrained(model_name_or_path)
140
+
141
+ def transform(example_batch):
142
+ # Take a list of PIL images and turn them to pixel values
143
+ inputs = processor([x.convert("RGB") for x in example_batch['image']], return_tensors='pt')
144
+ inputs['labels'] = example_batch['label']
145
+ return inputs
146
+
147
+ prepared_ds = dataset.with_transform(transform)
148
+
149
+ def collate_fn(batch):
150
+ return {
151
+ 'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
152
+ 'labels': torch.tensor([x['labels'] for x in batch])
153
+ }
154
+
155
+ # 3. Load Model
156
+ labels = dataset['train'].features['label'].names
157
+ model = ViTForImageClassification.from_pretrained(
158
+ model_name_or_path,
159
+ num_labels=len(labels),
160
+ id2label={str(i): c for i, c in enumerate(labels)},
161
+ label2id={c: str(i) for i, c in enumerate(labels)}
162
+ )
163
+
164
+ training_args = TrainingArguments(
165
+ output_dir="./vit-deepshield",
166
+ per_device_train_batch_size=16,
167
+ eval_strategy="steps",
168
+ num_train_epochs=3,
169
+ fp16=True, # Mixed precision for speed
170
+ save_steps=100,
171
+ eval_steps=100,
172
+ logging_steps=10,
173
+ learning_rate=2e-4,
174
+ save_total_limit=2,
175
+ remove_unused_columns=False,
176
+ push_to_hub=False,
177
+ load_best_model_at_end=True,
178
+ )
179
+
180
+ import evaluate
181
+ metric = evaluate.load("accuracy")
182
+ def compute_metrics(p):
183
+ return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
184
+
185
+ trainer = Trainer(
186
+ model=model,
187
+ args=training_args,
188
+ data_collator=collate_fn,
189
+ compute_metrics=compute_metrics,
190
+ train_dataset=prepared_ds["train"],
191
+ eval_dataset=prepared_ds["test"],
192
+ )
193
+
194
+ # 4. Train
195
+ train_results = trainer.train()
196
+ trainer.save_model("deepshield_vit_model")
197
+ processor.save_pretrained("deepshield_vit_model")
198
+ trainer.log_metrics("train", train_results.metrics)
199
+ trainer.save_metrics("train", train_results.metrics)
200
+ trainer.save_state()
201
+ print("Training Complete! The model is saved to ./deepshield_vit_model")
202
+ """
203
+
204
+ nb['cells'] = [
205
+ nbf.v4.new_markdown_cell(text),
206
+ nbf.v4.new_code_cell(code_install),
207
+ nbf.v4.new_code_cell(code_ffpp),
208
+ nbf.v4.new_code_cell(code_extract),
209
+ nbf.v4.new_code_cell(code_train)
210
+ ]
211
+
212
+ with open(r'c:\Users\athar\Desktop\minor2\backend\training\Colab_ViT_Training.ipynb', 'w', encoding='utf-8') as f:
213
+ nbf.write(nb, f)
heatmap_generator.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import io
5
+ from typing import Optional
6
+
7
+ import cv2
8
+ import numpy as np
9
+ import torch
10
+ from loguru import logger
11
+ from PIL import Image
12
+ from pytorch_grad_cam import GradCAMPlusPlus
13
+ from pytorch_grad_cam.utils.image import show_cam_on_image
14
+ from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
15
+
16
+ from config import settings
17
+ from models.model_loader import get_model_loader
18
+
19
+
20
+ class _HFLogitsWrapper(torch.nn.Module):
21
+ """Wrap a HuggingFace image classification model so forward() returns logits
22
+ as a plain tensor (pytorch_grad_cam expects tensor outputs, not dicts/dataclasses).
23
+ """
24
+
25
+ def __init__(self, model: torch.nn.Module) -> None:
26
+ super().__init__()
27
+ self.model = model
28
+
29
+ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: # type: ignore[override]
30
+ return self.model(pixel_values=pixel_values).logits
31
+
32
+
33
+ def _vit_reshape_transform(tensor: torch.Tensor, height: int = 14, width: int = 14) -> torch.Tensor:
34
+ """Grad-CAM expects (B, C, H, W); ViT hidden states are (B, 1+H*W, C).
35
+ Drop the CLS token and reshape tokens into a spatial grid.
36
+ """
37
+ result = tensor[:, 1:, :]
38
+ b, n, c = result.shape
39
+ result = result.reshape(b, height, width, c)
40
+ result = result.permute(0, 3, 1, 2) # (B, C, H, W)
41
+ return result
42
+
43
+
44
+ def _preprocess_for_cam(pil_img: Image.Image, processor) -> tuple[torch.Tensor, np.ndarray]:
45
+ """Return (input_tensor, rgb_float_224) where rgb_float_224 is a (H,W,3) float
46
+ array in [0,1] matching the model input geometry — needed for overlaying.
47
+ """
48
+ inputs = processor(images=pil_img, return_tensors="pt")
49
+ input_tensor = inputs["pixel_values"].to(settings.DEVICE)
50
+
51
+ size = getattr(processor, "size", {"height": 224, "width": 224})
52
+ h = size.get("height", 224) if isinstance(size, dict) else 224
53
+ w = size.get("width", 224) if isinstance(size, dict) else 224
54
+
55
+ resized = pil_img.resize((w, h), Image.BILINEAR)
56
+ rgb = np.array(resized).astype(np.float32) / 255.0 # (H,W,3) in [0,1]
57
+ return input_tensor, rgb
58
+
59
+
60
+ def _encode_overlay_to_base64(overlay: np.ndarray) -> str:
61
+ """Encode a uint8 (H,W,3) RGB overlay to a base64 data-URL PNG."""
62
+ buf = io.BytesIO()
63
+ Image.fromarray(overlay).save(buf, format="PNG")
64
+ b64 = base64.b64encode(buf.getvalue()).decode("ascii")
65
+ return f"data:image/png;base64,{b64}"
66
+
67
+
68
+ def _compute_gradcam_pp(
69
+ pil_img: Image.Image,
70
+ target_class_idx: Optional[int] = None,
71
+ ) -> tuple[np.ndarray, np.ndarray]:
72
+ """Compute Grad-CAM++ averaged across the last 3 ViT encoder layers.
73
+ Returns (grayscale_cam, rgb_float) where grayscale_cam is (H,W) in [0,1].
74
+ """
75
+ loader = get_model_loader()
76
+ model, processor = loader.load_image_model()
77
+
78
+ model.eval()
79
+ for p in model.parameters():
80
+ p.requires_grad_(True)
81
+
82
+ input_tensor, rgb_float = _preprocess_for_cam(pil_img, processor)
83
+
84
+ grid = int(model.config.image_size / model.config.patch_size)
85
+
86
+ # Average across last 3 ViT encoder layers for smoother heatmaps
87
+ num_layers = len(model.vit.encoder.layer)
88
+ last_n = min(3, num_layers)
89
+ target_layers = [
90
+ model.vit.encoder.layer[-(i + 1)].layernorm_before
91
+ for i in range(last_n)
92
+ ]
93
+
94
+ wrapped = _HFLogitsWrapper(model)
95
+
96
+ targets = None
97
+ if target_class_idx is not None:
98
+ targets = [ClassifierOutputTarget(int(target_class_idx))]
99
+
100
+ with GradCAMPlusPlus(
101
+ model=wrapped,
102
+ target_layers=target_layers,
103
+ reshape_transform=lambda t: _vit_reshape_transform(t, grid, grid),
104
+ ) as cam:
105
+ grayscale_cam = cam(input_tensor=input_tensor, targets=targets)[0] # (H,W) in [0,1]
106
+
107
+ return grayscale_cam, rgb_float
108
+
109
+
110
+ def generate_heatmap_base64(
111
+ pil_img: Image.Image,
112
+ target_class_idx: Optional[int] = None,
113
+ ) -> str:
114
+ """Produce a base64 data-URL PNG of the Grad-CAM++ overlay for the given image."""
115
+ grayscale_cam, rgb_float = _compute_gradcam_pp(pil_img, target_class_idx)
116
+ overlay = show_cam_on_image(rgb_float, grayscale_cam, use_rgb=True)
117
+ logger.info(f"Heatmap generated ({overlay.shape[0]}x{overlay.shape[1]})")
118
+ return _encode_overlay_to_base64(overlay)
119
+
120
+
121
+ def generate_boxes_base64(
122
+ pil_img: Image.Image,
123
+ target_class_idx: Optional[int] = None,
124
+ top_k: int = 5,
125
+ threshold: float = 0.4,
126
+ ) -> str:
127
+ """Produce bounding boxes around top-K connected components from Grad-CAM++ activation.
128
+ Renders colored boxes (red/yellow/orange by intensity) on the original image.
129
+ """
130
+ grayscale_cam, rgb_float = _compute_gradcam_pp(pil_img, target_class_idx)
131
+
132
+ h, w = rgb_float.shape[:2]
133
+ base_img = (rgb_float * 255).astype(np.uint8).copy()
134
+
135
+ # Threshold the heatmap to find activated regions
136
+ binary = (grayscale_cam >= threshold).astype(np.uint8) * 255
137
+ contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
138
+
139
+ if not contours:
140
+ logger.info("No significant activation regions found for bounding boxes")
141
+ return _encode_overlay_to_base64(base_img)
142
+
143
+ # Sort by area descending, take top_k
144
+ contours = sorted(contours, key=cv2.contourArea, reverse=True)[:top_k]
145
+
146
+ # Color by mean activation intensity within each box
147
+ for cnt in contours:
148
+ x, y, bw, bh = cv2.boundingRect(cnt)
149
+ region_activation = grayscale_cam[y:y + bh, x:x + bw].mean()
150
+
151
+ if region_activation >= 0.7:
152
+ color = (220, 40, 40) # red — high suspicion
153
+ elif region_activation >= 0.5:
154
+ color = (240, 140, 20) # orange — medium
155
+ else:
156
+ color = (230, 200, 40) # yellow — lower
157
+
158
+ cv2.rectangle(base_img, (x, y), (x + bw, y + bh), color, 2)
159
+ label = f"{region_activation * 100:.0f}%"
160
+ cv2.putText(base_img, label, (x, max(y - 6, 12)),
161
+ cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1, cv2.LINE_AA)
162
+
163
+ logger.info(f"Bounding boxes generated: {len(contours)} regions")
164
+ return _encode_overlay_to_base64(base_img)
image_service.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ from dataclasses import dataclass
5
+ from typing import Tuple
6
+
7
+ import torch
8
+ from loguru import logger
9
+ from PIL import Image
10
+
11
+ from config import settings
12
+ from models.model_loader import get_model_loader
13
+
14
+
15
+ @dataclass
16
+ class ImageClassification:
17
+ label: str
18
+ confidence: float
19
+ all_scores: dict[str, float]
20
+
21
+
22
+ def load_image_from_bytes(data: bytes) -> Image.Image:
23
+ img = Image.open(io.BytesIO(data))
24
+ if img.mode != "RGB":
25
+ img = img.convert("RGB")
26
+ return img
27
+
28
+
29
+ def classify_image(pil_img: Image.Image) -> ImageClassification:
30
+ """Run the ViT deepfake classifier on a PIL image."""
31
+ loader = get_model_loader()
32
+ model, processor = loader.load_image_model()
33
+
34
+ inputs = processor(images=pil_img, return_tensors="pt")
35
+ inputs = {k: v.to(settings.DEVICE) for k, v in inputs.items()}
36
+
37
+ with torch.no_grad():
38
+ outputs = model(**inputs)
39
+ logits = outputs.logits # (1, num_labels)
40
+ probs = torch.softmax(logits, dim=-1)[0]
41
+
42
+ id2label: dict[int, str] = getattr(model.config, "id2label", {})
43
+ all_scores = {id2label.get(i, str(i)): float(p.item()) for i, p in enumerate(probs)}
44
+ top_idx = int(torch.argmax(probs).item())
45
+ top_label = id2label.get(top_idx, str(top_idx))
46
+ top_conf = float(probs[top_idx].item())
47
+
48
+ logger.info(f"Image classify → {top_label} @ {top_conf:.3f}")
49
+ return ImageClassification(label=top_label, confidence=top_conf, all_scores=all_scores)
50
+
51
+
52
+ def preprocess_and_classify(raw_bytes: bytes) -> Tuple[Image.Image, ImageClassification]:
53
+ """Convenience: decode bytes → PIL → classify. Returns the PIL image too so
54
+ downstream steps (heatmap, artifact scan) can reuse it.
55
+ """
56
+ pil = load_image_from_bytes(raw_bytes)
57
+ result = classify_image(pil)
58
+ return pil, result
llm_explainer.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LLM Explainability Card — Phase 12.3
2
+
3
+ Generates a plain-English summary paragraph + 3 key-signal bullets from the
4
+ full analysis payload. Supports Gemini (default) and OpenAI providers.
5
+ Results are cached per record_id to avoid re-spending tokens.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ from abc import ABC, abstractmethod
12
+ from functools import lru_cache
13
+ from typing import Any
14
+
15
+ from loguru import logger
16
+
17
+ from config import settings
18
+ from schemas.common import LLMExplainabilitySummary
19
+
20
+ # ── In-memory cache keyed by record_id ──
21
+ _cache: dict[str, LLMExplainabilitySummary] = {}
22
+
23
+
24
+ _PROMPT_TEMPLATE = """\
25
+ You are DeepShield's explainability engine. Given the JSON analysis payload below,
26
+ write a concise, accessible summary for a non-technical user.
27
+
28
+ **Output format (strict JSON only — no markdown fences):**
29
+ {{
30
+ "paragraph": "<2-3 sentence plain-English summary of the verdict and key signals>",
31
+ "bullets": [
32
+ "<key signal 1>",
33
+ "<key signal 2>",
34
+ "<key signal 3>"
35
+ ]
36
+ }}
37
+
38
+ Rules:
39
+ - Be factual. State what the analysis found, not what you speculate.
40
+ - Reference specific indicators (e.g. "GAN artifact score", "EXIF metadata", "sensationalism level").
41
+ - If the verdict is "Likely Authentic", reassure the user and explain why.
42
+ - If the verdict is "Likely Manipulated" or "Suspicious", highlight the strongest evidence.
43
+ - Keep the paragraph under 60 words. Each bullet under 20 words.
44
+
45
+ **Analysis payload:**
46
+ {payload_json}
47
+ """
48
+
49
+
50
+ class _LLMProvider(ABC):
51
+ @abstractmethod
52
+ def generate(self, prompt: str) -> str:
53
+ """Send prompt to LLM and return raw text response."""
54
+
55
+
56
+ class _GeminiProvider(_LLMProvider):
57
+ def __init__(self) -> None:
58
+ import google.generativeai as genai
59
+ genai.configure(api_key=settings.LLM_API_KEY)
60
+ self._model = genai.GenerativeModel(settings.LLM_MODEL)
61
+
62
+ def generate(self, prompt: str) -> str:
63
+ response = self._model.generate_content(prompt)
64
+ return response.text
65
+
66
+
67
+ class _OpenAIProvider(_LLMProvider):
68
+ def __init__(self) -> None:
69
+ from openai import OpenAI
70
+ self._client = OpenAI(api_key=settings.LLM_API_KEY)
71
+
72
+ def generate(self, prompt: str) -> str:
73
+ response = self._client.chat.completions.create(
74
+ model=settings.LLM_MODEL,
75
+ messages=[{"role": "user", "content": prompt}],
76
+ temperature=0.3,
77
+ max_tokens=300,
78
+ )
79
+ return response.choices[0].message.content
80
+
81
+
82
+ @lru_cache(maxsize=1)
83
+ def _get_provider() -> _LLMProvider:
84
+ """Lazy-init the configured LLM provider (singleton)."""
85
+ provider_name = settings.LLM_PROVIDER.lower()
86
+ if provider_name == "openai":
87
+ return _OpenAIProvider()
88
+ return _GeminiProvider()
89
+
90
+
91
+ def _parse_llm_response(raw: str) -> tuple[str, list[str]]:
92
+ """Parse the LLM's JSON response into (paragraph, bullets).
93
+ Handles cases where the LLM wraps output in markdown fences.
94
+ """
95
+ text = raw.strip()
96
+ # Strip markdown code fences if present
97
+ if text.startswith("```"):
98
+ lines = text.split("\n")
99
+ # Remove first and last fence lines
100
+ lines = [l for l in lines if not l.strip().startswith("```")]
101
+ text = "\n".join(lines).strip()
102
+
103
+ parsed = json.loads(text)
104
+ paragraph = parsed.get("paragraph", "")
105
+ bullets = parsed.get("bullets", [])
106
+ if not isinstance(bullets, list):
107
+ bullets = [str(bullets)]
108
+ return paragraph, bullets[:3]
109
+
110
+
111
+ def generate_llm_summary(
112
+ payload: dict[str, Any],
113
+ record_id: str | None = None,
114
+ ) -> LLMExplainabilitySummary:
115
+ """Generate an LLM-powered plain-English explanation for an analysis result.
116
+
117
+ Args:
118
+ payload: The full analysis response dict (verdict, scores, indicators, etc.).
119
+ record_id: Optional cache key. If provided and cached, returns cached result.
120
+
121
+ Returns:
122
+ LLMExplainabilitySummary with paragraph, bullets, and model info.
123
+ """
124
+ # Check cache
125
+ if record_id and record_id in _cache:
126
+ logger.debug(f"LLM summary cache hit for record_id={record_id}")
127
+ cached = _cache[record_id]
128
+ cached.cached = True
129
+ return cached
130
+
131
+ # Guard: no API key configured
132
+ if not settings.LLM_API_KEY:
133
+ logger.warning("LLM_API_KEY not set — skipping LLM explainability card")
134
+ return LLMExplainabilitySummary(
135
+ paragraph="LLM explanation unavailable (no API key configured).",
136
+ bullets=[],
137
+ model_used="none",
138
+ )
139
+
140
+ # Strip heavy base64 fields to reduce token usage
141
+ slim_payload = {k: v for k, v in payload.items()
142
+ if k not in ("explainability",)}
143
+ # Include explainability but strip base64 images
144
+ if "explainability" in payload and isinstance(payload["explainability"], dict):
145
+ expl = {k: v for k, v in payload["explainability"].items()
146
+ if not k.endswith("_base64")}
147
+ slim_payload["explainability"] = expl
148
+
149
+ prompt = _PROMPT_TEMPLATE.format(payload_json=json.dumps(slim_payload, indent=2, default=str))
150
+
151
+ try:
152
+ provider = _get_provider()
153
+ raw_response = provider.generate(prompt)
154
+ paragraph, bullets = _parse_llm_response(raw_response)
155
+
156
+ summary = LLMExplainabilitySummary(
157
+ paragraph=paragraph,
158
+ bullets=bullets,
159
+ model_used=f"{settings.LLM_PROVIDER}/{settings.LLM_MODEL}",
160
+ )
161
+
162
+ # Cache result
163
+ if record_id:
164
+ _cache[record_id] = summary
165
+
166
+ logger.info(f"LLM summary generated via {settings.LLM_PROVIDER}/{settings.LLM_MODEL}")
167
+ return summary
168
+
169
+ except json.JSONDecodeError as e:
170
+ logger.error(f"LLM returned unparseable JSON: {e}")
171
+ return LLMExplainabilitySummary(
172
+ paragraph="Analysis complete. See the detailed indicators below for specifics.",
173
+ bullets=["LLM explanation could not be parsed"],
174
+ model_used=f"{settings.LLM_PROVIDER}/{settings.LLM_MODEL}",
175
+ )
176
+ except Exception as e:
177
+ logger.error(f"LLM explainer failed: {e}")
178
+ return LLMExplainabilitySummary(
179
+ paragraph="Analysis complete. See the detailed indicators below for specifics.",
180
+ bullets=["LLM explanation temporarily unavailable"],
181
+ model_used="error",
182
+ )
main.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from contextlib import asynccontextmanager
3
+
4
+ from fastapi import FastAPI
5
+ from fastapi.middleware.cors import CORSMiddleware
6
+ from loguru import logger
7
+
8
+ from api.router import api_router
9
+ from config import settings
10
+ from db.database import init_db
11
+ from models.model_loader import get_model_loader
12
+ from services.report_service import cleanup_expired
13
+
14
+
15
+ async def _report_cleanup_loop():
16
+ while True:
17
+ try:
18
+ cleanup_expired()
19
+ except Exception as e: # noqa: BLE001
20
+ logger.warning(f"Report cleanup error: {e}")
21
+ await asyncio.sleep(600) # every 10 min
22
+
23
+
24
+ @asynccontextmanager
25
+ async def lifespan(app: FastAPI):
26
+ logger.info("Starting DeepShield backend")
27
+ init_db()
28
+ logger.info("Database initialized")
29
+ if settings.PRELOAD_MODELS:
30
+ get_model_loader().preload_phase1()
31
+ else:
32
+ logger.info("PRELOAD_MODELS=false — models will load on first use")
33
+ task = asyncio.create_task(_report_cleanup_loop())
34
+ yield
35
+ task.cancel()
36
+ logger.info("Shutting down DeepShield backend")
37
+
38
+
39
+ app = FastAPI(
40
+ title="DeepShield API",
41
+ description="Explainable AI-based multimodal misinformation detection",
42
+ version="0.1.0",
43
+ lifespan=lifespan,
44
+ )
45
+
46
+ app.add_middleware(
47
+ CORSMiddleware,
48
+ allow_origins=settings.CORS_ORIGINS,
49
+ allow_credentials=True,
50
+ allow_methods=["*"],
51
+ allow_headers=["*"],
52
+ )
53
+
54
+ app.include_router(api_router)
55
+
56
+
57
+ @app.get("/")
58
+ def root():
59
+ return {"service": "DeepShield", "docs": "/docs", "health": "/api/v1/health"}
model_loader.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from threading import Lock
4
+ from typing import Optional, Tuple
5
+
6
+ from loguru import logger
7
+
8
+ from config import settings
9
+
10
+
11
+ class ModelLoader:
12
+ """Singleton holder for preloaded AI models. Thread-safe lazy init."""
13
+
14
+ _instance: Optional["ModelLoader"] = None
15
+ _lock: Lock = Lock()
16
+
17
+ def __new__(cls) -> "ModelLoader":
18
+ if cls._instance is None:
19
+ with cls._lock:
20
+ if cls._instance is None:
21
+ cls._instance = super().__new__(cls)
22
+ cls._instance._image_model = None
23
+ cls._instance._image_processor = None
24
+ cls._instance._text_pipeline = None
25
+ cls._instance._multilang_text_pipeline = None
26
+ cls._instance._ocr_reader = None
27
+ cls._instance._face_detector = None
28
+ cls._instance._spacy_nlp = None
29
+ cls._instance._sentence_transformer = None
30
+ return cls._instance
31
+
32
+ @classmethod
33
+ def get_instance(cls) -> "ModelLoader":
34
+ return cls()
35
+
36
+ # ---------- Image (ViT deepfake classifier) ----------
37
+ def load_image_model(self) -> Tuple[object, object]:
38
+ if self._image_model is None:
39
+ logger.info(f"Loading image model: {settings.IMAGE_MODEL_ID}")
40
+ from transformers import AutoImageProcessor, AutoModelForImageClassification
41
+
42
+ self._image_processor = AutoImageProcessor.from_pretrained(settings.IMAGE_MODEL_ID)
43
+ model = AutoModelForImageClassification.from_pretrained(settings.IMAGE_MODEL_ID)
44
+ model.to(settings.DEVICE)
45
+ model.eval()
46
+ self._image_model = model
47
+ logger.info("Image model loaded")
48
+ return self._image_model, self._image_processor
49
+
50
+ # ---------- Text (BERT fake-news classifier — English) ----------
51
+ def load_text_model(self):
52
+ if self._text_pipeline is None:
53
+ logger.info(f"Loading text model: {settings.TEXT_MODEL_ID}")
54
+ from transformers import pipeline
55
+
56
+ self._text_pipeline = pipeline(
57
+ "text-classification",
58
+ model=settings.TEXT_MODEL_ID,
59
+ device=0 if settings.DEVICE == "cuda" else -1,
60
+ )
61
+ logger.info("Text model loaded")
62
+ return self._text_pipeline
63
+
64
+ # ---------- Multilingual text model (Phase 13) ----------
65
+ def load_multilang_text_model(self):
66
+ """Load multilingual fake-news classifier. Falls back to English model if not configured."""
67
+ model_id = settings.TEXT_MULTILANG_MODEL_ID
68
+ if not model_id:
69
+ logger.debug("TEXT_MULTILANG_MODEL_ID not set — falling back to English text model")
70
+ return self.load_text_model()
71
+
72
+ if self._multilang_text_pipeline is None:
73
+ logger.info(f"Loading multilingual text model: {model_id}")
74
+ from transformers import pipeline
75
+
76
+ self._multilang_text_pipeline = pipeline(
77
+ "text-classification",
78
+ model=model_id,
79
+ device=0 if settings.DEVICE == "cuda" else -1,
80
+ )
81
+ logger.info("Multilingual text model loaded")
82
+ return self._multilang_text_pipeline
83
+
84
+ # ---------- spaCy NLP (Phase 13 NER) ----------
85
+ def load_spacy_nlp(self):
86
+ """Lazy-load spaCy English NLP model. Returns None if spaCy is not installed."""
87
+ if self._spacy_nlp is None:
88
+ try:
89
+ import spacy # type: ignore
90
+ try:
91
+ self._spacy_nlp = spacy.load("en_core_web_sm")
92
+ logger.info("spaCy en_core_web_sm loaded")
93
+ except OSError:
94
+ logger.warning(
95
+ "spaCy model 'en_core_web_sm' not found. "
96
+ "Run: python -m spacy download en_core_web_sm"
97
+ )
98
+ return None
99
+ except ImportError:
100
+ logger.warning("spaCy not installed — NER keyword extraction disabled")
101
+ return None
102
+ return self._spacy_nlp
103
+
104
+ # ---------- Sentence-Transformer (Phase 13 truth-override) ----------
105
+ def load_sentence_transformer(self):
106
+ """Lazy-load sentence-transformers/all-MiniLM-L6-v2. Returns None if not installed."""
107
+ if self._sentence_transformer is None:
108
+ try:
109
+ from sentence_transformers import SentenceTransformer # type: ignore
110
+ self._sentence_transformer = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
111
+ logger.info("Sentence-transformer (all-MiniLM-L6-v2) loaded")
112
+ except ImportError:
113
+ logger.warning("sentence-transformers not installed — truth-override disabled")
114
+ return None
115
+ except Exception as e:
116
+ logger.warning(f"Sentence-transformer load failed: {e}")
117
+ return None
118
+ return self._sentence_transformer
119
+
120
+ # ---------- OCR (EasyOCR) — Phase 13: use OCR_LANGS from config ----------
121
+ def load_ocr_engine(self):
122
+ if self._ocr_reader is None:
123
+ langs = [l.strip() for l in settings.OCR_LANGS.split(",") if l.strip()]
124
+ if not langs:
125
+ langs = ["en"]
126
+ logger.info(f"Loading EasyOCR reader (langs: {langs})")
127
+ import easyocr # type: ignore
128
+
129
+ self._ocr_reader = easyocr.Reader(
130
+ langs, gpu=(settings.DEVICE == "cuda"), verbose=False, download_enabled=True,
131
+ )
132
+ logger.info("EasyOCR loaded")
133
+ return self._ocr_reader
134
+
135
+ # ---------- Face detector (MediaPipe) ----------
136
+ def load_face_detector(self):
137
+ if self._face_detector is None:
138
+ logger.info("Loading MediaPipe FaceMesh")
139
+ import mediapipe as mp # type: ignore
140
+
141
+ self._face_detector = mp.solutions.face_mesh.FaceMesh(
142
+ static_image_mode=True,
143
+ max_num_faces=5,
144
+ min_detection_confidence=0.5,
145
+ )
146
+ logger.info("MediaPipe FaceMesh loaded")
147
+ return self._face_detector
148
+
149
+ # ---------- Preload ----------
150
+ def preload_phase1(self) -> None:
151
+ """Preload only what Phase 1 needs (image model)."""
152
+ self.load_image_model()
153
+
154
+
155
+ def get_model_loader() -> ModelLoader:
156
+ return ModelLoader.get_instance()
models.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+
3
+ from sqlalchemy import DateTime, ForeignKey, Integer, String, Text
4
+ from sqlalchemy.orm import Mapped, mapped_column, relationship
5
+
6
+ from db.database import Base
7
+
8
+
9
+ class User(Base):
10
+ __tablename__ = "users"
11
+
12
+ id: Mapped[int] = mapped_column(Integer, primary_key=True, index=True)
13
+ email: Mapped[str] = mapped_column(String(255), unique=True, index=True, nullable=False)
14
+ password_hash: Mapped[str] = mapped_column(String(255), nullable=False)
15
+ name: Mapped[str | None] = mapped_column(String(255), nullable=True)
16
+ created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
17
+
18
+ analyses: Mapped[list["AnalysisRecord"]] = relationship(back_populates="user")
19
+
20
+
21
+ class AnalysisRecord(Base):
22
+ __tablename__ = "analyses"
23
+
24
+ id: Mapped[int] = mapped_column(Integer, primary_key=True, index=True)
25
+ user_id: Mapped[int | None] = mapped_column(ForeignKey("users.id"), nullable=True)
26
+ media_type: Mapped[str] = mapped_column(String(32), nullable=False) # image|video|text|screenshot
27
+ verdict: Mapped[str] = mapped_column(String(32), nullable=False)
28
+ authenticity_score: Mapped[float] = mapped_column(nullable=False)
29
+ result_json: Mapped[str] = mapped_column(Text, nullable=False)
30
+ created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
31
+
32
+ user: Mapped["User | None"] = relationship(back_populates="analyses")
33
+ report: Mapped["Report | None"] = relationship(back_populates="analysis", uselist=False)
34
+
35
+
36
+ class Report(Base):
37
+ __tablename__ = "reports"
38
+
39
+ id: Mapped[int] = mapped_column(Integer, primary_key=True, index=True)
40
+ analysis_id: Mapped[int] = mapped_column(ForeignKey("analyses.id"), nullable=False)
41
+ file_path: Mapped[str] = mapped_column(String(512), nullable=False)
42
+ created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
43
+ expires_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
44
+
45
+ analysis: Mapped["AnalysisRecord"] = relationship(back_populates="report")
news_lookup.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import List, Optional, Tuple
5
+ from urllib.parse import urlparse
6
+
7
+ import httpx
8
+ from loguru import logger
9
+
10
+ from config import settings
11
+ from schemas.common import ContradictingEvidence, TrustedSource, TruthOverride
12
+
13
+ # Trusted news domains — higher relevance boost
14
+ TRUSTED_DOMAINS = {
15
+ "reuters.com": 1.0, "apnews.com": 1.0, "bbc.com": 1.0, "bbc.co.uk": 1.0,
16
+ "theguardian.com": 0.95, "nytimes.com": 0.95, "washingtonpost.com": 0.95,
17
+ "cnn.com": 0.9, "npr.org": 0.95, "aljazeera.com": 0.9,
18
+ "thehindu.com": 0.9, "indianexpress.com": 0.9, "ndtv.com": 0.85,
19
+ "hindustantimes.com": 0.85, "pti.news": 0.95,
20
+ }
21
+
22
+ # Fact-check / contradiction sources
23
+ FACTCHECK_DOMAINS = {
24
+ "factcheck.org", "snopes.com", "politifact.com", "fullfact.org",
25
+ "reuters.com/fact-check", "apnews.com/hub/ap-fact-check",
26
+ "factly.in", "altnews.in", "boomlive.in", "vishvasnews.com",
27
+ }
28
+
29
+ # Domains eligible for truth-override (weight >= 0.9 per BUILD_PLAN spec)
30
+ _HIGH_TRUST_DOMAINS = {d for d, w in TRUSTED_DOMAINS.items() if w >= 0.9}
31
+
32
+ # Thresholds per BUILD_PLAN §13.2
33
+ _OVERRIDE_SIMILARITY_THRESHOLD = 0.6
34
+ _OVERRIDE_FAKE_PROB_CAP = 0.15
35
+ _OVERRIDE_FAKE_PROB_MULTIPLIER = 0.3
36
+
37
+
38
+ @dataclass
39
+ class NewsLookupResult:
40
+ trusted_sources: List[TrustedSource]
41
+ contradicting_evidence: List[ContradictingEvidence]
42
+ total_articles: int
43
+ truth_override: Optional[TruthOverride] = None
44
+
45
+
46
+ def _domain_of(url: str) -> str:
47
+ try:
48
+ return urlparse(url).netloc.lower().replace("www.", "")
49
+ except Exception:
50
+ return ""
51
+
52
+
53
+ def _is_factcheck(url: str, title: str) -> bool:
54
+ dom = _domain_of(url)
55
+ if any(fc in dom for fc in FACTCHECK_DOMAINS):
56
+ return True
57
+ tl = (title or "").lower()
58
+ return any(kw in tl for kw in ("fact check", "fact-check", "debunked", "false claim", "misleading", "hoax"))
59
+
60
+
61
+ def _relevance(url: str) -> float:
62
+ dom = _domain_of(url)
63
+ for td, score in TRUSTED_DOMAINS.items():
64
+ if td in dom:
65
+ return score
66
+ return 0.5
67
+
68
+
69
+ def _is_high_trust(url: str) -> bool:
70
+ dom = _domain_of(url)
71
+ return any(ht in dom for ht in _HIGH_TRUST_DOMAINS)
72
+
73
+
74
+ def _compute_truth_override(
75
+ input_text: str,
76
+ trusted_sources: List[TrustedSource],
77
+ current_fake_prob: float,
78
+ ) -> Optional[TruthOverride]:
79
+ """Check if any high-trust source corroborates the input text at >= 0.6 cosine similarity.
80
+
81
+ Per BUILD_PLAN §13.2:
82
+ - Compute cosine similarity between input_text and each trusted-source headline+description
83
+ - If ≥ 1 high-trust source (weight ≥ 0.9) has similarity ≥ 0.6 → apply fake_prob *= 0.3, cap at 0.15
84
+ """
85
+ if not input_text or not trusted_sources:
86
+ return None
87
+
88
+ # Filter to high-trust sources only
89
+ high_trust = [s for s in trusted_sources if _is_high_trust(s.url)]
90
+ if not high_trust:
91
+ return None
92
+
93
+ # Lazy-load sentence-transformer
94
+ from models.model_loader import get_model_loader
95
+ st_model = get_model_loader().load_sentence_transformer()
96
+ if st_model is None:
97
+ return None
98
+
99
+ try:
100
+ import numpy as np
101
+
102
+ # Encode input text and all high-trust headlines
103
+ source_texts = [
104
+ f"{s.title}" for s in high_trust
105
+ ]
106
+ all_texts = [input_text[:512]] + source_texts
107
+
108
+ embeddings = st_model.encode(all_texts, convert_to_numpy=True, normalize_embeddings=True)
109
+ query_vec = embeddings[0] # (D,)
110
+ source_vecs = embeddings[1:] # (N, D)
111
+
112
+ # Cosine similarity — already normalized, so dot product = cosine similarity
113
+ similarities = np.dot(source_vecs, query_vec)
114
+
115
+ best_idx = int(np.argmax(similarities))
116
+ best_sim = float(similarities[best_idx])
117
+ best_source = high_trust[best_idx]
118
+
119
+ logger.info(
120
+ f"Truth-override: best similarity={best_sim:.3f} "
121
+ f"source={best_source.source_name} url={best_source.url}"
122
+ )
123
+
124
+ if best_sim >= _OVERRIDE_SIMILARITY_THRESHOLD:
125
+ new_fake_prob = min(
126
+ current_fake_prob * _OVERRIDE_FAKE_PROB_MULTIPLIER,
127
+ _OVERRIDE_FAKE_PROB_CAP,
128
+ )
129
+ logger.info(
130
+ f"Truth-override APPLIED: fake_prob {current_fake_prob:.3f} → {new_fake_prob:.3f}"
131
+ )
132
+ return TruthOverride(
133
+ applied=True,
134
+ source_url=best_source.url,
135
+ source_name=best_source.source_name,
136
+ similarity=round(best_sim, 4),
137
+ fake_prob_before=round(current_fake_prob, 4),
138
+ fake_prob_after=round(new_fake_prob, 4),
139
+ )
140
+
141
+ return TruthOverride(
142
+ applied=False,
143
+ source_url=best_source.url,
144
+ source_name=best_source.source_name,
145
+ similarity=round(best_sim, 4),
146
+ fake_prob_before=round(current_fake_prob, 4),
147
+ fake_prob_after=round(current_fake_prob, 4),
148
+ )
149
+
150
+ except Exception as e:
151
+ logger.warning(f"Truth-override computation failed: {e}")
152
+ return None
153
+
154
+
155
+ async def _fetch(q: str, country: Optional[str]) -> list[dict]:
156
+ target_country = country or "in"
157
+ params = {"apikey": settings.NEWS_API_KEY, "q": q, "language": "en", "size": 10, "country": "in"}
158
+
159
+ try:
160
+ async with httpx.AsyncClient(timeout=8.0) as c:
161
+ r = await c.get(settings.NEWS_API_BASE_URL, params=params)
162
+ r.raise_for_status()
163
+ return (r.json() or {}).get("results") or []
164
+ except Exception as e:
165
+ logger.warning(f"News lookup failed: {e}")
166
+ return []
167
+
168
+
169
+ async def search_news(
170
+ keywords: List[str],
171
+ limit: int = 6,
172
+ country: Optional[str] = None,
173
+ ) -> List[TrustedSource]:
174
+ """Back-compat simple form — returns trusted sources only."""
175
+ result = await search_news_full(keywords, limit=limit, country=country)
176
+ return result.trusted_sources
177
+
178
+
179
+ async def search_news_full(
180
+ keywords: List[str],
181
+ limit: int = 6,
182
+ country: Optional[str] = None,
183
+ original_text: Optional[str] = None,
184
+ current_fake_prob: float = 0.5,
185
+ ) -> NewsLookupResult:
186
+ """Full news lookup with truth-override support.
187
+
188
+ Args:
189
+ keywords: NER-extracted or frequency-extracted keywords to search.
190
+ limit: Max sources to return.
191
+ country: Country code for newsdata.io.
192
+ original_text: Input text to compare against headlines for truth-override.
193
+ current_fake_prob: Current fake probability — may be adjusted by truth-override.
194
+ """
195
+ if not settings.NEWS_API_KEY or not keywords:
196
+ return NewsLookupResult([], [], 0)
197
+
198
+ q = " ".join(keywords[:4])
199
+ articles = await _fetch(q, country)
200
+
201
+ seen: set[str] = set()
202
+ trusted: List[TrustedSource] = []
203
+ contradictions: List[ContradictingEvidence] = []
204
+
205
+ for art in articles:
206
+ url = art.get("link") or ""
207
+ if not url or url in seen:
208
+ continue
209
+ seen.add(url)
210
+
211
+ title = art.get("title") or ""
212
+ dom = _domain_of(url)
213
+ src_name = art.get("source_id") or dom or "news"
214
+
215
+ if _is_factcheck(url, title):
216
+ contradictions.append(ContradictingEvidence(
217
+ source_name=src_name, title=title, url=url, type="fact_check",
218
+ ))
219
+ continue
220
+
221
+ trusted.append(TrustedSource(
222
+ source_name=src_name,
223
+ title=title,
224
+ url=url,
225
+ published_at=art.get("pubDate"),
226
+ relevance_score=_relevance(url),
227
+ ))
228
+
229
+ trusted.sort(key=lambda s: -s.relevance_score)
230
+ trusted = trusted[:limit]
231
+
232
+ # ── Phase 13.2: Truth-override ──
233
+ truth_override = None
234
+ if original_text and trusted:
235
+ truth_override = _compute_truth_override(original_text, trusted, current_fake_prob)
236
+
237
+ return NewsLookupResult(
238
+ trusted_sources=trusted,
239
+ contradicting_evidence=contradictions[:limit],
240
+ total_articles=len(articles),
241
+ truth_override=truth_override,
242
+ )
report.html ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <title>DeepShield Analysis Report — {{ analysis_id }}</title>
6
+ <style>
7
+ @page { size: A4; margin: 16mm 18mm; }
8
+ body { font-family: Helvetica, Arial, sans-serif; color: #1A202C; font-size: 10pt; line-height: 1.45; }
9
+
10
+ /* ── Typography ── */
11
+ h1 { color: #4F46E5; margin: 0 0 2pt 0; font-size: 18pt; letter-spacing: -0.3pt; }
12
+ h2 { color: #4F46E5; margin: 14pt 0 5pt 0; font-size: 12pt; border-bottom: 1pt solid #E5E7EB; padding-bottom: 2pt; }
13
+ h3 { margin: 10pt 0 4pt 0; font-size: 10.5pt; color: #374151; }
14
+ .muted { color: #6B7280; font-size: 8.5pt; }
15
+
16
+ /* ── Header / logo row ── */
17
+ .header-table { width: 100%; border-collapse: collapse; border-bottom: 2pt solid #4F46E5; padding-bottom: 6pt; margin-bottom: 10pt; }
18
+ .logo-cell { font-size: 22pt; font-weight: bold; color: #4F46E5; width: 1%; white-space: nowrap; padding-right: 8pt; }
19
+ .logo-shield { color: #6366F1; }
20
+ .meta-cell { font-size: 8.5pt; color: #6B7280; vertical-align: bottom; }
21
+
22
+ /* ── Verdict row ── */
23
+ .verdict-table { width: 100%; border-collapse: collapse; margin: 6pt 0 10pt 0; background: #F9FAFB; }
24
+ .verdict-score-cell { width: 90pt; text-align: center; vertical-align: middle; padding: 8pt; }
25
+ .score-num { font-size: 26pt; font-weight: bold; }
26
+ .score-denom { font-size: 9pt; color: #6B7280; }
27
+ .score.real { color: #43A047; }
28
+ .score.warn { color: #FB8C00; }
29
+ .score.fake { color: #E53935; }
30
+ .verdict-detail-cell { padding: 8pt 10pt; vertical-align: middle; }
31
+ .verdict-label { font-size: 13pt; font-weight: bold; color: #1A202C; }
32
+ .verdict-sub { font-size: 8.5pt; color: #6B7280; margin-top: 2pt; }
33
+ .donut-cell { width: 75pt; text-align: center; vertical-align: middle; padding: 4pt; }
34
+ .donut-cell img { width: 72pt; }
35
+
36
+ /* ── LLM card ── */
37
+ .llm-box { background: #EEF2FF; border-left: 3pt solid #4F46E5; padding: 7pt 9pt; margin: 6pt 0; border-radius: 2pt; }
38
+ .llm-para { font-size: 9.5pt; color: #1A202C; margin: 0 0 5pt 0; }
39
+ .llm-bullets { margin: 0; padding-left: 14pt; }
40
+ .llm-bullets li { font-size: 9pt; color: #374151; margin-bottom: 2pt; }
41
+
42
+ /* ── Tables ── */
43
+ table.data { width: 100%; border-collapse: collapse; margin: 5pt 0; }
44
+ table.data th { background: #F3F4F6; color: #374151; font-size: 8.5pt; text-align: left; padding: 3pt 6pt; border-bottom: 1pt solid #E5E7EB; }
45
+ table.data td { font-size: 9pt; padding: 3pt 6pt; border-bottom: 1pt solid #F3F4F6; vertical-align: top; }
46
+ table.data tr:last-child td { border-bottom: none; }
47
+
48
+ /* ── VLM breakdown ── */
49
+ .vlm-score-bar-wrap { background: #E5E7EB; border-radius: 3pt; height: 5pt; width: 70pt; display: inline-block; vertical-align: middle; overflow: hidden; }
50
+ .vlm-score-bar { height: 5pt; border-radius: 3pt; }
51
+ .vlm-real { background: #43A047; }
52
+ .vlm-warn { background: #FB8C00; }
53
+ .vlm-fake { background: #E53935; }
54
+
55
+ /* ── Badges ── */
56
+ .badge { display: inline-block; padding: 1pt 5pt; border-radius: 3pt; font-size: 8pt; font-weight: bold; }
57
+ .sev-high { background: #FEE2E2; color: #B91C1C; }
58
+ .sev-medium { background: #FEF3C7; color: #92400E; }
59
+ .sev-low { background: #DBEAFE; color: #1E40AF; }
60
+ .badge-green { background: #DCFCE7; color: #166534; }
61
+ .badge-red { background: #FEE2E2; color: #991B1B; }
62
+
63
+ /* ── Keywords ── */
64
+ .keyword { display: inline-block; background: #EEF2FF; color: #4F46E5; padding: 1pt 6pt; border-radius: 3pt; margin: 1pt; font-size: 8.5pt; }
65
+
66
+ /* ── Truth-override ── */
67
+ .truth-box { background: #DCFCE7; border-left: 3pt solid #16A34A; padding: 5pt 8pt; margin: 5pt 0; font-size: 9pt; border-radius: 2pt; }
68
+
69
+ /* ── Footer ── */
70
+ .footer { margin-top: 16pt; padding-top: 5pt; border-top: 1pt solid #E5E7EB; color: #9CA3AF; font-size: 8pt; }
71
+ </style>
72
+ </head>
73
+ <body>
74
+
75
+ {# ── Header ── #}
76
+ <table class="header-table">
77
+ <tr>
78
+ <td class="logo-cell"><span class="logo-shield">&#9646;</span> DeepShield</td>
79
+ <td class="meta-cell">
80
+ Analysis Report &nbsp;·&nbsp; ID: {{ analysis_id }}<br />
81
+ Media: <b>{{ media_type | upper }}</b> &nbsp;·&nbsp; Generated: {{ generated_at }}
82
+ </td>
83
+ </tr>
84
+ </table>
85
+
86
+ {# ── Verdict ── #}
87
+ <h2>Verdict</h2>
88
+ <table class="verdict-table">
89
+ <tr>
90
+ <td class="verdict-score-cell">
91
+ <div class="score-num score {{ score_class }}">{{ verdict.authenticity_score }}</div>
92
+ <div class="score-denom">/ 100</div>
93
+ </td>
94
+ <td class="verdict-detail-cell">
95
+ <div class="verdict-label">{{ verdict.label }}</div>
96
+ <div class="verdict-sub">Severity: {{ verdict.severity }}</div>
97
+ <div class="verdict-sub">Model: {{ verdict.model_label }} &nbsp;({{ '%.1f' | format(verdict.model_confidence * 100) }}% confidence)</div>
98
+ </td>
99
+ {% if donut_b64 %}
100
+ <td class="donut-cell">
101
+ <img src="data:image/png;base64,{{ donut_b64 }}" alt="score donut" />
102
+ </td>
103
+ {% endif %}
104
+ </tr>
105
+ </table>
106
+
107
+ {# ── LLM Explanation ── #}
108
+ {% if llm_summary and llm_summary.paragraph %}
109
+ <h2>AI Explanation</h2>
110
+ <div class="llm-box">
111
+ <p class="llm-para">{{ llm_summary.paragraph }}</p>
112
+ {% if llm_summary.bullets %}
113
+ <ul class="llm-bullets">
114
+ {% for b in llm_summary.bullets %}<li>{{ b }}</li>{% endfor %}
115
+ </ul>
116
+ {% endif %}
117
+ {% if llm_summary.model_used %}
118
+ <div class="muted" style="margin-top:4pt;">via {{ llm_summary.model_used }}</div>
119
+ {% endif %}
120
+ </div>
121
+ {% endif %}
122
+
123
+ {# ══════════ IMAGE ══════════ #}
124
+ {% if media_type == 'image' %}
125
+
126
+ {# EXIF #}
127
+ {% if explainability.exif %}
128
+ <h2>EXIF Metadata</h2>
129
+ <table class="data">
130
+ <tr><th>Field</th><th>Value</th><th>Trust Signal</th></tr>
131
+ {% if explainability.exif.make %}
132
+ <tr><td>Camera Make</td><td>{{ explainability.exif.make }}</td><td><span class="badge badge-green">+real</span></td></tr>
133
+ {% endif %}
134
+ {% if explainability.exif.model %}
135
+ <tr><td>Camera Model</td><td>{{ explainability.exif.model }}</td><td></td></tr>
136
+ {% endif %}
137
+ {% if explainability.exif.datetime_original %}
138
+ <tr><td>Date Taken</td><td>{{ explainability.exif.datetime_original }}</td><td><span class="badge badge-green">+real</span></td></tr>
139
+ {% endif %}
140
+ {% if explainability.exif.software %}
141
+ <tr><td>Software</td><td>{{ explainability.exif.software }}</td>
142
+ <td>{% if 'photoshop' in explainability.exif.software | lower %}<span class="badge badge-red">+fake</span>{% endif %}</td></tr>
143
+ {% endif %}
144
+ {% if explainability.exif.lens_model %}
145
+ <tr><td>Lens Model</td><td>{{ explainability.exif.lens_model }}</td><td></td></tr>
146
+ {% endif %}
147
+ {% if explainability.exif.gps_info %}
148
+ <tr><td>GPS</td><td>{{ explainability.exif.gps_info }}</td><td></td></tr>
149
+ {% endif %}
150
+ <tr>
151
+ <td colspan="2"><b>Trust adjustment</b></td>
152
+ <td>
153
+ {% if explainability.exif.trust_adjustment > 0 %}
154
+ <span class="badge badge-red">+{{ explainability.exif.trust_adjustment }} (fake signal)</span>
155
+ {% elif explainability.exif.trust_adjustment < 0 %}
156
+ <span class="badge badge-green">{{ explainability.exif.trust_adjustment }} (real signal)</span>
157
+ {% else %}
158
+ neutral
159
+ {% endif %}
160
+ </td>
161
+ </tr>
162
+ </table>
163
+ {% endif %}
164
+
165
+ {# Artifact indicators #}
166
+ {% if explainability.artifact_indicators %}
167
+ <h2>Artifact Indicators</h2>
168
+ <table class="data">
169
+ <tr><th>Type</th><th>Severity</th><th>Confidence</th><th>Description</th></tr>
170
+ {% for ind in explainability.artifact_indicators %}
171
+ <tr>
172
+ <td>{{ ind.type }}</td>
173
+ <td><span class="badge sev-{{ ind.severity }}">{{ ind.severity }}</span></td>
174
+ <td>{{ '%.0f' | format(ind.confidence * 100) }}%</td>
175
+ <td>{{ ind.description }}</td>
176
+ </tr>
177
+ {% endfor %}
178
+ </table>
179
+ {% else %}
180
+ <h2>Artifact Indicators</h2>
181
+ <div class="muted">No artifacts detected.</div>
182
+ {% endif %}
183
+
184
+ {# VLM Detailed Breakdown #}
185
+ {% if explainability.vlm_breakdown %}
186
+ <h2>Detailed Breakdown</h2>
187
+ {% if explainability.vlm_breakdown.model_used %}
188
+ <div class="muted" style="margin-bottom:5pt;">Scored by {{ explainability.vlm_breakdown.model_used }}</div>
189
+ {% endif %}
190
+ <table class="data">
191
+ <tr><th>Component</th><th>Score</th><th>Bar</th><th>Notes</th></tr>
192
+ {% set bd = explainability.vlm_breakdown %}
193
+ {% for comp_key, comp_label in [
194
+ ('facial_symmetry', 'Facial Symmetry'),
195
+ ('skin_texture', 'Skin Texture'),
196
+ ('lighting_consistency', 'Lighting Consistency'),
197
+ ('background_coherence', 'Background Coherence'),
198
+ ('anatomy_hands_eyes', 'Anatomy / Hands & Eyes'),
199
+ ('context_objects', 'Context & Objects')
200
+ ] %}
201
+ {% set comp = bd[comp_key] %}
202
+ {% set sc2 = comp.score if comp else 75 %}
203
+ {% set bar_cls = 'vlm-real' if sc2 >= 70 else ('vlm-warn' if sc2 >= 40 else 'vlm-fake') %}
204
+ <tr>
205
+ <td>{{ comp_label }}</td>
206
+ <td><b>{{ sc2 }}</b>/100</td>
207
+ <td>
208
+ <span class="vlm-score-bar-wrap">
209
+ <span class="vlm-score-bar {{ bar_cls }}" style="width:{{ sc2 }}%;display:block;"></span>
210
+ </span>
211
+ </td>
212
+ <td class="muted">{{ comp.notes if comp else '' }}</td>
213
+ </tr>
214
+ {% endfor %}
215
+ </table>
216
+ {% endif %}
217
+
218
+ {% endif %}{# end image #}
219
+
220
+ {# ══════════ VIDEO ══════════ #}
221
+ {% if media_type == 'video' %}
222
+ <h2>Frame-Level Analysis</h2>
223
+ <table class="data">
224
+ <tr><th>Metric</th><th>Value</th></tr>
225
+ <tr><td>Frames sampled</td><td>{{ explainability.num_frames_sampled }}</td></tr>
226
+ <tr><td>Frames with face</td><td>{{ explainability.num_face_frames }}</td></tr>
227
+ <tr><td>Suspicious frames</td><td>{{ explainability.num_suspicious_frames }}</td></tr>
228
+ <tr><td>Mean suspicious prob</td><td>{{ '%.1f' | format(explainability.mean_suspicious_prob * 100) }}%</td></tr>
229
+ <tr><td>Max suspicious prob</td><td>{{ '%.1f' | format(explainability.max_suspicious_prob * 100) }}%</td></tr>
230
+ <tr><td>Insufficient faces</td><td>{{ explainability.insufficient_faces }}</td></tr>
231
+ </table>
232
+ {% endif %}
233
+
234
+ {# ══════════ TEXT ══════════ #}
235
+ {% if media_type == 'text' %}
236
+
237
+ {# Language + truth-override #}
238
+ {% if explainability.detected_language and explainability.detected_language != 'en' %}
239
+ <h2>Language</h2>
240
+ <div class="muted">Detected: <b>{{ explainability.detected_language | upper }}</b> — analysed via multilingual model</div>
241
+ {% endif %}
242
+ {% if explainability.truth_override and explainability.truth_override.applied %}
243
+ <div class="truth-box">
244
+ <b>Truth-override applied.</b>
245
+ Corroborated by {{ explainability.truth_override.source_name }}
246
+ ({{ '%.0f' | format(explainability.truth_override.similarity * 100) }}% similarity).
247
+ Fake probability reduced from {{ '%.1f' | format(explainability.truth_override.fake_prob_before * 100) }}%
248
+ to {{ '%.1f' | format(explainability.truth_override.fake_prob_after * 100) }}%.
249
+ </div>
250
+ {% endif %}
251
+
252
+ <h2>Text Classification</h2>
253
+ <table class="data">
254
+ <tr><th>Metric</th><th>Value</th></tr>
255
+ <tr><td>Fake probability</td><td>{{ '%.1f' | format(explainability.fake_probability * 100) }}%</td></tr>
256
+ <tr><td>Top label</td><td>{{ explainability.top_label }}</td></tr>
257
+ <tr><td>Sensationalism score</td><td>{{ explainability.sensationalism.score }}/100 ({{ explainability.sensationalism.level }})</td></tr>
258
+ <tr><td>Exclamations</td><td>{{ explainability.sensationalism.exclamation_count }}</td></tr>
259
+ <tr><td>ALL CAPS words</td><td>{{ explainability.sensationalism.caps_word_count }}</td></tr>
260
+ <tr><td>Clickbait matches</td><td>{{ explainability.sensationalism.clickbait_matches }}</td></tr>
261
+ <tr><td>Emotional words</td><td>{{ explainability.sensationalism.emotional_word_count }}</td></tr>
262
+ </table>
263
+
264
+ {% if explainability.manipulation_indicators %}
265
+ <h3>Manipulation Indicators ({{ explainability.manipulation_indicators | length }})</h3>
266
+ <table class="data">
267
+ <tr><th>Pattern</th><th>Severity</th><th>Matched text</th></tr>
268
+ {% for m in explainability.manipulation_indicators %}
269
+ <tr>
270
+ <td>{{ m.pattern_type }}</td>
271
+ <td><span class="badge sev-{{ m.severity }}">{{ m.severity }}</span></td>
272
+ <td>{{ m.matched_text }}</td>
273
+ </tr>
274
+ {% endfor %}
275
+ </table>
276
+ {% endif %}
277
+
278
+ {% if explainability.keywords %}
279
+ <h3>Extracted Keywords</h3>
280
+ <div>{% for kw in explainability.keywords %}<span class="keyword">{{ kw }}</span>{% endfor %}</div>
281
+ {% endif %}
282
+
283
+ {% endif %}{# end text #}
284
+
285
+ {# ══════════ SCREENSHOT ══════════ #}
286
+ {% if media_type == 'screenshot' %}
287
+
288
+ {% if explainability.detected_language and explainability.detected_language != 'en' %}
289
+ <div class="muted" style="margin-bottom:4pt;">Detected language: <b>{{ explainability.detected_language | upper }}</b></div>
290
+ {% endif %}
291
+ {% if explainability.truth_override and explainability.truth_override.applied %}
292
+ <div class="truth-box">
293
+ <b>Truth-override applied.</b> {{ explainability.truth_override.source_name }}
294
+ ({{ '%.0f' | format(explainability.truth_override.similarity * 100) }}% similarity)
295
+ </div>
296
+ {% endif %}
297
+
298
+ <h2>Extracted Text</h2>
299
+ <div class="muted">{{ explainability.ocr_boxes | length }} OCR regions detected</div>
300
+ <table class="data">
301
+ <tr><td style="white-space:pre-wrap; font-size:8.5pt; padding:6pt;">{{ explainability.extracted_text }}</td></tr>
302
+ </table>
303
+
304
+ <h3>Analysis Summary</h3>
305
+ <table class="data">
306
+ <tr><th>Metric</th><th>Value</th></tr>
307
+ <tr><td>Fake probability</td><td>{{ '%.1f' | format(explainability.fake_probability * 100) }}%</td></tr>
308
+ <tr><td>Sensationalism</td><td>{{ explainability.sensationalism.score }}/100 ({{ explainability.sensationalism.level }})</td></tr>
309
+ <tr><td>Suspicious phrases</td><td>{{ explainability.suspicious_phrases | length }}</td></tr>
310
+ <tr><td>Layout anomalies</td><td>{{ explainability.layout_anomalies | length }}</td></tr>
311
+ </table>
312
+
313
+ {% if explainability.suspicious_phrases %}
314
+ <h3>Suspicious Phrases</h3>
315
+ <table class="data">
316
+ <tr><th>Text</th><th>Pattern</th><th>Severity</th></tr>
317
+ {% for p in explainability.suspicious_phrases %}
318
+ <tr>
319
+ <td>{{ p.text }}</td>
320
+ <td>{{ p.pattern_type }}</td>
321
+ <td><span class="badge sev-{{ p.severity }}">{{ p.severity }}</span></td>
322
+ </tr>
323
+ {% endfor %}
324
+ </table>
325
+ {% endif %}
326
+
327
+ {% endif %}{# end screenshot #}
328
+
329
+ {# ══════════ SOURCES (all types) ══════════ #}
330
+ {% if trusted_sources %}
331
+ <h2>Trusted Source Cross-Reference ({{ trusted_sources | length }})</h2>
332
+ <table class="data">
333
+ <tr><th>Source</th><th>Title</th><th>Relevance</th></tr>
334
+ {% for s in trusted_sources %}
335
+ <tr>
336
+ <td>{{ s.source_name }}</td>
337
+ <td>{{ s.title }}</td>
338
+ <td>{{ '%.0f' | format(s.relevance_score * 100) }}%</td>
339
+ </tr>
340
+ {% endfor %}
341
+ </table>
342
+ {% endif %}
343
+
344
+ {% if contradicting_evidence %}
345
+ <h2 style="color:#B91C1C;">Contradicting Evidence ({{ contradicting_evidence | length }})</h2>
346
+ <table class="data">
347
+ <tr><th>Source</th><th>Title</th><th>Type</th></tr>
348
+ {% for c in contradicting_evidence %}
349
+ <tr><td>{{ c.source_name }}</td><td>{{ c.title }}</td><td>{{ c.type }}</td></tr>
350
+ {% endfor %}
351
+ </table>
352
+ {% endif %}
353
+
354
+ {# ══════════ PROCESSING ══════════ #}
355
+ <h2>Processing Summary</h2>
356
+ <div class="muted">Model: {{ processing_summary.model_used }} &nbsp;·&nbsp; Duration: {{ processing_summary.total_duration_ms }} ms</div>
357
+ <div style="font-size:8.5pt; margin-top:3pt;">{{ processing_summary.stages_completed | join(' → ') }}</div>
358
+
359
+ {# ══════════ FOOTER ══════════ #}
360
+ <div class="footer">
361
+ <b>DeepShield Responsible-AI Notice.</b> {{ responsible_ai_notice }}<br />
362
+ Generated {{ generated_at }}. Report expires in 1 hour.
363
+ AI-assisted analysis — cross-check with trusted sources before sharing.
364
+ </div>
365
+
366
+ </body>
367
+ </html>
report_service.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import json
5
+ import os
6
+ import time
7
+ import uuid
8
+ from datetime import datetime, timedelta, timezone
9
+ from io import BytesIO
10
+ from pathlib import Path
11
+ from typing import Any, Optional
12
+
13
+ from jinja2 import Environment, FileSystemLoader, select_autoescape
14
+ from loguru import logger
15
+ from xhtml2pdf import pisa # type: ignore
16
+
17
+ from config import settings
18
+ from db.models import AnalysisRecord, Report
19
+
20
+ TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates"
21
+
22
+ _env = Environment(
23
+ loader=FileSystemLoader(str(TEMPLATES_DIR)),
24
+ autoescape=select_autoescape(["html", "xml"]),
25
+ )
26
+
27
+
28
+ def _score_class(score: int) -> str:
29
+ if score >= 70:
30
+ return "real"
31
+ if score >= 40:
32
+ return "warn"
33
+ return "fake"
34
+
35
+
36
+ def _ensure_dir() -> Path:
37
+ p = Path(settings.REPORT_DIR)
38
+ p.mkdir(parents=True, exist_ok=True)
39
+ return p
40
+
41
+
42
+ def _make_donut_chart(score: int, score_cls: str) -> str:
43
+ """Render authenticity score as a donut chart PNG; return base64 or '' on failure."""
44
+ try:
45
+ import matplotlib # type: ignore
46
+ matplotlib.use("Agg")
47
+ import matplotlib.pyplot as plt # type: ignore
48
+
49
+ color_map = {"real": "#43A047", "warn": "#FB8C00", "fake": "#E53935"}
50
+ color = color_map.get(score_cls, "#6B7280")
51
+
52
+ fig, ax = plt.subplots(figsize=(2.2, 2.2), dpi=96)
53
+ sizes = [score, 100 - score]
54
+ wedge_colors = [color, "#F3F4F6"]
55
+ ax.pie(sizes, colors=wedge_colors, startangle=90,
56
+ wedgeprops=dict(width=0.42, edgecolor="white", linewidth=1))
57
+ ax.text(0, 0, str(score), ha="center", va="center",
58
+ fontsize=20, fontweight="bold", color=color)
59
+ ax.set_aspect("equal")
60
+ plt.tight_layout(pad=0.05)
61
+
62
+ buf = BytesIO()
63
+ fig.savefig(buf, format="png", bbox_inches="tight", transparent=True)
64
+ plt.close(fig)
65
+ buf.seek(0)
66
+ return base64.b64encode(buf.read()).decode()
67
+ except Exception as e:
68
+ logger.debug(f"Donut chart skipped: {e}")
69
+ return ""
70
+
71
+
72
+ def _extract_llm_summary(analysis_json: dict) -> dict | None:
73
+ """Extract llm_summary from either top-level or inside explainability (images)."""
74
+ top = analysis_json.get("llm_summary")
75
+ if top:
76
+ return top
77
+ return (analysis_json.get("explainability") or {}).get("llm_summary")
78
+
79
+
80
+ def render_html(analysis_json: dict) -> str:
81
+ score = analysis_json.get("verdict", {}).get("authenticity_score", 50)
82
+ sc = _score_class(score)
83
+ donut_b64 = _make_donut_chart(score, sc)
84
+ llm_summary = _extract_llm_summary(analysis_json)
85
+ expl: dict[str, Any] = analysis_json.get("explainability") or {}
86
+
87
+ tmpl = _env.get_template("report.html")
88
+ return tmpl.render(
89
+ analysis_id=analysis_json.get("analysis_id", ""),
90
+ media_type=analysis_json.get("media_type", "unknown"),
91
+ verdict=analysis_json.get("verdict", {}),
92
+ explainability=expl,
93
+ trusted_sources=analysis_json.get("trusted_sources", []),
94
+ contradicting_evidence=analysis_json.get("contradicting_evidence", []),
95
+ processing_summary=analysis_json.get("processing_summary", {}),
96
+ responsible_ai_notice=analysis_json.get(
97
+ "responsible_ai_notice",
98
+ "AI-based analysis may not be 100% accurate.",
99
+ ),
100
+ score_class=sc,
101
+ generated_at=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC"),
102
+ donut_b64=donut_b64,
103
+ llm_summary=llm_summary,
104
+ )
105
+
106
+
107
+ def html_to_pdf(html: str, out_path: Path) -> None:
108
+ with open(out_path, "wb") as f:
109
+ result = pisa.CreatePDF(html, dest=f)
110
+ if result.err:
111
+ raise RuntimeError(f"xhtml2pdf failed with {result.err} errors")
112
+
113
+
114
+ def generate_report(record: AnalysisRecord) -> Path:
115
+ out_dir = _ensure_dir()
116
+ filename = f"deepshield_{record.id}_{uuid.uuid4().hex[:8]}.pdf"
117
+ out_path = out_dir / filename
118
+
119
+ data = json.loads(record.result_json)
120
+ html = render_html(data)
121
+ html_to_pdf(html, out_path)
122
+ logger.info(f"Report generated id={record.id} path={out_path} size={out_path.stat().st_size}B")
123
+ return out_path
124
+
125
+
126
+ def create_report_row(analysis_id: int, path: Path) -> Report:
127
+ return Report(
128
+ analysis_id=analysis_id,
129
+ file_path=str(path),
130
+ expires_at=datetime.utcnow() + timedelta(seconds=settings.REPORT_TTL_SECONDS),
131
+ )
132
+
133
+
134
+ def cleanup_expired(now: Optional[datetime] = None) -> int:
135
+ """Delete expired PDFs from disk. Returns count deleted."""
136
+ now = now or datetime.utcnow()
137
+ d = Path(settings.REPORT_DIR)
138
+ if not d.exists():
139
+ return 0
140
+ deleted = 0
141
+ ttl = timedelta(seconds=settings.REPORT_TTL_SECONDS)
142
+ for f in d.glob("*.pdf"):
143
+ try:
144
+ mtime = datetime.utcfromtimestamp(f.stat().st_mtime)
145
+ if now - mtime > ttl:
146
+ f.unlink()
147
+ deleted += 1
148
+ except OSError as e:
149
+ logger.warning(f"Cleanup failed for {f}: {e}")
150
+ if deleted:
151
+ logger.info(f"Cleaned up {deleted} expired reports")
152
+ return deleted
requirements.txt ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn[standard]==0.32.0
3
+ pydantic==2.9.2
4
+ pydantic-settings==2.6.0
5
+ python-multipart==0.0.12
6
+ python-dotenv==1.0.1
7
+ loguru==0.7.2
8
+ SQLAlchemy==2.0.35
9
+ psycopg2-binary==2.9.9
10
+ alembic==1.13.3
11
+ python-jose[cryptography]==3.3.0
12
+ bcrypt==4.2.0
13
+
14
+ # === Phase 1: Image Detection ===
15
+ # Install torch separately with CPU index first (see README): pip install torch==2.4.1 --index-url https://download.pytorch.org/whl/cpu
16
+ torch==2.4.1
17
+ torchvision==0.19.1
18
+ transformers==4.44.2
19
+ Pillow>=10.4.0
20
+ numpy>=1.26,<3
21
+ opencv-python==4.10.0.84
22
+ grad-cam==1.5.4
23
+ mediapipe==0.10.14
24
+
25
+ # === Phase 12: Explainability v2 ===
26
+ exifread==3.0.0
27
+ google-generativeai>=0.3.0 # Gemini provider for LLM explainability
28
+ openai>=1.0.0 # OpenAI provider (alternative to Gemini)
29
+
30
+ # === Phase 14: PDF v2 donut chart ===
31
+ matplotlib>=3.9.0
32
+
33
+ # === Phase 13: Text Pipeline Hardening ===
34
+ # After installing, run: python -m spacy download en_core_web_sm
35
+ spacy>=3.7.0,<4.0.0
36
+ sentence-transformers>=2.7.0 # for truth-override cosine similarity (all-MiniLM-L6-v2)
37
+ langdetect==1.0.9 # lightweight language detection
38
+
39
+ # === Phase 3: Text / News ===
40
+ httpx==0.27.2
41
+
42
+ # === Phase 4: Screenshot / OCR ===
43
+ easyocr==1.7.2
44
+
45
+ # === Phase 7: PDF Reports ===
46
+ Jinja2==3.1.4
47
+ xhtml2pdf==0.2.16
48
+
49
+ # === Phase 8: Auth ===
50
+ email-validator==2.2.0
router.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+
3
+ from api.v1 import analyze, auth, health, history, report
4
+
5
+ api_router = APIRouter(prefix="/api/v1")
6
+ api_router.include_router(health.router)
7
+ api_router.include_router(analyze.router)
8
+ api_router.include_router(report.router)
9
+ api_router.include_router(auth.router)
10
+ api_router.include_router(history.router)
scoring.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Tuple
4
+
5
+ TRUST_SCALE = [
6
+ (0, 20, "Very Likely Fake", "critical"),
7
+ (21, 40, "Likely Fake", "danger"),
8
+ (41, 60, "Possibly Manipulated", "warning"),
9
+ (61, 80, "Likely Real", "positive"),
10
+ (81, 100, "Very Likely Real", "safe"),
11
+ ]
12
+
13
+
14
+ def compute_authenticity_score(model_confidence: float, label: str) -> int:
15
+ """Map (confidence, label) to 0-100 authenticity score.
16
+ Real-ish labels give high score; fake-ish labels give low score.
17
+ """
18
+ label_l = label.lower()
19
+ fake_tokens = ("fake", "deepfake", "manipulated", "ai", "generated", "synthetic")
20
+ if any(tok in label_l for tok in fake_tokens):
21
+ score = (1.0 - float(model_confidence)) * 100.0
22
+ else:
23
+ score = float(model_confidence) * 100.0
24
+ return int(round(max(0.0, min(100.0, score))))
25
+
26
+
27
+ def get_verdict_label(score: int) -> Tuple[str, str]:
28
+ for lo, hi, label, severity in TRUST_SCALE:
29
+ if lo <= score <= hi:
30
+ return label, severity
31
+ return "Unknown", "warning"
32
+
33
+
34
+ def get_score_color(score: int) -> str:
35
+ """Linear interpolate Red (#E53935) → Amber (#FFA726) → Green (#43A047)."""
36
+ def lerp(a: int, b: int, t: float) -> int:
37
+ return int(round(a + (b - a) * t))
38
+
39
+ score = max(0, min(100, score))
40
+ if score <= 50:
41
+ t = score / 50.0
42
+ r, g, b = lerp(0xE5, 0xFF, t), lerp(0x39, 0xA7, t), lerp(0x35, 0x26, t)
43
+ else:
44
+ t = (score - 50) / 50.0
45
+ r, g, b = lerp(0xFF, 0x43, t), lerp(0xA7, 0xA0, t), lerp(0x26, 0x47, t)
46
+ return f"#{r:02X}{g:02X}{b:02X}"
screenshot_service.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import List, Tuple
5
+
6
+ import numpy as np
7
+ from loguru import logger
8
+ from PIL import Image
9
+
10
+ from models.model_loader import get_model_loader
11
+
12
+
13
+ @dataclass
14
+ class OCRBox:
15
+ text: str
16
+ bbox: List[List[int]] # 4 points [[x,y],...]
17
+ confidence: float
18
+
19
+
20
+ @dataclass
21
+ class SuspiciousPhrase:
22
+ text: str
23
+ bbox: List[List[int]]
24
+ pattern_type: str
25
+ severity: str
26
+ description: str
27
+
28
+
29
+ @dataclass
30
+ class LayoutAnomaly:
31
+ type: str # misalignment / font_mismatch / uneven_spacing
32
+ severity: str
33
+ description: str
34
+ confidence: float
35
+
36
+
37
+ def run_ocr(pil_img: Image.Image) -> List[OCRBox]:
38
+ reader = get_model_loader().load_ocr_engine()
39
+ arr = np.array(pil_img.convert("RGB"))
40
+ results = reader.readtext(arr, detail=1, paragraph=False)
41
+ out: List[OCRBox] = []
42
+ for bbox, text, conf in results:
43
+ out.append(OCRBox(
44
+ text=str(text),
45
+ bbox=[[int(p[0]), int(p[1])] for p in bbox],
46
+ confidence=float(conf),
47
+ ))
48
+ logger.info(f"OCR extracted {len(out)} text regions")
49
+ return out
50
+
51
+
52
+ def extract_full_text(boxes: List[OCRBox]) -> str:
53
+ return " ".join(b.text for b in boxes if b.text.strip())
54
+
55
+
56
+ def map_phrases_to_boxes(boxes: List[OCRBox], manipulation_indicators) -> List[SuspiciousPhrase]:
57
+ """Map each manipulation indicator to the OCR box whose text contains it."""
58
+ out: List[SuspiciousPhrase] = []
59
+ for mi in manipulation_indicators:
60
+ needle = mi.matched_text.lower()
61
+ for b in boxes:
62
+ if needle in b.text.lower():
63
+ out.append(SuspiciousPhrase(
64
+ text=mi.matched_text,
65
+ bbox=b.bbox,
66
+ pattern_type=mi.pattern_type,
67
+ severity=mi.severity,
68
+ description=mi.description,
69
+ ))
70
+ break
71
+ return out
72
+
73
+
74
+ def detect_layout_anomalies(boxes: List[OCRBox]) -> List[LayoutAnomaly]:
75
+ """Heuristic layout checks on OCR bboxes."""
76
+ out: List[LayoutAnomaly] = []
77
+ if len(boxes) < 3:
78
+ return out
79
+
80
+ heights = []
81
+ x_lefts = []
82
+ for b in boxes:
83
+ pts = b.bbox
84
+ ys = [p[1] for p in pts]
85
+ xs = [p[0] for p in pts]
86
+ heights.append(max(ys) - min(ys))
87
+ x_lefts.append(min(xs))
88
+
89
+ h_arr = np.array(heights, dtype=float)
90
+ if h_arr.mean() > 0:
91
+ cv_h = float(h_arr.std() / h_arr.mean())
92
+ if cv_h > 0.7:
93
+ out.append(LayoutAnomaly(
94
+ type="font_mismatch",
95
+ severity="medium" if cv_h < 1.2 else "high",
96
+ description=f"High variance in text heights (cv={cv_h:.2f}) — mixed fonts/sizes possible",
97
+ confidence=min(cv_h / 1.5, 1.0),
98
+ ))
99
+
100
+ x_arr = np.array(x_lefts, dtype=float)
101
+ if x_arr.std() > 0 and len(x_arr) > 4:
102
+ clustered = sum(1 for x in x_arr if abs(x - np.median(x_arr)) < 20)
103
+ align_ratio = clustered / len(x_arr)
104
+ if align_ratio < 0.4:
105
+ out.append(LayoutAnomaly(
106
+ type="misalignment",
107
+ severity="low",
108
+ description=f"Only {align_ratio*100:.0f}% of text blocks share left-alignment — unusual layout",
109
+ confidence=1.0 - align_ratio,
110
+ ))
111
+
112
+ if len(boxes) >= 4:
113
+ tops = sorted([min(p[1] for p in b.bbox) for b in boxes])
114
+ gaps = np.diff(tops)
115
+ gaps = gaps[gaps > 0]
116
+ if len(gaps) >= 3 and gaps.mean() > 0:
117
+ cv_g = float(gaps.std() / gaps.mean())
118
+ if cv_g > 1.5:
119
+ out.append(LayoutAnomaly(
120
+ type="uneven_spacing",
121
+ severity="low",
122
+ description=f"Irregular vertical spacing between text blocks (cv={cv_g:.2f})",
123
+ confidence=min(cv_g / 2.5, 1.0),
124
+ ))
125
+
126
+ return out
test_image_classify.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Phase 1.2 smoke test: download a sample image and run the ViT classifier.
2
+
3
+ Run from backend/:
4
+ .venv/Scripts/python.exe scripts/test_image_classify.py
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import sys
9
+ import urllib.request
10
+ from pathlib import Path
11
+
12
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
13
+
14
+ import base64
15
+
16
+ from models.heatmap_generator import generate_heatmap_base64
17
+ from services.artifact_detector import scan_artifacts
18
+ from services.image_service import preprocess_and_classify
19
+ from utils.scoring import compute_authenticity_score, get_verdict_label
20
+
21
+ SAMPLE_URL = "https://picsum.photos/seed/deepshield/512/512"
22
+
23
+
24
+ def main() -> int:
25
+ print(f"Fetching sample image: {SAMPLE_URL}")
26
+ req = urllib.request.Request(SAMPLE_URL, headers={"User-Agent": "DeepShield/0.1"})
27
+ with urllib.request.urlopen(req, timeout=30) as r:
28
+ data = r.read()
29
+ print(f" got {len(data)} bytes")
30
+
31
+ print("Running classifier (first run will download model ~350MB)…")
32
+ pil, result = preprocess_and_classify(data)
33
+ print(f" image size: {pil.size}")
34
+ print(f" label: {result.label}")
35
+ print(f" confidence: {result.confidence:.4f}")
36
+ print(f" all scores: {result.all_scores}")
37
+
38
+ score = compute_authenticity_score(result.confidence, result.label)
39
+ verdict_label, severity = get_verdict_label(score)
40
+ print(f"\n authenticity_score: {score}")
41
+ print(f" verdict: {verdict_label} ({severity})")
42
+
43
+ print("\nScanning artifact indicators\u2026")
44
+ for ind in scan_artifacts(pil, data):
45
+ print(f" [{ind.severity.upper():6s}] {ind.type}: {ind.description} (conf {ind.confidence:.2f})")
46
+
47
+ print("\nGenerating Grad-CAM heatmap\u2026")
48
+ heatmap_url = generate_heatmap_base64(pil)
49
+ header, b64 = heatmap_url.split(",", 1)
50
+ out_path = Path(__file__).resolve().parent.parent / "heatmap_smoketest.png"
51
+ out_path.write_bytes(base64.b64decode(b64))
52
+ print(f" saved: {out_path}")
53
+ print(f" data URL length: {len(heatmap_url)} chars")
54
+ return 0
55
+
56
+
57
+ if __name__ == "__main__":
58
+ raise SystemExit(main())
test_news_api.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Test script for the NewsData API integration."""
2
+ import asyncio
3
+ import sys
4
+ import os
5
+
6
+ # Add backend directory to sys.path so we can import modules
7
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
8
+
9
+ from config import settings
10
+ from services.news_lookup import search_news_full
11
+
12
+ async def test_news():
13
+ print(f"Testing News API Integration with key: {settings.NEWS_API_KEY[:6]}... (masked)")
14
+
15
+ if not settings.NEWS_API_KEY:
16
+ print("ERROR: NEWS_API_KEY is empty in .env")
17
+ return
18
+
19
+ keywords = ["modi", "election", "bjp", "congress"]
20
+ print(f"Searching for keywords: {keywords}")
21
+
22
+ try:
23
+ result = await search_news_full(keywords, limit=5)
24
+
25
+ print("\n=== RAW RESULT ===")
26
+ print(f"Total articles found: {result.total_articles}")
27
+
28
+ print("\n=== TRUSTED SOURCES ===")
29
+ for i, source in enumerate(result.trusted_sources, 1):
30
+ date_str = str(source.published_at)[:10] if source.published_at else "Unknown date"
31
+ print(f"{i}. [{source.relevance_score}] {source.source_name}: {source.title[:60]}... ({date_str})")
32
+
33
+ print("\n=== CONTRADICTING EVIDENCE / FACT CHECKS ===")
34
+ if not result.contradicting_evidence:
35
+ print("No fact-check articles found for these keywords.")
36
+ for i, ev in enumerate(result.contradicting_evidence, 1):
37
+ print(f"{i}. {ev.source_name}: {ev.title[:60]}...")
38
+
39
+ except Exception as e:
40
+ print(f"\nERROR running test: {e}")
41
+
42
+ if __name__ == "__main__":
43
+ asyncio.run(test_news())
test_phase5.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Phase 5 smoke: unit-test news_lookup classification + endpoint wiring."""
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
9
+
10
+ from services.news_lookup import (
11
+ _domain_of, _is_factcheck, _relevance, search_news_full,
12
+ )
13
+
14
+
15
+ def test_domain():
16
+ assert _domain_of("https://www.reuters.com/article/x") == "reuters.com"
17
+ assert _domain_of("https://snopes.com/fact-check/abc") == "snopes.com"
18
+ print("[OK] _domain_of")
19
+
20
+
21
+ def test_factcheck_detection():
22
+ assert _is_factcheck("https://snopes.com/x", "Claim about moon")
23
+ assert _is_factcheck("https://factly.in/x", "")
24
+ assert _is_factcheck("https://example.com/x", "FACT CHECK: viral video debunked")
25
+ assert not _is_factcheck("https://bbc.com/news/world-123", "Election results")
26
+ print("[OK] _is_factcheck")
27
+
28
+
29
+ def test_relevance():
30
+ assert _relevance("https://reuters.com/x") == 1.0
31
+ assert _relevance("https://ndtv.com/x") == 0.85
32
+ assert _relevance("https://random-blog.xyz/x") == 0.5
33
+ print("[OK] _relevance weights")
34
+
35
+
36
+ async def test_empty_key_returns_empty():
37
+ res = await search_news_full(["modi", "election"])
38
+ assert res.trusted_sources == []
39
+ assert res.contradicting_evidence == []
40
+ assert res.total_articles == 0
41
+ print(f"[OK] empty-key path -> {res}")
42
+
43
+
44
+ async def test_endpoint_wiring():
45
+ import httpx
46
+ body = {"text": "BREAKING!!! You won't BELIEVE this SHOCKING miracle cure doctors don't want you to know!!! Click now!"}
47
+ async with httpx.AsyncClient(timeout=180.0) as c:
48
+ r = await c.post("http://127.0.0.1:8000/api/v1/analyze/text", json=body)
49
+ r.raise_for_status()
50
+ j = r.json()
51
+ assert j["media_type"] == "text"
52
+ assert "trusted_sources" in j
53
+ assert "contradicting_evidence" in j
54
+ assert "news_lookup" in j["processing_summary"]["stages_completed"]
55
+ print(f"[OK] /analyze/text -> verdict={j['verdict']['label']} "
56
+ f"score={j['verdict']['authenticity_score']} "
57
+ f"trusted={len(j['trusted_sources'])} contradictions={len(j['contradicting_evidence'])}")
58
+
59
+
60
+ async def main():
61
+ test_domain()
62
+ test_factcheck_detection()
63
+ test_relevance()
64
+ await test_empty_key_returns_empty()
65
+ await test_endpoint_wiring()
66
+ print("\n=== Phase 5 smoke PASS ===")
67
+
68
+
69
+ if __name__ == "__main__":
70
+ asyncio.run(main())
test_text_analysis.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Quick smoke test for sensationalism + manipulation detection."""
2
+ import sys
3
+ sys.path.insert(0, ".")
4
+
5
+ from services.text_service import score_sensationalism, detect_manipulation_indicators
6
+
7
+ # --- Sensationalism ---
8
+ text1 = "BREAKING: You wont believe this SHOCKING truth! Experts confirm the most DEVASTATING scandal exposed!!!"
9
+ s = score_sensationalism(text1)
10
+ print(f"Sensationalism: score={s.score} level={s.level}")
11
+ print(f" excl={s.exclamation_count} caps={s.caps_word_count} clickbait={s.clickbait_matches} emotional={s.emotional_word_count} superlative={s.superlative_count}")
12
+ assert s.score > 50, f"Expected high sensationalism, got {s.score}"
13
+ assert s.level in ("Medium", "High"), f"Expected Medium/High, got {s.level}"
14
+ print(" PASS")
15
+
16
+ # --- Manipulation ---
17
+ text2 = "Sources say that experts confirm the shocking truth. Allegedly, everyone knows this is a proven fact."
18
+ m = detect_manipulation_indicators(text2)
19
+ print(f"\nManipulation indicators: {len(m)} found")
20
+ for ind in m:
21
+ print(f" [{ind.severity}] {ind.pattern_type}: \"{ind.matched_text}\"")
22
+ assert len(m) >= 3, f"Expected >=3 indicators, got {len(m)}"
23
+ print(" PASS")
24
+
25
+ # --- Clean text ---
26
+ text3 = "The weather today is sunny with clear skies in New Delhi."
27
+ s2 = score_sensationalism(text3)
28
+ m2 = detect_manipulation_indicators(text3)
29
+ print(f"\nClean text: sensationalism={s2.score} ({s2.level}), manipulation={len(m2)}")
30
+ assert s2.score < 20, f"Expected low sensationalism for clean text, got {s2.score}"
31
+ assert len(m2) == 0, f"Expected 0 manipulation indicators for clean text, got {len(m2)}"
32
+ print(" PASS")
33
+
34
+ print("\nAll tests passed!")
text_service.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass, field
5
+ from typing import List, Optional
6
+
7
+ from loguru import logger
8
+
9
+ from models.model_loader import get_model_loader
10
+
11
+ FAKE_TOKENS = ("fake", "false", "unreliable", "misinformation")
12
+
13
+ # --- Sensationalism patterns ---
14
+ CLICKBAIT_PATTERNS = [
15
+ (r"\byou won'?t believe\b", "clickbait"),
16
+ (r"\bbreaking\s*:", "clickbait"),
17
+ (r"\bshocking\s*:", "clickbait"),
18
+ (r"\bexclusive\s*:", "clickbait"),
19
+ (r"\bjust\s+in\s*:", "clickbait"),
20
+ (r"\burgent\s*:", "clickbait"),
21
+ (r"\bwhat\s+happens\s+next\b", "clickbait"),
22
+ (r"\bthis\s+will\s+change\b", "clickbait"),
23
+ (r"\b(?:everyone|nobody)\s+(?:is|was)\s+talking\b", "clickbait"),
24
+ ]
25
+ EMOTIONAL_WORDS = {
26
+ "outrage", "shocking", "horrifying", "disgusting", "amazing", "incredible",
27
+ "unbelievable", "devastating", "terrifying", "explosive", "bombshell",
28
+ "jaw-dropping", "heartbreaking", "furious", "scandal", "crisis",
29
+ "chaos", "destroyed", "slammed", "blasted", "exposed", "revealed",
30
+ }
31
+ SUPERLATIVES = {
32
+ "best", "worst", "greatest", "biggest", "most", "least",
33
+ "fastest", "deadliest", "largest", "smallest", "ultimate",
34
+ }
35
+
36
+ # --- Manipulation indicator patterns ---
37
+ MANIPULATION_PATTERNS = [
38
+ # Unverified claims
39
+ (r"\bsources?\s+(?:say|said|claim|report)\b", "unverified_claim", "medium",
40
+ "Unverified source attribution without specific citation"),
41
+ (r"\ballegedly\b", "unverified_claim", "low",
42
+ "Hedging language suggests unverified information"),
43
+ (r"\breports?\s+suggest\b", "unverified_claim", "medium",
44
+ "Vague report attribution"),
45
+ (r"\baccording\s+to\s+(?:some|many|several)\b", "unverified_claim", "medium",
46
+ "Non-specific source attribution"),
47
+ (r"\brunconfirmed\b", "unverified_claim", "medium",
48
+ "Explicitly unconfirmed information"),
49
+ # Emotional manipulation
50
+ (r"\boutrage\b", "emotional_manipulation", "medium",
51
+ "Emotional trigger word designed to provoke reaction"),
52
+ (r"\bshocking\s+truth\b", "emotional_manipulation", "high",
53
+ "Sensationalist phrase designed to manipulate reader emotion"),
54
+ (r"\bwake\s+up\b", "emotional_manipulation", "medium",
55
+ "Call-to-action implying hidden knowledge"),
56
+ (r"\bthey\s+don'?t\s+want\s+you\s+to\s+know\b", "emotional_manipulation", "high",
57
+ "Conspiracy framing language"),
58
+ (r"\bopen\s+your\s+eyes\b", "emotional_manipulation", "medium",
59
+ "Implies audience ignorance"),
60
+ # False authority
61
+ (r"\bexperts?\s+(?:confirm|say|agree|warn)\b", "false_authority", "medium",
62
+ "Unnamed expert citation without specific attribution"),
63
+ (r"\bscientists?\s+(?:confirm|prove|say)\b", "false_authority", "medium",
64
+ "Unnamed scientist citation"),
65
+ (r"\bstudies?\s+(?:show|prove|confirm)\b", "false_authority", "low",
66
+ "Vague study reference without citation"),
67
+ (r"\beveryone\s+knows\b", "false_authority", "medium",
68
+ "Appeal to common knowledge fallacy"),
69
+ (r"\bit'?s\s+(?:a\s+)?(?:well-?known|proven)\s+fact\b", "false_authority", "medium",
70
+ "Assertion of fact without evidence"),
71
+ ]
72
+
73
+ # NER entity labels to prefer for keyword extraction
74
+ _NER_PREFERRED = {"PERSON", "ORG", "GPE", "EVENT", "PRODUCT", "NORP"}
75
+
76
+
77
+ @dataclass
78
+ class TextClassification:
79
+ label: str
80
+ confidence: float
81
+ fake_prob: float
82
+ all_scores: dict[str, float]
83
+
84
+
85
+ @dataclass
86
+ class SensationalismResult:
87
+ score: int # 0-100
88
+ level: str # Low / Medium / High
89
+ exclamation_count: int
90
+ caps_word_count: int
91
+ clickbait_matches: int
92
+ emotional_word_count: int
93
+ superlative_count: int
94
+
95
+
96
+ @dataclass
97
+ class ManipulationIndicator:
98
+ pattern_type: str # unverified_claim / emotional_manipulation / false_authority
99
+ matched_text: str
100
+ start_pos: int
101
+ end_pos: int
102
+ severity: str # low / medium / high
103
+ description: str
104
+
105
+
106
+ def detect_language(text: str) -> str:
107
+ """Detect the primary language of text using langdetect.
108
+ Returns ISO 639-1 code (e.g. 'en', 'hi'). Falls back to 'en' on failure.
109
+ """
110
+ if not text or len(text.strip()) < 10:
111
+ return "en"
112
+ try:
113
+ from langdetect import detect # type: ignore
114
+ lang = detect(text.strip())
115
+ logger.info(f"Language detected: {lang}")
116
+ return lang
117
+ except ImportError:
118
+ logger.debug("langdetect not installed — defaulting to 'en'")
119
+ return "en"
120
+ except Exception as e:
121
+ logger.debug(f"Language detection failed: {e} — defaulting to 'en'")
122
+ return "en"
123
+
124
+
125
+ def _scores_to_classification(items) -> TextClassification:
126
+ """Convert pipeline output to TextClassification."""
127
+ scores = {i["label"]: float(i["score"]) for i in items}
128
+ top_label, top_conf = max(scores.items(), key=lambda kv: kv[1])
129
+ # Extract fake probability
130
+ fake_prob = 0.0
131
+ if "LABEL_0" in scores:
132
+ fake_prob = scores["LABEL_0"]
133
+ else:
134
+ fake_prob = max(
135
+ (p for lbl, p in scores.items() if any(t in lbl.lower() for t in FAKE_TOKENS)),
136
+ default=0.0,
137
+ )
138
+ return TextClassification(top_label, top_conf, fake_prob, scores)
139
+
140
+
141
+ def classify_text(text: str, language: Optional[str] = None) -> TextClassification:
142
+ """Classify text as fake/real.
143
+ Routes to multilingual model when language is non-English and the model is configured.
144
+ """
145
+ text = (text or "").strip()
146
+ if not text:
147
+ return TextClassification("unknown", 0.0, 0.0, {})
148
+
149
+ loader = get_model_loader()
150
+
151
+ if language and language != "en":
152
+ pipe = loader.load_multilang_text_model()
153
+ else:
154
+ pipe = loader.load_text_model()
155
+
156
+ out = pipe(text[:2000], truncation=True, top_k=None)
157
+ items = out[0] if isinstance(out[0], list) else out
158
+ clf = _scores_to_classification(items)
159
+ logger.info(
160
+ f"Text classify [{language or 'en'}] → {clf.label} @ {clf.confidence:.3f} "
161
+ f"fake_p={clf.fake_prob:.3f}"
162
+ )
163
+ return clf
164
+
165
+
166
+ def score_sensationalism(text: str) -> SensationalismResult:
167
+ """Compute a 0-100 sensationalism score from structural/linguistic signals."""
168
+ if not text:
169
+ return SensationalismResult(0, "Low", 0, 0, 0, 0, 0)
170
+
171
+ words = text.split()
172
+ total_words = max(len(words), 1)
173
+
174
+ excl = text.count("!")
175
+ caps = sum(1 for w in words if w.isupper() and len(w) > 2)
176
+ clickbait = sum(
177
+ 1 for pat, _ in CLICKBAIT_PATTERNS
178
+ if re.search(pat, text, re.IGNORECASE)
179
+ )
180
+ emotional = sum(1 for w in words if w.lower().strip(".,!?;:") in EMOTIONAL_WORDS)
181
+ superlative = sum(1 for w in words if w.lower().strip(".,!?;:") in SUPERLATIVES)
182
+
183
+ raw = (
184
+ min(excl * 8, 25)
185
+ + min(caps / total_words * 200, 25)
186
+ + min(clickbait * 12, 25)
187
+ + min(emotional * 6, 15)
188
+ + min(superlative * 5, 10)
189
+ )
190
+ score = int(min(100, max(0, raw)))
191
+ level = "Low" if score < 30 else ("Medium" if score < 60 else "High")
192
+
193
+ logger.info(f"Sensationalism → {score} ({level}) excl={excl} caps={caps} cb={clickbait} emo={emotional}")
194
+ return SensationalismResult(score, level, excl, caps, clickbait, emotional, superlative)
195
+
196
+
197
+ def detect_manipulation_indicators(text: str) -> List[ManipulationIndicator]:
198
+ """Scan text for manipulation linguistic patterns with positions."""
199
+ if not text:
200
+ return []
201
+ indicators: List[ManipulationIndicator] = []
202
+ for pattern, ptype, severity, description in MANIPULATION_PATTERNS:
203
+ for m in re.finditer(pattern, text, re.IGNORECASE):
204
+ indicators.append(ManipulationIndicator(
205
+ pattern_type=ptype,
206
+ matched_text=m.group(),
207
+ start_pos=m.start(),
208
+ end_pos=m.end(),
209
+ severity=severity,
210
+ description=description,
211
+ ))
212
+ indicators.sort(key=lambda i: i.start_pos)
213
+ logger.info(f"Manipulation indicators → {len(indicators)} found")
214
+ return indicators
215
+
216
+
217
+ def extract_entities(text: str, max_k: int = 6) -> List[str]:
218
+ """Extract keywords via spaCy NER (PERSON, ORG, GPE, EVENT preferred).
219
+ Falls back to frequency-based extraction when spaCy is unavailable or text is too short.
220
+ """
221
+ if not text or len(text.strip()) < 20:
222
+ return _extract_keywords_freq(text, max_k)
223
+
224
+ loader = get_model_loader()
225
+ nlp = loader.load_spacy_nlp()
226
+
227
+ if nlp is None:
228
+ # spaCy not available — use frequency fallback
229
+ return _extract_keywords_freq(text, max_k)
230
+
231
+ try:
232
+ doc = nlp(text[:5000]) # cap for performance
233
+
234
+ # Collect named entities, preferring high-value types
235
+ preferred: List[str] = []
236
+ other: List[str] = []
237
+ seen: set[str] = set()
238
+
239
+ for ent in doc.ents:
240
+ norm = ent.text.strip()
241
+ norm_lower = norm.lower()
242
+ if not norm or norm_lower in seen or len(norm) < 2:
243
+ continue
244
+ seen.add(norm_lower)
245
+ if ent.label_ in _NER_PREFERRED:
246
+ preferred.append(norm)
247
+ else:
248
+ other.append(norm)
249
+
250
+ entities = preferred + other
251
+
252
+ if len(entities) >= 2:
253
+ logger.info(f"NER extracted {len(entities)} entities: {entities[:max_k]}")
254
+ return entities[:max_k]
255
+
256
+ # Not enough entities — supplement with frequency keywords
257
+ freq_kws = _extract_keywords_freq(text, max_k)
258
+ combined = entities + [k for k in freq_kws if k.lower() not in seen]
259
+ return combined[:max_k]
260
+
261
+ except Exception as e:
262
+ logger.warning(f"spaCy NER failed: {e} — falling back to frequency extraction")
263
+ return _extract_keywords_freq(text, max_k)
264
+
265
+
266
+ def _extract_keywords_freq(text: str, max_k: int = 6) -> List[str]:
267
+ """Frequency-based keyword extraction (original implementation, kept as fallback)."""
268
+ stop = {
269
+ "the","a","an","is","are","was","were","be","been","being","to","of","and","or","but",
270
+ "in","on","at","for","with","by","from","as","that","this","it","its","has","have","had",
271
+ "will","would","can","could","should","may","might","do","does","did","not","no","so",
272
+ "than","then","there","their","they","them","we","our","you","your","he","she","his","her",
273
+ }
274
+ words = re.findall(r"[A-Za-z][A-Za-z\-']{2,}", text or "")
275
+ freq: dict[str, int] = {}
276
+ for w in words:
277
+ wl = w.lower()
278
+ if wl in stop:
279
+ continue
280
+ freq[wl] = freq.get(wl, 0) + 1
281
+ return [w for w, _ in sorted(freq.items(), key=lambda kv: (-kv[1], kv[0]))[:max_k]]
282
+
283
+
284
+ # Back-compat alias: routes that still call extract_keywords get NER-first behaviour
285
+ extract_keywords = extract_entities
v1/__init__.py ADDED
File without changes
v1/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (165 Bytes). View file
 
v1/__pycache__/analyze.cpython-311.pyc ADDED
Binary file (21.6 kB). View file
 
v1/__pycache__/auth.cpython-311.pyc ADDED
Binary file (3.82 kB). View file
 
v1/__pycache__/health.cpython-311.pyc ADDED
Binary file (556 Bytes). View file
 
v1/__pycache__/history.cpython-311.pyc ADDED
Binary file (5.19 kB). View file