Spaces:
Running
Running
Spyderzz commited on
Commit ·
0853b44
1
Parent(s): a648128
Initial deployment of DeepShield backend
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .env +39 -0
- Colab_ViT_Training.ipynb +233 -0
- Dockerfile +31 -0
- README.md +11 -10
- __init__.py +0 -0
- analyze.py +177 -0
- artifact_detector.py +229 -0
- auth.py +30 -0
- auth_service.py +67 -0
- common.py +88 -0
- config.py +53 -0
- database.py +28 -0
- datasets/__init__.py +0 -0
- datasets/build_manifest.py +93 -0
- datasets/download_dfdc_sample.py +44 -0
- datasets/download_ffhq.py +49 -0
- datasets/extract_frames.py +90 -0
- datasets/procure_all.ps1 +40 -0
- datasets/procure_all.sh +37 -0
- deepshield_13_5bcf1328.pdf +148 -0
- deps.py +46 -0
- download_ffpp.py +261 -0
- ela_service.py +88 -0
- exif_service.py +129 -0
- file_handler.py +96 -0
- generate_colab_nb.py +213 -0
- heatmap_generator.py +164 -0
- image_service.py +58 -0
- llm_explainer.py +182 -0
- main.py +59 -0
- model_loader.py +156 -0
- models.py +45 -0
- news_lookup.py +242 -0
- report.html +367 -0
- report_service.py +152 -0
- requirements.txt +50 -0
- router.py +10 -0
- scoring.py +46 -0
- screenshot_service.py +126 -0
- test_image_classify.py +58 -0
- test_news_api.py +43 -0
- test_phase5.py +70 -0
- test_text_analysis.py +34 -0
- text_service.py +285 -0
- v1/__init__.py +0 -0
- v1/__pycache__/__init__.cpython-311.pyc +0 -0
- v1/__pycache__/analyze.cpython-311.pyc +0 -0
- v1/__pycache__/auth.cpython-311.pyc +0 -0
- v1/__pycache__/health.cpython-311.pyc +0 -0
- v1/__pycache__/history.cpython-311.pyc +0 -0
.env
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ═══════════════════════════════════════
|
| 2 |
+
# DeepShield — Environment Configuration
|
| 3 |
+
# ═══════════════════════════════════════
|
| 4 |
+
# Copy this file to backend/.env and customize
|
| 5 |
+
|
| 6 |
+
# Server
|
| 7 |
+
APP_HOST=0.0.0.0
|
| 8 |
+
APP_PORT=8000
|
| 9 |
+
DEBUG=true
|
| 10 |
+
CORS_ORIGINS=["http://localhost:5173"]
|
| 11 |
+
|
| 12 |
+
# Database
|
| 13 |
+
# For local dev: sqlite:///./deepshield.db
|
| 14 |
+
# For production (Neon/Supabase): postgresql://username:password@ep-cool...
|
| 15 |
+
DATABASE_URL=postgresql://neondb_owner:npg_YUdXqlrDP3H2@ep-divine-tooth-ame27uf3-pooler.c-5.us-east-1.aws.neon.tech/neondb?sslmode=require&channel_binding=require
|
| 16 |
+
|
| 17 |
+
# File Upload
|
| 18 |
+
MAX_UPLOAD_SIZE_MB=100
|
| 19 |
+
UPLOAD_DIR=./temp_uploads
|
| 20 |
+
FILE_RETENTION_SECONDS=300
|
| 21 |
+
|
| 22 |
+
# AI Models
|
| 23 |
+
IMAGE_MODEL_ID=prithivMLmods/Deep-Fake-Detector-v2-Model
|
| 24 |
+
TEXT_MODEL_ID=jy46604790/Fake-News-Bert-Detect
|
| 25 |
+
DEVICE=cpu
|
| 26 |
+
PRELOAD_MODELS=true
|
| 27 |
+
|
| 28 |
+
# News API (optional — sign up at https://newsdata.io)
|
| 29 |
+
NEWS_API_KEY=pub_83c8fca805124a4fb074256825decd4c
|
| 30 |
+
NEWS_API_BASE_URL=https://newsdata.io/api/1/news
|
| 31 |
+
|
| 32 |
+
# PDF Reports
|
| 33 |
+
REPORT_DIR=./temp_reports
|
| 34 |
+
REPORT_TTL_SECONDS=3600
|
| 35 |
+
|
| 36 |
+
# Auth — CHANGE JWT_SECRET_KEY IN PRODUCTION!
|
| 37 |
+
JWT_SECRET_KEY=change-me-in-production
|
| 38 |
+
JWT_ALGORITHM=HS256
|
| 39 |
+
JWT_EXPIRATION_MINUTES=1440
|
Colab_ViT_Training.ipynb
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "1e0e7b4a",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# DeepShield: FaceForensics++ ViT Training \n",
|
| 9 |
+
"Run this entirely in Google Colab.\n",
|
| 10 |
+
"**Before running**:\n",
|
| 11 |
+
"1. Go to `Runtime` -> `Change runtime type` -> select **T4 GPU**.\n",
|
| 12 |
+
"2. Run the cells below sequentially.\n"
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"cell_type": "code",
|
| 17 |
+
"execution_count": null,
|
| 18 |
+
"id": "4fe293e7",
|
| 19 |
+
"metadata": {},
|
| 20 |
+
"outputs": [],
|
| 21 |
+
"source": [
|
| 22 |
+
"!pip install timm transformers datasets accelerate evaluate opencv-python\n"
|
| 23 |
+
]
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"cell_type": "code",
|
| 27 |
+
"execution_count": null,
|
| 28 |
+
"id": "c9387c0f",
|
| 29 |
+
"metadata": {},
|
| 30 |
+
"outputs": [],
|
| 31 |
+
"source": [
|
| 32 |
+
"# We create the download script inside the Colab environment\n",
|
| 33 |
+
"download_script = '''#!/usr/bin/env python\n",
|
| 34 |
+
"import argparse\n",
|
| 35 |
+
"import os\n",
|
| 36 |
+
"import urllib.request\n",
|
| 37 |
+
"import tempfile\n",
|
| 38 |
+
"import time\n",
|
| 39 |
+
"import sys\n",
|
| 40 |
+
"import json\n",
|
| 41 |
+
"from tqdm import tqdm\n",
|
| 42 |
+
"from os.path import join\n",
|
| 43 |
+
"\n",
|
| 44 |
+
"FILELIST_URL = 'misc/filelist.json'\n",
|
| 45 |
+
"DEEPFEAKES_DETECTION_URL = 'misc/deepfake_detection_filenames.json'\n",
|
| 46 |
+
"DEEPFAKES_MODEL_NAMES = ['decoder_A.h5', 'decoder_B.h5', 'encoder.h5',]\n",
|
| 47 |
+
"DATASETS = {\n",
|
| 48 |
+
" 'original': 'original_sequences/youtube',\n",
|
| 49 |
+
" 'Deepfakes': 'manipulated_sequences/Deepfakes',\n",
|
| 50 |
+
" 'Face2Face': 'manipulated_sequences/Face2Face',\n",
|
| 51 |
+
" 'FaceShifter': 'manipulated_sequences/FaceShifter',\n",
|
| 52 |
+
" 'FaceSwap': 'manipulated_sequences/FaceSwap',\n",
|
| 53 |
+
" 'NeuralTextures': 'manipulated_sequences/NeuralTextures'\n",
|
| 54 |
+
"}\n",
|
| 55 |
+
"ALL_DATASETS = ['original', 'Deepfakes', 'Face2Face', 'FaceShifter', 'FaceSwap', 'NeuralTextures']\n",
|
| 56 |
+
"COMPRESSION = ['raw', 'c23', 'c40']\n",
|
| 57 |
+
"TYPE = ['videos']\n",
|
| 58 |
+
"\n",
|
| 59 |
+
"def download_file(url, out_file):\n",
|
| 60 |
+
" os.makedirs(os.path.dirname(out_file), exist_ok=True)\n",
|
| 61 |
+
" if not os.path.isfile(out_file):\n",
|
| 62 |
+
" urllib.request.urlretrieve(url, out_file)\n",
|
| 63 |
+
"\n",
|
| 64 |
+
"def main():\n",
|
| 65 |
+
" parser = argparse.ArgumentParser()\n",
|
| 66 |
+
" parser.add_argument('output_path', type=str)\n",
|
| 67 |
+
" parser.add_argument('-d', '--dataset', type=str, default='all')\n",
|
| 68 |
+
" parser.add_argument('-c', '--compression', type=str, default='c40')\n",
|
| 69 |
+
" parser.add_argument('-t', '--type', type=str, default='videos')\n",
|
| 70 |
+
" parser.add_argument('-n', '--num_videos', type=int, default=50) # Small amount for tutorial\n",
|
| 71 |
+
" args = parser.parse_args()\n",
|
| 72 |
+
" \n",
|
| 73 |
+
" base_url = 'http://kaldir.vc.in.tum.de/faceforensics/v3/'\n",
|
| 74 |
+
" \n",
|
| 75 |
+
" datasets = [args.dataset] if args.dataset != 'all' else ALL_DATASETS\n",
|
| 76 |
+
" for dataset in datasets:\n",
|
| 77 |
+
" dataset_path = DATASETS[dataset]\n",
|
| 78 |
+
" print(f'Downloading {args.compression} of {dataset}')\n",
|
| 79 |
+
" \n",
|
| 80 |
+
" file_pairs = json.loads(urllib.request.urlopen(base_url + FILELIST_URL).read().decode(\"utf-8\"))\n",
|
| 81 |
+
" filelist = []\n",
|
| 82 |
+
" if 'original' in dataset_path:\n",
|
| 83 |
+
" for pair in file_pairs:\n",
|
| 84 |
+
" filelist += pair\n",
|
| 85 |
+
" else:\n",
|
| 86 |
+
" for pair in file_pairs:\n",
|
| 87 |
+
" filelist.append('_'.join(pair))\n",
|
| 88 |
+
" filelist.append('_'.join(pair[::-1]))\n",
|
| 89 |
+
" \n",
|
| 90 |
+
" filelist = filelist[:args.num_videos]\n",
|
| 91 |
+
" dataset_videos_url = base_url + f'{dataset_path}/{args.compression}/{args.type}/'\n",
|
| 92 |
+
" dataset_output_path = join(args.output_path, dataset_path, args.compression, args.type)\n",
|
| 93 |
+
" \n",
|
| 94 |
+
" for filename in tqdm(filelist):\n",
|
| 95 |
+
" download_file(dataset_videos_url + filename + \".mp4\", join(dataset_output_path, filename + \".mp4\"))\n",
|
| 96 |
+
"\n",
|
| 97 |
+
"if __name__ == \"__main__\":\n",
|
| 98 |
+
" main()\n",
|
| 99 |
+
"'''\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"with open(\"download_ffpp.py\", \"w\") as f:\n",
|
| 102 |
+
" f.write(download_script)\n",
|
| 103 |
+
"\n",
|
| 104 |
+
"!python download_ffpp.py ./data -d all -c c40 -t videos -n 50\n"
|
| 105 |
+
]
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"cell_type": "code",
|
| 109 |
+
"execution_count": null,
|
| 110 |
+
"id": "f33716f6",
|
| 111 |
+
"metadata": {},
|
| 112 |
+
"outputs": [],
|
| 113 |
+
"source": [
|
| 114 |
+
"import cv2\n",
|
| 115 |
+
"import os\n",
|
| 116 |
+
"import glob\n",
|
| 117 |
+
"from tqdm import tqdm\n",
|
| 118 |
+
"\n",
|
| 119 |
+
"def extract_frames(video_folder, output_folder, label, max_frames=4):\n",
|
| 120 |
+
" os.makedirs(output_folder, exist_ok=True)\n",
|
| 121 |
+
" videos = glob.glob(os.path.join(video_folder, \"*.mp4\"))\n",
|
| 122 |
+
" \n",
|
| 123 |
+
" for vid_path in tqdm(videos, desc=f\"Extracting {label}\"):\n",
|
| 124 |
+
" vid_name = os.path.basename(vid_path).replace('.mp4','')\n",
|
| 125 |
+
" cap = cv2.VideoCapture(vid_path)\n",
|
| 126 |
+
" count = 0\n",
|
| 127 |
+
" while cap.isOpened() and count < max_frames:\n",
|
| 128 |
+
" ret, frame = cap.read()\n",
|
| 129 |
+
" if not ret: break\n",
|
| 130 |
+
" frame = cv2.resize(frame, (224, 224))\n",
|
| 131 |
+
" out_path = os.path.join(output_folder, f\"{vid_name}_f{count}.jpg\")\n",
|
| 132 |
+
" cv2.imwrite(out_path, frame)\n",
|
| 133 |
+
" count += 1\n",
|
| 134 |
+
" cap.release()\n",
|
| 135 |
+
"\n",
|
| 136 |
+
"# Extract Real\n",
|
| 137 |
+
"extract_frames('./data/original_sequences/youtube/c40/videos', './dataset/real', 'real')\n",
|
| 138 |
+
"\n",
|
| 139 |
+
"# Extract Fakes\n",
|
| 140 |
+
"fakes = ['Deepfakes', 'Face2Face', 'FaceSwap', 'NeuralTextures']\n",
|
| 141 |
+
"for f in fakes:\n",
|
| 142 |
+
" extract_frames(f'./data/manipulated_sequences/{f}/c40/videos', './dataset/fake', 'fake')\n"
|
| 143 |
+
]
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"cell_type": "code",
|
| 147 |
+
"execution_count": null,
|
| 148 |
+
"id": "b79cdd85",
|
| 149 |
+
"metadata": {},
|
| 150 |
+
"outputs": [],
|
| 151 |
+
"source": [
|
| 152 |
+
"import numpy as np\n",
|
| 153 |
+
"from datasets import load_dataset\n",
|
| 154 |
+
"from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer\n",
|
| 155 |
+
"import torch\n",
|
| 156 |
+
"\n",
|
| 157 |
+
"# 1. Load Dataset\n",
|
| 158 |
+
"dataset = load_dataset('imagefolder', data_dir='./dataset')\n",
|
| 159 |
+
"# Split into train/validation\n",
|
| 160 |
+
"dataset = dataset['train'].train_test_split(test_size=0.1)\n",
|
| 161 |
+
"\n",
|
| 162 |
+
"# 2. Preprocessor\n",
|
| 163 |
+
"model_name_or_path = 'google/vit-base-patch16-224-in21k'\n",
|
| 164 |
+
"processor = ViTImageProcessor.from_pretrained(model_name_or_path)\n",
|
| 165 |
+
"\n",
|
| 166 |
+
"def transform(example_batch):\n",
|
| 167 |
+
" # Take a list of PIL images and turn them to pixel values\n",
|
| 168 |
+
" inputs = processor([x.convert(\"RGB\") for x in example_batch['image']], return_tensors='pt')\n",
|
| 169 |
+
" inputs['labels'] = example_batch['label']\n",
|
| 170 |
+
" return inputs\n",
|
| 171 |
+
"\n",
|
| 172 |
+
"prepared_ds = dataset.with_transform(transform)\n",
|
| 173 |
+
"\n",
|
| 174 |
+
"def collate_fn(batch):\n",
|
| 175 |
+
" return {\n",
|
| 176 |
+
" 'pixel_values': torch.stack([x['pixel_values'] for x in batch]),\n",
|
| 177 |
+
" 'labels': torch.tensor([x['labels'] for x in batch])\n",
|
| 178 |
+
" }\n",
|
| 179 |
+
"\n",
|
| 180 |
+
"# 3. Load Model\n",
|
| 181 |
+
"labels = dataset['train'].features['label'].names\n",
|
| 182 |
+
"model = ViTForImageClassification.from_pretrained(\n",
|
| 183 |
+
" model_name_or_path,\n",
|
| 184 |
+
" num_labels=len(labels),\n",
|
| 185 |
+
" id2label={str(i): c for i, c in enumerate(labels)},\n",
|
| 186 |
+
" label2id={c: str(i) for i, c in enumerate(labels)}\n",
|
| 187 |
+
")\n",
|
| 188 |
+
"\n",
|
| 189 |
+
"training_args = TrainingArguments(\n",
|
| 190 |
+
" output_dir=\"./vit-deepshield\",\n",
|
| 191 |
+
" per_device_train_batch_size=16,\n",
|
| 192 |
+
" eval_strategy=\"steps\",\n",
|
| 193 |
+
" num_train_epochs=3,\n",
|
| 194 |
+
" fp16=True, # Mixed precision for speed\n",
|
| 195 |
+
" save_steps=100,\n",
|
| 196 |
+
" eval_steps=100,\n",
|
| 197 |
+
" logging_steps=10,\n",
|
| 198 |
+
" learning_rate=2e-4,\n",
|
| 199 |
+
" save_total_limit=2,\n",
|
| 200 |
+
" remove_unused_columns=False,\n",
|
| 201 |
+
" push_to_hub=False,\n",
|
| 202 |
+
" load_best_model_at_end=True,\n",
|
| 203 |
+
")\n",
|
| 204 |
+
"\n",
|
| 205 |
+
"import evaluate\n",
|
| 206 |
+
"metric = evaluate.load(\"accuracy\")\n",
|
| 207 |
+
"def compute_metrics(p):\n",
|
| 208 |
+
" return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)\n",
|
| 209 |
+
"\n",
|
| 210 |
+
"trainer = Trainer(\n",
|
| 211 |
+
" model=model,\n",
|
| 212 |
+
" args=training_args,\n",
|
| 213 |
+
" data_collator=collate_fn,\n",
|
| 214 |
+
" compute_metrics=compute_metrics,\n",
|
| 215 |
+
" train_dataset=prepared_ds[\"train\"],\n",
|
| 216 |
+
" eval_dataset=prepared_ds[\"test\"],\n",
|
| 217 |
+
")\n",
|
| 218 |
+
"\n",
|
| 219 |
+
"# 4. Train\n",
|
| 220 |
+
"train_results = trainer.train()\n",
|
| 221 |
+
"trainer.save_model(\"deepshield_vit_model\")\n",
|
| 222 |
+
"processor.save_pretrained(\"deepshield_vit_model\")\n",
|
| 223 |
+
"trainer.log_metrics(\"train\", train_results.metrics)\n",
|
| 224 |
+
"trainer.save_metrics(\"train\", train_results.metrics)\n",
|
| 225 |
+
"trainer.save_state()\n",
|
| 226 |
+
"print(\"Training Complete! The model is saved to ./deepshield_vit_model\")\n"
|
| 227 |
+
]
|
| 228 |
+
}
|
| 229 |
+
],
|
| 230 |
+
"metadata": {},
|
| 231 |
+
"nbformat": 4,
|
| 232 |
+
"nbformat_minor": 5
|
| 233 |
+
}
|
Dockerfile
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Base image with Python 3.10
|
| 2 |
+
FROM python:3.10-slim
|
| 3 |
+
|
| 4 |
+
# Set the working directory
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install system dependencies required for OpenCV, PyTorch, etc.
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
libgl1-mesa-glx \
|
| 10 |
+
libglib2.0-0 \
|
| 11 |
+
build-essential \
|
| 12 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
+
|
| 14 |
+
# Copy the requirements file into the container
|
| 15 |
+
COPY requirements.txt .
|
| 16 |
+
|
| 17 |
+
# Install Python dependencies
|
| 18 |
+
# Using --no-cache-dir keeps the Docker image smaller
|
| 19 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 20 |
+
|
| 21 |
+
# Copy the rest of the backend code
|
| 22 |
+
COPY . .
|
| 23 |
+
|
| 24 |
+
# Create directories for models and temporary uploads if they don't exist
|
| 25 |
+
RUN mkdir -p /app/temp_uploads /app/models
|
| 26 |
+
|
| 27 |
+
# Expose port 7860 (This is the default port required by Hugging Face Spaces)
|
| 28 |
+
EXPOSE 7860
|
| 29 |
+
|
| 30 |
+
# Run the FastAPI server on port 7860
|
| 31 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
-
|
| 2 |
-
title: Deepshield
|
| 3 |
-
emoji: 🏆
|
| 4 |
-
colorFrom: yellow
|
| 5 |
-
colorTo: purple
|
| 6 |
-
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
-
license: mit
|
| 9 |
-
---
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# backend/training
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
Training pipeline for the DeepShield image detector (BUILD_PLAN2 Phase 11).
|
| 4 |
+
|
| 5 |
+
| Phase | Module |
|
| 6 |
+
|---|---|
|
| 7 |
+
| 11.1 Dataset procurement | [`datasets/`](./datasets/) — see [../../docs/datasets.md](../../docs/datasets.md) |
|
| 8 |
+
| 11.2 Training | `dataset.py`, `train_convnext.py` (pending) |
|
| 9 |
+
| 11.2 Calibration | `calibrate.py` (pending) |
|
| 10 |
+
| 11.2 Evaluation | `eval.py` (pending) |
|
| 11 |
+
|
| 12 |
+
Run `bash datasets/procure_all.sh` to build `./data/manifest.csv`.
|
__init__.py
ADDED
|
File without changes
|
analyze.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import List
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel
|
| 6 |
+
|
| 7 |
+
from schemas.common import (
|
| 8 |
+
ArtifactIndicator,
|
| 9 |
+
ContradictingEvidence,
|
| 10 |
+
ExifSummary,
|
| 11 |
+
LLMExplainabilitySummary,
|
| 12 |
+
ProcessingSummary,
|
| 13 |
+
TrustedSource,
|
| 14 |
+
TruthOverride,
|
| 15 |
+
Verdict,
|
| 16 |
+
VLMBreakdown,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class SensationalismBreakdown(BaseModel):
|
| 21 |
+
score: int = 0
|
| 22 |
+
level: str = "Low"
|
| 23 |
+
exclamation_count: int = 0
|
| 24 |
+
caps_word_count: int = 0
|
| 25 |
+
clickbait_matches: int = 0
|
| 26 |
+
emotional_word_count: int = 0
|
| 27 |
+
superlative_count: int = 0
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class ManipulationIndicatorOut(BaseModel):
|
| 31 |
+
pattern_type: str
|
| 32 |
+
matched_text: str
|
| 33 |
+
start_pos: int
|
| 34 |
+
end_pos: int
|
| 35 |
+
severity: str
|
| 36 |
+
description: str
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class TextExplainability(BaseModel):
|
| 40 |
+
fake_probability: float
|
| 41 |
+
top_label: str
|
| 42 |
+
all_scores: dict = {}
|
| 43 |
+
keywords: List[str] = []
|
| 44 |
+
sensationalism: SensationalismBreakdown = SensationalismBreakdown()
|
| 45 |
+
manipulation_indicators: List[ManipulationIndicatorOut] = []
|
| 46 |
+
detected_language: str = "en" # ISO 639-1 code, e.g. "en", "hi"
|
| 47 |
+
truth_override: TruthOverride | None = None
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class TextAnalysisResponse(BaseModel):
|
| 51 |
+
analysis_id: str
|
| 52 |
+
record_id: int = 0
|
| 53 |
+
media_type: str = "text"
|
| 54 |
+
timestamp: str
|
| 55 |
+
verdict: Verdict
|
| 56 |
+
explainability: TextExplainability
|
| 57 |
+
llm_summary: LLMExplainabilitySummary | None = None
|
| 58 |
+
trusted_sources: List[TrustedSource] = []
|
| 59 |
+
contradicting_evidence: List[ContradictingEvidence] = []
|
| 60 |
+
processing_summary: ProcessingSummary
|
| 61 |
+
responsible_ai_notice: str = (
|
| 62 |
+
"AI-based analysis may not be 100% accurate. Cross-check with trusted sources before sharing."
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class OCRBoxOut(BaseModel):
|
| 67 |
+
text: str
|
| 68 |
+
bbox: List[List[int]]
|
| 69 |
+
confidence: float
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class SuspiciousPhraseOut(BaseModel):
|
| 73 |
+
text: str
|
| 74 |
+
bbox: List[List[int]]
|
| 75 |
+
pattern_type: str
|
| 76 |
+
severity: str
|
| 77 |
+
description: str
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class LayoutAnomalyOut(BaseModel):
|
| 81 |
+
type: str
|
| 82 |
+
severity: str
|
| 83 |
+
description: str
|
| 84 |
+
confidence: float
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class ScreenshotExplainability(BaseModel):
|
| 88 |
+
extracted_text: str = ""
|
| 89 |
+
ocr_boxes: List[OCRBoxOut] = []
|
| 90 |
+
fake_probability: float = 0.0
|
| 91 |
+
sensationalism: SensationalismBreakdown = SensationalismBreakdown()
|
| 92 |
+
suspicious_phrases: List[SuspiciousPhraseOut] = []
|
| 93 |
+
layout_anomalies: List[LayoutAnomalyOut] = []
|
| 94 |
+
keywords: List[str] = []
|
| 95 |
+
detected_language: str = "en"
|
| 96 |
+
truth_override: TruthOverride | None = None
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
class ScreenshotAnalysisResponse(BaseModel):
|
| 100 |
+
analysis_id: str
|
| 101 |
+
record_id: int = 0
|
| 102 |
+
media_type: str = "screenshot"
|
| 103 |
+
timestamp: str
|
| 104 |
+
verdict: Verdict
|
| 105 |
+
explainability: ScreenshotExplainability
|
| 106 |
+
llm_summary: LLMExplainabilitySummary | None = None
|
| 107 |
+
trusted_sources: List[TrustedSource] = []
|
| 108 |
+
contradicting_evidence: List[ContradictingEvidence] = []
|
| 109 |
+
processing_summary: ProcessingSummary
|
| 110 |
+
responsible_ai_notice: str = (
|
| 111 |
+
"AI-based analysis may not be 100% accurate. Cross-check with trusted sources before sharing."
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
class ImageExplainability(BaseModel):
|
| 116 |
+
heatmap_base64: str = ""
|
| 117 |
+
ela_base64: str = ""
|
| 118 |
+
boxes_base64: str = ""
|
| 119 |
+
heatmap_status: str = "success" # success | failed | degraded
|
| 120 |
+
artifact_indicators: List[ArtifactIndicator] = []
|
| 121 |
+
exif: ExifSummary | None = None
|
| 122 |
+
llm_summary: LLMExplainabilitySummary | None = None
|
| 123 |
+
vlm_breakdown: VLMBreakdown | None = None
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
class FrameAnalysisOut(BaseModel):
|
| 127 |
+
index: int
|
| 128 |
+
timestamp_s: float
|
| 129 |
+
label: str
|
| 130 |
+
confidence: float
|
| 131 |
+
suspicious_prob: float
|
| 132 |
+
is_suspicious: bool
|
| 133 |
+
has_face: bool = False
|
| 134 |
+
scored: bool = False
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
class VideoExplainability(BaseModel):
|
| 138 |
+
num_frames_sampled: int
|
| 139 |
+
num_face_frames: int = 0
|
| 140 |
+
num_suspicious_frames: int
|
| 141 |
+
mean_suspicious_prob: float
|
| 142 |
+
max_suspicious_prob: float
|
| 143 |
+
suspicious_ratio: float
|
| 144 |
+
insufficient_faces: bool = False
|
| 145 |
+
suspicious_timestamps: List[float] = []
|
| 146 |
+
frames: List[FrameAnalysisOut] = []
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
class VideoAnalysisResponse(BaseModel):
|
| 150 |
+
analysis_id: str
|
| 151 |
+
record_id: int = 0
|
| 152 |
+
media_type: str = "video"
|
| 153 |
+
timestamp: str
|
| 154 |
+
verdict: Verdict
|
| 155 |
+
explainability: VideoExplainability
|
| 156 |
+
llm_summary: LLMExplainabilitySummary | None = None
|
| 157 |
+
trusted_sources: List[TrustedSource] = []
|
| 158 |
+
contradicting_evidence: List[ContradictingEvidence] = []
|
| 159 |
+
processing_summary: ProcessingSummary
|
| 160 |
+
responsible_ai_notice: str = (
|
| 161 |
+
"AI-based analysis may not be 100% accurate. Cross-check with trusted sources before sharing."
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
class ImageAnalysisResponse(BaseModel):
|
| 166 |
+
analysis_id: str
|
| 167 |
+
record_id: int = 0
|
| 168 |
+
media_type: str = "image"
|
| 169 |
+
timestamp: str
|
| 170 |
+
verdict: Verdict
|
| 171 |
+
explainability: ImageExplainability
|
| 172 |
+
trusted_sources: List[TrustedSource] = []
|
| 173 |
+
contradicting_evidence: List[ContradictingEvidence] = []
|
| 174 |
+
processing_summary: ProcessingSummary
|
| 175 |
+
responsible_ai_notice: str = (
|
| 176 |
+
"AI-based analysis may not be 100% accurate. Cross-check with trusted sources before sharing."
|
| 177 |
+
)
|
artifact_detector.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import io
|
| 4 |
+
from typing import List
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
from loguru import logger
|
| 8 |
+
from PIL import Image
|
| 9 |
+
|
| 10 |
+
from schemas.common import ArtifactIndicator
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _severity_from_score(score: float) -> str:
|
| 14 |
+
if score >= 0.7:
|
| 15 |
+
return "high"
|
| 16 |
+
if score >= 0.4:
|
| 17 |
+
return "medium"
|
| 18 |
+
return "low"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# ---------- 1. GAN high-frequency signature (FFT) ----------
|
| 22 |
+
def detect_gan_hf_artifact(pil_img: Image.Image) -> ArtifactIndicator | None:
|
| 23 |
+
"""Compute high-frequency energy ratio on the luminance channel.
|
| 24 |
+
Real photos typically follow a ~1/f spectrum; many GAN outputs show
|
| 25 |
+
elevated HF energy or spectral peaks.
|
| 26 |
+
"""
|
| 27 |
+
try:
|
| 28 |
+
gray = np.asarray(pil_img.convert("L"), dtype=np.float32)
|
| 29 |
+
# downsample for speed
|
| 30 |
+
if max(gray.shape) > 512:
|
| 31 |
+
import cv2
|
| 32 |
+
|
| 33 |
+
scale = 512 / max(gray.shape)
|
| 34 |
+
gray = cv2.resize(gray, (int(gray.shape[1] * scale), int(gray.shape[0] * scale)))
|
| 35 |
+
|
| 36 |
+
fft = np.fft.fftshift(np.fft.fft2(gray))
|
| 37 |
+
mag = np.abs(fft)
|
| 38 |
+
h, w = mag.shape
|
| 39 |
+
cy, cx = h // 2, w // 2
|
| 40 |
+
y, x = np.ogrid[:h, :w]
|
| 41 |
+
r = np.sqrt((x - cx) ** 2 + (y - cy) ** 2)
|
| 42 |
+
r_max = np.sqrt(cx * cx + cy * cy)
|
| 43 |
+
hf_mask = r > (0.5 * r_max)
|
| 44 |
+
|
| 45 |
+
total = float(mag.sum() + 1e-9)
|
| 46 |
+
hf = float(mag[hf_mask].sum())
|
| 47 |
+
ratio = hf / total # typically 0.05–0.20 for natural photos
|
| 48 |
+
|
| 49 |
+
# normalize to [0,1] suspiciousness
|
| 50 |
+
score = max(0.0, min(1.0, (ratio - 0.10) / 0.20))
|
| 51 |
+
sev = _severity_from_score(score)
|
| 52 |
+
return ArtifactIndicator(
|
| 53 |
+
type="gan_artifact",
|
| 54 |
+
severity=sev,
|
| 55 |
+
description=(
|
| 56 |
+
f"High-frequency energy ratio {ratio:.3f} — "
|
| 57 |
+
+ ("elevated HF energy consistent with GAN/diffusion outputs" if score > 0.4
|
| 58 |
+
else "natural frequency falloff")
|
| 59 |
+
),
|
| 60 |
+
confidence=float(score),
|
| 61 |
+
)
|
| 62 |
+
except Exception as e: # noqa: BLE001
|
| 63 |
+
logger.warning(f"GAN HF detection failed: {e}")
|
| 64 |
+
return None
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# ---------- 2. JPEG quantization table anomaly ----------
|
| 68 |
+
_STANDARD_Q_SUMS = { # rough heuristic: camera JPEGs fall in these ranges
|
| 69 |
+
50: (1500, 4500),
|
| 70 |
+
75: (600, 2500),
|
| 71 |
+
90: (200, 1000),
|
| 72 |
+
95: (100, 600),
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def detect_compression_anomaly(raw_bytes: bytes) -> ArtifactIndicator | None:
|
| 77 |
+
"""Inspect JPEG quantization tables. Missing tables, non-standard layouts,
|
| 78 |
+
or re-saved tables often indicate manipulation or re-encoding.
|
| 79 |
+
"""
|
| 80 |
+
try:
|
| 81 |
+
img = Image.open(io.BytesIO(raw_bytes))
|
| 82 |
+
if img.format != "JPEG":
|
| 83 |
+
return ArtifactIndicator(
|
| 84 |
+
type="compression",
|
| 85 |
+
severity="low",
|
| 86 |
+
description=f"Non-JPEG format ({img.format}); compression signature not available",
|
| 87 |
+
confidence=0.1,
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
q = getattr(img, "quantization", None)
|
| 91 |
+
if not q:
|
| 92 |
+
return ArtifactIndicator(
|
| 93 |
+
type="compression",
|
| 94 |
+
severity="low",
|
| 95 |
+
description="No JPEG quantization tables readable",
|
| 96 |
+
confidence=0.2,
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
tables = list(q.values())
|
| 100 |
+
sums = [int(sum(t)) for t in tables]
|
| 101 |
+
num_tables = len(tables)
|
| 102 |
+
|
| 103 |
+
# Heuristics: very low sum → very high quality (possibly re-saved);
|
| 104 |
+
# non-standard number of tables; extreme values.
|
| 105 |
+
suspicious = 0.0
|
| 106 |
+
reasons: list[str] = []
|
| 107 |
+
if num_tables not in (1, 2):
|
| 108 |
+
suspicious += 0.4
|
| 109 |
+
reasons.append(f"unusual table count ({num_tables})")
|
| 110 |
+
if any(s < 60 for s in sums):
|
| 111 |
+
suspicious += 0.3
|
| 112 |
+
reasons.append("very low quantization sums (possible re-encoding)")
|
| 113 |
+
if any(s > 8000 for s in sums):
|
| 114 |
+
suspicious += 0.2
|
| 115 |
+
reasons.append("very high quantization sums")
|
| 116 |
+
|
| 117 |
+
score = max(0.0, min(1.0, suspicious))
|
| 118 |
+
sev = _severity_from_score(score)
|
| 119 |
+
desc = (
|
| 120 |
+
f"JPEG Q-table sums {sums}"
|
| 121 |
+
+ (f"; {', '.join(reasons)}" if reasons else "; within typical camera range")
|
| 122 |
+
)
|
| 123 |
+
return ArtifactIndicator(
|
| 124 |
+
type="compression",
|
| 125 |
+
severity=sev,
|
| 126 |
+
description=desc,
|
| 127 |
+
confidence=float(score),
|
| 128 |
+
)
|
| 129 |
+
except Exception as e: # noqa: BLE001
|
| 130 |
+
logger.warning(f"Compression anomaly detection failed: {e}")
|
| 131 |
+
return None
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
# ---------- 3. Facial boundary + 4. Lighting (MediaPipe) ----------
|
| 135 |
+
def detect_face_based_artifacts(pil_img: Image.Image) -> List[ArtifactIndicator]:
|
| 136 |
+
"""If a face is detected, analyze jaw boundary variance and per-quadrant
|
| 137 |
+
luminance balance. Returns 0, 1, or 2 indicators.
|
| 138 |
+
"""
|
| 139 |
+
results: List[ArtifactIndicator] = []
|
| 140 |
+
try:
|
| 141 |
+
import mediapipe as mp # type: ignore
|
| 142 |
+
|
| 143 |
+
from models.model_loader import get_model_loader
|
| 144 |
+
|
| 145 |
+
detector = get_model_loader().load_face_detector()
|
| 146 |
+
rgb = np.asarray(pil_img.convert("RGB"))
|
| 147 |
+
h, w = rgb.shape[:2]
|
| 148 |
+
mp_result = detector.process(rgb)
|
| 149 |
+
|
| 150 |
+
if not mp_result.multi_face_landmarks:
|
| 151 |
+
return results
|
| 152 |
+
|
| 153 |
+
landmarks = mp_result.multi_face_landmarks[0].landmark
|
| 154 |
+
|
| 155 |
+
# ----- Jaw boundary jitter -----
|
| 156 |
+
# FaceMesh jaw/oval landmark indices (approximate face contour)
|
| 157 |
+
JAW_IDX = [
|
| 158 |
+
10, 338, 297, 332, 284, 251, 389, 356, 454, 323, 361,
|
| 159 |
+
288, 397, 365, 379, 378, 400, 377, 152, 148, 176, 149,
|
| 160 |
+
150, 136, 172, 58, 132, 93, 234, 127, 162, 21, 54, 103, 67, 109,
|
| 161 |
+
]
|
| 162 |
+
pts = np.array([(landmarks[i].x * w, landmarks[i].y * h) for i in JAW_IDX])
|
| 163 |
+
# Second-difference magnitude = local curvature jitter
|
| 164 |
+
diffs = np.diff(pts, axis=0)
|
| 165 |
+
seconds = np.diff(diffs, axis=0)
|
| 166 |
+
jitter = float(np.linalg.norm(seconds, axis=1).mean()) / max(w, h)
|
| 167 |
+
jitter_score = max(0.0, min(1.0, (jitter - 0.003) / 0.010))
|
| 168 |
+
results.append(
|
| 169 |
+
ArtifactIndicator(
|
| 170 |
+
type="facial_boundary",
|
| 171 |
+
severity=_severity_from_score(jitter_score),
|
| 172 |
+
description=(
|
| 173 |
+
f"Jaw-contour jitter {jitter:.4f} (normalized) — "
|
| 174 |
+
+ ("inconsistent boundary blending detected" if jitter_score > 0.4
|
| 175 |
+
else "face boundary appears smooth")
|
| 176 |
+
),
|
| 177 |
+
confidence=float(jitter_score),
|
| 178 |
+
)
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
# ----- Lighting inconsistency (per-quadrant luminance) -----
|
| 182 |
+
xs = np.array([lm.x * w for lm in landmarks])
|
| 183 |
+
ys = np.array([lm.y * h for lm in landmarks])
|
| 184 |
+
x0, x1 = int(max(0, xs.min())), int(min(w, xs.max()))
|
| 185 |
+
y0, y1 = int(max(0, ys.min())), int(min(h, ys.max()))
|
| 186 |
+
if x1 > x0 + 4 and y1 > y0 + 4:
|
| 187 |
+
face_crop = rgb[y0:y1, x0:x1]
|
| 188 |
+
gray = 0.299 * face_crop[..., 0] + 0.587 * face_crop[..., 1] + 0.114 * face_crop[..., 2]
|
| 189 |
+
hh, ww = gray.shape
|
| 190 |
+
quads = [
|
| 191 |
+
gray[: hh // 2, : ww // 2],
|
| 192 |
+
gray[: hh // 2, ww // 2 :],
|
| 193 |
+
gray[hh // 2 :, : ww // 2],
|
| 194 |
+
gray[hh // 2 :, ww // 2 :],
|
| 195 |
+
]
|
| 196 |
+
means = np.array([q.mean() for q in quads if q.size > 0])
|
| 197 |
+
if means.size == 4 and means.mean() > 1e-3:
|
| 198 |
+
imbalance = float(means.std() / means.mean())
|
| 199 |
+
lighting_score = max(0.0, min(1.0, (imbalance - 0.08) / 0.20))
|
| 200 |
+
results.append(
|
| 201 |
+
ArtifactIndicator(
|
| 202 |
+
type="lighting",
|
| 203 |
+
severity=_severity_from_score(lighting_score),
|
| 204 |
+
description=(
|
| 205 |
+
f"Luminance imbalance across face quadrants {imbalance:.3f} — "
|
| 206 |
+
+ ("inconsistent lighting direction" if lighting_score > 0.4
|
| 207 |
+
else "lighting appears uniform")
|
| 208 |
+
),
|
| 209 |
+
confidence=float(lighting_score),
|
| 210 |
+
)
|
| 211 |
+
)
|
| 212 |
+
except Exception as e: # noqa: BLE001
|
| 213 |
+
logger.warning(f"Face-based artifact detection failed: {e}")
|
| 214 |
+
|
| 215 |
+
return results
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
# ---------- Orchestrator ----------
|
| 219 |
+
def scan_artifacts(pil_img: Image.Image, raw_bytes: bytes) -> List[ArtifactIndicator]:
|
| 220 |
+
indicators: List[ArtifactIndicator] = []
|
| 221 |
+
for fn in (
|
| 222 |
+
lambda: detect_gan_hf_artifact(pil_img),
|
| 223 |
+
lambda: detect_compression_anomaly(raw_bytes),
|
| 224 |
+
):
|
| 225 |
+
ind = fn()
|
| 226 |
+
if ind is not None:
|
| 227 |
+
indicators.append(ind)
|
| 228 |
+
indicators.extend(detect_face_based_artifacts(pil_img))
|
| 229 |
+
return indicators
|
auth.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, EmailStr, Field
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class RegisterBody(BaseModel):
|
| 9 |
+
email: EmailStr
|
| 10 |
+
password: str = Field(min_length=6, max_length=128)
|
| 11 |
+
name: str | None = Field(default=None, max_length=255)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class LoginBody(BaseModel):
|
| 15 |
+
email: EmailStr
|
| 16 |
+
password: str
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class UserOut(BaseModel):
|
| 20 |
+
id: int
|
| 21 |
+
email: str
|
| 22 |
+
name: str | None = None
|
| 23 |
+
created_at: datetime
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class TokenResponse(BaseModel):
|
| 27 |
+
access_token: str
|
| 28 |
+
token_type: str = "bearer"
|
| 29 |
+
expires_in_minutes: int
|
| 30 |
+
user: UserOut
|
auth_service.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from datetime import datetime, timedelta, timezone
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
import bcrypt
|
| 7 |
+
from jose import JWTError, jwt
|
| 8 |
+
from sqlalchemy.orm import Session
|
| 9 |
+
|
| 10 |
+
from config import settings
|
| 11 |
+
from db.models import User
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _encode_pw(plain: str) -> bytes:
|
| 15 |
+
# bcrypt truncates to 72 bytes silently in some builds and hard-errors in others.
|
| 16 |
+
# Truncate explicitly so behavior is deterministic across versions.
|
| 17 |
+
return plain.encode("utf-8")[:72]
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def hash_password(plain: str) -> str:
|
| 21 |
+
return bcrypt.hashpw(_encode_pw(plain), bcrypt.gensalt()).decode("utf-8")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def verify_password(plain: str, hashed: str) -> bool:
|
| 25 |
+
try:
|
| 26 |
+
return bcrypt.checkpw(_encode_pw(plain), hashed.encode("utf-8"))
|
| 27 |
+
except Exception:
|
| 28 |
+
return False
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def create_access_token(user_id: int, email: str) -> str:
|
| 32 |
+
now = datetime.now(timezone.utc)
|
| 33 |
+
payload = {
|
| 34 |
+
"sub": str(user_id),
|
| 35 |
+
"email": email,
|
| 36 |
+
"iat": int(now.timestamp()),
|
| 37 |
+
"exp": int((now + timedelta(minutes=settings.JWT_EXPIRATION_MINUTES)).timestamp()),
|
| 38 |
+
}
|
| 39 |
+
return jwt.encode(payload, settings.JWT_SECRET_KEY, algorithm=settings.JWT_ALGORITHM)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def decode_token(token: str) -> dict[str, Any] | None:
|
| 43 |
+
try:
|
| 44 |
+
return jwt.decode(token, settings.JWT_SECRET_KEY, algorithms=[settings.JWT_ALGORITHM])
|
| 45 |
+
except JWTError:
|
| 46 |
+
return None
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def register_user(db: Session, email: str, password: str, name: str | None) -> User:
|
| 50 |
+
email = email.strip().lower()
|
| 51 |
+
user = User(email=email, password_hash=hash_password(password), name=(name or None))
|
| 52 |
+
db.add(user)
|
| 53 |
+
db.commit()
|
| 54 |
+
db.refresh(user)
|
| 55 |
+
return user
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def authenticate(db: Session, email: str, password: str) -> User | None:
|
| 59 |
+
email = email.strip().lower()
|
| 60 |
+
user = db.query(User).filter(User.email == email).first()
|
| 61 |
+
if not user or not verify_password(password, user.password_hash):
|
| 62 |
+
return None
|
| 63 |
+
return user
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_user(db: Session, user_id: int) -> User | None:
|
| 67 |
+
return db.query(User).filter(User.id == user_id).first()
|
common.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import List, Optional
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Verdict(BaseModel):
|
| 9 |
+
model_config = ConfigDict(protected_namespaces=())
|
| 10 |
+
|
| 11 |
+
label: str
|
| 12 |
+
severity: str
|
| 13 |
+
authenticity_score: int = Field(ge=0, le=100)
|
| 14 |
+
model_confidence: float = Field(ge=0.0, le=1.0)
|
| 15 |
+
model_label: str
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class ArtifactIndicator(BaseModel):
|
| 19 |
+
type: str
|
| 20 |
+
severity: str # low | medium | high
|
| 21 |
+
description: str
|
| 22 |
+
confidence: float = Field(ge=0.0, le=1.0)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class TrustedSource(BaseModel):
|
| 26 |
+
source_name: str
|
| 27 |
+
title: str
|
| 28 |
+
url: str
|
| 29 |
+
published_at: Optional[str] = None
|
| 30 |
+
relevance_score: float = Field(ge=0.0, le=1.0)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class ContradictingEvidence(BaseModel):
|
| 34 |
+
source_name: str
|
| 35 |
+
title: str
|
| 36 |
+
url: str
|
| 37 |
+
type: str = "fact_check"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class TruthOverride(BaseModel):
|
| 41 |
+
applied: bool = False
|
| 42 |
+
source_url: str = ""
|
| 43 |
+
source_name: str = ""
|
| 44 |
+
similarity: float = 0.0
|
| 45 |
+
fake_prob_before: float = 0.0
|
| 46 |
+
fake_prob_after: float = 0.0
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class ExifSummary(BaseModel):
|
| 50 |
+
make: Optional[str] = None
|
| 51 |
+
model: Optional[str] = None
|
| 52 |
+
datetime_original: Optional[str] = None
|
| 53 |
+
gps_info: Optional[str] = None
|
| 54 |
+
software: Optional[str] = None
|
| 55 |
+
lens_model: Optional[str] = None
|
| 56 |
+
trust_adjustment: int = 0 # negative = more real, positive = more fake
|
| 57 |
+
trust_reason: str = ""
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class LLMExplainabilitySummary(BaseModel):
|
| 61 |
+
paragraph: str = ""
|
| 62 |
+
bullets: List[str] = []
|
| 63 |
+
model_used: str = ""
|
| 64 |
+
cached: bool = False
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class VLMComponentScore(BaseModel):
|
| 68 |
+
score: int = Field(ge=0, le=100, default=75)
|
| 69 |
+
notes: str = ""
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class VLMBreakdown(BaseModel):
|
| 73 |
+
facial_symmetry: VLMComponentScore = VLMComponentScore()
|
| 74 |
+
skin_texture: VLMComponentScore = VLMComponentScore()
|
| 75 |
+
lighting_consistency: VLMComponentScore = VLMComponentScore()
|
| 76 |
+
background_coherence: VLMComponentScore = VLMComponentScore()
|
| 77 |
+
anatomy_hands_eyes: VLMComponentScore = VLMComponentScore()
|
| 78 |
+
context_objects: VLMComponentScore = VLMComponentScore()
|
| 79 |
+
model_used: str = ""
|
| 80 |
+
cached: bool = False
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class ProcessingSummary(BaseModel):
|
| 84 |
+
model_config = ConfigDict(protected_namespaces=())
|
| 85 |
+
|
| 86 |
+
stages_completed: List[str]
|
| 87 |
+
total_duration_ms: int
|
| 88 |
+
model_used: str
|
config.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class Settings(BaseSettings):
|
| 5 |
+
# Server
|
| 6 |
+
APP_HOST: str = "0.0.0.0"
|
| 7 |
+
APP_PORT: int = 8000
|
| 8 |
+
DEBUG: bool = False
|
| 9 |
+
CORS_ORIGINS: list[str] = ["http://localhost:5173"]
|
| 10 |
+
|
| 11 |
+
# Database
|
| 12 |
+
DATABASE_URL: str = "sqlite:///./deepshield.db"
|
| 13 |
+
|
| 14 |
+
# File Upload
|
| 15 |
+
MAX_UPLOAD_SIZE_MB: int = 100
|
| 16 |
+
UPLOAD_DIR: str = "./temp_uploads"
|
| 17 |
+
ALLOWED_IMAGE_TYPES: list[str] = ["image/jpeg", "image/png", "image/webp"]
|
| 18 |
+
ALLOWED_VIDEO_TYPES: list[str] = ["video/mp4", "video/avi", "video/mov", "video/webm"]
|
| 19 |
+
FILE_RETENTION_SECONDS: int = 300
|
| 20 |
+
|
| 21 |
+
# AI Models
|
| 22 |
+
IMAGE_MODEL_ID: str = "prithivMLmods/Deep-Fake-Detector-v2-Model"
|
| 23 |
+
TEXT_MODEL_ID: str = "jy46604790/Fake-News-Bert-Detect"
|
| 24 |
+
# Multilingual text model for non-English (Hindi etc.). Leave empty to fall back to TEXT_MODEL_ID.
|
| 25 |
+
TEXT_MULTILANG_MODEL_ID: str = ""
|
| 26 |
+
DEVICE: str = "cpu"
|
| 27 |
+
PRELOAD_MODELS: bool = True # preload models at startup
|
| 28 |
+
|
| 29 |
+
# Phase 13: OCR language list (comma-separated ISO codes, e.g. "en,hi")
|
| 30 |
+
OCR_LANGS: str = "en,hi"
|
| 31 |
+
|
| 32 |
+
# News API
|
| 33 |
+
NEWS_API_KEY: str = ""
|
| 34 |
+
NEWS_API_BASE_URL: str = "https://newsdata.io/api/1/news"
|
| 35 |
+
|
| 36 |
+
# Reports
|
| 37 |
+
REPORT_DIR: str = "./temp_reports"
|
| 38 |
+
REPORT_TTL_SECONDS: int = 3600 # 1h expiry
|
| 39 |
+
|
| 40 |
+
# LLM Explainability (Phase 12)
|
| 41 |
+
LLM_PROVIDER: str = "gemini" # "gemini" | "openai"
|
| 42 |
+
LLM_API_KEY: str = ""
|
| 43 |
+
LLM_MODEL: str = "gemini-1.5-flash" # or "gpt-4o-mini"
|
| 44 |
+
|
| 45 |
+
# Auth
|
| 46 |
+
JWT_SECRET_KEY: str = "change-me-in-production"
|
| 47 |
+
JWT_ALGORITHM: str = "HS256"
|
| 48 |
+
JWT_EXPIRATION_MINUTES: int = 1440
|
| 49 |
+
|
| 50 |
+
model_config = SettingsConfigDict(env_file=".env", extra="ignore")
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
settings = Settings()
|
database.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sqlalchemy import create_engine
|
| 2 |
+
from sqlalchemy.orm import DeclarativeBase, sessionmaker
|
| 3 |
+
|
| 4 |
+
from config import settings
|
| 5 |
+
|
| 6 |
+
engine = create_engine(
|
| 7 |
+
settings.DATABASE_URL,
|
| 8 |
+
connect_args={"check_same_thread": False} if settings.DATABASE_URL.startswith("sqlite") else {},
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class Base(DeclarativeBase):
|
| 15 |
+
pass
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def get_db():
|
| 19 |
+
db = SessionLocal()
|
| 20 |
+
try:
|
| 21 |
+
yield db
|
| 22 |
+
finally:
|
| 23 |
+
db.close()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def init_db():
|
| 27 |
+
from db import models # noqa: F401
|
| 28 |
+
Base.metadata.create_all(bind=engine)
|
datasets/__init__.py
ADDED
|
File without changes
|
datasets/build_manifest.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Build a unified train/val/test manifest (70/15/15) across all dataset buckets.
|
| 2 |
+
|
| 3 |
+
Expected input layout (produced by the other scripts in this package):
|
| 4 |
+
|
| 5 |
+
data_root/
|
| 6 |
+
real/
|
| 7 |
+
ffpp_youtube/*.jpg # frames from FFPP original_sequences
|
| 8 |
+
ffhq/*.jpg # FFHQ thumbnails
|
| 9 |
+
|
| 10 |
+
fake/
|
| 11 |
+
ffpp_deepfakes/*.jpg
|
| 12 |
+
ffpp_face2face/*.jpg
|
| 13 |
+
ffpp_faceswap/*.jpg
|
| 14 |
+
ffpp_neuraltextures/*.jpg
|
| 15 |
+
ffpp_faceshifter/*.jpg
|
| 16 |
+
dfdc/*.jpg
|
| 17 |
+
|
| 18 |
+
The manifest is stratified by (label, source) so FFHQ stays represented
|
| 19 |
+
in val/test.
|
| 20 |
+
|
| 21 |
+
Usage:
|
| 22 |
+
python -m backend.training.datasets.build_manifest \
|
| 23 |
+
--data ./data --out ./data/manifest.csv --seed 42
|
| 24 |
+
"""
|
| 25 |
+
from __future__ import annotations
|
| 26 |
+
|
| 27 |
+
import argparse
|
| 28 |
+
import csv
|
| 29 |
+
import random
|
| 30 |
+
from collections import defaultdict
|
| 31 |
+
from pathlib import Path
|
| 32 |
+
|
| 33 |
+
IMG_EXTS = {".jpg", ".jpeg", ".png"}
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def collect(data_root: Path) -> list[tuple[str, str, str]]:
|
| 37 |
+
rows: list[tuple[str, str, str]] = []
|
| 38 |
+
for label in ("real", "fake"):
|
| 39 |
+
label_root = data_root / label
|
| 40 |
+
if not label_root.exists():
|
| 41 |
+
continue
|
| 42 |
+
for source_dir in sorted(p for p in label_root.iterdir() if p.is_dir()):
|
| 43 |
+
for img in source_dir.rglob("*"):
|
| 44 |
+
if img.suffix.lower() in IMG_EXTS and img.is_file():
|
| 45 |
+
rows.append((str(img.resolve()), label, source_dir.name))
|
| 46 |
+
return rows
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def split(rows: list[tuple[str, str, str]], seed: int) -> dict[str, list[tuple[str, str, str]]]:
|
| 50 |
+
buckets: dict[tuple[str, str], list[tuple[str, str, str]]] = defaultdict(list)
|
| 51 |
+
for r in rows:
|
| 52 |
+
buckets[(r[1], r[2])].append(r)
|
| 53 |
+
|
| 54 |
+
rng = random.Random(seed)
|
| 55 |
+
out = {"train": [], "val": [], "test": []}
|
| 56 |
+
for key, items in buckets.items():
|
| 57 |
+
rng.shuffle(items)
|
| 58 |
+
n = len(items)
|
| 59 |
+
n_train = int(0.70 * n)
|
| 60 |
+
n_val = int(0.15 * n)
|
| 61 |
+
out["train"].extend(items[:n_train])
|
| 62 |
+
out["val"].extend(items[n_train : n_train + n_val])
|
| 63 |
+
out["test"].extend(items[n_train + n_val :])
|
| 64 |
+
return out
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def main() -> None:
|
| 68 |
+
ap = argparse.ArgumentParser()
|
| 69 |
+
ap.add_argument("--data", required=True, type=Path)
|
| 70 |
+
ap.add_argument("--out", required=True, type=Path)
|
| 71 |
+
ap.add_argument("--seed", type=int, default=42)
|
| 72 |
+
args = ap.parse_args()
|
| 73 |
+
|
| 74 |
+
rows = collect(args.data)
|
| 75 |
+
if not rows:
|
| 76 |
+
raise SystemExit(f"No images found under {args.data}")
|
| 77 |
+
|
| 78 |
+
splits = split(rows, args.seed)
|
| 79 |
+
args.out.parent.mkdir(parents=True, exist_ok=True)
|
| 80 |
+
with args.out.open("w", newline="", encoding="utf-8") as f:
|
| 81 |
+
w = csv.writer(f)
|
| 82 |
+
w.writerow(["path", "label", "source", "split"])
|
| 83 |
+
for name, items in splits.items():
|
| 84 |
+
for path, label, source in items:
|
| 85 |
+
w.writerow([path, label, source, name])
|
| 86 |
+
|
| 87 |
+
summary = {k: len(v) for k, v in splits.items()}
|
| 88 |
+
print(f"Manifest: {args.out}")
|
| 89 |
+
print(f"Totals: {summary} (overall {sum(summary.values())})")
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
if __name__ == "__main__":
|
| 93 |
+
main()
|
datasets/download_dfdc_sample.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Download a sample of the DFDC (Deepfake Detection Challenge) Preview dataset.
|
| 2 |
+
|
| 3 |
+
The full DFDC is ~470GB; the *preview* release (~5GB, Kaggle) is enough for
|
| 4 |
+
diversity augmentation alongside FFPP.
|
| 5 |
+
|
| 6 |
+
Requires the Kaggle CLI (`pip install kaggle`) and ~/.kaggle/kaggle.json.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
python -m backend.training.datasets.download_dfdc_sample --output ./data/dfdc_preview
|
| 10 |
+
"""
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import argparse
|
| 14 |
+
import shutil
|
| 15 |
+
import subprocess
|
| 16 |
+
import sys
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def main() -> None:
|
| 21 |
+
ap = argparse.ArgumentParser()
|
| 22 |
+
ap.add_argument("--output", required=True, type=Path)
|
| 23 |
+
ap.add_argument(
|
| 24 |
+
"--competition",
|
| 25 |
+
default="deepfake-detection-challenge",
|
| 26 |
+
help="Kaggle competition slug (default: deepfake-detection-challenge preview).",
|
| 27 |
+
)
|
| 28 |
+
args = ap.parse_args()
|
| 29 |
+
|
| 30 |
+
kaggle = shutil.which("kaggle")
|
| 31 |
+
if kaggle is None:
|
| 32 |
+
print("Kaggle CLI not found. Install with: pip install kaggle", file=sys.stderr)
|
| 33 |
+
print("Then place kaggle.json in ~/.kaggle/ (chmod 600).", file=sys.stderr)
|
| 34 |
+
sys.exit(2)
|
| 35 |
+
|
| 36 |
+
args.output.mkdir(parents=True, exist_ok=True)
|
| 37 |
+
cmd = [kaggle, "competitions", "download", "-c", args.competition, "-p", str(args.output)]
|
| 38 |
+
print("Running:", " ".join(cmd))
|
| 39 |
+
subprocess.run(cmd, check=True)
|
| 40 |
+
print(f"Downloaded to {args.output}. Unzip with: unzip *.zip")
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
if __name__ == "__main__":
|
| 44 |
+
main()
|
datasets/download_ffhq.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Download the FFHQ 128x128 thumbnail subset from the official Google Drive mirror.
|
| 2 |
+
|
| 3 |
+
Pulls up to N images (default 10k) into the `real` bucket of the training set.
|
| 4 |
+
Falls back to the NVlabs 'ffhq-dataset' helper if available; otherwise expects
|
| 5 |
+
user to run the manual download once.
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
python -m backend.training.datasets.download_ffhq --output ./data/real/ffhq -n 10000
|
| 9 |
+
"""
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import argparse
|
| 13 |
+
import shutil
|
| 14 |
+
import subprocess
|
| 15 |
+
import sys
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def try_nvlabs_helper(output: Path, num: int) -> bool:
|
| 20 |
+
"""Prefer the official ffhq-dataset downloader if installed."""
|
| 21 |
+
helper = shutil.which("ffhq-dataset")
|
| 22 |
+
if helper is None:
|
| 23 |
+
return False
|
| 24 |
+
cmd = [helper, "--json", "ffhq-dataset-v2.json", "--thumbs", "--num_threads", "4"]
|
| 25 |
+
print("Running:", " ".join(cmd))
|
| 26 |
+
subprocess.run(cmd, cwd=output, check=False)
|
| 27 |
+
return True
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def main() -> None:
|
| 31 |
+
ap = argparse.ArgumentParser()
|
| 32 |
+
ap.add_argument("--output", required=True, type=Path)
|
| 33 |
+
ap.add_argument("-n", "--num", type=int, default=10000)
|
| 34 |
+
args = ap.parse_args()
|
| 35 |
+
args.output.mkdir(parents=True, exist_ok=True)
|
| 36 |
+
|
| 37 |
+
if try_nvlabs_helper(args.output, args.num):
|
| 38 |
+
return
|
| 39 |
+
|
| 40 |
+
print("[!] `ffhq-dataset` helper not installed.")
|
| 41 |
+
print(" Install via: pip install ffhq-dataset (requires gdown)")
|
| 42 |
+
print(" Or download thumbnails128x128.zip manually from:")
|
| 43 |
+
print(" https://github.com/NVlabs/ffhq-dataset")
|
| 44 |
+
print(f" Extract into: {args.output}")
|
| 45 |
+
sys.exit(1)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
if __name__ == "__main__":
|
| 49 |
+
main()
|
datasets/extract_frames.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Convert FFPP / DFDC videos -> 16 sampled frames at 224x224 RGB.
|
| 2 |
+
|
| 3 |
+
Usage:
|
| 4 |
+
python -m backend.training.datasets.extract_frames \
|
| 5 |
+
--input ./ffpp_data/original_sequences/youtube/raw/videos \
|
| 6 |
+
--output ./ffpp_data/frames/real \
|
| 7 |
+
--label real --frames 16 --size 224
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import argparse
|
| 12 |
+
import csv
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
import cv2
|
| 16 |
+
import numpy as np
|
| 17 |
+
from tqdm import tqdm
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def sample_frame_indices(total: int, n: int) -> list[int]:
|
| 21 |
+
if total <= 0:
|
| 22 |
+
return []
|
| 23 |
+
if total <= n:
|
| 24 |
+
return list(range(total))
|
| 25 |
+
step = total / float(n)
|
| 26 |
+
return [min(total - 1, int(step * i + step / 2)) for i in range(n)]
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def extract_from_video(path: Path, out_dir: Path, n: int, size: int) -> int:
|
| 30 |
+
cap = cv2.VideoCapture(str(path))
|
| 31 |
+
if not cap.isOpened():
|
| 32 |
+
return 0
|
| 33 |
+
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 34 |
+
indices = set(sample_frame_indices(total, n))
|
| 35 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 36 |
+
|
| 37 |
+
saved = 0
|
| 38 |
+
i = 0
|
| 39 |
+
while True:
|
| 40 |
+
ok, frame = cap.read()
|
| 41 |
+
if not ok:
|
| 42 |
+
break
|
| 43 |
+
if i in indices:
|
| 44 |
+
frame = cv2.resize(frame, (size, size), interpolation=cv2.INTER_AREA)
|
| 45 |
+
cv2.imwrite(str(out_dir / f"{path.stem}_f{i:06d}.jpg"), frame, [cv2.IMWRITE_JPEG_QUALITY, 95])
|
| 46 |
+
saved += 1
|
| 47 |
+
i += 1
|
| 48 |
+
cap.release()
|
| 49 |
+
return saved
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def main() -> None:
|
| 53 |
+
ap = argparse.ArgumentParser(description="Sample N frames per video and resize.")
|
| 54 |
+
ap.add_argument("--input", required=True, type=Path, help="Directory of .mp4 videos (recursive).")
|
| 55 |
+
ap.add_argument("--output", required=True, type=Path, help="Directory to write .jpg frames.")
|
| 56 |
+
ap.add_argument("--label", required=True, choices=["real", "fake"], help="Label tag for manifest.")
|
| 57 |
+
ap.add_argument("--frames", type=int, default=16)
|
| 58 |
+
ap.add_argument("--size", type=int, default=224)
|
| 59 |
+
ap.add_argument("--manifest", type=Path, default=None, help="Optional CSV manifest append path.")
|
| 60 |
+
args = ap.parse_args()
|
| 61 |
+
|
| 62 |
+
videos = [p for p in args.input.rglob("*.mp4")]
|
| 63 |
+
if not videos:
|
| 64 |
+
print(f"No .mp4 found under {args.input}")
|
| 65 |
+
return
|
| 66 |
+
|
| 67 |
+
rows: list[tuple[str, str, str]] = []
|
| 68 |
+
total_frames = 0
|
| 69 |
+
for vid in tqdm(videos, desc=f"extract[{args.label}]"):
|
| 70 |
+
rel_out = args.output / vid.stem
|
| 71 |
+
saved = extract_from_video(vid, rel_out, args.frames, args.size)
|
| 72 |
+
total_frames += saved
|
| 73 |
+
if args.manifest is not None:
|
| 74 |
+
for jpg in rel_out.glob("*.jpg"):
|
| 75 |
+
rows.append((str(jpg), args.label, vid.stem))
|
| 76 |
+
|
| 77 |
+
if args.manifest is not None and rows:
|
| 78 |
+
args.manifest.parent.mkdir(parents=True, exist_ok=True)
|
| 79 |
+
new_file = not args.manifest.exists()
|
| 80 |
+
with args.manifest.open("a", newline="", encoding="utf-8") as f:
|
| 81 |
+
w = csv.writer(f)
|
| 82 |
+
if new_file:
|
| 83 |
+
w.writerow(["path", "label", "source_video"])
|
| 84 |
+
w.writerows(rows)
|
| 85 |
+
|
| 86 |
+
print(f"Done. Videos: {len(videos)}, frames written: {total_frames}")
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
if __name__ == "__main__":
|
| 90 |
+
main()
|
datasets/procure_all.ps1
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Phase 11.1 orchestrator for Windows (PowerShell)
|
| 2 |
+
$ErrorActionPreference = "Stop"
|
| 3 |
+
|
| 4 |
+
$ROOT = if ($env:ROOT) { $env:ROOT } else { ".\data" }
|
| 5 |
+
$FFPP = if ($env:FFPP) { $env:FFPP } else { ".\ffpp_data" }
|
| 6 |
+
|
| 7 |
+
New-Item -ItemType Directory -Force -Path "$ROOT\real" | Out-Null
|
| 8 |
+
New-Item -ItemType Directory -Force -Path "$ROOT\fake" | Out-Null
|
| 9 |
+
New-Item -ItemType Directory -Force -Path $FFPP | Out-Null
|
| 10 |
+
|
| 11 |
+
Write-Host "1. FaceForensics++ (highly compressed c40, 10 videos only) -- requires TOS keypress"
|
| 12 |
+
python backend\scripts\download_ffpp.py $FFPP -d all -c c40 -t videos -n 10
|
| 13 |
+
|
| 14 |
+
Write-Host "2. Frame extraction: real (original youtube)"
|
| 15 |
+
python -m backend.training.datasets.extract_frames `
|
| 16 |
+
--input "$FFPP\original_sequences\youtube\c40\videos" `
|
| 17 |
+
--output "$ROOT\real\ffpp_youtube" --label real --frames 4 --size 224
|
| 18 |
+
|
| 19 |
+
Write-Host "3. Frame extraction: fakes (each manipulation family)"
|
| 20 |
+
$Families = @("Deepfakes", "Face2Face", "FaceSwap", "NeuralTextures", "FaceShifter")
|
| 21 |
+
foreach ($fam in $Families) {
|
| 22 |
+
$famLower = $fam.ToLower()
|
| 23 |
+
python -m backend.training.datasets.extract_frames `
|
| 24 |
+
--input "$FFPP\manipulated_sequences\$fam\c40\videos" `
|
| 25 |
+
--output "$ROOT\fake\ffpp_$famLower" --label fake --frames 4 --size 224
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
Write-Host "4. FFHQ thumbnails (real - limited to 100 items)"
|
| 29 |
+
python -m backend.training.datasets.download_ffhq --output "$ROOT\real\ffhq" -n 100
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
Write-Host "6. DFDC preview sample (fake+real)"
|
| 33 |
+
python -m backend.training.datasets.download_dfdc_sample --output "$ROOT\_dfdc_raw"
|
| 34 |
+
Write-Host "NOTE: You will need to manually unzip + sort DFDC into $ROOT\fake\dfdc and $ROOT\real\dfdc"
|
| 35 |
+
|
| 36 |
+
Write-Host "7. Build manifest"
|
| 37 |
+
python -m backend.training.datasets.build_manifest `
|
| 38 |
+
--data $ROOT --out "$ROOT\manifest.csv" --seed 42
|
| 39 |
+
|
| 40 |
+
Write-Host "Phase 11.1 complete. See $ROOT\manifest.csv"
|
datasets/procure_all.sh
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Phase 11.1 orchestrator: download + frame-extract + manifest.
|
| 3 |
+
# Total disk target: ~120k labeled images. Expect 60-80GB intermediate, ~30GB frames.
|
| 4 |
+
|
| 5 |
+
set -euo pipefail
|
| 6 |
+
|
| 7 |
+
ROOT="${ROOT:-./data}"
|
| 8 |
+
FFPP="${FFPP:-./ffpp_data}"
|
| 9 |
+
mkdir -p "$ROOT/real" "$ROOT/fake" "$FFPP"
|
| 10 |
+
|
| 11 |
+
# 1. FaceForensics++ (raw, videos) -- requires TOS keypress
|
| 12 |
+
python backend/scripts/download_ffpp.py "$FFPP" -d all -c raw -t videos
|
| 13 |
+
|
| 14 |
+
# 2. Frame extraction: real (original youtube)
|
| 15 |
+
python -m backend.training.datasets.extract_frames \
|
| 16 |
+
--input "$FFPP/original_sequences/youtube/raw/videos" \
|
| 17 |
+
--output "$ROOT/real/ffpp_youtube" --label real --frames 16 --size 224
|
| 18 |
+
|
| 19 |
+
# 3. Frame extraction: fakes (each manipulation family)
|
| 20 |
+
for fam in Deepfakes Face2Face FaceSwap NeuralTextures FaceShifter; do
|
| 21 |
+
python -m backend.training.datasets.extract_frames \
|
| 22 |
+
--input "$FFPP/manipulated_sequences/$fam/raw/videos" \
|
| 23 |
+
--output "$ROOT/fake/ffpp_${fam,,}" --label fake --frames 16 --size 224
|
| 24 |
+
done
|
| 25 |
+
|
| 26 |
+
# 4. FFHQ thumbnails (real)
|
| 27 |
+
python -m backend.training.datasets.download_ffhq --output "$ROOT/real/ffhq" -n 10000
|
| 28 |
+
|
| 29 |
+
# 6. DFDC preview sample (fake+real) -- needs Kaggle creds
|
| 30 |
+
python -m backend.training.datasets.download_dfdc_sample --output "$ROOT/_dfdc_raw"
|
| 31 |
+
# NOTE: unzip + sort into $ROOT/fake/dfdc and $ROOT/real/dfdc per DFDC metadata.json
|
| 32 |
+
|
| 33 |
+
# 7. Build manifest
|
| 34 |
+
python -m backend.training.datasets.build_manifest \
|
| 35 |
+
--data "$ROOT" --out "$ROOT/manifest.csv" --seed 42
|
| 36 |
+
|
| 37 |
+
echo "Phase 11.1 complete. See $ROOT/manifest.csv"
|
deepshield_13_5bcf1328.pdf
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
%PDF-1.4
|
| 2 |
+
%���� ReportLab Generated PDF document (opensource)
|
| 3 |
+
1 0 obj
|
| 4 |
+
<<
|
| 5 |
+
/F1 2 0 R /F2 3 0 R /F3 5 0 R
|
| 6 |
+
>>
|
| 7 |
+
endobj
|
| 8 |
+
2 0 obj
|
| 9 |
+
<<
|
| 10 |
+
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
| 11 |
+
>>
|
| 12 |
+
endobj
|
| 13 |
+
3 0 obj
|
| 14 |
+
<<
|
| 15 |
+
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
|
| 16 |
+
>>
|
| 17 |
+
endobj
|
| 18 |
+
4 0 obj
|
| 19 |
+
<<
|
| 20 |
+
/Contents 18 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 17 0 R /Resources <<
|
| 21 |
+
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
| 22 |
+
>> /Rotate 0 /Trans <<
|
| 23 |
+
|
| 24 |
+
>>
|
| 25 |
+
/Type /Page
|
| 26 |
+
>>
|
| 27 |
+
endobj
|
| 28 |
+
5 0 obj
|
| 29 |
+
<<
|
| 30 |
+
/BaseFont /Symbol /Name /F3 /Subtype /Type1 /Type /Font
|
| 31 |
+
>>
|
| 32 |
+
endobj
|
| 33 |
+
6 0 obj
|
| 34 |
+
<<
|
| 35 |
+
/Contents 19 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 17 0 R /Resources <<
|
| 36 |
+
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
| 37 |
+
>> /Rotate 0 /Trans <<
|
| 38 |
+
|
| 39 |
+
>>
|
| 40 |
+
/Type /Page
|
| 41 |
+
>>
|
| 42 |
+
endobj
|
| 43 |
+
7 0 obj
|
| 44 |
+
<<
|
| 45 |
+
/Outlines 9 0 R /PageMode /UseNone /Pages 17 0 R /Type /Catalog
|
| 46 |
+
>>
|
| 47 |
+
endobj
|
| 48 |
+
8 0 obj
|
| 49 |
+
<<
|
| 50 |
+
/Author () /CreationDate (D:20260415181653+05'00') /Creator (\(unspecified\)) /Keywords () /ModDate (D:20260415181653+05'00') /Producer (xhtml2pdf <https://github.com/xhtml2pdf/xhtml2pdf/>)
|
| 51 |
+
/Subject () /Title (DeepShield Analysis Report \204 7771f496-45b1-4c97-8a1a-d9d2492ca67d) /Trapped /False
|
| 52 |
+
>>
|
| 53 |
+
endobj
|
| 54 |
+
9 0 obj
|
| 55 |
+
<<
|
| 56 |
+
/Count 3 /First 10 0 R /Last 10 0 R /Type /Outlines
|
| 57 |
+
>>
|
| 58 |
+
endobj
|
| 59 |
+
10 0 obj
|
| 60 |
+
<<
|
| 61 |
+
/Count -4 /Dest [ 4 0 R /Fit ] /First 11 0 R /Last 16 0 R /Parent 9 0 R /Title (DeepShield Analysis Report)
|
| 62 |
+
>>
|
| 63 |
+
endobj
|
| 64 |
+
11 0 obj
|
| 65 |
+
<<
|
| 66 |
+
/Dest [ 4 0 R /Fit ] /Next 12 0 R /Parent 10 0 R /Title (Verdict)
|
| 67 |
+
>>
|
| 68 |
+
endobj
|
| 69 |
+
12 0 obj
|
| 70 |
+
<<
|
| 71 |
+
/Count -2 /Dest [ 4 0 R /Fit ] /First 13 0 R /Last 14 0 R /Next 15 0 R /Parent 10 0 R
|
| 72 |
+
/Prev 11 0 R /Title (Text Classification)
|
| 73 |
+
>>
|
| 74 |
+
endobj
|
| 75 |
+
13 0 obj
|
| 76 |
+
<<
|
| 77 |
+
/Dest [ 4 0 R /Fit ] /Next 14 0 R /Parent 12 0 R /Title (Sensationalism Signals)
|
| 78 |
+
>>
|
| 79 |
+
endobj
|
| 80 |
+
14 0 obj
|
| 81 |
+
<<
|
| 82 |
+
/Dest [ 4 0 R /Fit ] /Parent 12 0 R /Prev 13 0 R /Title (Extracted Keywords)
|
| 83 |
+
>>
|
| 84 |
+
endobj
|
| 85 |
+
15 0 obj
|
| 86 |
+
<<
|
| 87 |
+
/Dest [ 4 0 R /Fit ] /Next 16 0 R /Parent 10 0 R /Prev 12 0 R /Title (Trusted Source Cross-Reference \(1\))
|
| 88 |
+
>>
|
| 89 |
+
endobj
|
| 90 |
+
16 0 obj
|
| 91 |
+
<<
|
| 92 |
+
/Dest [ 6 0 R /Fit ] /Parent 10 0 R /Prev 15 0 R /Title (Processing Summary)
|
| 93 |
+
>>
|
| 94 |
+
endobj
|
| 95 |
+
17 0 obj
|
| 96 |
+
<<
|
| 97 |
+
/Count 2 /Kids [ 4 0 R 6 0 R ] /Type /Pages
|
| 98 |
+
>>
|
| 99 |
+
endobj
|
| 100 |
+
18 0 obj
|
| 101 |
+
<<
|
| 102 |
+
/Filter [ /ASCII85Decode /FlateDecode ] /Length 1750
|
| 103 |
+
>>
|
| 104 |
+
stream
|
| 105 |
+
Gb"/(9lo&I&A@sBlm4G[Acr2Y4p^$ca2t\gAsuiHo\c,I9gURE8lSA3M>qu?,XkR;()9nE&%0G$"Ts\%gUFdJ0E[3iXSb#I!k]Slq-+&^_fu5V&-:f'>`[5155TjpXI_!]U"iQd1qrcX0jNK021sk.K_S`f[kfkaR[pr2$LLU)UX&`3>7R17rJ3t':B_<4Kk*Grr8\a:5/Z<<[I]mbfHq28c@Y+3O)t)0k@mu0K^fiq^N*(u.%T.'jl<s/Nh4He2l7^V7l^6+r/e]g]la.!>S?L^o+>>SgBV8H:sX>5A0-l`)&\h4Lk6L5I=)ArV#_bh%^>M_c,"jSErfH[2A&CfKtLn_&K3h)!u;:i'6.H*(apE@/QWkIgF*OaTZ"ZT=me'_?iN-hL[(uHeb"'/B!\/7d068ieW>Y3P8NcsU#;"%eOe_!^-"Xsc?9a'H,u4"nMEm$3F[>c1S8J!`Sh;Ye8pG>de>ac3KpI*&j-(`*[@OB&i#OgJSl=(I-'<c@@S(D;k%W_$;Jl?$^4Y-G*rH-Rk_h_*=&9o`q/eu[3o$--Zc#XoX(sA&CI7RqS'cWBhG2:+ODa!):O6`^NT((K7(:%BVJ3=F%emKe-WmK3EIie5ZAbGXt^Hf,[uurZtImn"m<3AaU$p)@,./&T/aMg@_t-oU(Al5HTNb;0J4E-fqZg*4Y/o@,5%"0ObY@,kKsQdk#2'pZOD8tZrghVcMH[#FI&3f.,FmGKKKNo9?B[@`=FkP`:=oo>;4Vs.^rc%L+kt99^Gd]mfUsWoLD02jLH*WUl.Pb(oF^j?7RUN!m&Us22M!@A<RB<?,"#orPd]<&>ld**8+J._-f-FEVm$t<`HO6GNqd_[bhJ&8qK0d-ZKt;EB60u<VCgOQ;8F:jeCp]E2HpO&5==e.Z2c5.#%nBkfCHsrt>d0-2Z<CdP%-(PZ=R(ET3u6<D1@I(u[6LMn;M%:K3fl4ls;SX'd>:*Z]IT(dG)'7QU\#<V$$AmO6;HncG;?UO[<qf,QJem^o.f$D3^V'_h3dF.f82/[@>u^ecY/FgdnO#RWf_=Js*t;iiO?'fQ:g&@nC/Xhu.;&o1b+?_6-Z%i4;1H5GAUag0*4LfL'2;Sl`["O/H6p>jU\SO4%Ffq^-']m<b(Mo1Vg;h"E$f8Z?_AL@bH31kAKY%KEP\PmsdK2MJ^Dfb%0.sgc_9*[9&'t*;+>uUp/PKbuj>J71&Mh5t,WF_k&]O@P+do^;.WV"r6Kkb#5`,aF$-adPdc+'072](pse[q;.^?I#Q#kci1Qr9Z_U:Q_lQ53n!nIBHrchNfMeP-HF*=<22XdSrZ8j>sP4CR1SEP\Ge.aCh(VEW.)F'<]`"gVnaq<<]K,.uCIMlUqSgV3U</GlN`:3?Ft9S-uHH\_0/'rV&dUBe&=8^c)"F#b/Te`H6Yn1DnZc?T$IiaKe%'S][\*'W-]E<4.cnD8?.XB5)khib.oe$NkDa0D^I+$2a=[rbp"D3eQQqq@TO]aNHTMcGM3B3cn9,9'giRF__Y[<^:+bB3]sACEq,A$s%=n\8Vk/OM\c,W"mZ11,MaZ61]7"M`X1/qmcr-hH,#8+udNN9@p:IAM="9:b-RnD&FAVj^G'kW4tPgO+M25'hLH])Ped#fB*fOs>Te;V8("S^2/7e`3>4E]],alEY#@T-dG.(=/^7(s[bh3%omN/'WKl<"q_K`T7$VrMt.GfckX6]1EfAB]1F6o6g>\:2Etf)rD.XNrRc2pgl"Hr<(1MCd%~>endstream
|
| 106 |
+
endobj
|
| 107 |
+
19 0 obj
|
| 108 |
+
<<
|
| 109 |
+
/Filter [ /ASCII85Decode /FlateDecode ] /Length 1251
|
| 110 |
+
>>
|
| 111 |
+
stream
|
| 112 |
+
Gau`R;01GN&:Vs/fU'm&SZsB\Z>@pd[^l$Ne'"!6Hco+&(^1n<bt7%'s8H%#$m^MQApR0<`)taLn([eaAHiiuRK&mT!C!?!I`[+[8FM*9+s?gk^Sb`ESFuBheu'`^-k@VZQnjgqaj:g4M2J-c)%`([:iWt%O9mV9ZO6(4"\bX`WWWGJ,s27(iVrdq]@Q&`bX7t`KV@dkk1#U3_]/$nF6>.H%;Q95P;kU[/"Vgs.N%@'=M6kAJN1afF&?E_+rA+1KE+S:4],1QpOr^qg01e<#d,;@\e=!\1-*,1T[41J&^DSg86dC5.#&+tMiZhie$%p]f=sWJ!9ni#^ZR?Gp5lVJY,M<YHnZf[nt2A3ZtRV6dLh4C-*^gI%O$[,o&o;u7[Nu/XEmkj&m4-UHNFF#I0VCUiaS-$S2Gs[@(=.(Fg-V>W+]dGA*V*5[2WS\gs>9t%t32b/^W)[_+r7&3kOLD>8WTI508QU_ZkVRb*l"j_,ie@Wk/$,J'=rjAsRr^aIAp,g4N\@rcW@_7fV)G7.f:C\2aDCnK2"(-Yh-fNKV4ogPJ_Bbno/AG^W)=l`02mHESBSd,2MW2Q,8S^O,7f_^Pj+'$c\[n!'TZ'8A[[6$M/6Vlo9egXU318J0Zl;rXSYgM=-\-3TecfRc]m]FKNI.=E4amT3\PSaWQi;TtrPVN"#t`E;<R<T0FHF)>bkNM&M.:/OC)MK2$$?Jp$`SY/%t"jbj6*+.%6.71qjEsp)j@\0#RIF/1!&^q"O7Ou;8DL^2(?$>18.AWa`<qQ;FS*8d605U,LRjPYl%CQZ"EZ)d6ggmR/\emf.%.#K=ZXlPbU\40kfi-URgEX``iXe1pOV?N=StFNQ>H$Fi,Ak&SQPl+Y^;rG>nArp/_q%9B[r]_;\_^p'[__7OH7)iuf]c[rld?RB/M<r(<QsU%pNedj)1NmPM-_fL1VD1tNQL&@c-=<:"`[Vpojg6J[HJ4:,T\L_]InN3jJke4J(kV<hYN(d]b#E=":iOW#=k#-U%PKO/p'+,)f951AW&jRK9')Q>rP3T8Xk7<ZOVAq$3lpK6YL6tc'D2V%1G(jM8"TncWs=[!hW2(D30g$5(Q/MN1htIgRt\ADhN@$l202Af7(c#1P6?P("GPEU+>VY%=qG1""FA,mioCp,lF3^-AZtKRg/NFX>&kA^rZpnFA<r!,IA42rZQ6YFrrrLL)tME=&"E=g6gSrChSiOfRe!l*<?[tTYGRI@6&N"%Fn3=3;X6Dm0TH~>endstream
|
| 113 |
+
endobj
|
| 114 |
+
xref
|
| 115 |
+
0 20
|
| 116 |
+
0000000000 65535 f
|
| 117 |
+
0000000061 00000 n
|
| 118 |
+
0000000112 00000 n
|
| 119 |
+
0000000219 00000 n
|
| 120 |
+
0000000331 00000 n
|
| 121 |
+
0000000536 00000 n
|
| 122 |
+
0000000613 00000 n
|
| 123 |
+
0000000818 00000 n
|
| 124 |
+
0000000903 00000 n
|
| 125 |
+
0000001223 00000 n
|
| 126 |
+
0000001296 00000 n
|
| 127 |
+
0000001426 00000 n
|
| 128 |
+
0000001514 00000 n
|
| 129 |
+
0000001667 00000 n
|
| 130 |
+
0000001770 00000 n
|
| 131 |
+
0000001869 00000 n
|
| 132 |
+
0000001999 00000 n
|
| 133 |
+
0000002098 00000 n
|
| 134 |
+
0000002164 00000 n
|
| 135 |
+
0000004006 00000 n
|
| 136 |
+
trailer
|
| 137 |
+
<<
|
| 138 |
+
/ID
|
| 139 |
+
[<8e273c2672d813e3cd44109eb1edd604><8e273c2672d813e3cd44109eb1edd604>]
|
| 140 |
+
% ReportLab generated PDF document -- digest (opensource)
|
| 141 |
+
|
| 142 |
+
/Info 8 0 R
|
| 143 |
+
/Root 7 0 R
|
| 144 |
+
/Size 20
|
| 145 |
+
>>
|
| 146 |
+
startxref
|
| 147 |
+
5349
|
| 148 |
+
%%EOF
|
deps.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from fastapi import Depends, Header, HTTPException, status
|
| 4 |
+
from sqlalchemy.orm import Session
|
| 5 |
+
|
| 6 |
+
from db.database import get_db
|
| 7 |
+
from db.models import User
|
| 8 |
+
from services.auth_service import decode_token, get_user
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _extract_bearer(authorization: str | None) -> str | None:
|
| 12 |
+
if not authorization:
|
| 13 |
+
return None
|
| 14 |
+
parts = authorization.split()
|
| 15 |
+
if len(parts) != 2 or parts[0].lower() != "bearer":
|
| 16 |
+
return None
|
| 17 |
+
return parts[1]
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def get_current_user(
|
| 21 |
+
authorization: str | None = Header(default=None),
|
| 22 |
+
db: Session = Depends(get_db),
|
| 23 |
+
) -> User:
|
| 24 |
+
token = _extract_bearer(authorization)
|
| 25 |
+
if not token:
|
| 26 |
+
raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Missing bearer token")
|
| 27 |
+
payload = decode_token(token)
|
| 28 |
+
if not payload or "sub" not in payload:
|
| 29 |
+
raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Invalid or expired token")
|
| 30 |
+
user = get_user(db, int(payload["sub"]))
|
| 31 |
+
if not user:
|
| 32 |
+
raise HTTPException(status.HTTP_401_UNAUTHORIZED, "User not found")
|
| 33 |
+
return user
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def optional_current_user(
|
| 37 |
+
authorization: str | None = Header(default=None),
|
| 38 |
+
db: Session = Depends(get_db),
|
| 39 |
+
) -> User | None:
|
| 40 |
+
token = _extract_bearer(authorization)
|
| 41 |
+
if not token:
|
| 42 |
+
return None
|
| 43 |
+
payload = decode_token(token)
|
| 44 |
+
if not payload or "sub" not in payload:
|
| 45 |
+
return None
|
| 46 |
+
return get_user(db, int(payload["sub"]))
|
download_ffpp.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
""" Downloads FaceForensics++ and Deep Fake Detection public data release
|
| 3 |
+
Example usage:
|
| 4 |
+
see -h or https://github.com/ondyari/FaceForensics
|
| 5 |
+
"""
|
| 6 |
+
# -*- coding: utf-8 -*-
|
| 7 |
+
import argparse
|
| 8 |
+
import os
|
| 9 |
+
import urllib
|
| 10 |
+
import urllib.request
|
| 11 |
+
import tempfile
|
| 12 |
+
import time
|
| 13 |
+
import sys
|
| 14 |
+
import json
|
| 15 |
+
import random
|
| 16 |
+
from tqdm import tqdm
|
| 17 |
+
from os.path import join
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# URLs and filenames
|
| 21 |
+
FILELIST_URL = 'misc/filelist.json'
|
| 22 |
+
DEEPFEAKES_DETECTION_URL = 'misc/deepfake_detection_filenames.json'
|
| 23 |
+
DEEPFAKES_MODEL_NAMES = ['decoder_A.h5', 'decoder_B.h5', 'encoder.h5',]
|
| 24 |
+
|
| 25 |
+
# Parameters
|
| 26 |
+
DATASETS = {
|
| 27 |
+
'original_youtube_videos': 'misc/downloaded_youtube_videos.zip',
|
| 28 |
+
'original_youtube_videos_info': 'misc/downloaded_youtube_videos_info.zip',
|
| 29 |
+
'original': 'original_sequences/youtube',
|
| 30 |
+
'DeepFakeDetection_original': 'original_sequences/actors',
|
| 31 |
+
'Deepfakes': 'manipulated_sequences/Deepfakes',
|
| 32 |
+
'DeepFakeDetection': 'manipulated_sequences/DeepFakeDetection',
|
| 33 |
+
'Face2Face': 'manipulated_sequences/Face2Face',
|
| 34 |
+
'FaceShifter': 'manipulated_sequences/FaceShifter',
|
| 35 |
+
'FaceSwap': 'manipulated_sequences/FaceSwap',
|
| 36 |
+
'NeuralTextures': 'manipulated_sequences/NeuralTextures'
|
| 37 |
+
}
|
| 38 |
+
ALL_DATASETS = ['original', 'DeepFakeDetection_original', 'Deepfakes',
|
| 39 |
+
'DeepFakeDetection', 'Face2Face', 'FaceShifter', 'FaceSwap',
|
| 40 |
+
'NeuralTextures']
|
| 41 |
+
COMPRESSION = ['raw', 'c23', 'c40']
|
| 42 |
+
TYPE = ['videos', 'masks', 'models']
|
| 43 |
+
SERVERS = ['EU', 'EU2', 'CA']
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def parse_args():
|
| 47 |
+
parser = argparse.ArgumentParser(
|
| 48 |
+
description='Downloads FaceForensics v2 public data release.',
|
| 49 |
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
| 50 |
+
)
|
| 51 |
+
parser.add_argument('output_path', type=str, help='Output directory.')
|
| 52 |
+
parser.add_argument('-d', '--dataset', type=str, default='all',
|
| 53 |
+
help='Which dataset to download, either pristine or '
|
| 54 |
+
'manipulated data or the downloaded youtube '
|
| 55 |
+
'videos.',
|
| 56 |
+
choices=list(DATASETS.keys()) + ['all']
|
| 57 |
+
)
|
| 58 |
+
parser.add_argument('-c', '--compression', type=str, default='raw',
|
| 59 |
+
help='Which compression degree. All videos '
|
| 60 |
+
'have been generated with h264 with a varying '
|
| 61 |
+
'codec. Raw (c0) videos are lossless compressed.',
|
| 62 |
+
choices=COMPRESSION
|
| 63 |
+
)
|
| 64 |
+
parser.add_argument('-t', '--type', type=str, default='videos',
|
| 65 |
+
help='Which file type, i.e. videos, masks, for our '
|
| 66 |
+
'manipulation methods, models, for Deepfakes.',
|
| 67 |
+
choices=TYPE
|
| 68 |
+
)
|
| 69 |
+
parser.add_argument('-n', '--num_videos', type=int, default=None,
|
| 70 |
+
help='Select a number of videos number to '
|
| 71 |
+
"download if you don't want to download the full"
|
| 72 |
+
' dataset.')
|
| 73 |
+
parser.add_argument('--server', type=str, default='EU',
|
| 74 |
+
help='Server to download the data from. If you '
|
| 75 |
+
'encounter a slow download speed, consider '
|
| 76 |
+
'changing the server.',
|
| 77 |
+
choices=SERVERS
|
| 78 |
+
)
|
| 79 |
+
args = parser.parse_args()
|
| 80 |
+
|
| 81 |
+
# URLs
|
| 82 |
+
server = args.server
|
| 83 |
+
if server == 'EU':
|
| 84 |
+
server_url = 'http://canis.vc.in.tum.de:8100/'
|
| 85 |
+
elif server == 'EU2':
|
| 86 |
+
server_url = 'http://kaldir.vc.in.tum.de/faceforensics/'
|
| 87 |
+
elif server == 'CA':
|
| 88 |
+
server_url = 'http://falas.cmpt.sfu.ca:8100/'
|
| 89 |
+
else:
|
| 90 |
+
raise Exception('Wrong server name. Choices: {}'.format(str(SERVERS)))
|
| 91 |
+
args.tos_url = server_url + 'webpage/FaceForensics_TOS.pdf'
|
| 92 |
+
args.base_url = server_url + 'v3/'
|
| 93 |
+
args.deepfakes_model_url = server_url + 'v3/manipulated_sequences/' + \
|
| 94 |
+
'Deepfakes/models/'
|
| 95 |
+
|
| 96 |
+
return args
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def download_files(filenames, base_url, output_path, report_progress=True):
|
| 100 |
+
os.makedirs(output_path, exist_ok=True)
|
| 101 |
+
if report_progress:
|
| 102 |
+
filenames = tqdm(filenames)
|
| 103 |
+
for filename in filenames:
|
| 104 |
+
download_file(base_url + filename, join(output_path, filename))
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def reporthook(count, block_size, total_size):
|
| 108 |
+
global start_time
|
| 109 |
+
if count == 0:
|
| 110 |
+
start_time = time.time()
|
| 111 |
+
return
|
| 112 |
+
duration = time.time() - start_time
|
| 113 |
+
progress_size = int(count * block_size)
|
| 114 |
+
speed = int(progress_size / (1024 * duration))
|
| 115 |
+
percent = int(count * block_size * 100 / total_size)
|
| 116 |
+
sys.stdout.write("\rProgress: %d%%, %d MB, %d KB/s, %d seconds passed" %
|
| 117 |
+
(percent, progress_size / (1024 * 1024), speed, duration))
|
| 118 |
+
sys.stdout.flush()
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def download_file(url, out_file, report_progress=False):
|
| 122 |
+
out_dir = os.path.dirname(out_file)
|
| 123 |
+
if not os.path.isfile(out_file):
|
| 124 |
+
fh, out_file_tmp = tempfile.mkstemp(dir=out_dir)
|
| 125 |
+
f = os.fdopen(fh, 'w')
|
| 126 |
+
f.close()
|
| 127 |
+
if report_progress:
|
| 128 |
+
urllib.request.urlretrieve(url, out_file_tmp,
|
| 129 |
+
reporthook=reporthook)
|
| 130 |
+
else:
|
| 131 |
+
urllib.request.urlretrieve(url, out_file_tmp)
|
| 132 |
+
os.rename(out_file_tmp, out_file)
|
| 133 |
+
else:
|
| 134 |
+
tqdm.write('WARNING: skipping download of existing file ' + out_file)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def main(args):
|
| 138 |
+
# TOS
|
| 139 |
+
print('By pressing any key to continue you confirm that you have agreed '\
|
| 140 |
+
'to the FaceForensics terms of use as described at:')
|
| 141 |
+
print(args.tos_url)
|
| 142 |
+
print('***')
|
| 143 |
+
print('Press any key to continue, or CTRL-C to exit.')
|
| 144 |
+
_ = input('')
|
| 145 |
+
|
| 146 |
+
# Extract arguments
|
| 147 |
+
c_datasets = [args.dataset] if args.dataset != 'all' else ALL_DATASETS
|
| 148 |
+
c_type = args.type
|
| 149 |
+
c_compression = args.compression
|
| 150 |
+
num_videos = args.num_videos
|
| 151 |
+
output_path = args.output_path
|
| 152 |
+
os.makedirs(output_path, exist_ok=True)
|
| 153 |
+
|
| 154 |
+
# Check for special dataset cases
|
| 155 |
+
for dataset in c_datasets:
|
| 156 |
+
dataset_path = DATASETS[dataset]
|
| 157 |
+
# Special cases
|
| 158 |
+
if 'original_youtube_videos' in dataset:
|
| 159 |
+
# Here we download the original youtube videos zip file
|
| 160 |
+
print('Downloading original youtube videos.')
|
| 161 |
+
if not 'info' in dataset_path:
|
| 162 |
+
print('Please be patient, this may take a while (~40gb)')
|
| 163 |
+
suffix = ''
|
| 164 |
+
else:
|
| 165 |
+
suffix = 'info'
|
| 166 |
+
download_file(args.base_url + '/' + dataset_path,
|
| 167 |
+
out_file=join(output_path,
|
| 168 |
+
'downloaded_videos{}.zip'.format(
|
| 169 |
+
suffix)),
|
| 170 |
+
report_progress=True)
|
| 171 |
+
return
|
| 172 |
+
|
| 173 |
+
# Else: regular datasets
|
| 174 |
+
print('Downloading {} of dataset "{}"'.format(
|
| 175 |
+
c_type, dataset_path
|
| 176 |
+
))
|
| 177 |
+
|
| 178 |
+
# Get filelists and video lenghts list from server
|
| 179 |
+
if 'DeepFakeDetection' in dataset_path or 'actors' in dataset_path:
|
| 180 |
+
filepaths = json.loads(urllib.request.urlopen(args.base_url + '/' +
|
| 181 |
+
DEEPFEAKES_DETECTION_URL).read().decode("utf-8"))
|
| 182 |
+
if 'actors' in dataset_path:
|
| 183 |
+
filelist = filepaths['actors']
|
| 184 |
+
else:
|
| 185 |
+
filelist = filepaths['DeepFakesDetection']
|
| 186 |
+
elif 'original' in dataset_path:
|
| 187 |
+
# Load filelist from server
|
| 188 |
+
file_pairs = json.loads(urllib.request.urlopen(args.base_url + '/' +
|
| 189 |
+
FILELIST_URL).read().decode("utf-8"))
|
| 190 |
+
filelist = []
|
| 191 |
+
for pair in file_pairs:
|
| 192 |
+
filelist += pair
|
| 193 |
+
else:
|
| 194 |
+
# Load filelist from server
|
| 195 |
+
file_pairs = json.loads(urllib.request.urlopen(args.base_url + '/' +
|
| 196 |
+
FILELIST_URL).read().decode("utf-8"))
|
| 197 |
+
# Get filelist
|
| 198 |
+
filelist = []
|
| 199 |
+
for pair in file_pairs:
|
| 200 |
+
filelist.append('_'.join(pair))
|
| 201 |
+
if c_type != 'models':
|
| 202 |
+
filelist.append('_'.join(pair[::-1]))
|
| 203 |
+
# Maybe limit number of videos for download
|
| 204 |
+
if num_videos is not None and num_videos > 0:
|
| 205 |
+
print('Downloading the first {} videos'.format(num_videos))
|
| 206 |
+
filelist = filelist[:num_videos]
|
| 207 |
+
|
| 208 |
+
# Server and local paths
|
| 209 |
+
dataset_videos_url = args.base_url + '{}/{}/{}/'.format(
|
| 210 |
+
dataset_path, c_compression, c_type)
|
| 211 |
+
dataset_mask_url = args.base_url + '{}/{}/videos/'.format(
|
| 212 |
+
dataset_path, 'masks', c_type)
|
| 213 |
+
|
| 214 |
+
if c_type == 'videos':
|
| 215 |
+
dataset_output_path = join(output_path, dataset_path, c_compression,
|
| 216 |
+
c_type)
|
| 217 |
+
print('Output path: {}'.format(dataset_output_path))
|
| 218 |
+
filelist = [filename + '.mp4' for filename in filelist]
|
| 219 |
+
download_files(filelist, dataset_videos_url, dataset_output_path)
|
| 220 |
+
elif c_type == 'masks':
|
| 221 |
+
dataset_output_path = join(output_path, dataset_path, c_type,
|
| 222 |
+
'videos')
|
| 223 |
+
print('Output path: {}'.format(dataset_output_path))
|
| 224 |
+
if 'original' in dataset:
|
| 225 |
+
if args.dataset != 'all':
|
| 226 |
+
print('Only videos available for original data. Aborting.')
|
| 227 |
+
return
|
| 228 |
+
else:
|
| 229 |
+
print('Only videos available for original data. '
|
| 230 |
+
'Skipping original.\n')
|
| 231 |
+
continue
|
| 232 |
+
if 'FaceShifter' in dataset:
|
| 233 |
+
print('Masks not available for FaceShifter. Aborting.')
|
| 234 |
+
return
|
| 235 |
+
filelist = [filename + '.mp4' for filename in filelist]
|
| 236 |
+
download_files(filelist, dataset_mask_url, dataset_output_path)
|
| 237 |
+
|
| 238 |
+
# Else: models for deepfakes
|
| 239 |
+
else:
|
| 240 |
+
if dataset != 'Deepfakes' and c_type == 'models':
|
| 241 |
+
print('Models only available for Deepfakes. Aborting')
|
| 242 |
+
return
|
| 243 |
+
dataset_output_path = join(output_path, dataset_path, c_type)
|
| 244 |
+
print('Output path: {}'.format(dataset_output_path))
|
| 245 |
+
|
| 246 |
+
# Get Deepfakes models
|
| 247 |
+
for folder in tqdm(filelist):
|
| 248 |
+
folder_filelist = DEEPFAKES_MODEL_NAMES
|
| 249 |
+
|
| 250 |
+
# Folder paths
|
| 251 |
+
folder_base_url = args.deepfakes_model_url + folder + '/'
|
| 252 |
+
folder_dataset_output_path = join(dataset_output_path,
|
| 253 |
+
folder)
|
| 254 |
+
download_files(folder_filelist, folder_base_url,
|
| 255 |
+
folder_dataset_output_path,
|
| 256 |
+
report_progress=False) # already done
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
if __name__ == "__main__":
|
| 260 |
+
args = parse_args()
|
| 261 |
+
main(args)
|
ela_service.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Error Level Analysis (ELA) — Phase 12.1
|
| 2 |
+
|
| 3 |
+
Re-saves an image at a fixed JPEG quality and diffs against the original to reveal
|
| 4 |
+
per-pixel manipulation artifacts. Regions that were recently edited will show
|
| 5 |
+
higher error levels than untouched areas.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import base64
|
| 11 |
+
import io
|
| 12 |
+
|
| 13 |
+
import cv2
|
| 14 |
+
import numpy as np
|
| 15 |
+
from loguru import logger
|
| 16 |
+
from PIL import Image
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _compute_ela(pil_img: Image.Image, quality: int = 90, scale: float = 15.0) -> np.ndarray:
|
| 20 |
+
"""Return an ELA difference map as a uint8 (H,W,3) RGB array.
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
pil_img: Input image (any format — converted to RGB internally).
|
| 24 |
+
quality: JPEG re-save quality level (lower = more aggressive compression).
|
| 25 |
+
scale: Amplification factor for the difference (higher = more contrast).
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
Difference image as uint8 (H,W,3) array.
|
| 29 |
+
"""
|
| 30 |
+
rgb = pil_img.convert("RGB")
|
| 31 |
+
|
| 32 |
+
# Re-save at specified JPEG quality into an in-memory buffer
|
| 33 |
+
buf = io.BytesIO()
|
| 34 |
+
rgb.save(buf, format="JPEG", quality=quality)
|
| 35 |
+
buf.seek(0)
|
| 36 |
+
resaved = Image.open(buf).convert("RGB")
|
| 37 |
+
|
| 38 |
+
original_arr = np.array(rgb, dtype=np.float32)
|
| 39 |
+
resaved_arr = np.array(resaved, dtype=np.float32)
|
| 40 |
+
|
| 41 |
+
# Per-pixel absolute difference, amplified
|
| 42 |
+
diff = np.abs(original_arr - resaved_arr) * scale
|
| 43 |
+
diff = np.clip(diff, 0, 255).astype(np.uint8)
|
| 44 |
+
|
| 45 |
+
return diff
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def generate_ela_base64(pil_img: Image.Image, quality: int = 90, scale: float = 15.0) -> str:
|
| 49 |
+
"""Produce a base64 data-URL PNG of the ELA difference map.
|
| 50 |
+
|
| 51 |
+
Regions with higher error levels (brighter in the output) are more likely
|
| 52 |
+
to have been digitally manipulated.
|
| 53 |
+
"""
|
| 54 |
+
diff = _compute_ela(pil_img, quality=quality, scale=scale)
|
| 55 |
+
|
| 56 |
+
buf = io.BytesIO()
|
| 57 |
+
Image.fromarray(diff).save(buf, format="PNG")
|
| 58 |
+
b64 = base64.b64encode(buf.getvalue()).decode("ascii")
|
| 59 |
+
|
| 60 |
+
logger.info(f"ELA map generated ({diff.shape[1]}x{diff.shape[0]})")
|
| 61 |
+
return f"data:image/png;base64,{b64}"
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def generate_blended_ela_base64(
|
| 65 |
+
pil_img: Image.Image,
|
| 66 |
+
gradcam_weight: float = 0.6,
|
| 67 |
+
ela_weight: float = 0.4,
|
| 68 |
+
quality: int = 90,
|
| 69 |
+
scale: float = 15.0,
|
| 70 |
+
) -> str:
|
| 71 |
+
"""Blend Grad-CAM heatmap overlay with ELA at specified weights.
|
| 72 |
+
|
| 73 |
+
This is a utility for the 'blended' mode — it composites the ELA
|
| 74 |
+
difference map on top of the original image for visual clarity.
|
| 75 |
+
"""
|
| 76 |
+
rgb = pil_img.convert("RGB")
|
| 77 |
+
original_arr = np.array(rgb, dtype=np.float32)
|
| 78 |
+
ela_arr = _compute_ela(pil_img, quality=quality, scale=scale).astype(np.float32)
|
| 79 |
+
|
| 80 |
+
# Blend: overlay ELA on the original for visual context
|
| 81 |
+
blended = np.clip(original_arr * 0.5 + ela_arr * 0.5, 0, 255).astype(np.uint8)
|
| 82 |
+
|
| 83 |
+
buf = io.BytesIO()
|
| 84 |
+
Image.fromarray(blended).save(buf, format="PNG")
|
| 85 |
+
b64 = base64.b64encode(buf.getvalue()).decode("ascii")
|
| 86 |
+
|
| 87 |
+
logger.info(f"Blended ELA generated ({blended.shape[1]}x{blended.shape[0]})")
|
| 88 |
+
return f"data:image/png;base64,{b64}"
|
exif_service.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""EXIF Metadata Extraction — Phase 12.2
|
| 2 |
+
|
| 3 |
+
Extracts camera metadata from uploaded images and computes a trust adjustment
|
| 4 |
+
score: presence of authentic camera metadata lowers fake probability, while
|
| 5 |
+
evidence of editing software raises it.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from typing import Optional
|
| 11 |
+
|
| 12 |
+
from loguru import logger
|
| 13 |
+
from PIL import Image
|
| 14 |
+
from PIL.ExifTags import TAGS, GPSTAGS
|
| 15 |
+
|
| 16 |
+
from schemas.common import ExifSummary
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# Software strings that suggest post-processing / generation
|
| 20 |
+
_SUSPICIOUS_SOFTWARE = {
|
| 21 |
+
"adobe photoshop", "photoshop", "gimp", "affinity photo",
|
| 22 |
+
"stable diffusion", "midjourney", "dall-e", "comfyui",
|
| 23 |
+
"automatic1111", "invokeai",
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
# Software strings that are normal camera firmware
|
| 27 |
+
_CAMERA_SOFTWARE = {
|
| 28 |
+
"ver.", "firmware", "camera", "dji", "gopro",
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _decode_gps(gps_info: dict) -> Optional[str]:
|
| 33 |
+
"""Decode EXIF GPSInfo dict into a human-readable lat/lon string."""
|
| 34 |
+
try:
|
| 35 |
+
def _to_decimal(values, ref):
|
| 36 |
+
d, m, s = [float(v) for v in values]
|
| 37 |
+
decimal = d + m / 60.0 + s / 3600.0
|
| 38 |
+
if ref in ("S", "W"):
|
| 39 |
+
decimal = -decimal
|
| 40 |
+
return decimal
|
| 41 |
+
|
| 42 |
+
lat = _to_decimal(gps_info.get(2, (0, 0, 0)), gps_info.get(1, "N"))
|
| 43 |
+
lon = _to_decimal(gps_info.get(4, (0, 0, 0)), gps_info.get(3, "E"))
|
| 44 |
+
return f"{lat:.6f}, {lon:.6f}"
|
| 45 |
+
except Exception:
|
| 46 |
+
return None
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def extract_exif(pil_img: Image.Image, raw_bytes: bytes) -> ExifSummary:
|
| 50 |
+
"""Extract EXIF metadata and compute a trust adjustment score.
|
| 51 |
+
|
| 52 |
+
Trust adjustment logic:
|
| 53 |
+
- Valid Make + Model + DateTimeOriginal → -15 (more likely real camera photo)
|
| 54 |
+
- GPS info present → -5 additional (real photos often have GPS)
|
| 55 |
+
- Suspicious editing software detected → +10 (more likely manipulated)
|
| 56 |
+
- No EXIF at all → 0 (inconclusive — many platforms strip EXIF)
|
| 57 |
+
"""
|
| 58 |
+
summary = ExifSummary()
|
| 59 |
+
|
| 60 |
+
try:
|
| 61 |
+
exif_data = pil_img._getexif()
|
| 62 |
+
except Exception:
|
| 63 |
+
exif_data = None
|
| 64 |
+
|
| 65 |
+
if not exif_data:
|
| 66 |
+
# Try exifread as fallback for formats Pillow doesn't handle well
|
| 67 |
+
try:
|
| 68 |
+
import exifread
|
| 69 |
+
from io import BytesIO
|
| 70 |
+
tags = exifread.process_file(BytesIO(raw_bytes), details=False)
|
| 71 |
+
if tags:
|
| 72 |
+
summary.make = str(tags.get("Image Make", "")).strip() or None
|
| 73 |
+
summary.model = str(tags.get("Image Model", "")).strip() or None
|
| 74 |
+
summary.datetime_original = str(tags.get("EXIF DateTimeOriginal", "")).strip() or None
|
| 75 |
+
summary.software = str(tags.get("Image Software", "")).strip() or None
|
| 76 |
+
summary.lens_model = str(tags.get("EXIF LensModel", "")).strip() or None
|
| 77 |
+
except ImportError:
|
| 78 |
+
logger.debug("exifread not installed, skipping fallback EXIF extraction")
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logger.debug(f"exifread fallback failed: {e}")
|
| 81 |
+
else:
|
| 82 |
+
# Decode Pillow EXIF
|
| 83 |
+
decoded = {}
|
| 84 |
+
for tag_id, value in exif_data.items():
|
| 85 |
+
tag_name = TAGS.get(tag_id, tag_id)
|
| 86 |
+
decoded[tag_name] = value
|
| 87 |
+
|
| 88 |
+
summary.make = str(decoded.get("Make", "")).strip() or None
|
| 89 |
+
summary.model = str(decoded.get("Model", "")).strip() or None
|
| 90 |
+
summary.datetime_original = str(decoded.get("DateTimeOriginal", "")).strip() or None
|
| 91 |
+
summary.software = str(decoded.get("Software", "")).strip() or None
|
| 92 |
+
summary.lens_model = str(decoded.get("LensModel", "")).strip() or None
|
| 93 |
+
|
| 94 |
+
# GPS
|
| 95 |
+
gps_raw = decoded.get("GPSInfo")
|
| 96 |
+
if gps_raw and isinstance(gps_raw, dict):
|
| 97 |
+
gps_decoded = {}
|
| 98 |
+
for k, v in gps_raw.items():
|
| 99 |
+
gps_decoded[GPSTAGS.get(k, k)] = v
|
| 100 |
+
summary.gps_info = _decode_gps(gps_decoded)
|
| 101 |
+
|
| 102 |
+
# ── Trust adjustment scoring ──
|
| 103 |
+
adjustment = 0
|
| 104 |
+
reasons = []
|
| 105 |
+
|
| 106 |
+
has_camera_meta = summary.make and summary.model and summary.datetime_original
|
| 107 |
+
if has_camera_meta:
|
| 108 |
+
adjustment -= 15
|
| 109 |
+
reasons.append("valid camera metadata (Make/Model/DateTime)")
|
| 110 |
+
|
| 111 |
+
if summary.gps_info:
|
| 112 |
+
adjustment -= 5
|
| 113 |
+
reasons.append("GPS coordinates present")
|
| 114 |
+
|
| 115 |
+
if summary.software:
|
| 116 |
+
sw_lower = summary.software.lower()
|
| 117 |
+
if any(s in sw_lower for s in _SUSPICIOUS_SOFTWARE):
|
| 118 |
+
adjustment += 10
|
| 119 |
+
reasons.append(f"editing software detected: {summary.software}")
|
| 120 |
+
elif any(s in sw_lower for s in _CAMERA_SOFTWARE):
|
| 121 |
+
adjustment -= 2
|
| 122 |
+
reasons.append("camera firmware in Software field")
|
| 123 |
+
|
| 124 |
+
summary.trust_adjustment = adjustment
|
| 125 |
+
summary.trust_reason = "; ".join(reasons) if reasons else "no EXIF metadata found"
|
| 126 |
+
|
| 127 |
+
logger.info(f"EXIF extracted: make={summary.make}, model={summary.model}, "
|
| 128 |
+
f"adjustment={adjustment} ({summary.trust_reason})")
|
| 129 |
+
return summary
|
file_handler.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import io
|
| 4 |
+
import os
|
| 5 |
+
import tempfile
|
| 6 |
+
from typing import Iterable
|
| 7 |
+
|
| 8 |
+
from fastapi import HTTPException, UploadFile, status
|
| 9 |
+
|
| 10 |
+
from config import settings
|
| 11 |
+
|
| 12 |
+
IMAGE_MAGIC_BYTES: dict[bytes, str] = {
|
| 13 |
+
b"\xff\xd8\xff": "image/jpeg",
|
| 14 |
+
b"\x89PNG\r\n\x1a\n": "image/png",
|
| 15 |
+
b"RIFF": "image/webp", # partial; WEBP has 'RIFF....WEBP'
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _detect_mime_by_magic(head: bytes) -> str | None:
|
| 20 |
+
for sig, mime in IMAGE_MAGIC_BYTES.items():
|
| 21 |
+
if head.startswith(sig):
|
| 22 |
+
if mime == "image/webp" and b"WEBP" not in head[:16]:
|
| 23 |
+
continue
|
| 24 |
+
return mime
|
| 25 |
+
return None
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
async def read_upload_bytes(
|
| 29 |
+
file: UploadFile,
|
| 30 |
+
allowed_mimes: Iterable[str],
|
| 31 |
+
max_size_mb: int,
|
| 32 |
+
) -> tuple[bytes, str]:
|
| 33 |
+
"""Read an UploadFile into memory after validating type and size.
|
| 34 |
+
Returns (raw_bytes, detected_mime). Raises HTTPException on failure.
|
| 35 |
+
"""
|
| 36 |
+
data = await file.read()
|
| 37 |
+
size_mb = len(data) / (1024 * 1024)
|
| 38 |
+
if size_mb > max_size_mb:
|
| 39 |
+
raise HTTPException(
|
| 40 |
+
status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
|
| 41 |
+
detail=f"File too large ({size_mb:.1f} MB > {max_size_mb} MB)",
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
mime = _detect_mime_by_magic(data[:16]) or (file.content_type or "")
|
| 45 |
+
if mime not in allowed_mimes:
|
| 46 |
+
raise HTTPException(
|
| 47 |
+
status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
|
| 48 |
+
detail=f"Unsupported type '{mime}'. Allowed: {list(allowed_mimes)}",
|
| 49 |
+
)
|
| 50 |
+
return data, mime
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def bytes_to_buffer(data: bytes) -> io.BytesIO:
|
| 54 |
+
return io.BytesIO(data)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
async def save_upload_to_tempfile(
|
| 58 |
+
file: UploadFile,
|
| 59 |
+
allowed_mimes: Iterable[str],
|
| 60 |
+
max_size_mb: int,
|
| 61 |
+
suffix: str = ".mp4",
|
| 62 |
+
) -> tuple[str, str]:
|
| 63 |
+
"""Stream an UploadFile to a temp file on disk. Returns (path, mime).
|
| 64 |
+
MIME is taken from the client's content_type (no magic-byte check for videos).
|
| 65 |
+
Caller is responsible for deleting the temp file.
|
| 66 |
+
"""
|
| 67 |
+
mime = (file.content_type or "").lower()
|
| 68 |
+
if mime not in allowed_mimes:
|
| 69 |
+
raise HTTPException(
|
| 70 |
+
status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
|
| 71 |
+
detail=f"Unsupported type '{mime}'. Allowed: {list(allowed_mimes)}",
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
max_bytes = max_size_mb * 1024 * 1024
|
| 75 |
+
fd, path = tempfile.mkstemp(suffix=suffix, prefix="ds_vid_")
|
| 76 |
+
written = 0
|
| 77 |
+
try:
|
| 78 |
+
with os.fdopen(fd, "wb") as out:
|
| 79 |
+
while True:
|
| 80 |
+
chunk = await file.read(1024 * 1024)
|
| 81 |
+
if not chunk:
|
| 82 |
+
break
|
| 83 |
+
written += len(chunk)
|
| 84 |
+
if written > max_bytes:
|
| 85 |
+
raise HTTPException(
|
| 86 |
+
status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
|
| 87 |
+
detail=f"File too large (> {max_size_mb} MB)",
|
| 88 |
+
)
|
| 89 |
+
out.write(chunk)
|
| 90 |
+
except Exception:
|
| 91 |
+
try:
|
| 92 |
+
os.unlink(path)
|
| 93 |
+
except OSError:
|
| 94 |
+
pass
|
| 95 |
+
raise
|
| 96 |
+
return path, mime
|
generate_colab_nb.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import nbformat as nbf
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
nb = nbf.v4.new_notebook()
|
| 5 |
+
|
| 6 |
+
text = """\
|
| 7 |
+
# DeepShield: FaceForensics++ ViT Training
|
| 8 |
+
Run this entirely in Google Colab.
|
| 9 |
+
**Before running**:
|
| 10 |
+
1. Go to `Runtime` -> `Change runtime type` -> select **T4 GPU**.
|
| 11 |
+
2. Run the cells below sequentially.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
code_install = """\
|
| 15 |
+
!pip install timm transformers datasets accelerate evaluate opencv-python
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
code_ffpp = """\
|
| 19 |
+
# We create the download script inside the Colab environment
|
| 20 |
+
download_script = '''#!/usr/bin/env python
|
| 21 |
+
import argparse
|
| 22 |
+
import os
|
| 23 |
+
import urllib.request
|
| 24 |
+
import tempfile
|
| 25 |
+
import time
|
| 26 |
+
import sys
|
| 27 |
+
import json
|
| 28 |
+
from tqdm import tqdm
|
| 29 |
+
from os.path import join
|
| 30 |
+
|
| 31 |
+
FILELIST_URL = 'misc/filelist.json'
|
| 32 |
+
DEEPFEAKES_DETECTION_URL = 'misc/deepfake_detection_filenames.json'
|
| 33 |
+
DEEPFAKES_MODEL_NAMES = ['decoder_A.h5', 'decoder_B.h5', 'encoder.h5',]
|
| 34 |
+
DATASETS = {
|
| 35 |
+
'original': 'original_sequences/youtube',
|
| 36 |
+
'Deepfakes': 'manipulated_sequences/Deepfakes',
|
| 37 |
+
'Face2Face': 'manipulated_sequences/Face2Face',
|
| 38 |
+
'FaceShifter': 'manipulated_sequences/FaceShifter',
|
| 39 |
+
'FaceSwap': 'manipulated_sequences/FaceSwap',
|
| 40 |
+
'NeuralTextures': 'manipulated_sequences/NeuralTextures'
|
| 41 |
+
}
|
| 42 |
+
ALL_DATASETS = ['original', 'Deepfakes', 'Face2Face', 'FaceShifter', 'FaceSwap', 'NeuralTextures']
|
| 43 |
+
COMPRESSION = ['raw', 'c23', 'c40']
|
| 44 |
+
TYPE = ['videos']
|
| 45 |
+
|
| 46 |
+
def download_file(url, out_file):
|
| 47 |
+
os.makedirs(os.path.dirname(out_file), exist_ok=True)
|
| 48 |
+
if not os.path.isfile(out_file):
|
| 49 |
+
urllib.request.urlretrieve(url, out_file)
|
| 50 |
+
|
| 51 |
+
def main():
|
| 52 |
+
parser = argparse.ArgumentParser()
|
| 53 |
+
parser.add_argument('output_path', type=str)
|
| 54 |
+
parser.add_argument('-d', '--dataset', type=str, default='all')
|
| 55 |
+
parser.add_argument('-c', '--compression', type=str, default='c40')
|
| 56 |
+
parser.add_argument('-t', '--type', type=str, default='videos')
|
| 57 |
+
parser.add_argument('-n', '--num_videos', type=int, default=50) # Small amount for tutorial
|
| 58 |
+
args = parser.parse_args()
|
| 59 |
+
|
| 60 |
+
base_url = 'http://kaldir.vc.in.tum.de/faceforensics/v3/'
|
| 61 |
+
|
| 62 |
+
datasets = [args.dataset] if args.dataset != 'all' else ALL_DATASETS
|
| 63 |
+
for dataset in datasets:
|
| 64 |
+
dataset_path = DATASETS[dataset]
|
| 65 |
+
print(f'Downloading {args.compression} of {dataset}')
|
| 66 |
+
|
| 67 |
+
file_pairs = json.loads(urllib.request.urlopen(base_url + FILELIST_URL).read().decode("utf-8"))
|
| 68 |
+
filelist = []
|
| 69 |
+
if 'original' in dataset_path:
|
| 70 |
+
for pair in file_pairs:
|
| 71 |
+
filelist += pair
|
| 72 |
+
else:
|
| 73 |
+
for pair in file_pairs:
|
| 74 |
+
filelist.append('_'.join(pair))
|
| 75 |
+
filelist.append('_'.join(pair[::-1]))
|
| 76 |
+
|
| 77 |
+
filelist = filelist[:args.num_videos]
|
| 78 |
+
dataset_videos_url = base_url + f'{dataset_path}/{args.compression}/{args.type}/'
|
| 79 |
+
dataset_output_path = join(args.output_path, dataset_path, args.compression, args.type)
|
| 80 |
+
|
| 81 |
+
for filename in tqdm(filelist):
|
| 82 |
+
download_file(dataset_videos_url + filename + ".mp4", join(dataset_output_path, filename + ".mp4"))
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
main()
|
| 86 |
+
'''
|
| 87 |
+
|
| 88 |
+
with open("download_ffpp.py", "w") as f:
|
| 89 |
+
f.write(download_script)
|
| 90 |
+
|
| 91 |
+
!python download_ffpp.py ./data -d all -c c40 -t videos -n 50
|
| 92 |
+
"""
|
| 93 |
+
|
| 94 |
+
code_extract = """\
|
| 95 |
+
import cv2
|
| 96 |
+
import os
|
| 97 |
+
import glob
|
| 98 |
+
from tqdm import tqdm
|
| 99 |
+
|
| 100 |
+
def extract_frames(video_folder, output_folder, label, max_frames=4):
|
| 101 |
+
os.makedirs(output_folder, exist_ok=True)
|
| 102 |
+
videos = glob.glob(os.path.join(video_folder, "*.mp4"))
|
| 103 |
+
|
| 104 |
+
for vid_path in tqdm(videos, desc=f"Extracting {label}"):
|
| 105 |
+
vid_name = os.path.basename(vid_path).replace('.mp4','')
|
| 106 |
+
cap = cv2.VideoCapture(vid_path)
|
| 107 |
+
count = 0
|
| 108 |
+
while cap.isOpened() and count < max_frames:
|
| 109 |
+
ret, frame = cap.read()
|
| 110 |
+
if not ret: break
|
| 111 |
+
frame = cv2.resize(frame, (224, 224))
|
| 112 |
+
out_path = os.path.join(output_folder, f"{vid_name}_f{count}.jpg")
|
| 113 |
+
cv2.imwrite(out_path, frame)
|
| 114 |
+
count += 1
|
| 115 |
+
cap.release()
|
| 116 |
+
|
| 117 |
+
# Extract Real
|
| 118 |
+
extract_frames('./data/original_sequences/youtube/c40/videos', './dataset/real', 'real')
|
| 119 |
+
|
| 120 |
+
# Extract Fakes
|
| 121 |
+
fakes = ['Deepfakes', 'Face2Face', 'FaceSwap', 'NeuralTextures']
|
| 122 |
+
for f in fakes:
|
| 123 |
+
extract_frames(f'./data/manipulated_sequences/{f}/c40/videos', './dataset/fake', 'fake')
|
| 124 |
+
"""
|
| 125 |
+
|
| 126 |
+
code_train = """\
|
| 127 |
+
import numpy as np
|
| 128 |
+
from datasets import load_dataset
|
| 129 |
+
from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer
|
| 130 |
+
import torch
|
| 131 |
+
|
| 132 |
+
# 1. Load Dataset
|
| 133 |
+
dataset = load_dataset('imagefolder', data_dir='./dataset')
|
| 134 |
+
# Split into train/validation
|
| 135 |
+
dataset = dataset['train'].train_test_split(test_size=0.1)
|
| 136 |
+
|
| 137 |
+
# 2. Preprocessor
|
| 138 |
+
model_name_or_path = 'google/vit-base-patch16-224-in21k'
|
| 139 |
+
processor = ViTImageProcessor.from_pretrained(model_name_or_path)
|
| 140 |
+
|
| 141 |
+
def transform(example_batch):
|
| 142 |
+
# Take a list of PIL images and turn them to pixel values
|
| 143 |
+
inputs = processor([x.convert("RGB") for x in example_batch['image']], return_tensors='pt')
|
| 144 |
+
inputs['labels'] = example_batch['label']
|
| 145 |
+
return inputs
|
| 146 |
+
|
| 147 |
+
prepared_ds = dataset.with_transform(transform)
|
| 148 |
+
|
| 149 |
+
def collate_fn(batch):
|
| 150 |
+
return {
|
| 151 |
+
'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
|
| 152 |
+
'labels': torch.tensor([x['labels'] for x in batch])
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
# 3. Load Model
|
| 156 |
+
labels = dataset['train'].features['label'].names
|
| 157 |
+
model = ViTForImageClassification.from_pretrained(
|
| 158 |
+
model_name_or_path,
|
| 159 |
+
num_labels=len(labels),
|
| 160 |
+
id2label={str(i): c for i, c in enumerate(labels)},
|
| 161 |
+
label2id={c: str(i) for i, c in enumerate(labels)}
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
training_args = TrainingArguments(
|
| 165 |
+
output_dir="./vit-deepshield",
|
| 166 |
+
per_device_train_batch_size=16,
|
| 167 |
+
eval_strategy="steps",
|
| 168 |
+
num_train_epochs=3,
|
| 169 |
+
fp16=True, # Mixed precision for speed
|
| 170 |
+
save_steps=100,
|
| 171 |
+
eval_steps=100,
|
| 172 |
+
logging_steps=10,
|
| 173 |
+
learning_rate=2e-4,
|
| 174 |
+
save_total_limit=2,
|
| 175 |
+
remove_unused_columns=False,
|
| 176 |
+
push_to_hub=False,
|
| 177 |
+
load_best_model_at_end=True,
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
import evaluate
|
| 181 |
+
metric = evaluate.load("accuracy")
|
| 182 |
+
def compute_metrics(p):
|
| 183 |
+
return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
|
| 184 |
+
|
| 185 |
+
trainer = Trainer(
|
| 186 |
+
model=model,
|
| 187 |
+
args=training_args,
|
| 188 |
+
data_collator=collate_fn,
|
| 189 |
+
compute_metrics=compute_metrics,
|
| 190 |
+
train_dataset=prepared_ds["train"],
|
| 191 |
+
eval_dataset=prepared_ds["test"],
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
# 4. Train
|
| 195 |
+
train_results = trainer.train()
|
| 196 |
+
trainer.save_model("deepshield_vit_model")
|
| 197 |
+
processor.save_pretrained("deepshield_vit_model")
|
| 198 |
+
trainer.log_metrics("train", train_results.metrics)
|
| 199 |
+
trainer.save_metrics("train", train_results.metrics)
|
| 200 |
+
trainer.save_state()
|
| 201 |
+
print("Training Complete! The model is saved to ./deepshield_vit_model")
|
| 202 |
+
"""
|
| 203 |
+
|
| 204 |
+
nb['cells'] = [
|
| 205 |
+
nbf.v4.new_markdown_cell(text),
|
| 206 |
+
nbf.v4.new_code_cell(code_install),
|
| 207 |
+
nbf.v4.new_code_cell(code_ffpp),
|
| 208 |
+
nbf.v4.new_code_cell(code_extract),
|
| 209 |
+
nbf.v4.new_code_cell(code_train)
|
| 210 |
+
]
|
| 211 |
+
|
| 212 |
+
with open(r'c:\Users\athar\Desktop\minor2\backend\training\Colab_ViT_Training.ipynb', 'w', encoding='utf-8') as f:
|
| 213 |
+
nbf.write(nb, f)
|
heatmap_generator.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import base64
|
| 4 |
+
import io
|
| 5 |
+
from typing import Optional
|
| 6 |
+
|
| 7 |
+
import cv2
|
| 8 |
+
import numpy as np
|
| 9 |
+
import torch
|
| 10 |
+
from loguru import logger
|
| 11 |
+
from PIL import Image
|
| 12 |
+
from pytorch_grad_cam import GradCAMPlusPlus
|
| 13 |
+
from pytorch_grad_cam.utils.image import show_cam_on_image
|
| 14 |
+
from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
|
| 15 |
+
|
| 16 |
+
from config import settings
|
| 17 |
+
from models.model_loader import get_model_loader
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class _HFLogitsWrapper(torch.nn.Module):
|
| 21 |
+
"""Wrap a HuggingFace image classification model so forward() returns logits
|
| 22 |
+
as a plain tensor (pytorch_grad_cam expects tensor outputs, not dicts/dataclasses).
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(self, model: torch.nn.Module) -> None:
|
| 26 |
+
super().__init__()
|
| 27 |
+
self.model = model
|
| 28 |
+
|
| 29 |
+
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: # type: ignore[override]
|
| 30 |
+
return self.model(pixel_values=pixel_values).logits
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _vit_reshape_transform(tensor: torch.Tensor, height: int = 14, width: int = 14) -> torch.Tensor:
|
| 34 |
+
"""Grad-CAM expects (B, C, H, W); ViT hidden states are (B, 1+H*W, C).
|
| 35 |
+
Drop the CLS token and reshape tokens into a spatial grid.
|
| 36 |
+
"""
|
| 37 |
+
result = tensor[:, 1:, :]
|
| 38 |
+
b, n, c = result.shape
|
| 39 |
+
result = result.reshape(b, height, width, c)
|
| 40 |
+
result = result.permute(0, 3, 1, 2) # (B, C, H, W)
|
| 41 |
+
return result
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _preprocess_for_cam(pil_img: Image.Image, processor) -> tuple[torch.Tensor, np.ndarray]:
|
| 45 |
+
"""Return (input_tensor, rgb_float_224) where rgb_float_224 is a (H,W,3) float
|
| 46 |
+
array in [0,1] matching the model input geometry — needed for overlaying.
|
| 47 |
+
"""
|
| 48 |
+
inputs = processor(images=pil_img, return_tensors="pt")
|
| 49 |
+
input_tensor = inputs["pixel_values"].to(settings.DEVICE)
|
| 50 |
+
|
| 51 |
+
size = getattr(processor, "size", {"height": 224, "width": 224})
|
| 52 |
+
h = size.get("height", 224) if isinstance(size, dict) else 224
|
| 53 |
+
w = size.get("width", 224) if isinstance(size, dict) else 224
|
| 54 |
+
|
| 55 |
+
resized = pil_img.resize((w, h), Image.BILINEAR)
|
| 56 |
+
rgb = np.array(resized).astype(np.float32) / 255.0 # (H,W,3) in [0,1]
|
| 57 |
+
return input_tensor, rgb
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _encode_overlay_to_base64(overlay: np.ndarray) -> str:
|
| 61 |
+
"""Encode a uint8 (H,W,3) RGB overlay to a base64 data-URL PNG."""
|
| 62 |
+
buf = io.BytesIO()
|
| 63 |
+
Image.fromarray(overlay).save(buf, format="PNG")
|
| 64 |
+
b64 = base64.b64encode(buf.getvalue()).decode("ascii")
|
| 65 |
+
return f"data:image/png;base64,{b64}"
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _compute_gradcam_pp(
|
| 69 |
+
pil_img: Image.Image,
|
| 70 |
+
target_class_idx: Optional[int] = None,
|
| 71 |
+
) -> tuple[np.ndarray, np.ndarray]:
|
| 72 |
+
"""Compute Grad-CAM++ averaged across the last 3 ViT encoder layers.
|
| 73 |
+
Returns (grayscale_cam, rgb_float) where grayscale_cam is (H,W) in [0,1].
|
| 74 |
+
"""
|
| 75 |
+
loader = get_model_loader()
|
| 76 |
+
model, processor = loader.load_image_model()
|
| 77 |
+
|
| 78 |
+
model.eval()
|
| 79 |
+
for p in model.parameters():
|
| 80 |
+
p.requires_grad_(True)
|
| 81 |
+
|
| 82 |
+
input_tensor, rgb_float = _preprocess_for_cam(pil_img, processor)
|
| 83 |
+
|
| 84 |
+
grid = int(model.config.image_size / model.config.patch_size)
|
| 85 |
+
|
| 86 |
+
# Average across last 3 ViT encoder layers for smoother heatmaps
|
| 87 |
+
num_layers = len(model.vit.encoder.layer)
|
| 88 |
+
last_n = min(3, num_layers)
|
| 89 |
+
target_layers = [
|
| 90 |
+
model.vit.encoder.layer[-(i + 1)].layernorm_before
|
| 91 |
+
for i in range(last_n)
|
| 92 |
+
]
|
| 93 |
+
|
| 94 |
+
wrapped = _HFLogitsWrapper(model)
|
| 95 |
+
|
| 96 |
+
targets = None
|
| 97 |
+
if target_class_idx is not None:
|
| 98 |
+
targets = [ClassifierOutputTarget(int(target_class_idx))]
|
| 99 |
+
|
| 100 |
+
with GradCAMPlusPlus(
|
| 101 |
+
model=wrapped,
|
| 102 |
+
target_layers=target_layers,
|
| 103 |
+
reshape_transform=lambda t: _vit_reshape_transform(t, grid, grid),
|
| 104 |
+
) as cam:
|
| 105 |
+
grayscale_cam = cam(input_tensor=input_tensor, targets=targets)[0] # (H,W) in [0,1]
|
| 106 |
+
|
| 107 |
+
return grayscale_cam, rgb_float
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def generate_heatmap_base64(
|
| 111 |
+
pil_img: Image.Image,
|
| 112 |
+
target_class_idx: Optional[int] = None,
|
| 113 |
+
) -> str:
|
| 114 |
+
"""Produce a base64 data-URL PNG of the Grad-CAM++ overlay for the given image."""
|
| 115 |
+
grayscale_cam, rgb_float = _compute_gradcam_pp(pil_img, target_class_idx)
|
| 116 |
+
overlay = show_cam_on_image(rgb_float, grayscale_cam, use_rgb=True)
|
| 117 |
+
logger.info(f"Heatmap generated ({overlay.shape[0]}x{overlay.shape[1]})")
|
| 118 |
+
return _encode_overlay_to_base64(overlay)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def generate_boxes_base64(
|
| 122 |
+
pil_img: Image.Image,
|
| 123 |
+
target_class_idx: Optional[int] = None,
|
| 124 |
+
top_k: int = 5,
|
| 125 |
+
threshold: float = 0.4,
|
| 126 |
+
) -> str:
|
| 127 |
+
"""Produce bounding boxes around top-K connected components from Grad-CAM++ activation.
|
| 128 |
+
Renders colored boxes (red/yellow/orange by intensity) on the original image.
|
| 129 |
+
"""
|
| 130 |
+
grayscale_cam, rgb_float = _compute_gradcam_pp(pil_img, target_class_idx)
|
| 131 |
+
|
| 132 |
+
h, w = rgb_float.shape[:2]
|
| 133 |
+
base_img = (rgb_float * 255).astype(np.uint8).copy()
|
| 134 |
+
|
| 135 |
+
# Threshold the heatmap to find activated regions
|
| 136 |
+
binary = (grayscale_cam >= threshold).astype(np.uint8) * 255
|
| 137 |
+
contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 138 |
+
|
| 139 |
+
if not contours:
|
| 140 |
+
logger.info("No significant activation regions found for bounding boxes")
|
| 141 |
+
return _encode_overlay_to_base64(base_img)
|
| 142 |
+
|
| 143 |
+
# Sort by area descending, take top_k
|
| 144 |
+
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:top_k]
|
| 145 |
+
|
| 146 |
+
# Color by mean activation intensity within each box
|
| 147 |
+
for cnt in contours:
|
| 148 |
+
x, y, bw, bh = cv2.boundingRect(cnt)
|
| 149 |
+
region_activation = grayscale_cam[y:y + bh, x:x + bw].mean()
|
| 150 |
+
|
| 151 |
+
if region_activation >= 0.7:
|
| 152 |
+
color = (220, 40, 40) # red — high suspicion
|
| 153 |
+
elif region_activation >= 0.5:
|
| 154 |
+
color = (240, 140, 20) # orange — medium
|
| 155 |
+
else:
|
| 156 |
+
color = (230, 200, 40) # yellow — lower
|
| 157 |
+
|
| 158 |
+
cv2.rectangle(base_img, (x, y), (x + bw, y + bh), color, 2)
|
| 159 |
+
label = f"{region_activation * 100:.0f}%"
|
| 160 |
+
cv2.putText(base_img, label, (x, max(y - 6, 12)),
|
| 161 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1, cv2.LINE_AA)
|
| 162 |
+
|
| 163 |
+
logger.info(f"Bounding boxes generated: {len(contours)} regions")
|
| 164 |
+
return _encode_overlay_to_base64(base_img)
|
image_service.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import io
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from typing import Tuple
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
from loguru import logger
|
| 9 |
+
from PIL import Image
|
| 10 |
+
|
| 11 |
+
from config import settings
|
| 12 |
+
from models.model_loader import get_model_loader
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class ImageClassification:
|
| 17 |
+
label: str
|
| 18 |
+
confidence: float
|
| 19 |
+
all_scores: dict[str, float]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def load_image_from_bytes(data: bytes) -> Image.Image:
|
| 23 |
+
img = Image.open(io.BytesIO(data))
|
| 24 |
+
if img.mode != "RGB":
|
| 25 |
+
img = img.convert("RGB")
|
| 26 |
+
return img
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def classify_image(pil_img: Image.Image) -> ImageClassification:
|
| 30 |
+
"""Run the ViT deepfake classifier on a PIL image."""
|
| 31 |
+
loader = get_model_loader()
|
| 32 |
+
model, processor = loader.load_image_model()
|
| 33 |
+
|
| 34 |
+
inputs = processor(images=pil_img, return_tensors="pt")
|
| 35 |
+
inputs = {k: v.to(settings.DEVICE) for k, v in inputs.items()}
|
| 36 |
+
|
| 37 |
+
with torch.no_grad():
|
| 38 |
+
outputs = model(**inputs)
|
| 39 |
+
logits = outputs.logits # (1, num_labels)
|
| 40 |
+
probs = torch.softmax(logits, dim=-1)[0]
|
| 41 |
+
|
| 42 |
+
id2label: dict[int, str] = getattr(model.config, "id2label", {})
|
| 43 |
+
all_scores = {id2label.get(i, str(i)): float(p.item()) for i, p in enumerate(probs)}
|
| 44 |
+
top_idx = int(torch.argmax(probs).item())
|
| 45 |
+
top_label = id2label.get(top_idx, str(top_idx))
|
| 46 |
+
top_conf = float(probs[top_idx].item())
|
| 47 |
+
|
| 48 |
+
logger.info(f"Image classify → {top_label} @ {top_conf:.3f}")
|
| 49 |
+
return ImageClassification(label=top_label, confidence=top_conf, all_scores=all_scores)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def preprocess_and_classify(raw_bytes: bytes) -> Tuple[Image.Image, ImageClassification]:
|
| 53 |
+
"""Convenience: decode bytes → PIL → classify. Returns the PIL image too so
|
| 54 |
+
downstream steps (heatmap, artifact scan) can reuse it.
|
| 55 |
+
"""
|
| 56 |
+
pil = load_image_from_bytes(raw_bytes)
|
| 57 |
+
result = classify_image(pil)
|
| 58 |
+
return pil, result
|
llm_explainer.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""LLM Explainability Card — Phase 12.3
|
| 2 |
+
|
| 3 |
+
Generates a plain-English summary paragraph + 3 key-signal bullets from the
|
| 4 |
+
full analysis payload. Supports Gemini (default) and OpenAI providers.
|
| 5 |
+
Results are cached per record_id to avoid re-spending tokens.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import json
|
| 11 |
+
from abc import ABC, abstractmethod
|
| 12 |
+
from functools import lru_cache
|
| 13 |
+
from typing import Any
|
| 14 |
+
|
| 15 |
+
from loguru import logger
|
| 16 |
+
|
| 17 |
+
from config import settings
|
| 18 |
+
from schemas.common import LLMExplainabilitySummary
|
| 19 |
+
|
| 20 |
+
# ── In-memory cache keyed by record_id ──
|
| 21 |
+
_cache: dict[str, LLMExplainabilitySummary] = {}
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
_PROMPT_TEMPLATE = """\
|
| 25 |
+
You are DeepShield's explainability engine. Given the JSON analysis payload below,
|
| 26 |
+
write a concise, accessible summary for a non-technical user.
|
| 27 |
+
|
| 28 |
+
**Output format (strict JSON only — no markdown fences):**
|
| 29 |
+
{{
|
| 30 |
+
"paragraph": "<2-3 sentence plain-English summary of the verdict and key signals>",
|
| 31 |
+
"bullets": [
|
| 32 |
+
"<key signal 1>",
|
| 33 |
+
"<key signal 2>",
|
| 34 |
+
"<key signal 3>"
|
| 35 |
+
]
|
| 36 |
+
}}
|
| 37 |
+
|
| 38 |
+
Rules:
|
| 39 |
+
- Be factual. State what the analysis found, not what you speculate.
|
| 40 |
+
- Reference specific indicators (e.g. "GAN artifact score", "EXIF metadata", "sensationalism level").
|
| 41 |
+
- If the verdict is "Likely Authentic", reassure the user and explain why.
|
| 42 |
+
- If the verdict is "Likely Manipulated" or "Suspicious", highlight the strongest evidence.
|
| 43 |
+
- Keep the paragraph under 60 words. Each bullet under 20 words.
|
| 44 |
+
|
| 45 |
+
**Analysis payload:**
|
| 46 |
+
{payload_json}
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class _LLMProvider(ABC):
|
| 51 |
+
@abstractmethod
|
| 52 |
+
def generate(self, prompt: str) -> str:
|
| 53 |
+
"""Send prompt to LLM and return raw text response."""
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class _GeminiProvider(_LLMProvider):
|
| 57 |
+
def __init__(self) -> None:
|
| 58 |
+
import google.generativeai as genai
|
| 59 |
+
genai.configure(api_key=settings.LLM_API_KEY)
|
| 60 |
+
self._model = genai.GenerativeModel(settings.LLM_MODEL)
|
| 61 |
+
|
| 62 |
+
def generate(self, prompt: str) -> str:
|
| 63 |
+
response = self._model.generate_content(prompt)
|
| 64 |
+
return response.text
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class _OpenAIProvider(_LLMProvider):
|
| 68 |
+
def __init__(self) -> None:
|
| 69 |
+
from openai import OpenAI
|
| 70 |
+
self._client = OpenAI(api_key=settings.LLM_API_KEY)
|
| 71 |
+
|
| 72 |
+
def generate(self, prompt: str) -> str:
|
| 73 |
+
response = self._client.chat.completions.create(
|
| 74 |
+
model=settings.LLM_MODEL,
|
| 75 |
+
messages=[{"role": "user", "content": prompt}],
|
| 76 |
+
temperature=0.3,
|
| 77 |
+
max_tokens=300,
|
| 78 |
+
)
|
| 79 |
+
return response.choices[0].message.content
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
@lru_cache(maxsize=1)
|
| 83 |
+
def _get_provider() -> _LLMProvider:
|
| 84 |
+
"""Lazy-init the configured LLM provider (singleton)."""
|
| 85 |
+
provider_name = settings.LLM_PROVIDER.lower()
|
| 86 |
+
if provider_name == "openai":
|
| 87 |
+
return _OpenAIProvider()
|
| 88 |
+
return _GeminiProvider()
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def _parse_llm_response(raw: str) -> tuple[str, list[str]]:
|
| 92 |
+
"""Parse the LLM's JSON response into (paragraph, bullets).
|
| 93 |
+
Handles cases where the LLM wraps output in markdown fences.
|
| 94 |
+
"""
|
| 95 |
+
text = raw.strip()
|
| 96 |
+
# Strip markdown code fences if present
|
| 97 |
+
if text.startswith("```"):
|
| 98 |
+
lines = text.split("\n")
|
| 99 |
+
# Remove first and last fence lines
|
| 100 |
+
lines = [l for l in lines if not l.strip().startswith("```")]
|
| 101 |
+
text = "\n".join(lines).strip()
|
| 102 |
+
|
| 103 |
+
parsed = json.loads(text)
|
| 104 |
+
paragraph = parsed.get("paragraph", "")
|
| 105 |
+
bullets = parsed.get("bullets", [])
|
| 106 |
+
if not isinstance(bullets, list):
|
| 107 |
+
bullets = [str(bullets)]
|
| 108 |
+
return paragraph, bullets[:3]
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def generate_llm_summary(
|
| 112 |
+
payload: dict[str, Any],
|
| 113 |
+
record_id: str | None = None,
|
| 114 |
+
) -> LLMExplainabilitySummary:
|
| 115 |
+
"""Generate an LLM-powered plain-English explanation for an analysis result.
|
| 116 |
+
|
| 117 |
+
Args:
|
| 118 |
+
payload: The full analysis response dict (verdict, scores, indicators, etc.).
|
| 119 |
+
record_id: Optional cache key. If provided and cached, returns cached result.
|
| 120 |
+
|
| 121 |
+
Returns:
|
| 122 |
+
LLMExplainabilitySummary with paragraph, bullets, and model info.
|
| 123 |
+
"""
|
| 124 |
+
# Check cache
|
| 125 |
+
if record_id and record_id in _cache:
|
| 126 |
+
logger.debug(f"LLM summary cache hit for record_id={record_id}")
|
| 127 |
+
cached = _cache[record_id]
|
| 128 |
+
cached.cached = True
|
| 129 |
+
return cached
|
| 130 |
+
|
| 131 |
+
# Guard: no API key configured
|
| 132 |
+
if not settings.LLM_API_KEY:
|
| 133 |
+
logger.warning("LLM_API_KEY not set — skipping LLM explainability card")
|
| 134 |
+
return LLMExplainabilitySummary(
|
| 135 |
+
paragraph="LLM explanation unavailable (no API key configured).",
|
| 136 |
+
bullets=[],
|
| 137 |
+
model_used="none",
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
# Strip heavy base64 fields to reduce token usage
|
| 141 |
+
slim_payload = {k: v for k, v in payload.items()
|
| 142 |
+
if k not in ("explainability",)}
|
| 143 |
+
# Include explainability but strip base64 images
|
| 144 |
+
if "explainability" in payload and isinstance(payload["explainability"], dict):
|
| 145 |
+
expl = {k: v for k, v in payload["explainability"].items()
|
| 146 |
+
if not k.endswith("_base64")}
|
| 147 |
+
slim_payload["explainability"] = expl
|
| 148 |
+
|
| 149 |
+
prompt = _PROMPT_TEMPLATE.format(payload_json=json.dumps(slim_payload, indent=2, default=str))
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
provider = _get_provider()
|
| 153 |
+
raw_response = provider.generate(prompt)
|
| 154 |
+
paragraph, bullets = _parse_llm_response(raw_response)
|
| 155 |
+
|
| 156 |
+
summary = LLMExplainabilitySummary(
|
| 157 |
+
paragraph=paragraph,
|
| 158 |
+
bullets=bullets,
|
| 159 |
+
model_used=f"{settings.LLM_PROVIDER}/{settings.LLM_MODEL}",
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
# Cache result
|
| 163 |
+
if record_id:
|
| 164 |
+
_cache[record_id] = summary
|
| 165 |
+
|
| 166 |
+
logger.info(f"LLM summary generated via {settings.LLM_PROVIDER}/{settings.LLM_MODEL}")
|
| 167 |
+
return summary
|
| 168 |
+
|
| 169 |
+
except json.JSONDecodeError as e:
|
| 170 |
+
logger.error(f"LLM returned unparseable JSON: {e}")
|
| 171 |
+
return LLMExplainabilitySummary(
|
| 172 |
+
paragraph="Analysis complete. See the detailed indicators below for specifics.",
|
| 173 |
+
bullets=["LLM explanation could not be parsed"],
|
| 174 |
+
model_used=f"{settings.LLM_PROVIDER}/{settings.LLM_MODEL}",
|
| 175 |
+
)
|
| 176 |
+
except Exception as e:
|
| 177 |
+
logger.error(f"LLM explainer failed: {e}")
|
| 178 |
+
return LLMExplainabilitySummary(
|
| 179 |
+
paragraph="Analysis complete. See the detailed indicators below for specifics.",
|
| 180 |
+
bullets=["LLM explanation temporarily unavailable"],
|
| 181 |
+
model_used="error",
|
| 182 |
+
)
|
main.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from contextlib import asynccontextmanager
|
| 3 |
+
|
| 4 |
+
from fastapi import FastAPI
|
| 5 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 6 |
+
from loguru import logger
|
| 7 |
+
|
| 8 |
+
from api.router import api_router
|
| 9 |
+
from config import settings
|
| 10 |
+
from db.database import init_db
|
| 11 |
+
from models.model_loader import get_model_loader
|
| 12 |
+
from services.report_service import cleanup_expired
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
async def _report_cleanup_loop():
|
| 16 |
+
while True:
|
| 17 |
+
try:
|
| 18 |
+
cleanup_expired()
|
| 19 |
+
except Exception as e: # noqa: BLE001
|
| 20 |
+
logger.warning(f"Report cleanup error: {e}")
|
| 21 |
+
await asyncio.sleep(600) # every 10 min
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@asynccontextmanager
|
| 25 |
+
async def lifespan(app: FastAPI):
|
| 26 |
+
logger.info("Starting DeepShield backend")
|
| 27 |
+
init_db()
|
| 28 |
+
logger.info("Database initialized")
|
| 29 |
+
if settings.PRELOAD_MODELS:
|
| 30 |
+
get_model_loader().preload_phase1()
|
| 31 |
+
else:
|
| 32 |
+
logger.info("PRELOAD_MODELS=false — models will load on first use")
|
| 33 |
+
task = asyncio.create_task(_report_cleanup_loop())
|
| 34 |
+
yield
|
| 35 |
+
task.cancel()
|
| 36 |
+
logger.info("Shutting down DeepShield backend")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
app = FastAPI(
|
| 40 |
+
title="DeepShield API",
|
| 41 |
+
description="Explainable AI-based multimodal misinformation detection",
|
| 42 |
+
version="0.1.0",
|
| 43 |
+
lifespan=lifespan,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
app.add_middleware(
|
| 47 |
+
CORSMiddleware,
|
| 48 |
+
allow_origins=settings.CORS_ORIGINS,
|
| 49 |
+
allow_credentials=True,
|
| 50 |
+
allow_methods=["*"],
|
| 51 |
+
allow_headers=["*"],
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
app.include_router(api_router)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
@app.get("/")
|
| 58 |
+
def root():
|
| 59 |
+
return {"service": "DeepShield", "docs": "/docs", "health": "/api/v1/health"}
|
model_loader.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from threading import Lock
|
| 4 |
+
from typing import Optional, Tuple
|
| 5 |
+
|
| 6 |
+
from loguru import logger
|
| 7 |
+
|
| 8 |
+
from config import settings
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class ModelLoader:
|
| 12 |
+
"""Singleton holder for preloaded AI models. Thread-safe lazy init."""
|
| 13 |
+
|
| 14 |
+
_instance: Optional["ModelLoader"] = None
|
| 15 |
+
_lock: Lock = Lock()
|
| 16 |
+
|
| 17 |
+
def __new__(cls) -> "ModelLoader":
|
| 18 |
+
if cls._instance is None:
|
| 19 |
+
with cls._lock:
|
| 20 |
+
if cls._instance is None:
|
| 21 |
+
cls._instance = super().__new__(cls)
|
| 22 |
+
cls._instance._image_model = None
|
| 23 |
+
cls._instance._image_processor = None
|
| 24 |
+
cls._instance._text_pipeline = None
|
| 25 |
+
cls._instance._multilang_text_pipeline = None
|
| 26 |
+
cls._instance._ocr_reader = None
|
| 27 |
+
cls._instance._face_detector = None
|
| 28 |
+
cls._instance._spacy_nlp = None
|
| 29 |
+
cls._instance._sentence_transformer = None
|
| 30 |
+
return cls._instance
|
| 31 |
+
|
| 32 |
+
@classmethod
|
| 33 |
+
def get_instance(cls) -> "ModelLoader":
|
| 34 |
+
return cls()
|
| 35 |
+
|
| 36 |
+
# ---------- Image (ViT deepfake classifier) ----------
|
| 37 |
+
def load_image_model(self) -> Tuple[object, object]:
|
| 38 |
+
if self._image_model is None:
|
| 39 |
+
logger.info(f"Loading image model: {settings.IMAGE_MODEL_ID}")
|
| 40 |
+
from transformers import AutoImageProcessor, AutoModelForImageClassification
|
| 41 |
+
|
| 42 |
+
self._image_processor = AutoImageProcessor.from_pretrained(settings.IMAGE_MODEL_ID)
|
| 43 |
+
model = AutoModelForImageClassification.from_pretrained(settings.IMAGE_MODEL_ID)
|
| 44 |
+
model.to(settings.DEVICE)
|
| 45 |
+
model.eval()
|
| 46 |
+
self._image_model = model
|
| 47 |
+
logger.info("Image model loaded")
|
| 48 |
+
return self._image_model, self._image_processor
|
| 49 |
+
|
| 50 |
+
# ---------- Text (BERT fake-news classifier — English) ----------
|
| 51 |
+
def load_text_model(self):
|
| 52 |
+
if self._text_pipeline is None:
|
| 53 |
+
logger.info(f"Loading text model: {settings.TEXT_MODEL_ID}")
|
| 54 |
+
from transformers import pipeline
|
| 55 |
+
|
| 56 |
+
self._text_pipeline = pipeline(
|
| 57 |
+
"text-classification",
|
| 58 |
+
model=settings.TEXT_MODEL_ID,
|
| 59 |
+
device=0 if settings.DEVICE == "cuda" else -1,
|
| 60 |
+
)
|
| 61 |
+
logger.info("Text model loaded")
|
| 62 |
+
return self._text_pipeline
|
| 63 |
+
|
| 64 |
+
# ---------- Multilingual text model (Phase 13) ----------
|
| 65 |
+
def load_multilang_text_model(self):
|
| 66 |
+
"""Load multilingual fake-news classifier. Falls back to English model if not configured."""
|
| 67 |
+
model_id = settings.TEXT_MULTILANG_MODEL_ID
|
| 68 |
+
if not model_id:
|
| 69 |
+
logger.debug("TEXT_MULTILANG_MODEL_ID not set — falling back to English text model")
|
| 70 |
+
return self.load_text_model()
|
| 71 |
+
|
| 72 |
+
if self._multilang_text_pipeline is None:
|
| 73 |
+
logger.info(f"Loading multilingual text model: {model_id}")
|
| 74 |
+
from transformers import pipeline
|
| 75 |
+
|
| 76 |
+
self._multilang_text_pipeline = pipeline(
|
| 77 |
+
"text-classification",
|
| 78 |
+
model=model_id,
|
| 79 |
+
device=0 if settings.DEVICE == "cuda" else -1,
|
| 80 |
+
)
|
| 81 |
+
logger.info("Multilingual text model loaded")
|
| 82 |
+
return self._multilang_text_pipeline
|
| 83 |
+
|
| 84 |
+
# ---------- spaCy NLP (Phase 13 NER) ----------
|
| 85 |
+
def load_spacy_nlp(self):
|
| 86 |
+
"""Lazy-load spaCy English NLP model. Returns None if spaCy is not installed."""
|
| 87 |
+
if self._spacy_nlp is None:
|
| 88 |
+
try:
|
| 89 |
+
import spacy # type: ignore
|
| 90 |
+
try:
|
| 91 |
+
self._spacy_nlp = spacy.load("en_core_web_sm")
|
| 92 |
+
logger.info("spaCy en_core_web_sm loaded")
|
| 93 |
+
except OSError:
|
| 94 |
+
logger.warning(
|
| 95 |
+
"spaCy model 'en_core_web_sm' not found. "
|
| 96 |
+
"Run: python -m spacy download en_core_web_sm"
|
| 97 |
+
)
|
| 98 |
+
return None
|
| 99 |
+
except ImportError:
|
| 100 |
+
logger.warning("spaCy not installed — NER keyword extraction disabled")
|
| 101 |
+
return None
|
| 102 |
+
return self._spacy_nlp
|
| 103 |
+
|
| 104 |
+
# ---------- Sentence-Transformer (Phase 13 truth-override) ----------
|
| 105 |
+
def load_sentence_transformer(self):
|
| 106 |
+
"""Lazy-load sentence-transformers/all-MiniLM-L6-v2. Returns None if not installed."""
|
| 107 |
+
if self._sentence_transformer is None:
|
| 108 |
+
try:
|
| 109 |
+
from sentence_transformers import SentenceTransformer # type: ignore
|
| 110 |
+
self._sentence_transformer = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 111 |
+
logger.info("Sentence-transformer (all-MiniLM-L6-v2) loaded")
|
| 112 |
+
except ImportError:
|
| 113 |
+
logger.warning("sentence-transformers not installed — truth-override disabled")
|
| 114 |
+
return None
|
| 115 |
+
except Exception as e:
|
| 116 |
+
logger.warning(f"Sentence-transformer load failed: {e}")
|
| 117 |
+
return None
|
| 118 |
+
return self._sentence_transformer
|
| 119 |
+
|
| 120 |
+
# ---------- OCR (EasyOCR) — Phase 13: use OCR_LANGS from config ----------
|
| 121 |
+
def load_ocr_engine(self):
|
| 122 |
+
if self._ocr_reader is None:
|
| 123 |
+
langs = [l.strip() for l in settings.OCR_LANGS.split(",") if l.strip()]
|
| 124 |
+
if not langs:
|
| 125 |
+
langs = ["en"]
|
| 126 |
+
logger.info(f"Loading EasyOCR reader (langs: {langs})")
|
| 127 |
+
import easyocr # type: ignore
|
| 128 |
+
|
| 129 |
+
self._ocr_reader = easyocr.Reader(
|
| 130 |
+
langs, gpu=(settings.DEVICE == "cuda"), verbose=False, download_enabled=True,
|
| 131 |
+
)
|
| 132 |
+
logger.info("EasyOCR loaded")
|
| 133 |
+
return self._ocr_reader
|
| 134 |
+
|
| 135 |
+
# ---------- Face detector (MediaPipe) ----------
|
| 136 |
+
def load_face_detector(self):
|
| 137 |
+
if self._face_detector is None:
|
| 138 |
+
logger.info("Loading MediaPipe FaceMesh")
|
| 139 |
+
import mediapipe as mp # type: ignore
|
| 140 |
+
|
| 141 |
+
self._face_detector = mp.solutions.face_mesh.FaceMesh(
|
| 142 |
+
static_image_mode=True,
|
| 143 |
+
max_num_faces=5,
|
| 144 |
+
min_detection_confidence=0.5,
|
| 145 |
+
)
|
| 146 |
+
logger.info("MediaPipe FaceMesh loaded")
|
| 147 |
+
return self._face_detector
|
| 148 |
+
|
| 149 |
+
# ---------- Preload ----------
|
| 150 |
+
def preload_phase1(self) -> None:
|
| 151 |
+
"""Preload only what Phase 1 needs (image model)."""
|
| 152 |
+
self.load_image_model()
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def get_model_loader() -> ModelLoader:
|
| 156 |
+
return ModelLoader.get_instance()
|
models.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
|
| 3 |
+
from sqlalchemy import DateTime, ForeignKey, Integer, String, Text
|
| 4 |
+
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
| 5 |
+
|
| 6 |
+
from db.database import Base
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class User(Base):
|
| 10 |
+
__tablename__ = "users"
|
| 11 |
+
|
| 12 |
+
id: Mapped[int] = mapped_column(Integer, primary_key=True, index=True)
|
| 13 |
+
email: Mapped[str] = mapped_column(String(255), unique=True, index=True, nullable=False)
|
| 14 |
+
password_hash: Mapped[str] = mapped_column(String(255), nullable=False)
|
| 15 |
+
name: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
| 16 |
+
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
| 17 |
+
|
| 18 |
+
analyses: Mapped[list["AnalysisRecord"]] = relationship(back_populates="user")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class AnalysisRecord(Base):
|
| 22 |
+
__tablename__ = "analyses"
|
| 23 |
+
|
| 24 |
+
id: Mapped[int] = mapped_column(Integer, primary_key=True, index=True)
|
| 25 |
+
user_id: Mapped[int | None] = mapped_column(ForeignKey("users.id"), nullable=True)
|
| 26 |
+
media_type: Mapped[str] = mapped_column(String(32), nullable=False) # image|video|text|screenshot
|
| 27 |
+
verdict: Mapped[str] = mapped_column(String(32), nullable=False)
|
| 28 |
+
authenticity_score: Mapped[float] = mapped_column(nullable=False)
|
| 29 |
+
result_json: Mapped[str] = mapped_column(Text, nullable=False)
|
| 30 |
+
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
| 31 |
+
|
| 32 |
+
user: Mapped["User | None"] = relationship(back_populates="analyses")
|
| 33 |
+
report: Mapped["Report | None"] = relationship(back_populates="analysis", uselist=False)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class Report(Base):
|
| 37 |
+
__tablename__ = "reports"
|
| 38 |
+
|
| 39 |
+
id: Mapped[int] = mapped_column(Integer, primary_key=True, index=True)
|
| 40 |
+
analysis_id: Mapped[int] = mapped_column(ForeignKey("analyses.id"), nullable=False)
|
| 41 |
+
file_path: Mapped[str] = mapped_column(String(512), nullable=False)
|
| 42 |
+
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
| 43 |
+
expires_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
|
| 44 |
+
|
| 45 |
+
analysis: Mapped["AnalysisRecord"] = relationship(back_populates="report")
|
news_lookup.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import List, Optional, Tuple
|
| 5 |
+
from urllib.parse import urlparse
|
| 6 |
+
|
| 7 |
+
import httpx
|
| 8 |
+
from loguru import logger
|
| 9 |
+
|
| 10 |
+
from config import settings
|
| 11 |
+
from schemas.common import ContradictingEvidence, TrustedSource, TruthOverride
|
| 12 |
+
|
| 13 |
+
# Trusted news domains — higher relevance boost
|
| 14 |
+
TRUSTED_DOMAINS = {
|
| 15 |
+
"reuters.com": 1.0, "apnews.com": 1.0, "bbc.com": 1.0, "bbc.co.uk": 1.0,
|
| 16 |
+
"theguardian.com": 0.95, "nytimes.com": 0.95, "washingtonpost.com": 0.95,
|
| 17 |
+
"cnn.com": 0.9, "npr.org": 0.95, "aljazeera.com": 0.9,
|
| 18 |
+
"thehindu.com": 0.9, "indianexpress.com": 0.9, "ndtv.com": 0.85,
|
| 19 |
+
"hindustantimes.com": 0.85, "pti.news": 0.95,
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
# Fact-check / contradiction sources
|
| 23 |
+
FACTCHECK_DOMAINS = {
|
| 24 |
+
"factcheck.org", "snopes.com", "politifact.com", "fullfact.org",
|
| 25 |
+
"reuters.com/fact-check", "apnews.com/hub/ap-fact-check",
|
| 26 |
+
"factly.in", "altnews.in", "boomlive.in", "vishvasnews.com",
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
# Domains eligible for truth-override (weight >= 0.9 per BUILD_PLAN spec)
|
| 30 |
+
_HIGH_TRUST_DOMAINS = {d for d, w in TRUSTED_DOMAINS.items() if w >= 0.9}
|
| 31 |
+
|
| 32 |
+
# Thresholds per BUILD_PLAN §13.2
|
| 33 |
+
_OVERRIDE_SIMILARITY_THRESHOLD = 0.6
|
| 34 |
+
_OVERRIDE_FAKE_PROB_CAP = 0.15
|
| 35 |
+
_OVERRIDE_FAKE_PROB_MULTIPLIER = 0.3
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@dataclass
|
| 39 |
+
class NewsLookupResult:
|
| 40 |
+
trusted_sources: List[TrustedSource]
|
| 41 |
+
contradicting_evidence: List[ContradictingEvidence]
|
| 42 |
+
total_articles: int
|
| 43 |
+
truth_override: Optional[TruthOverride] = None
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _domain_of(url: str) -> str:
|
| 47 |
+
try:
|
| 48 |
+
return urlparse(url).netloc.lower().replace("www.", "")
|
| 49 |
+
except Exception:
|
| 50 |
+
return ""
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _is_factcheck(url: str, title: str) -> bool:
|
| 54 |
+
dom = _domain_of(url)
|
| 55 |
+
if any(fc in dom for fc in FACTCHECK_DOMAINS):
|
| 56 |
+
return True
|
| 57 |
+
tl = (title or "").lower()
|
| 58 |
+
return any(kw in tl for kw in ("fact check", "fact-check", "debunked", "false claim", "misleading", "hoax"))
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _relevance(url: str) -> float:
|
| 62 |
+
dom = _domain_of(url)
|
| 63 |
+
for td, score in TRUSTED_DOMAINS.items():
|
| 64 |
+
if td in dom:
|
| 65 |
+
return score
|
| 66 |
+
return 0.5
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _is_high_trust(url: str) -> bool:
|
| 70 |
+
dom = _domain_of(url)
|
| 71 |
+
return any(ht in dom for ht in _HIGH_TRUST_DOMAINS)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def _compute_truth_override(
|
| 75 |
+
input_text: str,
|
| 76 |
+
trusted_sources: List[TrustedSource],
|
| 77 |
+
current_fake_prob: float,
|
| 78 |
+
) -> Optional[TruthOverride]:
|
| 79 |
+
"""Check if any high-trust source corroborates the input text at >= 0.6 cosine similarity.
|
| 80 |
+
|
| 81 |
+
Per BUILD_PLAN §13.2:
|
| 82 |
+
- Compute cosine similarity between input_text and each trusted-source headline+description
|
| 83 |
+
- If ≥ 1 high-trust source (weight ≥ 0.9) has similarity ≥ 0.6 → apply fake_prob *= 0.3, cap at 0.15
|
| 84 |
+
"""
|
| 85 |
+
if not input_text or not trusted_sources:
|
| 86 |
+
return None
|
| 87 |
+
|
| 88 |
+
# Filter to high-trust sources only
|
| 89 |
+
high_trust = [s for s in trusted_sources if _is_high_trust(s.url)]
|
| 90 |
+
if not high_trust:
|
| 91 |
+
return None
|
| 92 |
+
|
| 93 |
+
# Lazy-load sentence-transformer
|
| 94 |
+
from models.model_loader import get_model_loader
|
| 95 |
+
st_model = get_model_loader().load_sentence_transformer()
|
| 96 |
+
if st_model is None:
|
| 97 |
+
return None
|
| 98 |
+
|
| 99 |
+
try:
|
| 100 |
+
import numpy as np
|
| 101 |
+
|
| 102 |
+
# Encode input text and all high-trust headlines
|
| 103 |
+
source_texts = [
|
| 104 |
+
f"{s.title}" for s in high_trust
|
| 105 |
+
]
|
| 106 |
+
all_texts = [input_text[:512]] + source_texts
|
| 107 |
+
|
| 108 |
+
embeddings = st_model.encode(all_texts, convert_to_numpy=True, normalize_embeddings=True)
|
| 109 |
+
query_vec = embeddings[0] # (D,)
|
| 110 |
+
source_vecs = embeddings[1:] # (N, D)
|
| 111 |
+
|
| 112 |
+
# Cosine similarity — already normalized, so dot product = cosine similarity
|
| 113 |
+
similarities = np.dot(source_vecs, query_vec)
|
| 114 |
+
|
| 115 |
+
best_idx = int(np.argmax(similarities))
|
| 116 |
+
best_sim = float(similarities[best_idx])
|
| 117 |
+
best_source = high_trust[best_idx]
|
| 118 |
+
|
| 119 |
+
logger.info(
|
| 120 |
+
f"Truth-override: best similarity={best_sim:.3f} "
|
| 121 |
+
f"source={best_source.source_name} url={best_source.url}"
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
if best_sim >= _OVERRIDE_SIMILARITY_THRESHOLD:
|
| 125 |
+
new_fake_prob = min(
|
| 126 |
+
current_fake_prob * _OVERRIDE_FAKE_PROB_MULTIPLIER,
|
| 127 |
+
_OVERRIDE_FAKE_PROB_CAP,
|
| 128 |
+
)
|
| 129 |
+
logger.info(
|
| 130 |
+
f"Truth-override APPLIED: fake_prob {current_fake_prob:.3f} → {new_fake_prob:.3f}"
|
| 131 |
+
)
|
| 132 |
+
return TruthOverride(
|
| 133 |
+
applied=True,
|
| 134 |
+
source_url=best_source.url,
|
| 135 |
+
source_name=best_source.source_name,
|
| 136 |
+
similarity=round(best_sim, 4),
|
| 137 |
+
fake_prob_before=round(current_fake_prob, 4),
|
| 138 |
+
fake_prob_after=round(new_fake_prob, 4),
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
return TruthOverride(
|
| 142 |
+
applied=False,
|
| 143 |
+
source_url=best_source.url,
|
| 144 |
+
source_name=best_source.source_name,
|
| 145 |
+
similarity=round(best_sim, 4),
|
| 146 |
+
fake_prob_before=round(current_fake_prob, 4),
|
| 147 |
+
fake_prob_after=round(current_fake_prob, 4),
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
except Exception as e:
|
| 151 |
+
logger.warning(f"Truth-override computation failed: {e}")
|
| 152 |
+
return None
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
async def _fetch(q: str, country: Optional[str]) -> list[dict]:
|
| 156 |
+
target_country = country or "in"
|
| 157 |
+
params = {"apikey": settings.NEWS_API_KEY, "q": q, "language": "en", "size": 10, "country": "in"}
|
| 158 |
+
|
| 159 |
+
try:
|
| 160 |
+
async with httpx.AsyncClient(timeout=8.0) as c:
|
| 161 |
+
r = await c.get(settings.NEWS_API_BASE_URL, params=params)
|
| 162 |
+
r.raise_for_status()
|
| 163 |
+
return (r.json() or {}).get("results") or []
|
| 164 |
+
except Exception as e:
|
| 165 |
+
logger.warning(f"News lookup failed: {e}")
|
| 166 |
+
return []
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
async def search_news(
|
| 170 |
+
keywords: List[str],
|
| 171 |
+
limit: int = 6,
|
| 172 |
+
country: Optional[str] = None,
|
| 173 |
+
) -> List[TrustedSource]:
|
| 174 |
+
"""Back-compat simple form — returns trusted sources only."""
|
| 175 |
+
result = await search_news_full(keywords, limit=limit, country=country)
|
| 176 |
+
return result.trusted_sources
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
async def search_news_full(
|
| 180 |
+
keywords: List[str],
|
| 181 |
+
limit: int = 6,
|
| 182 |
+
country: Optional[str] = None,
|
| 183 |
+
original_text: Optional[str] = None,
|
| 184 |
+
current_fake_prob: float = 0.5,
|
| 185 |
+
) -> NewsLookupResult:
|
| 186 |
+
"""Full news lookup with truth-override support.
|
| 187 |
+
|
| 188 |
+
Args:
|
| 189 |
+
keywords: NER-extracted or frequency-extracted keywords to search.
|
| 190 |
+
limit: Max sources to return.
|
| 191 |
+
country: Country code for newsdata.io.
|
| 192 |
+
original_text: Input text to compare against headlines for truth-override.
|
| 193 |
+
current_fake_prob: Current fake probability — may be adjusted by truth-override.
|
| 194 |
+
"""
|
| 195 |
+
if not settings.NEWS_API_KEY or not keywords:
|
| 196 |
+
return NewsLookupResult([], [], 0)
|
| 197 |
+
|
| 198 |
+
q = " ".join(keywords[:4])
|
| 199 |
+
articles = await _fetch(q, country)
|
| 200 |
+
|
| 201 |
+
seen: set[str] = set()
|
| 202 |
+
trusted: List[TrustedSource] = []
|
| 203 |
+
contradictions: List[ContradictingEvidence] = []
|
| 204 |
+
|
| 205 |
+
for art in articles:
|
| 206 |
+
url = art.get("link") or ""
|
| 207 |
+
if not url or url in seen:
|
| 208 |
+
continue
|
| 209 |
+
seen.add(url)
|
| 210 |
+
|
| 211 |
+
title = art.get("title") or ""
|
| 212 |
+
dom = _domain_of(url)
|
| 213 |
+
src_name = art.get("source_id") or dom or "news"
|
| 214 |
+
|
| 215 |
+
if _is_factcheck(url, title):
|
| 216 |
+
contradictions.append(ContradictingEvidence(
|
| 217 |
+
source_name=src_name, title=title, url=url, type="fact_check",
|
| 218 |
+
))
|
| 219 |
+
continue
|
| 220 |
+
|
| 221 |
+
trusted.append(TrustedSource(
|
| 222 |
+
source_name=src_name,
|
| 223 |
+
title=title,
|
| 224 |
+
url=url,
|
| 225 |
+
published_at=art.get("pubDate"),
|
| 226 |
+
relevance_score=_relevance(url),
|
| 227 |
+
))
|
| 228 |
+
|
| 229 |
+
trusted.sort(key=lambda s: -s.relevance_score)
|
| 230 |
+
trusted = trusted[:limit]
|
| 231 |
+
|
| 232 |
+
# ── Phase 13.2: Truth-override ──
|
| 233 |
+
truth_override = None
|
| 234 |
+
if original_text and trusted:
|
| 235 |
+
truth_override = _compute_truth_override(original_text, trusted, current_fake_prob)
|
| 236 |
+
|
| 237 |
+
return NewsLookupResult(
|
| 238 |
+
trusted_sources=trusted,
|
| 239 |
+
contradicting_evidence=contradictions[:limit],
|
| 240 |
+
total_articles=len(articles),
|
| 241 |
+
truth_override=truth_override,
|
| 242 |
+
)
|
report.html
ADDED
|
@@ -0,0 +1,367 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="utf-8" />
|
| 5 |
+
<title>DeepShield Analysis Report — {{ analysis_id }}</title>
|
| 6 |
+
<style>
|
| 7 |
+
@page { size: A4; margin: 16mm 18mm; }
|
| 8 |
+
body { font-family: Helvetica, Arial, sans-serif; color: #1A202C; font-size: 10pt; line-height: 1.45; }
|
| 9 |
+
|
| 10 |
+
/* ── Typography ── */
|
| 11 |
+
h1 { color: #4F46E5; margin: 0 0 2pt 0; font-size: 18pt; letter-spacing: -0.3pt; }
|
| 12 |
+
h2 { color: #4F46E5; margin: 14pt 0 5pt 0; font-size: 12pt; border-bottom: 1pt solid #E5E7EB; padding-bottom: 2pt; }
|
| 13 |
+
h3 { margin: 10pt 0 4pt 0; font-size: 10.5pt; color: #374151; }
|
| 14 |
+
.muted { color: #6B7280; font-size: 8.5pt; }
|
| 15 |
+
|
| 16 |
+
/* ── Header / logo row ── */
|
| 17 |
+
.header-table { width: 100%; border-collapse: collapse; border-bottom: 2pt solid #4F46E5; padding-bottom: 6pt; margin-bottom: 10pt; }
|
| 18 |
+
.logo-cell { font-size: 22pt; font-weight: bold; color: #4F46E5; width: 1%; white-space: nowrap; padding-right: 8pt; }
|
| 19 |
+
.logo-shield { color: #6366F1; }
|
| 20 |
+
.meta-cell { font-size: 8.5pt; color: #6B7280; vertical-align: bottom; }
|
| 21 |
+
|
| 22 |
+
/* ── Verdict row ── */
|
| 23 |
+
.verdict-table { width: 100%; border-collapse: collapse; margin: 6pt 0 10pt 0; background: #F9FAFB; }
|
| 24 |
+
.verdict-score-cell { width: 90pt; text-align: center; vertical-align: middle; padding: 8pt; }
|
| 25 |
+
.score-num { font-size: 26pt; font-weight: bold; }
|
| 26 |
+
.score-denom { font-size: 9pt; color: #6B7280; }
|
| 27 |
+
.score.real { color: #43A047; }
|
| 28 |
+
.score.warn { color: #FB8C00; }
|
| 29 |
+
.score.fake { color: #E53935; }
|
| 30 |
+
.verdict-detail-cell { padding: 8pt 10pt; vertical-align: middle; }
|
| 31 |
+
.verdict-label { font-size: 13pt; font-weight: bold; color: #1A202C; }
|
| 32 |
+
.verdict-sub { font-size: 8.5pt; color: #6B7280; margin-top: 2pt; }
|
| 33 |
+
.donut-cell { width: 75pt; text-align: center; vertical-align: middle; padding: 4pt; }
|
| 34 |
+
.donut-cell img { width: 72pt; }
|
| 35 |
+
|
| 36 |
+
/* ── LLM card ── */
|
| 37 |
+
.llm-box { background: #EEF2FF; border-left: 3pt solid #4F46E5; padding: 7pt 9pt; margin: 6pt 0; border-radius: 2pt; }
|
| 38 |
+
.llm-para { font-size: 9.5pt; color: #1A202C; margin: 0 0 5pt 0; }
|
| 39 |
+
.llm-bullets { margin: 0; padding-left: 14pt; }
|
| 40 |
+
.llm-bullets li { font-size: 9pt; color: #374151; margin-bottom: 2pt; }
|
| 41 |
+
|
| 42 |
+
/* ── Tables ── */
|
| 43 |
+
table.data { width: 100%; border-collapse: collapse; margin: 5pt 0; }
|
| 44 |
+
table.data th { background: #F3F4F6; color: #374151; font-size: 8.5pt; text-align: left; padding: 3pt 6pt; border-bottom: 1pt solid #E5E7EB; }
|
| 45 |
+
table.data td { font-size: 9pt; padding: 3pt 6pt; border-bottom: 1pt solid #F3F4F6; vertical-align: top; }
|
| 46 |
+
table.data tr:last-child td { border-bottom: none; }
|
| 47 |
+
|
| 48 |
+
/* ── VLM breakdown ── */
|
| 49 |
+
.vlm-score-bar-wrap { background: #E5E7EB; border-radius: 3pt; height: 5pt; width: 70pt; display: inline-block; vertical-align: middle; overflow: hidden; }
|
| 50 |
+
.vlm-score-bar { height: 5pt; border-radius: 3pt; }
|
| 51 |
+
.vlm-real { background: #43A047; }
|
| 52 |
+
.vlm-warn { background: #FB8C00; }
|
| 53 |
+
.vlm-fake { background: #E53935; }
|
| 54 |
+
|
| 55 |
+
/* ── Badges ── */
|
| 56 |
+
.badge { display: inline-block; padding: 1pt 5pt; border-radius: 3pt; font-size: 8pt; font-weight: bold; }
|
| 57 |
+
.sev-high { background: #FEE2E2; color: #B91C1C; }
|
| 58 |
+
.sev-medium { background: #FEF3C7; color: #92400E; }
|
| 59 |
+
.sev-low { background: #DBEAFE; color: #1E40AF; }
|
| 60 |
+
.badge-green { background: #DCFCE7; color: #166534; }
|
| 61 |
+
.badge-red { background: #FEE2E2; color: #991B1B; }
|
| 62 |
+
|
| 63 |
+
/* ── Keywords ── */
|
| 64 |
+
.keyword { display: inline-block; background: #EEF2FF; color: #4F46E5; padding: 1pt 6pt; border-radius: 3pt; margin: 1pt; font-size: 8.5pt; }
|
| 65 |
+
|
| 66 |
+
/* ── Truth-override ── */
|
| 67 |
+
.truth-box { background: #DCFCE7; border-left: 3pt solid #16A34A; padding: 5pt 8pt; margin: 5pt 0; font-size: 9pt; border-radius: 2pt; }
|
| 68 |
+
|
| 69 |
+
/* ── Footer ── */
|
| 70 |
+
.footer { margin-top: 16pt; padding-top: 5pt; border-top: 1pt solid #E5E7EB; color: #9CA3AF; font-size: 8pt; }
|
| 71 |
+
</style>
|
| 72 |
+
</head>
|
| 73 |
+
<body>
|
| 74 |
+
|
| 75 |
+
{# ── Header ── #}
|
| 76 |
+
<table class="header-table">
|
| 77 |
+
<tr>
|
| 78 |
+
<td class="logo-cell"><span class="logo-shield">▮</span> DeepShield</td>
|
| 79 |
+
<td class="meta-cell">
|
| 80 |
+
Analysis Report · ID: {{ analysis_id }}<br />
|
| 81 |
+
Media: <b>{{ media_type | upper }}</b> · Generated: {{ generated_at }}
|
| 82 |
+
</td>
|
| 83 |
+
</tr>
|
| 84 |
+
</table>
|
| 85 |
+
|
| 86 |
+
{# ── Verdict ── #}
|
| 87 |
+
<h2>Verdict</h2>
|
| 88 |
+
<table class="verdict-table">
|
| 89 |
+
<tr>
|
| 90 |
+
<td class="verdict-score-cell">
|
| 91 |
+
<div class="score-num score {{ score_class }}">{{ verdict.authenticity_score }}</div>
|
| 92 |
+
<div class="score-denom">/ 100</div>
|
| 93 |
+
</td>
|
| 94 |
+
<td class="verdict-detail-cell">
|
| 95 |
+
<div class="verdict-label">{{ verdict.label }}</div>
|
| 96 |
+
<div class="verdict-sub">Severity: {{ verdict.severity }}</div>
|
| 97 |
+
<div class="verdict-sub">Model: {{ verdict.model_label }} ({{ '%.1f' | format(verdict.model_confidence * 100) }}% confidence)</div>
|
| 98 |
+
</td>
|
| 99 |
+
{% if donut_b64 %}
|
| 100 |
+
<td class="donut-cell">
|
| 101 |
+
<img src="data:image/png;base64,{{ donut_b64 }}" alt="score donut" />
|
| 102 |
+
</td>
|
| 103 |
+
{% endif %}
|
| 104 |
+
</tr>
|
| 105 |
+
</table>
|
| 106 |
+
|
| 107 |
+
{# ── LLM Explanation ── #}
|
| 108 |
+
{% if llm_summary and llm_summary.paragraph %}
|
| 109 |
+
<h2>AI Explanation</h2>
|
| 110 |
+
<div class="llm-box">
|
| 111 |
+
<p class="llm-para">{{ llm_summary.paragraph }}</p>
|
| 112 |
+
{% if llm_summary.bullets %}
|
| 113 |
+
<ul class="llm-bullets">
|
| 114 |
+
{% for b in llm_summary.bullets %}<li>{{ b }}</li>{% endfor %}
|
| 115 |
+
</ul>
|
| 116 |
+
{% endif %}
|
| 117 |
+
{% if llm_summary.model_used %}
|
| 118 |
+
<div class="muted" style="margin-top:4pt;">via {{ llm_summary.model_used }}</div>
|
| 119 |
+
{% endif %}
|
| 120 |
+
</div>
|
| 121 |
+
{% endif %}
|
| 122 |
+
|
| 123 |
+
{# ══════════ IMAGE ══════════ #}
|
| 124 |
+
{% if media_type == 'image' %}
|
| 125 |
+
|
| 126 |
+
{# EXIF #}
|
| 127 |
+
{% if explainability.exif %}
|
| 128 |
+
<h2>EXIF Metadata</h2>
|
| 129 |
+
<table class="data">
|
| 130 |
+
<tr><th>Field</th><th>Value</th><th>Trust Signal</th></tr>
|
| 131 |
+
{% if explainability.exif.make %}
|
| 132 |
+
<tr><td>Camera Make</td><td>{{ explainability.exif.make }}</td><td><span class="badge badge-green">+real</span></td></tr>
|
| 133 |
+
{% endif %}
|
| 134 |
+
{% if explainability.exif.model %}
|
| 135 |
+
<tr><td>Camera Model</td><td>{{ explainability.exif.model }}</td><td></td></tr>
|
| 136 |
+
{% endif %}
|
| 137 |
+
{% if explainability.exif.datetime_original %}
|
| 138 |
+
<tr><td>Date Taken</td><td>{{ explainability.exif.datetime_original }}</td><td><span class="badge badge-green">+real</span></td></tr>
|
| 139 |
+
{% endif %}
|
| 140 |
+
{% if explainability.exif.software %}
|
| 141 |
+
<tr><td>Software</td><td>{{ explainability.exif.software }}</td>
|
| 142 |
+
<td>{% if 'photoshop' in explainability.exif.software | lower %}<span class="badge badge-red">+fake</span>{% endif %}</td></tr>
|
| 143 |
+
{% endif %}
|
| 144 |
+
{% if explainability.exif.lens_model %}
|
| 145 |
+
<tr><td>Lens Model</td><td>{{ explainability.exif.lens_model }}</td><td></td></tr>
|
| 146 |
+
{% endif %}
|
| 147 |
+
{% if explainability.exif.gps_info %}
|
| 148 |
+
<tr><td>GPS</td><td>{{ explainability.exif.gps_info }}</td><td></td></tr>
|
| 149 |
+
{% endif %}
|
| 150 |
+
<tr>
|
| 151 |
+
<td colspan="2"><b>Trust adjustment</b></td>
|
| 152 |
+
<td>
|
| 153 |
+
{% if explainability.exif.trust_adjustment > 0 %}
|
| 154 |
+
<span class="badge badge-red">+{{ explainability.exif.trust_adjustment }} (fake signal)</span>
|
| 155 |
+
{% elif explainability.exif.trust_adjustment < 0 %}
|
| 156 |
+
<span class="badge badge-green">{{ explainability.exif.trust_adjustment }} (real signal)</span>
|
| 157 |
+
{% else %}
|
| 158 |
+
neutral
|
| 159 |
+
{% endif %}
|
| 160 |
+
</td>
|
| 161 |
+
</tr>
|
| 162 |
+
</table>
|
| 163 |
+
{% endif %}
|
| 164 |
+
|
| 165 |
+
{# Artifact indicators #}
|
| 166 |
+
{% if explainability.artifact_indicators %}
|
| 167 |
+
<h2>Artifact Indicators</h2>
|
| 168 |
+
<table class="data">
|
| 169 |
+
<tr><th>Type</th><th>Severity</th><th>Confidence</th><th>Description</th></tr>
|
| 170 |
+
{% for ind in explainability.artifact_indicators %}
|
| 171 |
+
<tr>
|
| 172 |
+
<td>{{ ind.type }}</td>
|
| 173 |
+
<td><span class="badge sev-{{ ind.severity }}">{{ ind.severity }}</span></td>
|
| 174 |
+
<td>{{ '%.0f' | format(ind.confidence * 100) }}%</td>
|
| 175 |
+
<td>{{ ind.description }}</td>
|
| 176 |
+
</tr>
|
| 177 |
+
{% endfor %}
|
| 178 |
+
</table>
|
| 179 |
+
{% else %}
|
| 180 |
+
<h2>Artifact Indicators</h2>
|
| 181 |
+
<div class="muted">No artifacts detected.</div>
|
| 182 |
+
{% endif %}
|
| 183 |
+
|
| 184 |
+
{# VLM Detailed Breakdown #}
|
| 185 |
+
{% if explainability.vlm_breakdown %}
|
| 186 |
+
<h2>Detailed Breakdown</h2>
|
| 187 |
+
{% if explainability.vlm_breakdown.model_used %}
|
| 188 |
+
<div class="muted" style="margin-bottom:5pt;">Scored by {{ explainability.vlm_breakdown.model_used }}</div>
|
| 189 |
+
{% endif %}
|
| 190 |
+
<table class="data">
|
| 191 |
+
<tr><th>Component</th><th>Score</th><th>Bar</th><th>Notes</th></tr>
|
| 192 |
+
{% set bd = explainability.vlm_breakdown %}
|
| 193 |
+
{% for comp_key, comp_label in [
|
| 194 |
+
('facial_symmetry', 'Facial Symmetry'),
|
| 195 |
+
('skin_texture', 'Skin Texture'),
|
| 196 |
+
('lighting_consistency', 'Lighting Consistency'),
|
| 197 |
+
('background_coherence', 'Background Coherence'),
|
| 198 |
+
('anatomy_hands_eyes', 'Anatomy / Hands & Eyes'),
|
| 199 |
+
('context_objects', 'Context & Objects')
|
| 200 |
+
] %}
|
| 201 |
+
{% set comp = bd[comp_key] %}
|
| 202 |
+
{% set sc2 = comp.score if comp else 75 %}
|
| 203 |
+
{% set bar_cls = 'vlm-real' if sc2 >= 70 else ('vlm-warn' if sc2 >= 40 else 'vlm-fake') %}
|
| 204 |
+
<tr>
|
| 205 |
+
<td>{{ comp_label }}</td>
|
| 206 |
+
<td><b>{{ sc2 }}</b>/100</td>
|
| 207 |
+
<td>
|
| 208 |
+
<span class="vlm-score-bar-wrap">
|
| 209 |
+
<span class="vlm-score-bar {{ bar_cls }}" style="width:{{ sc2 }}%;display:block;"></span>
|
| 210 |
+
</span>
|
| 211 |
+
</td>
|
| 212 |
+
<td class="muted">{{ comp.notes if comp else '' }}</td>
|
| 213 |
+
</tr>
|
| 214 |
+
{% endfor %}
|
| 215 |
+
</table>
|
| 216 |
+
{% endif %}
|
| 217 |
+
|
| 218 |
+
{% endif %}{# end image #}
|
| 219 |
+
|
| 220 |
+
{# ══════════ VIDEO ══════════ #}
|
| 221 |
+
{% if media_type == 'video' %}
|
| 222 |
+
<h2>Frame-Level Analysis</h2>
|
| 223 |
+
<table class="data">
|
| 224 |
+
<tr><th>Metric</th><th>Value</th></tr>
|
| 225 |
+
<tr><td>Frames sampled</td><td>{{ explainability.num_frames_sampled }}</td></tr>
|
| 226 |
+
<tr><td>Frames with face</td><td>{{ explainability.num_face_frames }}</td></tr>
|
| 227 |
+
<tr><td>Suspicious frames</td><td>{{ explainability.num_suspicious_frames }}</td></tr>
|
| 228 |
+
<tr><td>Mean suspicious prob</td><td>{{ '%.1f' | format(explainability.mean_suspicious_prob * 100) }}%</td></tr>
|
| 229 |
+
<tr><td>Max suspicious prob</td><td>{{ '%.1f' | format(explainability.max_suspicious_prob * 100) }}%</td></tr>
|
| 230 |
+
<tr><td>Insufficient faces</td><td>{{ explainability.insufficient_faces }}</td></tr>
|
| 231 |
+
</table>
|
| 232 |
+
{% endif %}
|
| 233 |
+
|
| 234 |
+
{# ══════════ TEXT ══════════ #}
|
| 235 |
+
{% if media_type == 'text' %}
|
| 236 |
+
|
| 237 |
+
{# Language + truth-override #}
|
| 238 |
+
{% if explainability.detected_language and explainability.detected_language != 'en' %}
|
| 239 |
+
<h2>Language</h2>
|
| 240 |
+
<div class="muted">Detected: <b>{{ explainability.detected_language | upper }}</b> — analysed via multilingual model</div>
|
| 241 |
+
{% endif %}
|
| 242 |
+
{% if explainability.truth_override and explainability.truth_override.applied %}
|
| 243 |
+
<div class="truth-box">
|
| 244 |
+
<b>Truth-override applied.</b>
|
| 245 |
+
Corroborated by {{ explainability.truth_override.source_name }}
|
| 246 |
+
({{ '%.0f' | format(explainability.truth_override.similarity * 100) }}% similarity).
|
| 247 |
+
Fake probability reduced from {{ '%.1f' | format(explainability.truth_override.fake_prob_before * 100) }}%
|
| 248 |
+
to {{ '%.1f' | format(explainability.truth_override.fake_prob_after * 100) }}%.
|
| 249 |
+
</div>
|
| 250 |
+
{% endif %}
|
| 251 |
+
|
| 252 |
+
<h2>Text Classification</h2>
|
| 253 |
+
<table class="data">
|
| 254 |
+
<tr><th>Metric</th><th>Value</th></tr>
|
| 255 |
+
<tr><td>Fake probability</td><td>{{ '%.1f' | format(explainability.fake_probability * 100) }}%</td></tr>
|
| 256 |
+
<tr><td>Top label</td><td>{{ explainability.top_label }}</td></tr>
|
| 257 |
+
<tr><td>Sensationalism score</td><td>{{ explainability.sensationalism.score }}/100 ({{ explainability.sensationalism.level }})</td></tr>
|
| 258 |
+
<tr><td>Exclamations</td><td>{{ explainability.sensationalism.exclamation_count }}</td></tr>
|
| 259 |
+
<tr><td>ALL CAPS words</td><td>{{ explainability.sensationalism.caps_word_count }}</td></tr>
|
| 260 |
+
<tr><td>Clickbait matches</td><td>{{ explainability.sensationalism.clickbait_matches }}</td></tr>
|
| 261 |
+
<tr><td>Emotional words</td><td>{{ explainability.sensationalism.emotional_word_count }}</td></tr>
|
| 262 |
+
</table>
|
| 263 |
+
|
| 264 |
+
{% if explainability.manipulation_indicators %}
|
| 265 |
+
<h3>Manipulation Indicators ({{ explainability.manipulation_indicators | length }})</h3>
|
| 266 |
+
<table class="data">
|
| 267 |
+
<tr><th>Pattern</th><th>Severity</th><th>Matched text</th></tr>
|
| 268 |
+
{% for m in explainability.manipulation_indicators %}
|
| 269 |
+
<tr>
|
| 270 |
+
<td>{{ m.pattern_type }}</td>
|
| 271 |
+
<td><span class="badge sev-{{ m.severity }}">{{ m.severity }}</span></td>
|
| 272 |
+
<td>{{ m.matched_text }}</td>
|
| 273 |
+
</tr>
|
| 274 |
+
{% endfor %}
|
| 275 |
+
</table>
|
| 276 |
+
{% endif %}
|
| 277 |
+
|
| 278 |
+
{% if explainability.keywords %}
|
| 279 |
+
<h3>Extracted Keywords</h3>
|
| 280 |
+
<div>{% for kw in explainability.keywords %}<span class="keyword">{{ kw }}</span>{% endfor %}</div>
|
| 281 |
+
{% endif %}
|
| 282 |
+
|
| 283 |
+
{% endif %}{# end text #}
|
| 284 |
+
|
| 285 |
+
{# ══════════ SCREENSHOT ══════════ #}
|
| 286 |
+
{% if media_type == 'screenshot' %}
|
| 287 |
+
|
| 288 |
+
{% if explainability.detected_language and explainability.detected_language != 'en' %}
|
| 289 |
+
<div class="muted" style="margin-bottom:4pt;">Detected language: <b>{{ explainability.detected_language | upper }}</b></div>
|
| 290 |
+
{% endif %}
|
| 291 |
+
{% if explainability.truth_override and explainability.truth_override.applied %}
|
| 292 |
+
<div class="truth-box">
|
| 293 |
+
<b>Truth-override applied.</b> {{ explainability.truth_override.source_name }}
|
| 294 |
+
({{ '%.0f' | format(explainability.truth_override.similarity * 100) }}% similarity)
|
| 295 |
+
</div>
|
| 296 |
+
{% endif %}
|
| 297 |
+
|
| 298 |
+
<h2>Extracted Text</h2>
|
| 299 |
+
<div class="muted">{{ explainability.ocr_boxes | length }} OCR regions detected</div>
|
| 300 |
+
<table class="data">
|
| 301 |
+
<tr><td style="white-space:pre-wrap; font-size:8.5pt; padding:6pt;">{{ explainability.extracted_text }}</td></tr>
|
| 302 |
+
</table>
|
| 303 |
+
|
| 304 |
+
<h3>Analysis Summary</h3>
|
| 305 |
+
<table class="data">
|
| 306 |
+
<tr><th>Metric</th><th>Value</th></tr>
|
| 307 |
+
<tr><td>Fake probability</td><td>{{ '%.1f' | format(explainability.fake_probability * 100) }}%</td></tr>
|
| 308 |
+
<tr><td>Sensationalism</td><td>{{ explainability.sensationalism.score }}/100 ({{ explainability.sensationalism.level }})</td></tr>
|
| 309 |
+
<tr><td>Suspicious phrases</td><td>{{ explainability.suspicious_phrases | length }}</td></tr>
|
| 310 |
+
<tr><td>Layout anomalies</td><td>{{ explainability.layout_anomalies | length }}</td></tr>
|
| 311 |
+
</table>
|
| 312 |
+
|
| 313 |
+
{% if explainability.suspicious_phrases %}
|
| 314 |
+
<h3>Suspicious Phrases</h3>
|
| 315 |
+
<table class="data">
|
| 316 |
+
<tr><th>Text</th><th>Pattern</th><th>Severity</th></tr>
|
| 317 |
+
{% for p in explainability.suspicious_phrases %}
|
| 318 |
+
<tr>
|
| 319 |
+
<td>{{ p.text }}</td>
|
| 320 |
+
<td>{{ p.pattern_type }}</td>
|
| 321 |
+
<td><span class="badge sev-{{ p.severity }}">{{ p.severity }}</span></td>
|
| 322 |
+
</tr>
|
| 323 |
+
{% endfor %}
|
| 324 |
+
</table>
|
| 325 |
+
{% endif %}
|
| 326 |
+
|
| 327 |
+
{% endif %}{# end screenshot #}
|
| 328 |
+
|
| 329 |
+
{# ══════════ SOURCES (all types) ══════════ #}
|
| 330 |
+
{% if trusted_sources %}
|
| 331 |
+
<h2>Trusted Source Cross-Reference ({{ trusted_sources | length }})</h2>
|
| 332 |
+
<table class="data">
|
| 333 |
+
<tr><th>Source</th><th>Title</th><th>Relevance</th></tr>
|
| 334 |
+
{% for s in trusted_sources %}
|
| 335 |
+
<tr>
|
| 336 |
+
<td>{{ s.source_name }}</td>
|
| 337 |
+
<td>{{ s.title }}</td>
|
| 338 |
+
<td>{{ '%.0f' | format(s.relevance_score * 100) }}%</td>
|
| 339 |
+
</tr>
|
| 340 |
+
{% endfor %}
|
| 341 |
+
</table>
|
| 342 |
+
{% endif %}
|
| 343 |
+
|
| 344 |
+
{% if contradicting_evidence %}
|
| 345 |
+
<h2 style="color:#B91C1C;">Contradicting Evidence ({{ contradicting_evidence | length }})</h2>
|
| 346 |
+
<table class="data">
|
| 347 |
+
<tr><th>Source</th><th>Title</th><th>Type</th></tr>
|
| 348 |
+
{% for c in contradicting_evidence %}
|
| 349 |
+
<tr><td>{{ c.source_name }}</td><td>{{ c.title }}</td><td>{{ c.type }}</td></tr>
|
| 350 |
+
{% endfor %}
|
| 351 |
+
</table>
|
| 352 |
+
{% endif %}
|
| 353 |
+
|
| 354 |
+
{# ══════════ PROCESSING ══════════ #}
|
| 355 |
+
<h2>Processing Summary</h2>
|
| 356 |
+
<div class="muted">Model: {{ processing_summary.model_used }} · Duration: {{ processing_summary.total_duration_ms }} ms</div>
|
| 357 |
+
<div style="font-size:8.5pt; margin-top:3pt;">{{ processing_summary.stages_completed | join(' → ') }}</div>
|
| 358 |
+
|
| 359 |
+
{# ══════════ FOOTER ══════════ #}
|
| 360 |
+
<div class="footer">
|
| 361 |
+
<b>DeepShield Responsible-AI Notice.</b> {{ responsible_ai_notice }}<br />
|
| 362 |
+
Generated {{ generated_at }}. Report expires in 1 hour.
|
| 363 |
+
AI-assisted analysis — cross-check with trusted sources before sharing.
|
| 364 |
+
</div>
|
| 365 |
+
|
| 366 |
+
</body>
|
| 367 |
+
</html>
|
report_service.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import base64
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import time
|
| 7 |
+
import uuid
|
| 8 |
+
from datetime import datetime, timedelta, timezone
|
| 9 |
+
from io import BytesIO
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Any, Optional
|
| 12 |
+
|
| 13 |
+
from jinja2 import Environment, FileSystemLoader, select_autoescape
|
| 14 |
+
from loguru import logger
|
| 15 |
+
from xhtml2pdf import pisa # type: ignore
|
| 16 |
+
|
| 17 |
+
from config import settings
|
| 18 |
+
from db.models import AnalysisRecord, Report
|
| 19 |
+
|
| 20 |
+
TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates"
|
| 21 |
+
|
| 22 |
+
_env = Environment(
|
| 23 |
+
loader=FileSystemLoader(str(TEMPLATES_DIR)),
|
| 24 |
+
autoescape=select_autoescape(["html", "xml"]),
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _score_class(score: int) -> str:
|
| 29 |
+
if score >= 70:
|
| 30 |
+
return "real"
|
| 31 |
+
if score >= 40:
|
| 32 |
+
return "warn"
|
| 33 |
+
return "fake"
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _ensure_dir() -> Path:
|
| 37 |
+
p = Path(settings.REPORT_DIR)
|
| 38 |
+
p.mkdir(parents=True, exist_ok=True)
|
| 39 |
+
return p
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _make_donut_chart(score: int, score_cls: str) -> str:
|
| 43 |
+
"""Render authenticity score as a donut chart PNG; return base64 or '' on failure."""
|
| 44 |
+
try:
|
| 45 |
+
import matplotlib # type: ignore
|
| 46 |
+
matplotlib.use("Agg")
|
| 47 |
+
import matplotlib.pyplot as plt # type: ignore
|
| 48 |
+
|
| 49 |
+
color_map = {"real": "#43A047", "warn": "#FB8C00", "fake": "#E53935"}
|
| 50 |
+
color = color_map.get(score_cls, "#6B7280")
|
| 51 |
+
|
| 52 |
+
fig, ax = plt.subplots(figsize=(2.2, 2.2), dpi=96)
|
| 53 |
+
sizes = [score, 100 - score]
|
| 54 |
+
wedge_colors = [color, "#F3F4F6"]
|
| 55 |
+
ax.pie(sizes, colors=wedge_colors, startangle=90,
|
| 56 |
+
wedgeprops=dict(width=0.42, edgecolor="white", linewidth=1))
|
| 57 |
+
ax.text(0, 0, str(score), ha="center", va="center",
|
| 58 |
+
fontsize=20, fontweight="bold", color=color)
|
| 59 |
+
ax.set_aspect("equal")
|
| 60 |
+
plt.tight_layout(pad=0.05)
|
| 61 |
+
|
| 62 |
+
buf = BytesIO()
|
| 63 |
+
fig.savefig(buf, format="png", bbox_inches="tight", transparent=True)
|
| 64 |
+
plt.close(fig)
|
| 65 |
+
buf.seek(0)
|
| 66 |
+
return base64.b64encode(buf.read()).decode()
|
| 67 |
+
except Exception as e:
|
| 68 |
+
logger.debug(f"Donut chart skipped: {e}")
|
| 69 |
+
return ""
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _extract_llm_summary(analysis_json: dict) -> dict | None:
|
| 73 |
+
"""Extract llm_summary from either top-level or inside explainability (images)."""
|
| 74 |
+
top = analysis_json.get("llm_summary")
|
| 75 |
+
if top:
|
| 76 |
+
return top
|
| 77 |
+
return (analysis_json.get("explainability") or {}).get("llm_summary")
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def render_html(analysis_json: dict) -> str:
|
| 81 |
+
score = analysis_json.get("verdict", {}).get("authenticity_score", 50)
|
| 82 |
+
sc = _score_class(score)
|
| 83 |
+
donut_b64 = _make_donut_chart(score, sc)
|
| 84 |
+
llm_summary = _extract_llm_summary(analysis_json)
|
| 85 |
+
expl: dict[str, Any] = analysis_json.get("explainability") or {}
|
| 86 |
+
|
| 87 |
+
tmpl = _env.get_template("report.html")
|
| 88 |
+
return tmpl.render(
|
| 89 |
+
analysis_id=analysis_json.get("analysis_id", ""),
|
| 90 |
+
media_type=analysis_json.get("media_type", "unknown"),
|
| 91 |
+
verdict=analysis_json.get("verdict", {}),
|
| 92 |
+
explainability=expl,
|
| 93 |
+
trusted_sources=analysis_json.get("trusted_sources", []),
|
| 94 |
+
contradicting_evidence=analysis_json.get("contradicting_evidence", []),
|
| 95 |
+
processing_summary=analysis_json.get("processing_summary", {}),
|
| 96 |
+
responsible_ai_notice=analysis_json.get(
|
| 97 |
+
"responsible_ai_notice",
|
| 98 |
+
"AI-based analysis may not be 100% accurate.",
|
| 99 |
+
),
|
| 100 |
+
score_class=sc,
|
| 101 |
+
generated_at=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC"),
|
| 102 |
+
donut_b64=donut_b64,
|
| 103 |
+
llm_summary=llm_summary,
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def html_to_pdf(html: str, out_path: Path) -> None:
|
| 108 |
+
with open(out_path, "wb") as f:
|
| 109 |
+
result = pisa.CreatePDF(html, dest=f)
|
| 110 |
+
if result.err:
|
| 111 |
+
raise RuntimeError(f"xhtml2pdf failed with {result.err} errors")
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def generate_report(record: AnalysisRecord) -> Path:
|
| 115 |
+
out_dir = _ensure_dir()
|
| 116 |
+
filename = f"deepshield_{record.id}_{uuid.uuid4().hex[:8]}.pdf"
|
| 117 |
+
out_path = out_dir / filename
|
| 118 |
+
|
| 119 |
+
data = json.loads(record.result_json)
|
| 120 |
+
html = render_html(data)
|
| 121 |
+
html_to_pdf(html, out_path)
|
| 122 |
+
logger.info(f"Report generated id={record.id} path={out_path} size={out_path.stat().st_size}B")
|
| 123 |
+
return out_path
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def create_report_row(analysis_id: int, path: Path) -> Report:
|
| 127 |
+
return Report(
|
| 128 |
+
analysis_id=analysis_id,
|
| 129 |
+
file_path=str(path),
|
| 130 |
+
expires_at=datetime.utcnow() + timedelta(seconds=settings.REPORT_TTL_SECONDS),
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def cleanup_expired(now: Optional[datetime] = None) -> int:
|
| 135 |
+
"""Delete expired PDFs from disk. Returns count deleted."""
|
| 136 |
+
now = now or datetime.utcnow()
|
| 137 |
+
d = Path(settings.REPORT_DIR)
|
| 138 |
+
if not d.exists():
|
| 139 |
+
return 0
|
| 140 |
+
deleted = 0
|
| 141 |
+
ttl = timedelta(seconds=settings.REPORT_TTL_SECONDS)
|
| 142 |
+
for f in d.glob("*.pdf"):
|
| 143 |
+
try:
|
| 144 |
+
mtime = datetime.utcfromtimestamp(f.stat().st_mtime)
|
| 145 |
+
if now - mtime > ttl:
|
| 146 |
+
f.unlink()
|
| 147 |
+
deleted += 1
|
| 148 |
+
except OSError as e:
|
| 149 |
+
logger.warning(f"Cleanup failed for {f}: {e}")
|
| 150 |
+
if deleted:
|
| 151 |
+
logger.info(f"Cleaned up {deleted} expired reports")
|
| 152 |
+
return deleted
|
requirements.txt
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.115.0
|
| 2 |
+
uvicorn[standard]==0.32.0
|
| 3 |
+
pydantic==2.9.2
|
| 4 |
+
pydantic-settings==2.6.0
|
| 5 |
+
python-multipart==0.0.12
|
| 6 |
+
python-dotenv==1.0.1
|
| 7 |
+
loguru==0.7.2
|
| 8 |
+
SQLAlchemy==2.0.35
|
| 9 |
+
psycopg2-binary==2.9.9
|
| 10 |
+
alembic==1.13.3
|
| 11 |
+
python-jose[cryptography]==3.3.0
|
| 12 |
+
bcrypt==4.2.0
|
| 13 |
+
|
| 14 |
+
# === Phase 1: Image Detection ===
|
| 15 |
+
# Install torch separately with CPU index first (see README): pip install torch==2.4.1 --index-url https://download.pytorch.org/whl/cpu
|
| 16 |
+
torch==2.4.1
|
| 17 |
+
torchvision==0.19.1
|
| 18 |
+
transformers==4.44.2
|
| 19 |
+
Pillow>=10.4.0
|
| 20 |
+
numpy>=1.26,<3
|
| 21 |
+
opencv-python==4.10.0.84
|
| 22 |
+
grad-cam==1.5.4
|
| 23 |
+
mediapipe==0.10.14
|
| 24 |
+
|
| 25 |
+
# === Phase 12: Explainability v2 ===
|
| 26 |
+
exifread==3.0.0
|
| 27 |
+
google-generativeai>=0.3.0 # Gemini provider for LLM explainability
|
| 28 |
+
openai>=1.0.0 # OpenAI provider (alternative to Gemini)
|
| 29 |
+
|
| 30 |
+
# === Phase 14: PDF v2 donut chart ===
|
| 31 |
+
matplotlib>=3.9.0
|
| 32 |
+
|
| 33 |
+
# === Phase 13: Text Pipeline Hardening ===
|
| 34 |
+
# After installing, run: python -m spacy download en_core_web_sm
|
| 35 |
+
spacy>=3.7.0,<4.0.0
|
| 36 |
+
sentence-transformers>=2.7.0 # for truth-override cosine similarity (all-MiniLM-L6-v2)
|
| 37 |
+
langdetect==1.0.9 # lightweight language detection
|
| 38 |
+
|
| 39 |
+
# === Phase 3: Text / News ===
|
| 40 |
+
httpx==0.27.2
|
| 41 |
+
|
| 42 |
+
# === Phase 4: Screenshot / OCR ===
|
| 43 |
+
easyocr==1.7.2
|
| 44 |
+
|
| 45 |
+
# === Phase 7: PDF Reports ===
|
| 46 |
+
Jinja2==3.1.4
|
| 47 |
+
xhtml2pdf==0.2.16
|
| 48 |
+
|
| 49 |
+
# === Phase 8: Auth ===
|
| 50 |
+
email-validator==2.2.0
|
router.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter
|
| 2 |
+
|
| 3 |
+
from api.v1 import analyze, auth, health, history, report
|
| 4 |
+
|
| 5 |
+
api_router = APIRouter(prefix="/api/v1")
|
| 6 |
+
api_router.include_router(health.router)
|
| 7 |
+
api_router.include_router(analyze.router)
|
| 8 |
+
api_router.include_router(report.router)
|
| 9 |
+
api_router.include_router(auth.router)
|
| 10 |
+
api_router.include_router(history.router)
|
scoring.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Tuple
|
| 4 |
+
|
| 5 |
+
TRUST_SCALE = [
|
| 6 |
+
(0, 20, "Very Likely Fake", "critical"),
|
| 7 |
+
(21, 40, "Likely Fake", "danger"),
|
| 8 |
+
(41, 60, "Possibly Manipulated", "warning"),
|
| 9 |
+
(61, 80, "Likely Real", "positive"),
|
| 10 |
+
(81, 100, "Very Likely Real", "safe"),
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def compute_authenticity_score(model_confidence: float, label: str) -> int:
|
| 15 |
+
"""Map (confidence, label) to 0-100 authenticity score.
|
| 16 |
+
Real-ish labels give high score; fake-ish labels give low score.
|
| 17 |
+
"""
|
| 18 |
+
label_l = label.lower()
|
| 19 |
+
fake_tokens = ("fake", "deepfake", "manipulated", "ai", "generated", "synthetic")
|
| 20 |
+
if any(tok in label_l for tok in fake_tokens):
|
| 21 |
+
score = (1.0 - float(model_confidence)) * 100.0
|
| 22 |
+
else:
|
| 23 |
+
score = float(model_confidence) * 100.0
|
| 24 |
+
return int(round(max(0.0, min(100.0, score))))
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def get_verdict_label(score: int) -> Tuple[str, str]:
|
| 28 |
+
for lo, hi, label, severity in TRUST_SCALE:
|
| 29 |
+
if lo <= score <= hi:
|
| 30 |
+
return label, severity
|
| 31 |
+
return "Unknown", "warning"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def get_score_color(score: int) -> str:
|
| 35 |
+
"""Linear interpolate Red (#E53935) → Amber (#FFA726) → Green (#43A047)."""
|
| 36 |
+
def lerp(a: int, b: int, t: float) -> int:
|
| 37 |
+
return int(round(a + (b - a) * t))
|
| 38 |
+
|
| 39 |
+
score = max(0, min(100, score))
|
| 40 |
+
if score <= 50:
|
| 41 |
+
t = score / 50.0
|
| 42 |
+
r, g, b = lerp(0xE5, 0xFF, t), lerp(0x39, 0xA7, t), lerp(0x35, 0x26, t)
|
| 43 |
+
else:
|
| 44 |
+
t = (score - 50) / 50.0
|
| 45 |
+
r, g, b = lerp(0xFF, 0x43, t), lerp(0xA7, 0xA0, t), lerp(0x26, 0x47, t)
|
| 46 |
+
return f"#{r:02X}{g:02X}{b:02X}"
|
screenshot_service.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass, field
|
| 4 |
+
from typing import List, Tuple
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
from loguru import logger
|
| 8 |
+
from PIL import Image
|
| 9 |
+
|
| 10 |
+
from models.model_loader import get_model_loader
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class OCRBox:
|
| 15 |
+
text: str
|
| 16 |
+
bbox: List[List[int]] # 4 points [[x,y],...]
|
| 17 |
+
confidence: float
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@dataclass
|
| 21 |
+
class SuspiciousPhrase:
|
| 22 |
+
text: str
|
| 23 |
+
bbox: List[List[int]]
|
| 24 |
+
pattern_type: str
|
| 25 |
+
severity: str
|
| 26 |
+
description: str
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class LayoutAnomaly:
|
| 31 |
+
type: str # misalignment / font_mismatch / uneven_spacing
|
| 32 |
+
severity: str
|
| 33 |
+
description: str
|
| 34 |
+
confidence: float
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def run_ocr(pil_img: Image.Image) -> List[OCRBox]:
|
| 38 |
+
reader = get_model_loader().load_ocr_engine()
|
| 39 |
+
arr = np.array(pil_img.convert("RGB"))
|
| 40 |
+
results = reader.readtext(arr, detail=1, paragraph=False)
|
| 41 |
+
out: List[OCRBox] = []
|
| 42 |
+
for bbox, text, conf in results:
|
| 43 |
+
out.append(OCRBox(
|
| 44 |
+
text=str(text),
|
| 45 |
+
bbox=[[int(p[0]), int(p[1])] for p in bbox],
|
| 46 |
+
confidence=float(conf),
|
| 47 |
+
))
|
| 48 |
+
logger.info(f"OCR extracted {len(out)} text regions")
|
| 49 |
+
return out
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def extract_full_text(boxes: List[OCRBox]) -> str:
|
| 53 |
+
return " ".join(b.text for b in boxes if b.text.strip())
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def map_phrases_to_boxes(boxes: List[OCRBox], manipulation_indicators) -> List[SuspiciousPhrase]:
|
| 57 |
+
"""Map each manipulation indicator to the OCR box whose text contains it."""
|
| 58 |
+
out: List[SuspiciousPhrase] = []
|
| 59 |
+
for mi in manipulation_indicators:
|
| 60 |
+
needle = mi.matched_text.lower()
|
| 61 |
+
for b in boxes:
|
| 62 |
+
if needle in b.text.lower():
|
| 63 |
+
out.append(SuspiciousPhrase(
|
| 64 |
+
text=mi.matched_text,
|
| 65 |
+
bbox=b.bbox,
|
| 66 |
+
pattern_type=mi.pattern_type,
|
| 67 |
+
severity=mi.severity,
|
| 68 |
+
description=mi.description,
|
| 69 |
+
))
|
| 70 |
+
break
|
| 71 |
+
return out
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def detect_layout_anomalies(boxes: List[OCRBox]) -> List[LayoutAnomaly]:
|
| 75 |
+
"""Heuristic layout checks on OCR bboxes."""
|
| 76 |
+
out: List[LayoutAnomaly] = []
|
| 77 |
+
if len(boxes) < 3:
|
| 78 |
+
return out
|
| 79 |
+
|
| 80 |
+
heights = []
|
| 81 |
+
x_lefts = []
|
| 82 |
+
for b in boxes:
|
| 83 |
+
pts = b.bbox
|
| 84 |
+
ys = [p[1] for p in pts]
|
| 85 |
+
xs = [p[0] for p in pts]
|
| 86 |
+
heights.append(max(ys) - min(ys))
|
| 87 |
+
x_lefts.append(min(xs))
|
| 88 |
+
|
| 89 |
+
h_arr = np.array(heights, dtype=float)
|
| 90 |
+
if h_arr.mean() > 0:
|
| 91 |
+
cv_h = float(h_arr.std() / h_arr.mean())
|
| 92 |
+
if cv_h > 0.7:
|
| 93 |
+
out.append(LayoutAnomaly(
|
| 94 |
+
type="font_mismatch",
|
| 95 |
+
severity="medium" if cv_h < 1.2 else "high",
|
| 96 |
+
description=f"High variance in text heights (cv={cv_h:.2f}) — mixed fonts/sizes possible",
|
| 97 |
+
confidence=min(cv_h / 1.5, 1.0),
|
| 98 |
+
))
|
| 99 |
+
|
| 100 |
+
x_arr = np.array(x_lefts, dtype=float)
|
| 101 |
+
if x_arr.std() > 0 and len(x_arr) > 4:
|
| 102 |
+
clustered = sum(1 for x in x_arr if abs(x - np.median(x_arr)) < 20)
|
| 103 |
+
align_ratio = clustered / len(x_arr)
|
| 104 |
+
if align_ratio < 0.4:
|
| 105 |
+
out.append(LayoutAnomaly(
|
| 106 |
+
type="misalignment",
|
| 107 |
+
severity="low",
|
| 108 |
+
description=f"Only {align_ratio*100:.0f}% of text blocks share left-alignment — unusual layout",
|
| 109 |
+
confidence=1.0 - align_ratio,
|
| 110 |
+
))
|
| 111 |
+
|
| 112 |
+
if len(boxes) >= 4:
|
| 113 |
+
tops = sorted([min(p[1] for p in b.bbox) for b in boxes])
|
| 114 |
+
gaps = np.diff(tops)
|
| 115 |
+
gaps = gaps[gaps > 0]
|
| 116 |
+
if len(gaps) >= 3 and gaps.mean() > 0:
|
| 117 |
+
cv_g = float(gaps.std() / gaps.mean())
|
| 118 |
+
if cv_g > 1.5:
|
| 119 |
+
out.append(LayoutAnomaly(
|
| 120 |
+
type="uneven_spacing",
|
| 121 |
+
severity="low",
|
| 122 |
+
description=f"Irregular vertical spacing between text blocks (cv={cv_g:.2f})",
|
| 123 |
+
confidence=min(cv_g / 2.5, 1.0),
|
| 124 |
+
))
|
| 125 |
+
|
| 126 |
+
return out
|
test_image_classify.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Phase 1.2 smoke test: download a sample image and run the ViT classifier.
|
| 2 |
+
|
| 3 |
+
Run from backend/:
|
| 4 |
+
.venv/Scripts/python.exe scripts/test_image_classify.py
|
| 5 |
+
"""
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
import urllib.request
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
| 13 |
+
|
| 14 |
+
import base64
|
| 15 |
+
|
| 16 |
+
from models.heatmap_generator import generate_heatmap_base64
|
| 17 |
+
from services.artifact_detector import scan_artifacts
|
| 18 |
+
from services.image_service import preprocess_and_classify
|
| 19 |
+
from utils.scoring import compute_authenticity_score, get_verdict_label
|
| 20 |
+
|
| 21 |
+
SAMPLE_URL = "https://picsum.photos/seed/deepshield/512/512"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def main() -> int:
|
| 25 |
+
print(f"Fetching sample image: {SAMPLE_URL}")
|
| 26 |
+
req = urllib.request.Request(SAMPLE_URL, headers={"User-Agent": "DeepShield/0.1"})
|
| 27 |
+
with urllib.request.urlopen(req, timeout=30) as r:
|
| 28 |
+
data = r.read()
|
| 29 |
+
print(f" got {len(data)} bytes")
|
| 30 |
+
|
| 31 |
+
print("Running classifier (first run will download model ~350MB)…")
|
| 32 |
+
pil, result = preprocess_and_classify(data)
|
| 33 |
+
print(f" image size: {pil.size}")
|
| 34 |
+
print(f" label: {result.label}")
|
| 35 |
+
print(f" confidence: {result.confidence:.4f}")
|
| 36 |
+
print(f" all scores: {result.all_scores}")
|
| 37 |
+
|
| 38 |
+
score = compute_authenticity_score(result.confidence, result.label)
|
| 39 |
+
verdict_label, severity = get_verdict_label(score)
|
| 40 |
+
print(f"\n authenticity_score: {score}")
|
| 41 |
+
print(f" verdict: {verdict_label} ({severity})")
|
| 42 |
+
|
| 43 |
+
print("\nScanning artifact indicators\u2026")
|
| 44 |
+
for ind in scan_artifacts(pil, data):
|
| 45 |
+
print(f" [{ind.severity.upper():6s}] {ind.type}: {ind.description} (conf {ind.confidence:.2f})")
|
| 46 |
+
|
| 47 |
+
print("\nGenerating Grad-CAM heatmap\u2026")
|
| 48 |
+
heatmap_url = generate_heatmap_base64(pil)
|
| 49 |
+
header, b64 = heatmap_url.split(",", 1)
|
| 50 |
+
out_path = Path(__file__).resolve().parent.parent / "heatmap_smoketest.png"
|
| 51 |
+
out_path.write_bytes(base64.b64decode(b64))
|
| 52 |
+
print(f" saved: {out_path}")
|
| 53 |
+
print(f" data URL length: {len(heatmap_url)} chars")
|
| 54 |
+
return 0
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
+
raise SystemExit(main())
|
test_news_api.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Test script for the NewsData API integration."""
|
| 2 |
+
import asyncio
|
| 3 |
+
import sys
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
# Add backend directory to sys.path so we can import modules
|
| 7 |
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
| 8 |
+
|
| 9 |
+
from config import settings
|
| 10 |
+
from services.news_lookup import search_news_full
|
| 11 |
+
|
| 12 |
+
async def test_news():
|
| 13 |
+
print(f"Testing News API Integration with key: {settings.NEWS_API_KEY[:6]}... (masked)")
|
| 14 |
+
|
| 15 |
+
if not settings.NEWS_API_KEY:
|
| 16 |
+
print("ERROR: NEWS_API_KEY is empty in .env")
|
| 17 |
+
return
|
| 18 |
+
|
| 19 |
+
keywords = ["modi", "election", "bjp", "congress"]
|
| 20 |
+
print(f"Searching for keywords: {keywords}")
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
result = await search_news_full(keywords, limit=5)
|
| 24 |
+
|
| 25 |
+
print("\n=== RAW RESULT ===")
|
| 26 |
+
print(f"Total articles found: {result.total_articles}")
|
| 27 |
+
|
| 28 |
+
print("\n=== TRUSTED SOURCES ===")
|
| 29 |
+
for i, source in enumerate(result.trusted_sources, 1):
|
| 30 |
+
date_str = str(source.published_at)[:10] if source.published_at else "Unknown date"
|
| 31 |
+
print(f"{i}. [{source.relevance_score}] {source.source_name}: {source.title[:60]}... ({date_str})")
|
| 32 |
+
|
| 33 |
+
print("\n=== CONTRADICTING EVIDENCE / FACT CHECKS ===")
|
| 34 |
+
if not result.contradicting_evidence:
|
| 35 |
+
print("No fact-check articles found for these keywords.")
|
| 36 |
+
for i, ev in enumerate(result.contradicting_evidence, 1):
|
| 37 |
+
print(f"{i}. {ev.source_name}: {ev.title[:60]}...")
|
| 38 |
+
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(f"\nERROR running test: {e}")
|
| 41 |
+
|
| 42 |
+
if __name__ == "__main__":
|
| 43 |
+
asyncio.run(test_news())
|
test_phase5.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Phase 5 smoke: unit-test news_lookup classification + endpoint wiring."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import asyncio
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
| 9 |
+
|
| 10 |
+
from services.news_lookup import (
|
| 11 |
+
_domain_of, _is_factcheck, _relevance, search_news_full,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def test_domain():
|
| 16 |
+
assert _domain_of("https://www.reuters.com/article/x") == "reuters.com"
|
| 17 |
+
assert _domain_of("https://snopes.com/fact-check/abc") == "snopes.com"
|
| 18 |
+
print("[OK] _domain_of")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def test_factcheck_detection():
|
| 22 |
+
assert _is_factcheck("https://snopes.com/x", "Claim about moon")
|
| 23 |
+
assert _is_factcheck("https://factly.in/x", "")
|
| 24 |
+
assert _is_factcheck("https://example.com/x", "FACT CHECK: viral video debunked")
|
| 25 |
+
assert not _is_factcheck("https://bbc.com/news/world-123", "Election results")
|
| 26 |
+
print("[OK] _is_factcheck")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def test_relevance():
|
| 30 |
+
assert _relevance("https://reuters.com/x") == 1.0
|
| 31 |
+
assert _relevance("https://ndtv.com/x") == 0.85
|
| 32 |
+
assert _relevance("https://random-blog.xyz/x") == 0.5
|
| 33 |
+
print("[OK] _relevance weights")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
async def test_empty_key_returns_empty():
|
| 37 |
+
res = await search_news_full(["modi", "election"])
|
| 38 |
+
assert res.trusted_sources == []
|
| 39 |
+
assert res.contradicting_evidence == []
|
| 40 |
+
assert res.total_articles == 0
|
| 41 |
+
print(f"[OK] empty-key path -> {res}")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
async def test_endpoint_wiring():
|
| 45 |
+
import httpx
|
| 46 |
+
body = {"text": "BREAKING!!! You won't BELIEVE this SHOCKING miracle cure doctors don't want you to know!!! Click now!"}
|
| 47 |
+
async with httpx.AsyncClient(timeout=180.0) as c:
|
| 48 |
+
r = await c.post("http://127.0.0.1:8000/api/v1/analyze/text", json=body)
|
| 49 |
+
r.raise_for_status()
|
| 50 |
+
j = r.json()
|
| 51 |
+
assert j["media_type"] == "text"
|
| 52 |
+
assert "trusted_sources" in j
|
| 53 |
+
assert "contradicting_evidence" in j
|
| 54 |
+
assert "news_lookup" in j["processing_summary"]["stages_completed"]
|
| 55 |
+
print(f"[OK] /analyze/text -> verdict={j['verdict']['label']} "
|
| 56 |
+
f"score={j['verdict']['authenticity_score']} "
|
| 57 |
+
f"trusted={len(j['trusted_sources'])} contradictions={len(j['contradicting_evidence'])}")
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
async def main():
|
| 61 |
+
test_domain()
|
| 62 |
+
test_factcheck_detection()
|
| 63 |
+
test_relevance()
|
| 64 |
+
await test_empty_key_returns_empty()
|
| 65 |
+
await test_endpoint_wiring()
|
| 66 |
+
print("\n=== Phase 5 smoke PASS ===")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
if __name__ == "__main__":
|
| 70 |
+
asyncio.run(main())
|
test_text_analysis.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Quick smoke test for sensationalism + manipulation detection."""
|
| 2 |
+
import sys
|
| 3 |
+
sys.path.insert(0, ".")
|
| 4 |
+
|
| 5 |
+
from services.text_service import score_sensationalism, detect_manipulation_indicators
|
| 6 |
+
|
| 7 |
+
# --- Sensationalism ---
|
| 8 |
+
text1 = "BREAKING: You wont believe this SHOCKING truth! Experts confirm the most DEVASTATING scandal exposed!!!"
|
| 9 |
+
s = score_sensationalism(text1)
|
| 10 |
+
print(f"Sensationalism: score={s.score} level={s.level}")
|
| 11 |
+
print(f" excl={s.exclamation_count} caps={s.caps_word_count} clickbait={s.clickbait_matches} emotional={s.emotional_word_count} superlative={s.superlative_count}")
|
| 12 |
+
assert s.score > 50, f"Expected high sensationalism, got {s.score}"
|
| 13 |
+
assert s.level in ("Medium", "High"), f"Expected Medium/High, got {s.level}"
|
| 14 |
+
print(" PASS")
|
| 15 |
+
|
| 16 |
+
# --- Manipulation ---
|
| 17 |
+
text2 = "Sources say that experts confirm the shocking truth. Allegedly, everyone knows this is a proven fact."
|
| 18 |
+
m = detect_manipulation_indicators(text2)
|
| 19 |
+
print(f"\nManipulation indicators: {len(m)} found")
|
| 20 |
+
for ind in m:
|
| 21 |
+
print(f" [{ind.severity}] {ind.pattern_type}: \"{ind.matched_text}\"")
|
| 22 |
+
assert len(m) >= 3, f"Expected >=3 indicators, got {len(m)}"
|
| 23 |
+
print(" PASS")
|
| 24 |
+
|
| 25 |
+
# --- Clean text ---
|
| 26 |
+
text3 = "The weather today is sunny with clear skies in New Delhi."
|
| 27 |
+
s2 = score_sensationalism(text3)
|
| 28 |
+
m2 = detect_manipulation_indicators(text3)
|
| 29 |
+
print(f"\nClean text: sensationalism={s2.score} ({s2.level}), manipulation={len(m2)}")
|
| 30 |
+
assert s2.score < 20, f"Expected low sensationalism for clean text, got {s2.score}"
|
| 31 |
+
assert len(m2) == 0, f"Expected 0 manipulation indicators for clean text, got {len(m2)}"
|
| 32 |
+
print(" PASS")
|
| 33 |
+
|
| 34 |
+
print("\nAll tests passed!")
|
text_service.py
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from dataclasses import dataclass, field
|
| 5 |
+
from typing import List, Optional
|
| 6 |
+
|
| 7 |
+
from loguru import logger
|
| 8 |
+
|
| 9 |
+
from models.model_loader import get_model_loader
|
| 10 |
+
|
| 11 |
+
FAKE_TOKENS = ("fake", "false", "unreliable", "misinformation")
|
| 12 |
+
|
| 13 |
+
# --- Sensationalism patterns ---
|
| 14 |
+
CLICKBAIT_PATTERNS = [
|
| 15 |
+
(r"\byou won'?t believe\b", "clickbait"),
|
| 16 |
+
(r"\bbreaking\s*:", "clickbait"),
|
| 17 |
+
(r"\bshocking\s*:", "clickbait"),
|
| 18 |
+
(r"\bexclusive\s*:", "clickbait"),
|
| 19 |
+
(r"\bjust\s+in\s*:", "clickbait"),
|
| 20 |
+
(r"\burgent\s*:", "clickbait"),
|
| 21 |
+
(r"\bwhat\s+happens\s+next\b", "clickbait"),
|
| 22 |
+
(r"\bthis\s+will\s+change\b", "clickbait"),
|
| 23 |
+
(r"\b(?:everyone|nobody)\s+(?:is|was)\s+talking\b", "clickbait"),
|
| 24 |
+
]
|
| 25 |
+
EMOTIONAL_WORDS = {
|
| 26 |
+
"outrage", "shocking", "horrifying", "disgusting", "amazing", "incredible",
|
| 27 |
+
"unbelievable", "devastating", "terrifying", "explosive", "bombshell",
|
| 28 |
+
"jaw-dropping", "heartbreaking", "furious", "scandal", "crisis",
|
| 29 |
+
"chaos", "destroyed", "slammed", "blasted", "exposed", "revealed",
|
| 30 |
+
}
|
| 31 |
+
SUPERLATIVES = {
|
| 32 |
+
"best", "worst", "greatest", "biggest", "most", "least",
|
| 33 |
+
"fastest", "deadliest", "largest", "smallest", "ultimate",
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
# --- Manipulation indicator patterns ---
|
| 37 |
+
MANIPULATION_PATTERNS = [
|
| 38 |
+
# Unverified claims
|
| 39 |
+
(r"\bsources?\s+(?:say|said|claim|report)\b", "unverified_claim", "medium",
|
| 40 |
+
"Unverified source attribution without specific citation"),
|
| 41 |
+
(r"\ballegedly\b", "unverified_claim", "low",
|
| 42 |
+
"Hedging language suggests unverified information"),
|
| 43 |
+
(r"\breports?\s+suggest\b", "unverified_claim", "medium",
|
| 44 |
+
"Vague report attribution"),
|
| 45 |
+
(r"\baccording\s+to\s+(?:some|many|several)\b", "unverified_claim", "medium",
|
| 46 |
+
"Non-specific source attribution"),
|
| 47 |
+
(r"\brunconfirmed\b", "unverified_claim", "medium",
|
| 48 |
+
"Explicitly unconfirmed information"),
|
| 49 |
+
# Emotional manipulation
|
| 50 |
+
(r"\boutrage\b", "emotional_manipulation", "medium",
|
| 51 |
+
"Emotional trigger word designed to provoke reaction"),
|
| 52 |
+
(r"\bshocking\s+truth\b", "emotional_manipulation", "high",
|
| 53 |
+
"Sensationalist phrase designed to manipulate reader emotion"),
|
| 54 |
+
(r"\bwake\s+up\b", "emotional_manipulation", "medium",
|
| 55 |
+
"Call-to-action implying hidden knowledge"),
|
| 56 |
+
(r"\bthey\s+don'?t\s+want\s+you\s+to\s+know\b", "emotional_manipulation", "high",
|
| 57 |
+
"Conspiracy framing language"),
|
| 58 |
+
(r"\bopen\s+your\s+eyes\b", "emotional_manipulation", "medium",
|
| 59 |
+
"Implies audience ignorance"),
|
| 60 |
+
# False authority
|
| 61 |
+
(r"\bexperts?\s+(?:confirm|say|agree|warn)\b", "false_authority", "medium",
|
| 62 |
+
"Unnamed expert citation without specific attribution"),
|
| 63 |
+
(r"\bscientists?\s+(?:confirm|prove|say)\b", "false_authority", "medium",
|
| 64 |
+
"Unnamed scientist citation"),
|
| 65 |
+
(r"\bstudies?\s+(?:show|prove|confirm)\b", "false_authority", "low",
|
| 66 |
+
"Vague study reference without citation"),
|
| 67 |
+
(r"\beveryone\s+knows\b", "false_authority", "medium",
|
| 68 |
+
"Appeal to common knowledge fallacy"),
|
| 69 |
+
(r"\bit'?s\s+(?:a\s+)?(?:well-?known|proven)\s+fact\b", "false_authority", "medium",
|
| 70 |
+
"Assertion of fact without evidence"),
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
# NER entity labels to prefer for keyword extraction
|
| 74 |
+
_NER_PREFERRED = {"PERSON", "ORG", "GPE", "EVENT", "PRODUCT", "NORP"}
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
@dataclass
|
| 78 |
+
class TextClassification:
|
| 79 |
+
label: str
|
| 80 |
+
confidence: float
|
| 81 |
+
fake_prob: float
|
| 82 |
+
all_scores: dict[str, float]
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
@dataclass
|
| 86 |
+
class SensationalismResult:
|
| 87 |
+
score: int # 0-100
|
| 88 |
+
level: str # Low / Medium / High
|
| 89 |
+
exclamation_count: int
|
| 90 |
+
caps_word_count: int
|
| 91 |
+
clickbait_matches: int
|
| 92 |
+
emotional_word_count: int
|
| 93 |
+
superlative_count: int
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
@dataclass
|
| 97 |
+
class ManipulationIndicator:
|
| 98 |
+
pattern_type: str # unverified_claim / emotional_manipulation / false_authority
|
| 99 |
+
matched_text: str
|
| 100 |
+
start_pos: int
|
| 101 |
+
end_pos: int
|
| 102 |
+
severity: str # low / medium / high
|
| 103 |
+
description: str
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def detect_language(text: str) -> str:
|
| 107 |
+
"""Detect the primary language of text using langdetect.
|
| 108 |
+
Returns ISO 639-1 code (e.g. 'en', 'hi'). Falls back to 'en' on failure.
|
| 109 |
+
"""
|
| 110 |
+
if not text or len(text.strip()) < 10:
|
| 111 |
+
return "en"
|
| 112 |
+
try:
|
| 113 |
+
from langdetect import detect # type: ignore
|
| 114 |
+
lang = detect(text.strip())
|
| 115 |
+
logger.info(f"Language detected: {lang}")
|
| 116 |
+
return lang
|
| 117 |
+
except ImportError:
|
| 118 |
+
logger.debug("langdetect not installed — defaulting to 'en'")
|
| 119 |
+
return "en"
|
| 120 |
+
except Exception as e:
|
| 121 |
+
logger.debug(f"Language detection failed: {e} — defaulting to 'en'")
|
| 122 |
+
return "en"
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def _scores_to_classification(items) -> TextClassification:
|
| 126 |
+
"""Convert pipeline output to TextClassification."""
|
| 127 |
+
scores = {i["label"]: float(i["score"]) for i in items}
|
| 128 |
+
top_label, top_conf = max(scores.items(), key=lambda kv: kv[1])
|
| 129 |
+
# Extract fake probability
|
| 130 |
+
fake_prob = 0.0
|
| 131 |
+
if "LABEL_0" in scores:
|
| 132 |
+
fake_prob = scores["LABEL_0"]
|
| 133 |
+
else:
|
| 134 |
+
fake_prob = max(
|
| 135 |
+
(p for lbl, p in scores.items() if any(t in lbl.lower() for t in FAKE_TOKENS)),
|
| 136 |
+
default=0.0,
|
| 137 |
+
)
|
| 138 |
+
return TextClassification(top_label, top_conf, fake_prob, scores)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def classify_text(text: str, language: Optional[str] = None) -> TextClassification:
|
| 142 |
+
"""Classify text as fake/real.
|
| 143 |
+
Routes to multilingual model when language is non-English and the model is configured.
|
| 144 |
+
"""
|
| 145 |
+
text = (text or "").strip()
|
| 146 |
+
if not text:
|
| 147 |
+
return TextClassification("unknown", 0.0, 0.0, {})
|
| 148 |
+
|
| 149 |
+
loader = get_model_loader()
|
| 150 |
+
|
| 151 |
+
if language and language != "en":
|
| 152 |
+
pipe = loader.load_multilang_text_model()
|
| 153 |
+
else:
|
| 154 |
+
pipe = loader.load_text_model()
|
| 155 |
+
|
| 156 |
+
out = pipe(text[:2000], truncation=True, top_k=None)
|
| 157 |
+
items = out[0] if isinstance(out[0], list) else out
|
| 158 |
+
clf = _scores_to_classification(items)
|
| 159 |
+
logger.info(
|
| 160 |
+
f"Text classify [{language or 'en'}] → {clf.label} @ {clf.confidence:.3f} "
|
| 161 |
+
f"fake_p={clf.fake_prob:.3f}"
|
| 162 |
+
)
|
| 163 |
+
return clf
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def score_sensationalism(text: str) -> SensationalismResult:
|
| 167 |
+
"""Compute a 0-100 sensationalism score from structural/linguistic signals."""
|
| 168 |
+
if not text:
|
| 169 |
+
return SensationalismResult(0, "Low", 0, 0, 0, 0, 0)
|
| 170 |
+
|
| 171 |
+
words = text.split()
|
| 172 |
+
total_words = max(len(words), 1)
|
| 173 |
+
|
| 174 |
+
excl = text.count("!")
|
| 175 |
+
caps = sum(1 for w in words if w.isupper() and len(w) > 2)
|
| 176 |
+
clickbait = sum(
|
| 177 |
+
1 for pat, _ in CLICKBAIT_PATTERNS
|
| 178 |
+
if re.search(pat, text, re.IGNORECASE)
|
| 179 |
+
)
|
| 180 |
+
emotional = sum(1 for w in words if w.lower().strip(".,!?;:") in EMOTIONAL_WORDS)
|
| 181 |
+
superlative = sum(1 for w in words if w.lower().strip(".,!?;:") in SUPERLATIVES)
|
| 182 |
+
|
| 183 |
+
raw = (
|
| 184 |
+
min(excl * 8, 25)
|
| 185 |
+
+ min(caps / total_words * 200, 25)
|
| 186 |
+
+ min(clickbait * 12, 25)
|
| 187 |
+
+ min(emotional * 6, 15)
|
| 188 |
+
+ min(superlative * 5, 10)
|
| 189 |
+
)
|
| 190 |
+
score = int(min(100, max(0, raw)))
|
| 191 |
+
level = "Low" if score < 30 else ("Medium" if score < 60 else "High")
|
| 192 |
+
|
| 193 |
+
logger.info(f"Sensationalism → {score} ({level}) excl={excl} caps={caps} cb={clickbait} emo={emotional}")
|
| 194 |
+
return SensationalismResult(score, level, excl, caps, clickbait, emotional, superlative)
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def detect_manipulation_indicators(text: str) -> List[ManipulationIndicator]:
|
| 198 |
+
"""Scan text for manipulation linguistic patterns with positions."""
|
| 199 |
+
if not text:
|
| 200 |
+
return []
|
| 201 |
+
indicators: List[ManipulationIndicator] = []
|
| 202 |
+
for pattern, ptype, severity, description in MANIPULATION_PATTERNS:
|
| 203 |
+
for m in re.finditer(pattern, text, re.IGNORECASE):
|
| 204 |
+
indicators.append(ManipulationIndicator(
|
| 205 |
+
pattern_type=ptype,
|
| 206 |
+
matched_text=m.group(),
|
| 207 |
+
start_pos=m.start(),
|
| 208 |
+
end_pos=m.end(),
|
| 209 |
+
severity=severity,
|
| 210 |
+
description=description,
|
| 211 |
+
))
|
| 212 |
+
indicators.sort(key=lambda i: i.start_pos)
|
| 213 |
+
logger.info(f"Manipulation indicators → {len(indicators)} found")
|
| 214 |
+
return indicators
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def extract_entities(text: str, max_k: int = 6) -> List[str]:
|
| 218 |
+
"""Extract keywords via spaCy NER (PERSON, ORG, GPE, EVENT preferred).
|
| 219 |
+
Falls back to frequency-based extraction when spaCy is unavailable or text is too short.
|
| 220 |
+
"""
|
| 221 |
+
if not text or len(text.strip()) < 20:
|
| 222 |
+
return _extract_keywords_freq(text, max_k)
|
| 223 |
+
|
| 224 |
+
loader = get_model_loader()
|
| 225 |
+
nlp = loader.load_spacy_nlp()
|
| 226 |
+
|
| 227 |
+
if nlp is None:
|
| 228 |
+
# spaCy not available — use frequency fallback
|
| 229 |
+
return _extract_keywords_freq(text, max_k)
|
| 230 |
+
|
| 231 |
+
try:
|
| 232 |
+
doc = nlp(text[:5000]) # cap for performance
|
| 233 |
+
|
| 234 |
+
# Collect named entities, preferring high-value types
|
| 235 |
+
preferred: List[str] = []
|
| 236 |
+
other: List[str] = []
|
| 237 |
+
seen: set[str] = set()
|
| 238 |
+
|
| 239 |
+
for ent in doc.ents:
|
| 240 |
+
norm = ent.text.strip()
|
| 241 |
+
norm_lower = norm.lower()
|
| 242 |
+
if not norm or norm_lower in seen or len(norm) < 2:
|
| 243 |
+
continue
|
| 244 |
+
seen.add(norm_lower)
|
| 245 |
+
if ent.label_ in _NER_PREFERRED:
|
| 246 |
+
preferred.append(norm)
|
| 247 |
+
else:
|
| 248 |
+
other.append(norm)
|
| 249 |
+
|
| 250 |
+
entities = preferred + other
|
| 251 |
+
|
| 252 |
+
if len(entities) >= 2:
|
| 253 |
+
logger.info(f"NER extracted {len(entities)} entities: {entities[:max_k]}")
|
| 254 |
+
return entities[:max_k]
|
| 255 |
+
|
| 256 |
+
# Not enough entities — supplement with frequency keywords
|
| 257 |
+
freq_kws = _extract_keywords_freq(text, max_k)
|
| 258 |
+
combined = entities + [k for k in freq_kws if k.lower() not in seen]
|
| 259 |
+
return combined[:max_k]
|
| 260 |
+
|
| 261 |
+
except Exception as e:
|
| 262 |
+
logger.warning(f"spaCy NER failed: {e} — falling back to frequency extraction")
|
| 263 |
+
return _extract_keywords_freq(text, max_k)
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
def _extract_keywords_freq(text: str, max_k: int = 6) -> List[str]:
|
| 267 |
+
"""Frequency-based keyword extraction (original implementation, kept as fallback)."""
|
| 268 |
+
stop = {
|
| 269 |
+
"the","a","an","is","are","was","were","be","been","being","to","of","and","or","but",
|
| 270 |
+
"in","on","at","for","with","by","from","as","that","this","it","its","has","have","had",
|
| 271 |
+
"will","would","can","could","should","may","might","do","does","did","not","no","so",
|
| 272 |
+
"than","then","there","their","they","them","we","our","you","your","he","she","his","her",
|
| 273 |
+
}
|
| 274 |
+
words = re.findall(r"[A-Za-z][A-Za-z\-']{2,}", text or "")
|
| 275 |
+
freq: dict[str, int] = {}
|
| 276 |
+
for w in words:
|
| 277 |
+
wl = w.lower()
|
| 278 |
+
if wl in stop:
|
| 279 |
+
continue
|
| 280 |
+
freq[wl] = freq.get(wl, 0) + 1
|
| 281 |
+
return [w for w, _ in sorted(freq.items(), key=lambda kv: (-kv[1], kv[0]))[:max_k]]
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
# Back-compat alias: routes that still call extract_keywords get NER-first behaviour
|
| 285 |
+
extract_keywords = extract_entities
|
v1/__init__.py
ADDED
|
File without changes
|
v1/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (165 Bytes). View file
|
|
|
v1/__pycache__/analyze.cpython-311.pyc
ADDED
|
Binary file (21.6 kB). View file
|
|
|
v1/__pycache__/auth.cpython-311.pyc
ADDED
|
Binary file (3.82 kB). View file
|
|
|
v1/__pycache__/health.cpython-311.pyc
ADDED
|
Binary file (556 Bytes). View file
|
|
|
v1/__pycache__/history.cpython-311.pyc
ADDED
|
Binary file (5.19 kB). View file
|
|
|