deepshield / config.py
ar07xd's picture
Sync from GitHub via hub-sync
36529c1 verified
import json
import secrets
from urllib.parse import parse_qsl, urlencode
from typing import Any
from pydantic import field_validator, model_validator
from pydantic_settings import BaseSettings, SettingsConfigDict
def _parse_list_env(value: Any, default: list[str]) -> list[str]:
"""Accept list env values as JSON, CSV, single-value string, or native list."""
if value is None:
return default
if isinstance(value, str):
raw = value.strip()
if not raw:
return default
if raw.startswith("["):
try:
parsed = json.loads(raw)
if isinstance(parsed, list):
cleaned = [str(i).strip() for i in parsed if str(i).strip()]
return cleaned or default
except json.JSONDecodeError:
# Fall back to CSV parsing if JSON is malformed.
pass
cleaned = [i.strip() for i in raw.split(",") if i.strip()]
return cleaned or default
if isinstance(value, list):
cleaned = [str(i).strip() for i in value if str(i).strip()]
return cleaned or default
return default
def _normalize_origin(origin: str) -> str:
"""Normalize CORS origin values to avoid strict mismatch (e.g. trailing slash)."""
cleaned = origin.strip()
if cleaned.startswith(("http://", "https://")):
cleaned = cleaned.rstrip("/")
return cleaned
def _fix_postgres_url(raw: str) -> str:
"""Normalize common Postgres URL mistakes from deployment envs.
- Converts postgres:// to postgresql://
- Encodes stray '@' in credentials (usually from unescaped passwords)
- Ensures sslmode=require for Supabase URLs when missing
"""
url = raw.strip()
if url.startswith("postgres://"):
url = "postgresql://" + url[len("postgres://") :]
if not url.startswith("postgresql://"):
return url
# Split scheme + authority/path safely without full URL parsing.
rest = url[len("postgresql://") :]
if "@" in rest:
userinfo, remainder = rest.rsplit("@", 1)
# Any '@' left in userinfo belongs to credentials and must be percent-encoded.
userinfo = userinfo.replace("@", "%40")
url = "postgresql://" + userinfo + "@" + remainder
if "supabase.co" in url:
if "?" in url:
base, query = url.split("?", 1)
params = dict(parse_qsl(query, keep_blank_values=True))
if "sslmode" not in params:
params["sslmode"] = "require"
url = base + "?" + urlencode(params)
else:
url = url + "?sslmode=require"
return url
class Settings(BaseSettings):
# Server
APP_HOST: str = "0.0.0.0"
APP_PORT: int = 8000
DEBUG: bool = False
CORS_ORIGINS: Any = ["http://localhost:5173"]
@model_validator(mode="before")
@classmethod
def drop_blank_values(cls, data: Any) -> Any:
"""Treat blank env vars as unset so defaults apply instead of parse errors."""
if isinstance(data, dict):
return {
k: v
for k, v in data.items()
if not (isinstance(v, str) and not v.strip())
}
return data
@field_validator("CORS_ORIGINS", mode="before")
@classmethod
def assemble_cors_origins(cls, v: Any) -> list[str]:
"""Parse CORS_ORIGINS from string (JSON or comma-separated) into a list."""
origins = _parse_list_env(v, default=["http://localhost:5173"])
normalized = [_normalize_origin(i) for i in origins if _normalize_origin(i)]
return normalized or ["http://localhost:5173"]
# Database
DATABASE_URL: str = "sqlite:///./deepshield.db"
@field_validator("DATABASE_URL", mode="before")
@classmethod
def normalize_database_url(cls, v: Any) -> str:
"""Support common HF-style postgres URL aliases and blank values."""
if v is None:
return "sqlite:///./deepshield.db"
if isinstance(v, str):
raw = v.strip()
if not raw:
return "sqlite:///./deepshield.db"
return _fix_postgres_url(raw)
return str(v)
# File Upload
MAX_UPLOAD_SIZE_MB: int = 100
UPLOAD_DIR: str = "/data/uploads"
ALLOWED_IMAGE_TYPES: list[str] = ["image/jpeg", "image/png", "image/webp"]
ALLOWED_VIDEO_TYPES: list[str] = ["video/mp4", "video/avi", "video/mov", "video/webm"]
FILE_RETENTION_SECONDS: int = 300
# AI Models
IMAGE_MODEL_ID: str = "prithivMLmods/Deep-Fake-Detector-v2-Model"
GENERAL_IMAGE_MODEL_ID: str = "umm-maybe/AI-image-detector"
# Phase C1/C2: second AI-image head specialised on diffusion/GAN output.
# Ensembled with the general detector before feeding face-present fusion.
# Set to "" to disable (falls back to general detector only).
DIFFUSION_IMAGE_MODEL_ID: str = "haywoodsloan/ai-image-detector-deploy"
DIFFUSION_MODEL_ENABLED: bool = True
# Blend weights for the two-head general ensemble (must sum ≤ 1.0).
# When only one head is available the available head gets full weight.
GENERAL_AI_WEIGHT: float = 0.45
DIFFUSION_AI_WEIGHT: float = 0.55
# Temperature scaling for each detector head (> 1.0 = softer probabilities,
# < 1.0 = sharper). 1.0 = no scaling. Tune after running run_image_eval.py.
GENERAL_MODEL_TEMPERATURE: float = 1.0
DIFFUSION_MODEL_TEMPERATURE: float = 1.0
TEXT_MODEL_ID: str = "jy46604790/Fake-News-Bert-Detect"
# Multilingual text model for non-English (Hindi etc.). Leave empty to fall back to TEXT_MODEL_ID.
TEXT_MULTILANG_MODEL_ID: str = ""
DEVICE: str = "cpu"
PRELOAD_MODELS: bool = True # preload models at startup
# Phase 13: OCR language list (comma-separated ISO codes, e.g. "en,hi")
OCR_LANGS: str = "en,hi"
# News API
NEWS_API_KEY: str = ""
NEWS_API_BASE_URL: str = "https://newsdata.io/api/1/latest"
NEWS_API_ARCHIVE_BASE_URL: str = "https://newsdata.io/api/1/archive"
NEWS_API_LANGUAGES: str = "en,hi"
NEWS_API_RECENT_TIMEFRAME: str = "1"
NEWS_API_OLDER_DAYS: int = 7
NEWS_API_PAGE_SIZE: int = 10
NEWS_API_PRIMARY_COUNTRY: str = "in"
# Reports
REPORT_DIR: str = "/data/reports"
REPORT_TTL_SECONDS: int = 3600 # 1h expiry
PUBLIC_APP_URL: str = ""
# Public backend origin or API base URL used for third-party OAuth callbacks.
# Examples: http://localhost:8000, https://api.example.com, https://api.example.com/api/v1
PUBLIC_API_URL: str = ""
# Phase 19 — dedup cache + object storage
CACHE_TTL_DAYS: int = 30
MEDIA_ROOT: str = "/data/media"
MEDIA_SIGNED_URL_TTL_SECONDS: int = 3600
# LLM Explainability (Phase 12)
LLM_PROVIDER: str = "gemini" # "gemini" | "openai"
LLM_API_KEY: str = ""
LLM_MODEL: str = "gemini-2.0-flash" # 2.0-flash: fastest response, no thinking overhead, best for real-time summaries.
# LLM fallback — Groq (Llama 3.3 70B by default). Used automatically when the
# primary provider returns 429/quota exceeded. Leave empty to disable fallback.
GROQ_API_KEY: str = ""
GROQ_MODEL: str = "llama-3.3-70b-versatile"
# EfficientNet (ICPR2020 / DeepShield1 merge)
EFFICIENTNET_MODEL: str = "EfficientNetAutoAttB4"
EFFICIENTNET_TRAIN_DB: str = "DFDC"
ENSEMBLE_MODE: bool = True # run both ViT + EfficientNet and average scores
# Phase 11.3: FFPP-fine-tuned ViT. Path is resolved relative to the repo root.
# The checkpoint lives at <repo_root>/trained_models/ (the `trained_models/` dir
# at the project root, alongside `backend/` and `frontend/`).
FFPP_MODEL_PATH: str = "trained_models"
# Optional: pull FFPP checkpoint from Hugging Face Hub when local checkpoint
# is missing (keeps large model files out of GitHub source repo).
FFPP_MODEL_REPO_ID: str = ""
FFPP_MODEL_REVISION: str = "main"
FFPP_BASE_PROCESSOR_ID: str = "google/vit-base-patch16-224-in21k"
FFPP_ENABLED: bool = True
# DenseNet121 face-GAN specialist (in-house trained on 140k Kaggle dataset).
# Loaded from a TF-free PyTorch checkpoint converted via convert_densenet_keras_to_pt.py.
DENSENET_ENABLED: bool = True
# Path to .pt checkpoint, resolved relative to repo root (or absolute).
DENSENET_MODEL_PATH: str = "backend/trained_models/densenet121_faces.pt"
DENSENET_META_PATH: str = "backend/trained_models/densenet121_faces_meta.json"
# HF Space fallback when local checkpoint is absent.
DENSENET_HF_REPO_ID: str = "ar07xd/deepshield"
DENSENET_HF_REVISION: str = "main"
# Ensemble weights — DenseNet leads because it is trained on still-image GAN
# faces (the dominant upload type). FFPP / EffNet are stronger on video frames.
# Face-stack internal weights (sum = 1.0).
DENSENET_WEIGHT_FACE: float = 0.45
FFPP_WEIGHT_FACE: float = 0.25
VIT_WEIGHT_FACE: float = 0.15
EFFNET_WEIGHT_FACE: float = 0.15
# Video-frame path: FFPP leads since FFPP is trained on FF++ video frames.
DENSENET_VIDEO_WEIGHT: float = 0.10
VIDEO_FFPP_WEIGHT_FACE: float = 0.50
VIDEO_EFFNET_WEIGHT_FACE: float = 0.30
VIDEO_VIT_WEIGHT_FACE: float = 0.10
FFPP_WEIGHT_NOFACE: float = 0.35
VIT_WEIGHT_NOFACE: float = 0.65
# Face-present unified evidence weights (Phase A2/A3).
# face_stack = composite of FFPP+ViT+EffNet (all face-swap models).
# general = generic AI-image detector (diffusion/GAN whole-image cues).
# forensics = artifact scanner output.
# exif = camera-metadata trust signal.
# vlm = VLM consistency breakdown (optional).
FACE_STACK_WEIGHT_FACE: float = 0.40
GENERAL_WEIGHT_FACE: float = 0.40
FORENSICS_WEIGHT_FACE: float = 0.10
EXIF_WEIGHT_FACE: float = 0.05
VLM_WEIGHT_FACE: float = 0.05
# No-face evidence weights (existing behavior preserved).
NOFACE_GENERAL_WEIGHT: float = 0.60
NOFACE_FORENSICS_WEIGHT: float = 0.20
NOFACE_EXIF_WEIGHT: float = 0.10
NOFACE_VLM_WEIGHT: float = 0.10
# Hard gating thresholds (Phase A4). When the general detector is highly
# confident the image is synthetic, or strong GAN artifacts are present,
# the final fake probability is floored at GATING_FAKE_FLOOR (0.50 maps to
# authenticity score 50, i.e. cannot land in "Likely Real" or above).
GENERAL_FAKE_GATING_THRESHOLD: float = 0.80
GAN_ARTIFACT_GATING_THRESHOLD: float = 0.70
GATING_FAKE_FLOOR: float = 0.50
# Synthetic still-image overrides. FaceForensics/DFDC models are trained for
# manipulated video faces, so they should not veto a strong still-image AI
# detector on generated portraits.
SYNTHETIC_STILL_HIGH_THRESHOLD: float = 0.80
SYNTHETIC_STILL_HIGH_FLOOR: float = 0.80
SYNTHETIC_STILL_VERY_HIGH_THRESHOLD: float = 0.90
SYNTHETIC_STILL_VERY_HIGH_FLOOR: float = 0.90
# Video-frame weight overrides. When an image is detected as a low-res
# video frame (face-swap deepfakes are extracted from video), the general
# AI-image detector is unreliable (it's trained on synthesised stills, not
# video face-swaps). We shift weight strongly toward the face-swap-trained
# models (FFPP / EfficientNet) in that case.
VIDEO_FRAME_FACE_STACK_WEIGHT: float = 0.55
VIDEO_FRAME_GENERAL_WEIGHT: float = 0.30
VIDEO_FRAME_FORENSICS_WEIGHT: float = 0.10
VIDEO_FRAME_EXIF_WEIGHT: float = 0.05
# Per-frame video detector blend. FFPP ViT is trained on FaceForensics++
# face forgery frames, so it is the dominant signal for video analysis.
VIDEO_FFPP_WEIGHT: float = 0.70
VIDEO_EFFNET_WEIGHT: float = 0.30
VIDEO_SAMPLE_FRAMES: int = 32 # frames to sample per video for inference
EXIFTOOL_PATH: str = "" # full path to ExifTool binary; empty = metadata write disabled
# Auth
JWT_SECRET_KEY: str = ""
JWT_SECRET_KEY_GENERATED: bool = False
JWT_ALGORITHM: str = "HS256"
JWT_EXPIRATION_MINUTES: int = 1440
GOOGLE_CLIENT_ID: str = ""
GOOGLE_CLIENT_SECRET: str = ""
GITHUB_CLIENT_ID: str = ""
GITHUB_CLIENT_SECRET: str = ""
@model_validator(mode="after")
def ensure_jwt_secret(self):
if not self.JWT_SECRET_KEY:
if self.DEBUG:
self.JWT_SECRET_KEY = secrets.token_urlsafe(48)
self.JWT_SECRET_KEY_GENERATED = True
else:
self.JWT_SECRET_KEY = secrets.token_urlsafe(48)
self.JWT_SECRET_KEY_GENERATED = True
else:
self.JWT_SECRET_KEY_GENERATED = False
return self
@field_validator("ALLOWED_IMAGE_TYPES", mode="before")
@classmethod
def assemble_allowed_image_types(cls, v: Any) -> list[str]:
return _parse_list_env(v, default=["image/jpeg", "image/png", "image/webp"])
@field_validator("ALLOWED_VIDEO_TYPES", mode="before")
@classmethod
def assemble_allowed_video_types(cls, v: Any) -> list[str]:
return _parse_list_env(v, default=["video/mp4", "video/avi", "video/mov", "video/webm"])
model_config = SettingsConfigDict(env_file=".env", extra="ignore")
settings = Settings()