Spaces:

ar07xd
/

deepshield

Running

App Files Files Community

deepshield / config.py

ar07xd

Sync from GitHub via hub-sync

36529c1 verified 5 days ago

raw

history blame contribute delete

13.2 kB

	import json
	import secrets
	from urllib.parse import parse_qsl, urlencode
	from typing import Any
	from pydantic import field_validator, model_validator
	from pydantic_settings import BaseSettings, SettingsConfigDict


	def _parse_list_env(value: Any, default: list[str]) -> list[str]:
	"""Accept list env values as JSON, CSV, single-value string, or native list."""
	if value is None:
	return default

	if isinstance(value, str):
	raw = value.strip()
	if not raw:
	return default

	if raw.startswith("["):
	try:
	parsed = json.loads(raw)
	if isinstance(parsed, list):
	cleaned = [str(i).strip() for i in parsed if str(i).strip()]
	return cleaned or default
	except json.JSONDecodeError:
	# Fall back to CSV parsing if JSON is malformed.
	pass

	cleaned = [i.strip() for i in raw.split(",") if i.strip()]
	return cleaned or default

	if isinstance(value, list):
	cleaned = [str(i).strip() for i in value if str(i).strip()]
	return cleaned or default

	return default


	def _normalize_origin(origin: str) -> str:
	"""Normalize CORS origin values to avoid strict mismatch (e.g. trailing slash)."""
	cleaned = origin.strip()
	if cleaned.startswith(("http://", "https://")):
	cleaned = cleaned.rstrip("/")
	return cleaned


	def _fix_postgres_url(raw: str) -> str:
	"""Normalize common Postgres URL mistakes from deployment envs.

	- Converts postgres:// to postgresql://
	- Encodes stray '@' in credentials (usually from unescaped passwords)
	- Ensures sslmode=require for Supabase URLs when missing
	"""
	url = raw.strip()
	if url.startswith("postgres://"):
	url = "postgresql://" + url[len("postgres://") :]

	if not url.startswith("postgresql://"):
	return url

	# Split scheme + authority/path safely without full URL parsing.
	rest = url[len("postgresql://") :]
	if "@" in rest:
	userinfo, remainder = rest.rsplit("@", 1)
	# Any '@' left in userinfo belongs to credentials and must be percent-encoded.
	userinfo = userinfo.replace("@", "%40")
	url = "postgresql://" + userinfo + "@" + remainder

	if "supabase.co" in url:
	if "?" in url:
	base, query = url.split("?", 1)
	params = dict(parse_qsl(query, keep_blank_values=True))
	if "sslmode" not in params:
	params["sslmode"] = "require"
	url = base + "?" + urlencode(params)
	else:
	url = url + "?sslmode=require"

	return url


	class Settings(BaseSettings):
	# Server
	APP_HOST: str = "0.0.0.0"
	APP_PORT: int = 8000
	DEBUG: bool = False
	CORS_ORIGINS: Any = ["http://localhost:5173"]

	@model_validator(mode="before")
	@classmethod
	def drop_blank_values(cls, data: Any) -> Any:
	"""Treat blank env vars as unset so defaults apply instead of parse errors."""
	if isinstance(data, dict):
	return {
	k: v
	for k, v in data.items()
	if not (isinstance(v, str) and not v.strip())
	}
	return data

	@field_validator("CORS_ORIGINS", mode="before")
	@classmethod
	def assemble_cors_origins(cls, v: Any) -> list[str]:
	"""Parse CORS_ORIGINS from string (JSON or comma-separated) into a list."""
	origins = _parse_list_env(v, default=["http://localhost:5173"])
	normalized = [_normalize_origin(i) for i in origins if _normalize_origin(i)]
	return normalized or ["http://localhost:5173"]

	# Database
	DATABASE_URL: str = "sqlite:///./deepshield.db"

	@field_validator("DATABASE_URL", mode="before")
	@classmethod
	def normalize_database_url(cls, v: Any) -> str:
	"""Support common HF-style postgres URL aliases and blank values."""
	if v is None:
	return "sqlite:///./deepshield.db"
	if isinstance(v, str):
	raw = v.strip()
	if not raw:
	return "sqlite:///./deepshield.db"
	return _fix_postgres_url(raw)
	return str(v)

	# File Upload
	MAX_UPLOAD_SIZE_MB: int = 100
	UPLOAD_DIR: str = "/data/uploads"
	ALLOWED_IMAGE_TYPES: list[str] = ["image/jpeg", "image/png", "image/webp"]
	ALLOWED_VIDEO_TYPES: list[str] = ["video/mp4", "video/avi", "video/mov", "video/webm"]
	FILE_RETENTION_SECONDS: int = 300

	# AI Models
	IMAGE_MODEL_ID: str = "prithivMLmods/Deep-Fake-Detector-v2-Model"
	GENERAL_IMAGE_MODEL_ID: str = "umm-maybe/AI-image-detector"
	# Phase C1/C2: second AI-image head specialised on diffusion/GAN output.
	# Ensembled with the general detector before feeding face-present fusion.
	# Set to "" to disable (falls back to general detector only).
	DIFFUSION_IMAGE_MODEL_ID: str = "haywoodsloan/ai-image-detector-deploy"
	DIFFUSION_MODEL_ENABLED: bool = True
	# Blend weights for the two-head general ensemble (must sum ≤ 1.0).
	# When only one head is available the available head gets full weight.
	GENERAL_AI_WEIGHT: float = 0.45
	DIFFUSION_AI_WEIGHT: float = 0.55
	# Temperature scaling for each detector head (> 1.0 = softer probabilities,
	# < 1.0 = sharper). 1.0 = no scaling. Tune after running run_image_eval.py.
	GENERAL_MODEL_TEMPERATURE: float = 1.0
	DIFFUSION_MODEL_TEMPERATURE: float = 1.0
	TEXT_MODEL_ID: str = "jy46604790/Fake-News-Bert-Detect"
	# Multilingual text model for non-English (Hindi etc.). Leave empty to fall back to TEXT_MODEL_ID.
	TEXT_MULTILANG_MODEL_ID: str = ""
	DEVICE: str = "cpu"
	PRELOAD_MODELS: bool = True # preload models at startup

	# Phase 13: OCR language list (comma-separated ISO codes, e.g. "en,hi")
	OCR_LANGS: str = "en,hi"

	# News API
	NEWS_API_KEY: str = ""
	NEWS_API_BASE_URL: str = "https://newsdata.io/api/1/latest"
	NEWS_API_ARCHIVE_BASE_URL: str = "https://newsdata.io/api/1/archive"
	NEWS_API_LANGUAGES: str = "en,hi"
	NEWS_API_RECENT_TIMEFRAME: str = "1"
	NEWS_API_OLDER_DAYS: int = 7
	NEWS_API_PAGE_SIZE: int = 10
	NEWS_API_PRIMARY_COUNTRY: str = "in"

	# Reports
	REPORT_DIR: str = "/data/reports"
	REPORT_TTL_SECONDS: int = 3600 # 1h expiry
	PUBLIC_APP_URL: str = ""
	# Public backend origin or API base URL used for third-party OAuth callbacks.
	# Examples: http://localhost:8000, https://api.example.com, https://api.example.com/api/v1
	PUBLIC_API_URL: str = ""

	# Phase 19 — dedup cache + object storage
	CACHE_TTL_DAYS: int = 30
	MEDIA_ROOT: str = "/data/media"
	MEDIA_SIGNED_URL_TTL_SECONDS: int = 3600

	# LLM Explainability (Phase 12)
	LLM_PROVIDER: str = "gemini" # "gemini" \| "openai"
	LLM_API_KEY: str = ""
	LLM_MODEL: str = "gemini-2.0-flash" # 2.0-flash: fastest response, no thinking overhead, best for real-time summaries.

	# LLM fallback — Groq (Llama 3.3 70B by default). Used automatically when the
	# primary provider returns 429/quota exceeded. Leave empty to disable fallback.
	GROQ_API_KEY: str = ""
	GROQ_MODEL: str = "llama-3.3-70b-versatile"

	# EfficientNet (ICPR2020 / DeepShield1 merge)
	EFFICIENTNET_MODEL: str = "EfficientNetAutoAttB4"
	EFFICIENTNET_TRAIN_DB: str = "DFDC"
	ENSEMBLE_MODE: bool = True # run both ViT + EfficientNet and average scores

	# Phase 11.3: FFPP-fine-tuned ViT. Path is resolved relative to the repo root.
	# The checkpoint lives at <repo_root>/trained_models/ (the `trained_models/` dir
	# at the project root, alongside `backend/` and `frontend/`).
	FFPP_MODEL_PATH: str = "trained_models"
	# Optional: pull FFPP checkpoint from Hugging Face Hub when local checkpoint
	# is missing (keeps large model files out of GitHub source repo).
	FFPP_MODEL_REPO_ID: str = ""
	FFPP_MODEL_REVISION: str = "main"
	FFPP_BASE_PROCESSOR_ID: str = "google/vit-base-patch16-224-in21k"
	FFPP_ENABLED: bool = True
	# DenseNet121 face-GAN specialist (in-house trained on 140k Kaggle dataset).
	# Loaded from a TF-free PyTorch checkpoint converted via convert_densenet_keras_to_pt.py.
	DENSENET_ENABLED: bool = True
	# Path to .pt checkpoint, resolved relative to repo root (or absolute).
	DENSENET_MODEL_PATH: str = "backend/trained_models/densenet121_faces.pt"
	DENSENET_META_PATH: str = "backend/trained_models/densenet121_faces_meta.json"
	# HF Space fallback when local checkpoint is absent.
	DENSENET_HF_REPO_ID: str = "ar07xd/deepshield"
	DENSENET_HF_REVISION: str = "main"

	# Ensemble weights — DenseNet leads because it is trained on still-image GAN
	# faces (the dominant upload type). FFPP / EffNet are stronger on video frames.
	# Face-stack internal weights (sum = 1.0).
	DENSENET_WEIGHT_FACE: float = 0.45
	FFPP_WEIGHT_FACE: float = 0.25
	VIT_WEIGHT_FACE: float = 0.15
	EFFNET_WEIGHT_FACE: float = 0.15
	# Video-frame path: FFPP leads since FFPP is trained on FF++ video frames.
	DENSENET_VIDEO_WEIGHT: float = 0.10
	VIDEO_FFPP_WEIGHT_FACE: float = 0.50
	VIDEO_EFFNET_WEIGHT_FACE: float = 0.30
	VIDEO_VIT_WEIGHT_FACE: float = 0.10
	FFPP_WEIGHT_NOFACE: float = 0.35
	VIT_WEIGHT_NOFACE: float = 0.65

	# Face-present unified evidence weights (Phase A2/A3).
	# face_stack = composite of FFPP+ViT+EffNet (all face-swap models).
	# general = generic AI-image detector (diffusion/GAN whole-image cues).
	# forensics = artifact scanner output.
	# exif = camera-metadata trust signal.
	# vlm = VLM consistency breakdown (optional).
	FACE_STACK_WEIGHT_FACE: float = 0.40
	GENERAL_WEIGHT_FACE: float = 0.40
	FORENSICS_WEIGHT_FACE: float = 0.10
	EXIF_WEIGHT_FACE: float = 0.05
	VLM_WEIGHT_FACE: float = 0.05

	# No-face evidence weights (existing behavior preserved).
	NOFACE_GENERAL_WEIGHT: float = 0.60
	NOFACE_FORENSICS_WEIGHT: float = 0.20
	NOFACE_EXIF_WEIGHT: float = 0.10
	NOFACE_VLM_WEIGHT: float = 0.10

	# Hard gating thresholds (Phase A4). When the general detector is highly
	# confident the image is synthetic, or strong GAN artifacts are present,
	# the final fake probability is floored at GATING_FAKE_FLOOR (0.50 maps to
	# authenticity score 50, i.e. cannot land in "Likely Real" or above).
	GENERAL_FAKE_GATING_THRESHOLD: float = 0.80
	GAN_ARTIFACT_GATING_THRESHOLD: float = 0.70
	GATING_FAKE_FLOOR: float = 0.50
	# Synthetic still-image overrides. FaceForensics/DFDC models are trained for
	# manipulated video faces, so they should not veto a strong still-image AI
	# detector on generated portraits.
	SYNTHETIC_STILL_HIGH_THRESHOLD: float = 0.80
	SYNTHETIC_STILL_HIGH_FLOOR: float = 0.80
	SYNTHETIC_STILL_VERY_HIGH_THRESHOLD: float = 0.90
	SYNTHETIC_STILL_VERY_HIGH_FLOOR: float = 0.90

	# Video-frame weight overrides. When an image is detected as a low-res
	# video frame (face-swap deepfakes are extracted from video), the general
	# AI-image detector is unreliable (it's trained on synthesised stills, not
	# video face-swaps). We shift weight strongly toward the face-swap-trained
	# models (FFPP / EfficientNet) in that case.
	VIDEO_FRAME_FACE_STACK_WEIGHT: float = 0.55
	VIDEO_FRAME_GENERAL_WEIGHT: float = 0.30
	VIDEO_FRAME_FORENSICS_WEIGHT: float = 0.10
	VIDEO_FRAME_EXIF_WEIGHT: float = 0.05
	# Per-frame video detector blend. FFPP ViT is trained on FaceForensics++
	# face forgery frames, so it is the dominant signal for video analysis.
	VIDEO_FFPP_WEIGHT: float = 0.70
	VIDEO_EFFNET_WEIGHT: float = 0.30
	VIDEO_SAMPLE_FRAMES: int = 32 # frames to sample per video for inference
	EXIFTOOL_PATH: str = "" # full path to ExifTool binary; empty = metadata write disabled

	# Auth
	JWT_SECRET_KEY: str = ""
	JWT_SECRET_KEY_GENERATED: bool = False
	JWT_ALGORITHM: str = "HS256"
	JWT_EXPIRATION_MINUTES: int = 1440
	GOOGLE_CLIENT_ID: str = ""
	GOOGLE_CLIENT_SECRET: str = ""
	GITHUB_CLIENT_ID: str = ""
	GITHUB_CLIENT_SECRET: str = ""

	@model_validator(mode="after")
	def ensure_jwt_secret(self):
	if not self.JWT_SECRET_KEY:
	if self.DEBUG:
	self.JWT_SECRET_KEY = secrets.token_urlsafe(48)
	self.JWT_SECRET_KEY_GENERATED = True
	else:
	self.JWT_SECRET_KEY = secrets.token_urlsafe(48)
	self.JWT_SECRET_KEY_GENERATED = True
	else:
	self.JWT_SECRET_KEY_GENERATED = False
	return self

	@field_validator("ALLOWED_IMAGE_TYPES", mode="before")
	@classmethod
	def assemble_allowed_image_types(cls, v: Any) -> list[str]:
	return _parse_list_env(v, default=["image/jpeg", "image/png", "image/webp"])

	@field_validator("ALLOWED_VIDEO_TYPES", mode="before")
	@classmethod
	def assemble_allowed_video_types(cls, v: Any) -> list[str]:
	return _parse_list_env(v, default=["video/mp4", "video/avi", "video/mov", "video/webm"])

	model_config = SettingsConfigDict(env_file=".env", extra="ignore")


	settings = Settings()