Spaces:

ar07xd
/

deepshield

Running

File size: 13,161 Bytes

import json
import secrets
from urllib.parse import parse_qsl, urlencode
from typing import Any
from pydantic import field_validator, model_validator
from pydantic_settings import BaseSettings, SettingsConfigDict


def _parse_list_env(value: Any, default: list[str]) -> list[str]:
    """Accept list env values as JSON, CSV, single-value string, or native list."""
    if value is None:
        return default

    if isinstance(value, str):
        raw = value.strip()
        if not raw:
            return default

        if raw.startswith("["):
            try:
                parsed = json.loads(raw)
                if isinstance(parsed, list):
                    cleaned = [str(i).strip() for i in parsed if str(i).strip()]
                    return cleaned or default
            except json.JSONDecodeError:
                # Fall back to CSV parsing if JSON is malformed.
                pass

        cleaned = [i.strip() for i in raw.split(",") if i.strip()]
        return cleaned or default

    if isinstance(value, list):
        cleaned = [str(i).strip() for i in value if str(i).strip()]
        return cleaned or default

    return default


def _normalize_origin(origin: str) -> str:
    """Normalize CORS origin values to avoid strict mismatch (e.g. trailing slash)."""
    cleaned = origin.strip()
    if cleaned.startswith(("http://", "https://")):
        cleaned = cleaned.rstrip("/")
    return cleaned


def _fix_postgres_url(raw: str) -> str:
    """Normalize common Postgres URL mistakes from deployment envs.

    - Converts postgres:// to postgresql://
    - Encodes stray '@' in credentials (usually from unescaped passwords)
    - Ensures sslmode=require for Supabase URLs when missing
    """
    url = raw.strip()
    if url.startswith("postgres://"):
        url = "postgresql://" + url[len("postgres://") :]

    if not url.startswith("postgresql://"):
        return url

    # Split scheme + authority/path safely without full URL parsing.
    rest = url[len("postgresql://") :]
    if "@" in rest:
        userinfo, remainder = rest.rsplit("@", 1)
        # Any '@' left in userinfo belongs to credentials and must be percent-encoded.
        userinfo = userinfo.replace("@", "%40")
        url = "postgresql://" + userinfo + "@" + remainder

    if "supabase.co" in url:
        if "?" in url:
            base, query = url.split("?", 1)
            params = dict(parse_qsl(query, keep_blank_values=True))
            if "sslmode" not in params:
                params["sslmode"] = "require"
            url = base + "?" + urlencode(params)
        else:
            url = url + "?sslmode=require"

    return url


class Settings(BaseSettings):
    # Server
    APP_HOST: str = "0.0.0.0"
    APP_PORT: int = 8000
    DEBUG: bool = False
    CORS_ORIGINS: Any = ["http://localhost:5173"]

    @model_validator(mode="before")
    @classmethod
    def drop_blank_values(cls, data: Any) -> Any:
        """Treat blank env vars as unset so defaults apply instead of parse errors."""
        if isinstance(data, dict):
            return {
                k: v
                for k, v in data.items()
                if not (isinstance(v, str) and not v.strip())
            }
        return data

    @field_validator("CORS_ORIGINS", mode="before")
    @classmethod
    def assemble_cors_origins(cls, v: Any) -> list[str]:
        """Parse CORS_ORIGINS from string (JSON or comma-separated) into a list."""
        origins = _parse_list_env(v, default=["http://localhost:5173"])
        normalized = [_normalize_origin(i) for i in origins if _normalize_origin(i)]
        return normalized or ["http://localhost:5173"]

    # Database
    DATABASE_URL: str = "sqlite:///./deepshield.db"

    @field_validator("DATABASE_URL", mode="before")
    @classmethod
    def normalize_database_url(cls, v: Any) -> str:
        """Support common HF-style postgres URL aliases and blank values."""
        if v is None:
            return "sqlite:///./deepshield.db"
        if isinstance(v, str):
            raw = v.strip()
            if not raw:
                return "sqlite:///./deepshield.db"
            return _fix_postgres_url(raw)
        return str(v)

    # File Upload
    MAX_UPLOAD_SIZE_MB: int = 100
    UPLOAD_DIR: str = "/data/uploads"
    ALLOWED_IMAGE_TYPES: list[str] = ["image/jpeg", "image/png", "image/webp"]
    ALLOWED_VIDEO_TYPES: list[str] = ["video/mp4", "video/avi", "video/mov", "video/webm"]
    FILE_RETENTION_SECONDS: int = 300

    # AI Models
    IMAGE_MODEL_ID: str = "prithivMLmods/Deep-Fake-Detector-v2-Model"
    GENERAL_IMAGE_MODEL_ID: str = "umm-maybe/AI-image-detector"
    # Phase C1/C2: second AI-image head specialised on diffusion/GAN output.
    # Ensembled with the general detector before feeding face-present fusion.
    # Set to "" to disable (falls back to general detector only).
    DIFFUSION_IMAGE_MODEL_ID: str = "haywoodsloan/ai-image-detector-deploy"
    DIFFUSION_MODEL_ENABLED: bool = True
    # Blend weights for the two-head general ensemble (must sum ≤ 1.0).
    # When only one head is available the available head gets full weight.
    GENERAL_AI_WEIGHT: float = 0.45
    DIFFUSION_AI_WEIGHT: float = 0.55
    # Temperature scaling for each detector head (> 1.0 = softer probabilities,
    # < 1.0 = sharper). 1.0 = no scaling. Tune after running run_image_eval.py.
    GENERAL_MODEL_TEMPERATURE: float = 1.0
    DIFFUSION_MODEL_TEMPERATURE: float = 1.0
    TEXT_MODEL_ID: str = "jy46604790/Fake-News-Bert-Detect"
    # Multilingual text model for non-English (Hindi etc.). Leave empty to fall back to TEXT_MODEL_ID.
    TEXT_MULTILANG_MODEL_ID: str = ""
    DEVICE: str = "cpu"
    PRELOAD_MODELS: bool = True  # preload models at startup

    # Phase 13: OCR language list (comma-separated ISO codes, e.g. "en,hi")
    OCR_LANGS: str = "en,hi"

    # News API
    NEWS_API_KEY: str = ""
    NEWS_API_BASE_URL: str = "https://newsdata.io/api/1/latest"
    NEWS_API_ARCHIVE_BASE_URL: str = "https://newsdata.io/api/1/archive"
    NEWS_API_LANGUAGES: str = "en,hi"
    NEWS_API_RECENT_TIMEFRAME: str = "1"
    NEWS_API_OLDER_DAYS: int = 7
    NEWS_API_PAGE_SIZE: int = 10
    NEWS_API_PRIMARY_COUNTRY: str = "in"

    # Reports
    REPORT_DIR: str = "/data/reports"
    REPORT_TTL_SECONDS: int = 3600  # 1h expiry
    PUBLIC_APP_URL: str = ""
    # Public backend origin or API base URL used for third-party OAuth callbacks.
    # Examples: http://localhost:8000, https://api.example.com, https://api.example.com/api/v1
    PUBLIC_API_URL: str = ""

    # Phase 19 — dedup cache + object storage
    CACHE_TTL_DAYS: int = 30
    MEDIA_ROOT: str = "/data/media"
    MEDIA_SIGNED_URL_TTL_SECONDS: int = 3600

    # LLM Explainability (Phase 12)
    LLM_PROVIDER: str = "gemini"  # "gemini" | "openai"
    LLM_API_KEY: str = ""
    LLM_MODEL: str = "gemini-2.0-flash"  # 2.0-flash: fastest response, no thinking overhead, best for real-time summaries.

    # LLM fallback — Groq (Llama 3.3 70B by default). Used automatically when the
    # primary provider returns 429/quota exceeded. Leave empty to disable fallback.
    GROQ_API_KEY: str = ""
    GROQ_MODEL: str = "llama-3.3-70b-versatile"

    # EfficientNet (ICPR2020 / DeepShield1 merge)
    EFFICIENTNET_MODEL: str = "EfficientNetAutoAttB4"
    EFFICIENTNET_TRAIN_DB: str = "DFDC"
    ENSEMBLE_MODE: bool = True  # run both ViT + EfficientNet and average scores

    # Phase 11.3: FFPP-fine-tuned ViT. Path is resolved relative to the repo root.
    # The checkpoint lives at <repo_root>/trained_models/ (the `trained_models/` dir
    # at the project root, alongside `backend/` and `frontend/`).
    FFPP_MODEL_PATH: str = "trained_models"
    # Optional: pull FFPP checkpoint from Hugging Face Hub when local checkpoint
    # is missing (keeps large model files out of GitHub source repo).
    FFPP_MODEL_REPO_ID: str = ""
    FFPP_MODEL_REVISION: str = "main"
    FFPP_BASE_PROCESSOR_ID: str = "google/vit-base-patch16-224-in21k"
    FFPP_ENABLED: bool = True
    # DenseNet121 face-GAN specialist (in-house trained on 140k Kaggle dataset).
    # Loaded from a TF-free PyTorch checkpoint converted via convert_densenet_keras_to_pt.py.
    DENSENET_ENABLED: bool = True
    # Path to .pt checkpoint, resolved relative to repo root (or absolute).
    DENSENET_MODEL_PATH: str = "backend/trained_models/densenet121_faces.pt"
    DENSENET_META_PATH:  str = "backend/trained_models/densenet121_faces_meta.json"
    # HF Space fallback when local checkpoint is absent.
    DENSENET_HF_REPO_ID:  str = "ar07xd/deepshield"
    DENSENET_HF_REVISION: str = "main"

    # Ensemble weights — DenseNet leads because it is trained on still-image GAN
    # faces (the dominant upload type). FFPP / EffNet are stronger on video frames.
    # Face-stack internal weights (sum = 1.0).
    DENSENET_WEIGHT_FACE: float = 0.45
    FFPP_WEIGHT_FACE:     float = 0.25
    VIT_WEIGHT_FACE:      float = 0.15
    EFFNET_WEIGHT_FACE:   float = 0.15
    # Video-frame path: FFPP leads since FFPP is trained on FF++ video frames.
    DENSENET_VIDEO_WEIGHT: float = 0.10
    VIDEO_FFPP_WEIGHT_FACE: float = 0.50
    VIDEO_EFFNET_WEIGHT_FACE: float = 0.30
    VIDEO_VIT_WEIGHT_FACE: float = 0.10
    FFPP_WEIGHT_NOFACE: float = 0.35
    VIT_WEIGHT_NOFACE:  float = 0.65

    # Face-present unified evidence weights (Phase A2/A3).
    # face_stack = composite of FFPP+ViT+EffNet (all face-swap models).
    # general   = generic AI-image detector (diffusion/GAN whole-image cues).
    # forensics = artifact scanner output.
    # exif      = camera-metadata trust signal.
    # vlm       = VLM consistency breakdown (optional).
    FACE_STACK_WEIGHT_FACE: float = 0.40
    GENERAL_WEIGHT_FACE: float = 0.40
    FORENSICS_WEIGHT_FACE: float = 0.10
    EXIF_WEIGHT_FACE: float = 0.05
    VLM_WEIGHT_FACE: float = 0.05

    # No-face evidence weights (existing behavior preserved).
    NOFACE_GENERAL_WEIGHT: float = 0.60
    NOFACE_FORENSICS_WEIGHT: float = 0.20
    NOFACE_EXIF_WEIGHT: float = 0.10
    NOFACE_VLM_WEIGHT: float = 0.10

    # Hard gating thresholds (Phase A4). When the general detector is highly
    # confident the image is synthetic, or strong GAN artifacts are present,
    # the final fake probability is floored at GATING_FAKE_FLOOR (0.50 maps to
    # authenticity score 50, i.e. cannot land in "Likely Real" or above).
    GENERAL_FAKE_GATING_THRESHOLD: float = 0.80
    GAN_ARTIFACT_GATING_THRESHOLD: float = 0.70
    GATING_FAKE_FLOOR: float = 0.50
    # Synthetic still-image overrides. FaceForensics/DFDC models are trained for
    # manipulated video faces, so they should not veto a strong still-image AI
    # detector on generated portraits.
    SYNTHETIC_STILL_HIGH_THRESHOLD: float = 0.80
    SYNTHETIC_STILL_HIGH_FLOOR: float = 0.80
    SYNTHETIC_STILL_VERY_HIGH_THRESHOLD: float = 0.90
    SYNTHETIC_STILL_VERY_HIGH_FLOOR: float = 0.90

    # Video-frame weight overrides. When an image is detected as a low-res
    # video frame (face-swap deepfakes are extracted from video), the general
    # AI-image detector is unreliable (it's trained on synthesised stills, not
    # video face-swaps). We shift weight strongly toward the face-swap-trained
    # models (FFPP / EfficientNet) in that case.
    VIDEO_FRAME_FACE_STACK_WEIGHT: float = 0.55
    VIDEO_FRAME_GENERAL_WEIGHT: float = 0.30
    VIDEO_FRAME_FORENSICS_WEIGHT: float = 0.10
    VIDEO_FRAME_EXIF_WEIGHT: float = 0.05
    # Per-frame video detector blend. FFPP ViT is trained on FaceForensics++
    # face forgery frames, so it is the dominant signal for video analysis.
    VIDEO_FFPP_WEIGHT: float = 0.70
    VIDEO_EFFNET_WEIGHT: float = 0.30
    VIDEO_SAMPLE_FRAMES: int = 32  # frames to sample per video for inference
    EXIFTOOL_PATH: str = ""  # full path to ExifTool binary; empty = metadata write disabled

    # Auth
    JWT_SECRET_KEY: str = ""
    JWT_SECRET_KEY_GENERATED: bool = False
    JWT_ALGORITHM: str = "HS256"
    JWT_EXPIRATION_MINUTES: int = 1440
    GOOGLE_CLIENT_ID: str = ""
    GOOGLE_CLIENT_SECRET: str = ""
    GITHUB_CLIENT_ID: str = ""
    GITHUB_CLIENT_SECRET: str = ""

    @model_validator(mode="after")
    def ensure_jwt_secret(self):
        if not self.JWT_SECRET_KEY:
            if self.DEBUG:
                self.JWT_SECRET_KEY = secrets.token_urlsafe(48)
                self.JWT_SECRET_KEY_GENERATED = True
            else:
                self.JWT_SECRET_KEY = secrets.token_urlsafe(48)
                self.JWT_SECRET_KEY_GENERATED = True
        else:
            self.JWT_SECRET_KEY_GENERATED = False
        return self

    @field_validator("ALLOWED_IMAGE_TYPES", mode="before")
    @classmethod
    def assemble_allowed_image_types(cls, v: Any) -> list[str]:
        return _parse_list_env(v, default=["image/jpeg", "image/png", "image/webp"])

    @field_validator("ALLOWED_VIDEO_TYPES", mode="before")
    @classmethod
    def assemble_allowed_video_types(cls, v: Any) -> list[str]:
        return _parse_list_env(v, default=["video/mp4", "video/avi", "video/mov", "video/webm"])

    model_config = SettingsConfigDict(env_file=".env", extra="ignore")


settings = Settings()