File size: 13,161 Bytes
7bf8093
26f3f24
6a4e518
3681f82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0853b44
 
6a4e518
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0853b44
 
 
 
 
3681f82
 
 
 
 
 
 
 
 
 
 
 
 
0853b44
7bf8093
 
 
 
3681f82
 
 
 
 
 
7b6ec69
3681f82
 
 
 
 
 
7b6ec69
 
 
3681f82
6a4e518
3681f82
0853b44
 
 
780a87a
0853b44
 
 
 
 
 
fba30db
711bdfc
 
 
 
 
 
 
 
 
 
 
 
 
0853b44
 
 
 
 
 
 
 
 
 
 
1b18758
 
 
 
 
 
 
0853b44
 
780a87a
0853b44
780a87a
1548c1f
 
 
0853b44
fba30db
 
780a87a
b1d2ce2
fba30db
0853b44
 
 
2a9ebf5
fba30db
 
 
 
 
0853b44
3909c31
 
 
 
fba30db
 
 
 
 
 
 
 
 
 
 
07ff735
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711bdfc
07ff735
711bdfc
 
 
 
 
 
 
 
 
 
 
 
 
 
fba30db
 
 
 
711bdfc
 
 
 
 
 
 
 
1b18758
 
 
 
 
 
 
711bdfc
 
 
 
 
 
07ff735
 
711bdfc
 
1b18758
 
 
 
36529c1
3909c31
 
0853b44
26f3f24
 
0853b44
 
780a87a
 
 
 
0853b44
26f3f24
 
 
 
 
 
 
 
 
 
 
 
 
3681f82
 
 
 
 
 
 
 
 
 
0853b44
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
import json
import secrets
from urllib.parse import parse_qsl, urlencode
from typing import Any
from pydantic import field_validator, model_validator
from pydantic_settings import BaseSettings, SettingsConfigDict


def _parse_list_env(value: Any, default: list[str]) -> list[str]:
    """Accept list env values as JSON, CSV, single-value string, or native list."""
    if value is None:
        return default

    if isinstance(value, str):
        raw = value.strip()
        if not raw:
            return default

        if raw.startswith("["):
            try:
                parsed = json.loads(raw)
                if isinstance(parsed, list):
                    cleaned = [str(i).strip() for i in parsed if str(i).strip()]
                    return cleaned or default
            except json.JSONDecodeError:
                # Fall back to CSV parsing if JSON is malformed.
                pass

        cleaned = [i.strip() for i in raw.split(",") if i.strip()]
        return cleaned or default

    if isinstance(value, list):
        cleaned = [str(i).strip() for i in value if str(i).strip()]
        return cleaned or default

    return default


def _normalize_origin(origin: str) -> str:
    """Normalize CORS origin values to avoid strict mismatch (e.g. trailing slash)."""
    cleaned = origin.strip()
    if cleaned.startswith(("http://", "https://")):
        cleaned = cleaned.rstrip("/")
    return cleaned


def _fix_postgres_url(raw: str) -> str:
    """Normalize common Postgres URL mistakes from deployment envs.

    - Converts postgres:// to postgresql://
    - Encodes stray '@' in credentials (usually from unescaped passwords)
    - Ensures sslmode=require for Supabase URLs when missing
    """
    url = raw.strip()
    if url.startswith("postgres://"):
        url = "postgresql://" + url[len("postgres://") :]

    if not url.startswith("postgresql://"):
        return url

    # Split scheme + authority/path safely without full URL parsing.
    rest = url[len("postgresql://") :]
    if "@" in rest:
        userinfo, remainder = rest.rsplit("@", 1)
        # Any '@' left in userinfo belongs to credentials and must be percent-encoded.
        userinfo = userinfo.replace("@", "%40")
        url = "postgresql://" + userinfo + "@" + remainder

    if "supabase.co" in url:
        if "?" in url:
            base, query = url.split("?", 1)
            params = dict(parse_qsl(query, keep_blank_values=True))
            if "sslmode" not in params:
                params["sslmode"] = "require"
            url = base + "?" + urlencode(params)
        else:
            url = url + "?sslmode=require"

    return url


class Settings(BaseSettings):
    # Server
    APP_HOST: str = "0.0.0.0"
    APP_PORT: int = 8000
    DEBUG: bool = False
    CORS_ORIGINS: Any = ["http://localhost:5173"]

    @model_validator(mode="before")
    @classmethod
    def drop_blank_values(cls, data: Any) -> Any:
        """Treat blank env vars as unset so defaults apply instead of parse errors."""
        if isinstance(data, dict):
            return {
                k: v
                for k, v in data.items()
                if not (isinstance(v, str) and not v.strip())
            }
        return data

    @field_validator("CORS_ORIGINS", mode="before")
    @classmethod
    def assemble_cors_origins(cls, v: Any) -> list[str]:
        """Parse CORS_ORIGINS from string (JSON or comma-separated) into a list."""
        origins = _parse_list_env(v, default=["http://localhost:5173"])
        normalized = [_normalize_origin(i) for i in origins if _normalize_origin(i)]
        return normalized or ["http://localhost:5173"]

    # Database
    DATABASE_URL: str = "sqlite:///./deepshield.db"

    @field_validator("DATABASE_URL", mode="before")
    @classmethod
    def normalize_database_url(cls, v: Any) -> str:
        """Support common HF-style postgres URL aliases and blank values."""
        if v is None:
            return "sqlite:///./deepshield.db"
        if isinstance(v, str):
            raw = v.strip()
            if not raw:
                return "sqlite:///./deepshield.db"
            return _fix_postgres_url(raw)
        return str(v)

    # File Upload
    MAX_UPLOAD_SIZE_MB: int = 100
    UPLOAD_DIR: str = "/data/uploads"
    ALLOWED_IMAGE_TYPES: list[str] = ["image/jpeg", "image/png", "image/webp"]
    ALLOWED_VIDEO_TYPES: list[str] = ["video/mp4", "video/avi", "video/mov", "video/webm"]
    FILE_RETENTION_SECONDS: int = 300

    # AI Models
    IMAGE_MODEL_ID: str = "prithivMLmods/Deep-Fake-Detector-v2-Model"
    GENERAL_IMAGE_MODEL_ID: str = "umm-maybe/AI-image-detector"
    # Phase C1/C2: second AI-image head specialised on diffusion/GAN output.
    # Ensembled with the general detector before feeding face-present fusion.
    # Set to "" to disable (falls back to general detector only).
    DIFFUSION_IMAGE_MODEL_ID: str = "haywoodsloan/ai-image-detector-deploy"
    DIFFUSION_MODEL_ENABLED: bool = True
    # Blend weights for the two-head general ensemble (must sum ≤ 1.0).
    # When only one head is available the available head gets full weight.
    GENERAL_AI_WEIGHT: float = 0.45
    DIFFUSION_AI_WEIGHT: float = 0.55
    # Temperature scaling for each detector head (> 1.0 = softer probabilities,
    # < 1.0 = sharper). 1.0 = no scaling. Tune after running run_image_eval.py.
    GENERAL_MODEL_TEMPERATURE: float = 1.0
    DIFFUSION_MODEL_TEMPERATURE: float = 1.0
    TEXT_MODEL_ID: str = "jy46604790/Fake-News-Bert-Detect"
    # Multilingual text model for non-English (Hindi etc.). Leave empty to fall back to TEXT_MODEL_ID.
    TEXT_MULTILANG_MODEL_ID: str = ""
    DEVICE: str = "cpu"
    PRELOAD_MODELS: bool = True  # preload models at startup

    # Phase 13: OCR language list (comma-separated ISO codes, e.g. "en,hi")
    OCR_LANGS: str = "en,hi"

    # News API
    NEWS_API_KEY: str = ""
    NEWS_API_BASE_URL: str = "https://newsdata.io/api/1/latest"
    NEWS_API_ARCHIVE_BASE_URL: str = "https://newsdata.io/api/1/archive"
    NEWS_API_LANGUAGES: str = "en,hi"
    NEWS_API_RECENT_TIMEFRAME: str = "1"
    NEWS_API_OLDER_DAYS: int = 7
    NEWS_API_PAGE_SIZE: int = 10
    NEWS_API_PRIMARY_COUNTRY: str = "in"

    # Reports
    REPORT_DIR: str = "/data/reports"
    REPORT_TTL_SECONDS: int = 3600  # 1h expiry
    PUBLIC_APP_URL: str = ""
    # Public backend origin or API base URL used for third-party OAuth callbacks.
    # Examples: http://localhost:8000, https://api.example.com, https://api.example.com/api/v1
    PUBLIC_API_URL: str = ""

    # Phase 19 — dedup cache + object storage
    CACHE_TTL_DAYS: int = 30
    MEDIA_ROOT: str = "/data/media"
    MEDIA_SIGNED_URL_TTL_SECONDS: int = 3600

    # LLM Explainability (Phase 12)
    LLM_PROVIDER: str = "gemini"  # "gemini" | "openai"
    LLM_API_KEY: str = ""
    LLM_MODEL: str = "gemini-2.0-flash"  # 2.0-flash: fastest response, no thinking overhead, best for real-time summaries.

    # LLM fallback — Groq (Llama 3.3 70B by default). Used automatically when the
    # primary provider returns 429/quota exceeded. Leave empty to disable fallback.
    GROQ_API_KEY: str = ""
    GROQ_MODEL: str = "llama-3.3-70b-versatile"

    # EfficientNet (ICPR2020 / DeepShield1 merge)
    EFFICIENTNET_MODEL: str = "EfficientNetAutoAttB4"
    EFFICIENTNET_TRAIN_DB: str = "DFDC"
    ENSEMBLE_MODE: bool = True  # run both ViT + EfficientNet and average scores

    # Phase 11.3: FFPP-fine-tuned ViT. Path is resolved relative to the repo root.
    # The checkpoint lives at <repo_root>/trained_models/ (the `trained_models/` dir
    # at the project root, alongside `backend/` and `frontend/`).
    FFPP_MODEL_PATH: str = "trained_models"
    # Optional: pull FFPP checkpoint from Hugging Face Hub when local checkpoint
    # is missing (keeps large model files out of GitHub source repo).
    FFPP_MODEL_REPO_ID: str = ""
    FFPP_MODEL_REVISION: str = "main"
    FFPP_BASE_PROCESSOR_ID: str = "google/vit-base-patch16-224-in21k"
    FFPP_ENABLED: bool = True
    # DenseNet121 face-GAN specialist (in-house trained on 140k Kaggle dataset).
    # Loaded from a TF-free PyTorch checkpoint converted via convert_densenet_keras_to_pt.py.
    DENSENET_ENABLED: bool = True
    # Path to .pt checkpoint, resolved relative to repo root (or absolute).
    DENSENET_MODEL_PATH: str = "backend/trained_models/densenet121_faces.pt"
    DENSENET_META_PATH:  str = "backend/trained_models/densenet121_faces_meta.json"
    # HF Space fallback when local checkpoint is absent.
    DENSENET_HF_REPO_ID:  str = "ar07xd/deepshield"
    DENSENET_HF_REVISION: str = "main"

    # Ensemble weights — DenseNet leads because it is trained on still-image GAN
    # faces (the dominant upload type). FFPP / EffNet are stronger on video frames.
    # Face-stack internal weights (sum = 1.0).
    DENSENET_WEIGHT_FACE: float = 0.45
    FFPP_WEIGHT_FACE:     float = 0.25
    VIT_WEIGHT_FACE:      float = 0.15
    EFFNET_WEIGHT_FACE:   float = 0.15
    # Video-frame path: FFPP leads since FFPP is trained on FF++ video frames.
    DENSENET_VIDEO_WEIGHT: float = 0.10
    VIDEO_FFPP_WEIGHT_FACE: float = 0.50
    VIDEO_EFFNET_WEIGHT_FACE: float = 0.30
    VIDEO_VIT_WEIGHT_FACE: float = 0.10
    FFPP_WEIGHT_NOFACE: float = 0.35
    VIT_WEIGHT_NOFACE:  float = 0.65

    # Face-present unified evidence weights (Phase A2/A3).
    # face_stack = composite of FFPP+ViT+EffNet (all face-swap models).
    # general   = generic AI-image detector (diffusion/GAN whole-image cues).
    # forensics = artifact scanner output.
    # exif      = camera-metadata trust signal.
    # vlm       = VLM consistency breakdown (optional).
    FACE_STACK_WEIGHT_FACE: float = 0.40
    GENERAL_WEIGHT_FACE: float = 0.40
    FORENSICS_WEIGHT_FACE: float = 0.10
    EXIF_WEIGHT_FACE: float = 0.05
    VLM_WEIGHT_FACE: float = 0.05

    # No-face evidence weights (existing behavior preserved).
    NOFACE_GENERAL_WEIGHT: float = 0.60
    NOFACE_FORENSICS_WEIGHT: float = 0.20
    NOFACE_EXIF_WEIGHT: float = 0.10
    NOFACE_VLM_WEIGHT: float = 0.10

    # Hard gating thresholds (Phase A4). When the general detector is highly
    # confident the image is synthetic, or strong GAN artifacts are present,
    # the final fake probability is floored at GATING_FAKE_FLOOR (0.50 maps to
    # authenticity score 50, i.e. cannot land in "Likely Real" or above).
    GENERAL_FAKE_GATING_THRESHOLD: float = 0.80
    GAN_ARTIFACT_GATING_THRESHOLD: float = 0.70
    GATING_FAKE_FLOOR: float = 0.50
    # Synthetic still-image overrides. FaceForensics/DFDC models are trained for
    # manipulated video faces, so they should not veto a strong still-image AI
    # detector on generated portraits.
    SYNTHETIC_STILL_HIGH_THRESHOLD: float = 0.80
    SYNTHETIC_STILL_HIGH_FLOOR: float = 0.80
    SYNTHETIC_STILL_VERY_HIGH_THRESHOLD: float = 0.90
    SYNTHETIC_STILL_VERY_HIGH_FLOOR: float = 0.90

    # Video-frame weight overrides. When an image is detected as a low-res
    # video frame (face-swap deepfakes are extracted from video), the general
    # AI-image detector is unreliable (it's trained on synthesised stills, not
    # video face-swaps). We shift weight strongly toward the face-swap-trained
    # models (FFPP / EfficientNet) in that case.
    VIDEO_FRAME_FACE_STACK_WEIGHT: float = 0.55
    VIDEO_FRAME_GENERAL_WEIGHT: float = 0.30
    VIDEO_FRAME_FORENSICS_WEIGHT: float = 0.10
    VIDEO_FRAME_EXIF_WEIGHT: float = 0.05
    # Per-frame video detector blend. FFPP ViT is trained on FaceForensics++
    # face forgery frames, so it is the dominant signal for video analysis.
    VIDEO_FFPP_WEIGHT: float = 0.70
    VIDEO_EFFNET_WEIGHT: float = 0.30
    VIDEO_SAMPLE_FRAMES: int = 32  # frames to sample per video for inference
    EXIFTOOL_PATH: str = ""  # full path to ExifTool binary; empty = metadata write disabled

    # Auth
    JWT_SECRET_KEY: str = ""
    JWT_SECRET_KEY_GENERATED: bool = False
    JWT_ALGORITHM: str = "HS256"
    JWT_EXPIRATION_MINUTES: int = 1440
    GOOGLE_CLIENT_ID: str = ""
    GOOGLE_CLIENT_SECRET: str = ""
    GITHUB_CLIENT_ID: str = ""
    GITHUB_CLIENT_SECRET: str = ""

    @model_validator(mode="after")
    def ensure_jwt_secret(self):
        if not self.JWT_SECRET_KEY:
            if self.DEBUG:
                self.JWT_SECRET_KEY = secrets.token_urlsafe(48)
                self.JWT_SECRET_KEY_GENERATED = True
            else:
                self.JWT_SECRET_KEY = secrets.token_urlsafe(48)
                self.JWT_SECRET_KEY_GENERATED = True
        else:
            self.JWT_SECRET_KEY_GENERATED = False
        return self

    @field_validator("ALLOWED_IMAGE_TYPES", mode="before")
    @classmethod
    def assemble_allowed_image_types(cls, v: Any) -> list[str]:
        return _parse_list_env(v, default=["image/jpeg", "image/png", "image/webp"])

    @field_validator("ALLOWED_VIDEO_TYPES", mode="before")
    @classmethod
    def assemble_allowed_video_types(cls, v: Any) -> list[str]:
        return _parse_list_env(v, default=["video/mp4", "video/avi", "video/mov", "video/webm"])

    model_config = SettingsConfigDict(env_file=".env", extra="ignore")


settings = Settings()