Aloukik21 commited on
Commit
e51097e
Β·
verified Β·
1 Parent(s): 2290a07

Upload detector.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. detector.py +1048 -0
detector.py ADDED
@@ -0,0 +1,1048 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Production-Ready AI Content Detector (v3 - Enhanced Ensemble)
3
+ ==============================================================
4
+ Multi-modal detection: Image, Audio, Text
5
+
6
+ Uses trained meta-classifiers (LogReg) that combine multiple models + features
7
+ per modality for maximum accuracy. v3 adds:
8
+ - Bombek1 SigLIP2+DINOv2 image detector (0.9997 AUC, JPEG-robust)
9
+ - DF_Arena_1B audio model (Speech DF Arena, 8 training datasets)
10
+ - fakespot-ai RoBERTa text detector (Mozilla-backed, catches GPT technical)
11
+
12
+ Usage:
13
+ detector = AIContentDetector()
14
+ result = detector.detect_image("photo.jpg")
15
+ result = detector.detect_audio("voice.wav")
16
+ result = detector.detect_text("Some text to analyze...")
17
+ result = detector.detect_video("clip.mp4") # frames + audio analysis
18
+ results = detector.detect_images_batch(["img1.jpg", "img2.png"])
19
+ """
20
+
21
+ import sys, os
22
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
23
+ try:
24
+ import fix_torchcodec
25
+ except ImportError:
26
+ pass
27
+
28
+ import torch
29
+ import numpy as np
30
+ import soundfile as sf
31
+ from PIL import Image
32
+ from typing import Union, List, Dict, Optional
33
+ import io
34
+ import math
35
+ from collections import Counter
36
+ from torchvision import transforms as tv_transforms
37
+
38
+
39
+ # ─── Pre-trained meta-classifier weights ──────────────────────
40
+ # v5.1: 8 features, retrained on 204 images (90 AI + 114 real from COCO/Food101/CatsDogs/CUB/diverse)
41
+ # CV=96.6%, Bombek1 (#1 coef=+2.50) + SPAI (+1.24) + NYUAD (+0.65) + ai_vs_real (-1.11)
42
+ _IMG_SCALER_MEAN = [0.46721075337286583, 0.4332848905084707, 0.34848470501282125, 0.7513610315914312, -2.7428234702735845, 1.4757695660114816e-05, 0.47213903127932083, 0.5310949190042461]
43
+ _IMG_SCALER_SCALE = [0.4562829992667211, 0.4653274721438903, 0.2594560381028844, 0.2566914952700282, 0.31761878154208484, 1.745336794888413e-05, 0.4468171423032323, 0.4707389622737817]
44
+ _IMG_LR_COEF = [0.6488963010751596, 0.19470730198227582, 0.3669096091179738, -1.1058065882150858, -0.47635552888598026, -0.015401252102331365, 2.5029078795863406, 1.237011726618108]
45
+ _IMG_LR_INTERCEPT = -0.7403570533419102
46
+
47
+ # v5: 9 features (3 neural + 5 spectral + Arena). Arena (+1.09) adds strong signal.
48
+ # Feature order: [DavidCombei, Gustking, mo-thecreator, spec_flat, centroid_mean, centroid_std, zcr, rolloff, Arena]
49
+ _AUD_SCALER_MEAN = [0.5667607612050348, 0.2773010993612484, 0.23310774392822925, 0.03141037016224877, 1807.2398348786571, 897.18004887457, 0.12301036345108962, 6620.40736210088, 0.5433762406366287]
50
+ _AUD_SCALER_SCALE = [0.48680867334512096, 0.29197482864644153, 0.4211570130989059, 0.024618810573647662, 459.40344999868597, 394.8528855416117, 0.046570088698838365, 829.6553459300637, 0.4155082795685684]
51
+ _AUD_LR_COEF = [0.7845433297452213, -0.25601227158569434, 0.38715143588917217, 0.5305971113288093, 0.14191280089652655, 1.7648106776858394, -1.6174243839603224, -1.09787021389514, 1.092684667819162]
52
+ _AUD_LR_INTERCEPT = 0.39250921446958165
53
+
54
+ # v5: 8 features (Binoculars + RoBERTa + 5 stats + fakespot). fakespot is #1 feature (coef=1.23)
55
+ _TXT_SCALER_MEAN = [1.1353826005329457, 0.33250804246780497, -0.48164806951384675, 5.916446148470062, 0.6490103211442594, 0.5124573713819743, 5.220866125485708, 0.6364287314816944]
56
+ _TXT_SCALER_SCALE = [0.19535976595611237, 0.45007809250809544, 0.21119484430166974, 1.1937958293169302, 0.19352867829552858, 0.21389850106439456, 1.2135677101079925, 0.43094435530407293]
57
+ _TXT_LR_COEF = [-0.6243579398646565, 0.389259232075374, -0.5040499517552531, -0.21291399657541557, -0.08360375807827485, -0.014109874794709326, 0.22446151217916235, 1.2266905154327146]
58
+ _TXT_LR_INTERCEPT = 0.1964292008569683
59
+
60
+
61
+ def _logistic_predict(features, scaler_mean, scaler_scale, coef, intercept):
62
+ """Apply StandardScaler + LogisticRegression prediction."""
63
+ x = np.array(features, dtype=np.float64)
64
+ x_scaled = (x - np.array(scaler_mean)) / np.array(scaler_scale)
65
+ logit = float(np.dot(x_scaled, np.array(coef)) + intercept)
66
+ prob = 1.0 / (1.0 + math.exp(-logit))
67
+ return prob
68
+
69
+
70
+ class AIContentDetector:
71
+ """Production-ready multi-modal AI content detector with stacking ensembles."""
72
+
73
+ def __init__(self, device: str = "auto", load_image=True, load_audio=True, load_text=True,
74
+ quantize_text: bool = True, compile_models: bool = True):
75
+ """
76
+ Initialize detector. Only loads models for requested modalities.
77
+
78
+ Args:
79
+ device: "auto", "cuda", or "cpu"
80
+ load_image: Load image detection models (4 ViT classifiers)
81
+ load_audio: Load audio detection models (2 wav2vec2 classifiers)
82
+ load_text: Load text detection models (Falcon-7B pair + RoBERTa)
83
+ quantize_text: Use INT8 for Falcon-7B (halves VRAM: 26GB→13GB)
84
+ compile_models: Use torch.compile for 10-30% speedup (slow first call)
85
+ """
86
+ if device == "auto":
87
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
88
+ else:
89
+ self.device = device
90
+ self._quantize_text = quantize_text
91
+ self._compile_models = compile_models
92
+
93
+ self._image_models = None
94
+ self._audio_models = None
95
+ self._text_models = None
96
+
97
+ if load_image:
98
+ self._load_image_models()
99
+ if load_audio:
100
+ self._load_audio_models()
101
+ if load_text:
102
+ self._load_text_models()
103
+
104
+ # ─── IMAGE DETECTION ───────────────────────────────────────────
105
+
106
+ def _load_image_models(self):
107
+ from transformers import pipeline as hf_pipeline
108
+ from transformers import AutoModelForImageClassification
109
+ print("Loading 4 ViT + SPAI + Bombek1 image detectors...")
110
+ dev = 0 if self.device == "cuda" else -1
111
+
112
+ def _load_image_pipeline(model_id):
113
+ """Load image-classification pipeline with transformers 5.x compatibility."""
114
+ try:
115
+ return hf_pipeline("image-classification", model=model_id, device=dev)
116
+ except (ValueError, OSError):
117
+ # Transformers 5.x: auto-detection fails for older models
118
+ from transformers import ViTImageProcessor
119
+ img_proc = ViTImageProcessor.from_pretrained(model_id)
120
+ model = AutoModelForImageClassification.from_pretrained(model_id)
121
+ return hf_pipeline("image-classification", model=model, image_processor=img_proc, device=dev)
122
+
123
+ self._image_models = [
124
+ _load_image_pipeline("NYUAD-ComNets/NYUAD_AI-generated_images_detector"),
125
+ _load_image_pipeline("Organika/sdxl-detector"),
126
+ _load_image_pipeline("umm-maybe/AI-image-detector"),
127
+ _load_image_pipeline("dima806/ai_vs_real_image_detection"),
128
+ ]
129
+
130
+ # Load Bombek1 SigLIP2+DINOv2 (0.9997 AUC, JPEG-robust, 25+ generators)
131
+ self._bombek_model = None
132
+ try:
133
+ from huggingface_hub import hf_hub_download
134
+ import importlib.util
135
+ model_pt = hf_hub_download(
136
+ repo_id="Bombek1/ai-image-detector-siglip-dinov2",
137
+ filename="pytorch_model.pt"
138
+ )
139
+ model_py = hf_hub_download(
140
+ repo_id="Bombek1/ai-image-detector-siglip-dinov2",
141
+ filename="model.py"
142
+ )
143
+ spec = importlib.util.spec_from_file_location("bombek_model", model_py)
144
+ bombek_mod = importlib.util.module_from_spec(spec)
145
+ spec.loader.exec_module(bombek_mod)
146
+ self._bombek_model = bombek_mod.AIImageDetector(model_pt, device=self.device)
147
+ print(" Bombek1 SigLIP2+DINOv2 loaded (0.9997 AUC)")
148
+ except Exception as e:
149
+ print(f" Warning: Bombek1 failed to load: {e}")
150
+
151
+ # Load SPAI (CVPR 2025) - spectral AI image detection
152
+ self._spai_model = None
153
+ self._spai_to_tensor = tv_transforms.ToTensor()
154
+ spai_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "spai_repo")
155
+ spai_weights = os.path.join(spai_dir, "weights", "spai.pth")
156
+ if os.path.exists(spai_weights):
157
+ try:
158
+ sys.path.insert(0, spai_dir)
159
+ from spai.config import get_custom_config
160
+ from spai.models.build import build_cls_model
161
+ from spai.utils import load_pretrained
162
+ import logging
163
+ spai_logger = logging.getLogger("spai_load")
164
+ spai_logger.setLevel(logging.WARNING)
165
+
166
+ config = get_custom_config(os.path.join(spai_dir, "configs", "spai.yaml"))
167
+ config.defrost()
168
+ config.PRETRAINED = spai_weights
169
+ config.freeze()
170
+
171
+ self._spai_model = build_cls_model(config)
172
+ self._spai_model.cuda()
173
+ self._spai_model.eval()
174
+ load_pretrained(config, self._spai_model, spai_logger)
175
+ self._spai_feat_batch = config.MODEL.FEATURE_EXTRACTION_BATCH
176
+ print(" SPAI model loaded (139.9M params, CVPR 2025)")
177
+ except Exception as e:
178
+ print(f" Warning: SPAI failed to load: {e}")
179
+ self._spai_model = None
180
+ else:
181
+ print(f" SPAI weights not found at {spai_weights}, skipping")
182
+
183
+ print("Image models loaded!")
184
+
185
+ def _extract_image_features(self, img: Image.Image) -> list:
186
+ """Extract 4 model scores + 2 FFT features for meta-classifier."""
187
+ feats = []
188
+
189
+ # 4 model AI-probability scores
190
+ for p in self._image_models:
191
+ result = p(img)
192
+ ai_score = 0.0
193
+ for r in result:
194
+ lab = r["label"].lower()
195
+ if lab in ["sd", "dalle", "artificial", "fake", "ai"]:
196
+ ai_score = r["score"]
197
+ break
198
+ feats.append(ai_score)
199
+
200
+ # FFT spectral slope + HF ratio
201
+ img_gray = np.array(img.convert('L').resize((256, 256)), dtype=np.float64)
202
+ f_shift = np.fft.fftshift(np.fft.fft2(img_gray))
203
+ power = np.abs(f_shift) ** 2
204
+ h, w = power.shape
205
+ cy, cx = h // 2, w // 2
206
+ Y, X = np.ogrid[:h, :w]
207
+ r = np.sqrt((X - cx)**2 + (Y - cy)**2).astype(int)
208
+ max_r = min(cx, cy)
209
+ radial_psd = np.zeros(max_r)
210
+ for i in range(max_r):
211
+ mask = r == i
212
+ if mask.any():
213
+ radial_psd[i] = power[mask].mean()
214
+ log_psd = np.log(radial_psd + 1e-10)
215
+ freqs = np.arange(1, len(log_psd))
216
+ slope, _ = np.polyfit(np.log(freqs), log_psd[1:], 1)
217
+ mid = len(radial_psd) // 2
218
+ hf_ratio = np.sum(radial_psd[mid:]) / (np.sum(radial_psd) + 1e-10)
219
+
220
+ feats.append(slope)
221
+ feats.append(hf_ratio)
222
+ return feats
223
+
224
+ def _spai_score(self, img: Image.Image) -> float:
225
+ """Get SPAI (CVPR 2025) AI probability score for an image."""
226
+ if self._spai_model is None:
227
+ return -1.0 # sentinel: not available
228
+ try:
229
+ # SPAI requires minimum 224px in each dimension for patch extraction
230
+ if img.size[0] < 224 or img.size[1] < 224:
231
+ img = img.resize((max(224, img.size[0]), max(224, img.size[1])))
232
+ t = self._spai_to_tensor(img).unsqueeze(0).cuda()
233
+ with torch.no_grad():
234
+ out = self._spai_model([t], self._spai_feat_batch)
235
+ return float(torch.sigmoid(out).item())
236
+ except Exception:
237
+ return -1.0
238
+
239
+ def _bombek_score(self, img: Image.Image) -> float:
240
+ """Get Bombek1 SigLIP2+DINOv2 AI probability score."""
241
+ if self._bombek_model is None:
242
+ return -1.0
243
+ try:
244
+ result = self._bombek_model.predict(img)
245
+ return float(result["probability"])
246
+ except Exception:
247
+ return -1.0
248
+
249
+ def detect_image(self, image: Union[str, Image.Image]) -> Dict:
250
+ """
251
+ Detect if an image is AI-generated using stacking meta-classifier + SPAI + Bombek1.
252
+
253
+ Args:
254
+ image: File path or PIL Image
255
+
256
+ Returns:
257
+ {"is_ai": bool, "confidence": float, "ai_probability": float, "label": str, "details": dict}
258
+ """
259
+ if self._image_models is None:
260
+ raise RuntimeError("Image models not loaded. Initialize with load_image=True")
261
+
262
+ # Check provenance metadata if file path provided
263
+ provenance = None
264
+ image_path = None
265
+ if isinstance(image, str):
266
+ image_path = image
267
+ provenance = self.check_provenance(image)
268
+ image = Image.open(image)
269
+ img = image.convert("RGB")
270
+
271
+ feats6 = self._extract_image_features(img)
272
+
273
+ # Get SPAI score (CVPR 2025 spectral detection)
274
+ spai = self._spai_score(img)
275
+
276
+ # Get Bombek1 score (SigLIP2+DINOv2, 0.9997 AUC)
277
+ bombek = self._bombek_score(img)
278
+
279
+ # v5: Bombek1 and SPAI are now meta-classifier features (not just overrides)
280
+ feats = feats6 + [max(0.0, bombek), max(0.0, spai)]
281
+ raw_prob = _logistic_predict(feats, _IMG_SCALER_MEAN, _IMG_SCALER_SCALE, _IMG_LR_COEF, _IMG_LR_INTERCEPT)
282
+
283
+ model_scores = feats6[:4]
284
+ n_ai_models = sum(1 for s in model_scores if s > 0.5)
285
+ if spai >= 0 and spai > 0.5:
286
+ n_ai_models += 1
287
+ if bombek >= 0 and bombek > 0.5:
288
+ n_ai_models += 1
289
+
290
+ # v5: meta-classifier includes Bombek1+SPAI so minimal overrides needed
291
+ ai_prob = raw_prob
292
+
293
+ is_ai = ai_prob > 0.5
294
+ confidence = abs(ai_prob - 0.5) * 2
295
+
296
+ model_names = [
297
+ "NYUAD_AI-generated_images_detector",
298
+ "sdxl-detector",
299
+ "AI-image-detector",
300
+ "ai_vs_real_image_detection",
301
+ ]
302
+ details = {name: round(score, 4) for name, score in zip(model_names, model_scores)}
303
+ details["fft_slope"] = round(feats[4], 4)
304
+ details["fft_hf_ratio"] = round(feats[5], 8)
305
+ if spai >= 0:
306
+ details["SPAI"] = round(spai, 4)
307
+ if bombek >= 0:
308
+ details["Bombek1_SigLIP2_DINOv2"] = round(bombek, 4)
309
+ details["models_agreeing_ai"] = n_ai_models
310
+
311
+ # Include provenance data if available
312
+ if provenance and provenance["has_provenance"]:
313
+ details["provenance"] = {
314
+ "source": provenance["source"],
315
+ "ai_signals": provenance["ai_signals"],
316
+ "camera_signals": provenance["camera_signals"],
317
+ }
318
+ # Strong provenance signals can override model predictions
319
+ if provenance["ai_signals"]:
320
+ # C2PA/metadata says AI-generated β†’ boost probability
321
+ ai_prob = max(ai_prob, 0.85)
322
+ is_ai = True
323
+ elif provenance["camera_signals"] and not provenance["ai_signals"]:
324
+ # Camera EXIF with no AI signals β†’ lower probability
325
+ if ai_prob > 0.5 and n_ai_models < 4:
326
+ details["provenance_override"] = f"Camera metadata found, reducing AI probability from {ai_prob:.4f}"
327
+ ai_prob = min(ai_prob, 0.45)
328
+ is_ai = False
329
+
330
+ confidence = abs(ai_prob - 0.5) * 2
331
+
332
+ return {
333
+ "is_ai": is_ai,
334
+ "confidence": round(confidence, 3),
335
+ "ai_probability": round(ai_prob, 4),
336
+ "label": "AI-Generated" if is_ai else "Real",
337
+ "details": details,
338
+ }
339
+
340
+ def detect_images_batch(self, images: List[Union[str, Image.Image]]) -> List[Dict]:
341
+ """Batch process multiple images."""
342
+ return [self.detect_image(img) for img in images]
343
+
344
+ # ─── PROVENANCE / C2PA CHECKING ───────────────────────────────
345
+
346
+ @staticmethod
347
+ def check_provenance(image_path: str) -> Dict:
348
+ """
349
+ Check image provenance metadata for AI generation signals.
350
+
351
+ Checks C2PA (if library available), EXIF, and XMP metadata for
352
+ known AI tool signatures or real camera provenance.
353
+
354
+ Args:
355
+ image_path: Path to image file
356
+
357
+ Returns:
358
+ {"has_provenance": bool, "source": str|None, "ai_signals": list, "camera_signals": list}
359
+ """
360
+ result = {"has_provenance": False, "source": None, "ai_signals": [], "camera_signals": [], "details": {}}
361
+
362
+ # Known AI tool keywords in metadata
363
+ ai_keywords = ["dall-e", "dalle", "chatgpt", "openai", "midjourney", "stable diffusion",
364
+ "firefly", "adobe firefly", "imagen", "gemini", "flux", "ideogram",
365
+ "leonardo", "playground", "nightcafe", "artbreeder"]
366
+
367
+ # Try C2PA first (if available)
368
+ try:
369
+ import c2pa
370
+ reader = c2pa.Reader(image_path)
371
+ import json
372
+ manifest_data = json.loads(reader.json())
373
+ result["has_provenance"] = True
374
+ result["source"] = "c2pa"
375
+ result["details"]["c2pa"] = manifest_data
376
+
377
+ active = manifest_data.get("active_manifest", "")
378
+ if active and active in manifest_data.get("manifests", {}):
379
+ m = manifest_data["manifests"][active]
380
+ gen = m.get("claim_generator", "")
381
+ result["details"]["claim_generator"] = gen
382
+
383
+ # Check for AI source type in assertions
384
+ for assertion in m.get("assertions", []):
385
+ if "c2pa.actions" in assertion.get("label", ""):
386
+ for action in assertion.get("data", {}).get("actions", []):
387
+ dst = action.get("digitalSourceType", "")
388
+ if "trainedAlgorithmicMedia" in dst:
389
+ result["ai_signals"].append(f"c2pa:trainedAlgorithmicMedia")
390
+ elif "digitalCapture" in dst:
391
+ result["camera_signals"].append(f"c2pa:digitalCapture")
392
+
393
+ if any(kw in gen.lower() for kw in ai_keywords):
394
+ result["ai_signals"].append(f"c2pa:generator={gen}")
395
+ except ImportError:
396
+ pass
397
+ except Exception:
398
+ pass
399
+
400
+ # Check EXIF metadata
401
+ try:
402
+ img = Image.open(image_path)
403
+ exif = img.getexif()
404
+ if exif:
405
+ # Tag 305 = Software, 271 = Make, 272 = Model
406
+ software = exif.get(305, "")
407
+ make = exif.get(271, "")
408
+ model = exif.get(272, "")
409
+
410
+ if software or make or model:
411
+ result["has_provenance"] = True
412
+ result["details"]["exif_software"] = software
413
+ result["details"]["exif_make"] = make
414
+ result["details"]["exif_model"] = model
415
+
416
+ sw_lower = software.lower()
417
+ if any(kw in sw_lower for kw in ai_keywords):
418
+ result["ai_signals"].append(f"exif:software={software}")
419
+ if make and make.lower() not in ["", "unknown"]:
420
+ result["camera_signals"].append(f"exif:make={make}")
421
+ if model and model.lower() not in ["", "unknown"]:
422
+ result["camera_signals"].append(f"exif:model={model}")
423
+ except Exception:
424
+ pass
425
+
426
+ # Check XMP metadata for AI tool signatures
427
+ try:
428
+ with open(image_path, 'rb') as f:
429
+ data = f.read(min(65536, os.path.getsize(image_path))) # First 64KB
430
+ # Look for XMP packet
431
+ xmp_start = data.find(b'<x:xmpmeta')
432
+ if xmp_start >= 0:
433
+ xmp_end = data.find(b'</x:xmpmeta>', xmp_start)
434
+ if xmp_end >= 0:
435
+ xmp = data[xmp_start:xmp_end + 13].decode('utf-8', errors='ignore')
436
+ result["details"]["has_xmp"] = True
437
+ xmp_lower = xmp.lower()
438
+ for kw in ai_keywords:
439
+ if kw in xmp_lower:
440
+ result["ai_signals"].append(f"xmp:contains={kw}")
441
+ result["has_provenance"] = True
442
+ # Check for IPTC digitalsourcetype
443
+ if "trainedalgorithmicmedia" in xmp_lower:
444
+ result["ai_signals"].append("xmp:trainedAlgorithmicMedia")
445
+ result["has_provenance"] = True
446
+ if "digitalcapture" in xmp_lower:
447
+ result["camera_signals"].append("xmp:digitalCapture")
448
+ result["has_provenance"] = True
449
+ except Exception:
450
+ pass
451
+
452
+ if not result["source"]:
453
+ if result["ai_signals"]:
454
+ result["source"] = "metadata"
455
+ elif result["camera_signals"]:
456
+ result["source"] = "exif"
457
+
458
+ return result
459
+
460
+ # ─── AUDIO DETECTION ───────────────────────────────────────────
461
+
462
+ def _load_audio_models(self):
463
+ from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
464
+ print("Loading 3 audio detectors + DF_Arena_1B...")
465
+ self._audio_models = []
466
+
467
+ for name, short in [
468
+ ("DavidCombei/wav2vec2-xls-r-1b-DeepFake-AI4TRUST", "DavidCombei-1B"),
469
+ ("Gustking/wav2vec2-large-xlsr-deepfake-audio-classification", "Gustking"),
470
+ ]:
471
+ feat = AutoFeatureExtractor.from_pretrained(name)
472
+ model = AutoModelForAudioClassification.from_pretrained(name).eval().to(self.device)
473
+ if self._compile_models:
474
+ try:
475
+ model = torch.compile(model)
476
+ except Exception:
477
+ pass
478
+ self._audio_models.append({"feat": feat, "model": model, "fake_idx": 1, "name": short})
479
+
480
+ # mo-thecreator: complementary model β€” excels on In-the-Wild deepfakes (92% TPR)
481
+ try:
482
+ mo_feat = AutoFeatureExtractor.from_pretrained("mo-thecreator/Deepfake-audio-detection")
483
+ mo_model = AutoModelForAudioClassification.from_pretrained("mo-thecreator/Deepfake-audio-detection").eval().to(self.device)
484
+ # Determine fake label index
485
+ id2label = getattr(mo_model.config, 'id2label', {})
486
+ fake_idx = 1
487
+ for idx, label in id2label.items():
488
+ if any(kw in str(label).lower() for kw in ['fake', 'spoof', 'deepfake', 'synthetic']):
489
+ fake_idx = int(idx)
490
+ break
491
+ self._audio_models.append({"feat": mo_feat, "model": mo_model, "fake_idx": fake_idx, "name": "mo-thecreator"})
492
+ print(" mo-thecreator Deepfake-audio-detection loaded (In-the-Wild specialist)")
493
+ except Exception as e:
494
+ print(f" Warning: mo-thecreator failed to load: {e}")
495
+ self._audio_models.append(None) # placeholder to keep feature indexing
496
+
497
+ # Load DF_Arena_1B (Speech DF Arena 2025, 0.91% EER In-the-Wild)
498
+ # Trained on 8 datasets: ASVspoof 2019/2024, Codecfake, LibriSeVoc, etc.
499
+ self._arena_pipe = None
500
+ try:
501
+ from transformers import pipeline as hf_pipeline
502
+ self._arena_pipe = hf_pipeline(
503
+ "antispoofing",
504
+ model="Speech-Arena-2025/DF_Arena_1B_V_1",
505
+ trust_remote_code=True,
506
+ device=self.device
507
+ )
508
+ print(" DF_Arena_1B loaded (1B params, Speech DF Arena 2025)")
509
+ except Exception as e:
510
+ print(f" Warning: DF_Arena_1B failed to load: {e}")
511
+
512
+ print("Audio models loaded!")
513
+
514
+ def _arena_score(self, audio_arr: np.ndarray) -> float:
515
+ """Get DF_Arena_1B spoof probability score."""
516
+ if self._arena_pipe is None:
517
+ return -1.0
518
+ try:
519
+ result = self._arena_pipe(audio_arr)
520
+ return float(result.get("all_scores", {}).get("spoof", 0.0))
521
+ except Exception:
522
+ return -1.0
523
+
524
+ def _extract_audio_features(self, audio_arr: np.ndarray, sr: int) -> list:
525
+ """Extract 3 model scores + 5 spectral features for meta-classifier.
526
+ Feature order: [DavidCombei, Gustking, mo-thecreator, spec_flat, centroid_mean,
527
+ centroid_std, zcr, rolloff]"""
528
+ import librosa
529
+
530
+ feats = []
531
+
532
+ # 3 neural model scores (DavidCombei + Gustking + mo-thecreator)
533
+ for m in self._audio_models:
534
+ if m is None:
535
+ feats.append(0.5) # neutral default if model failed to load
536
+ continue
537
+ inp = m["feat"](audio_arr, sampling_rate=sr, return_tensors="pt", padding=True)
538
+ with torch.no_grad():
539
+ logits = m["model"](**{k: v.to(self.device) for k, v in inp.items()}).logits
540
+ probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
541
+ feats.append(float(probs[m["fake_idx"]]))
542
+
543
+ # Spectral features
544
+ sf_vals = librosa.feature.spectral_flatness(y=audio_arr, n_fft=2048, hop_length=512)
545
+ feats.append(float(np.mean(sf_vals)))
546
+
547
+ centroid = librosa.feature.spectral_centroid(y=audio_arr, sr=sr)
548
+ feats.append(float(np.mean(centroid)))
549
+ feats.append(float(np.std(centroid)))
550
+
551
+ zcr = librosa.feature.zero_crossing_rate(audio_arr)
552
+ feats.append(float(np.mean(zcr)))
553
+
554
+ rolloff = librosa.feature.spectral_rolloff(y=audio_arr, sr=sr, roll_percent=0.99)
555
+ feats.append(float(np.mean(rolloff)))
556
+
557
+ return feats
558
+
559
+ def detect_audio(self, audio: Union[str, np.ndarray], sr: int = 16000, max_duration: float = 4.0) -> Dict:
560
+ """
561
+ Detect if audio is AI-generated/deepfake using stacking meta-classifier.
562
+
563
+ Args:
564
+ audio: File path or numpy array
565
+ sr: Sample rate (if numpy array)
566
+ max_duration: Max seconds to analyze
567
+
568
+ Returns:
569
+ {"is_ai": bool, "confidence": float, "ai_probability": float, "label": str, "details": dict}
570
+ """
571
+ if self._audio_models is None:
572
+ raise RuntimeError("Audio models not loaded. Initialize with load_audio=True")
573
+
574
+ import librosa
575
+
576
+ if isinstance(audio, str):
577
+ audio_arr, sr = sf.read(audio)
578
+ audio_arr = audio_arr.astype(np.float32)
579
+ else:
580
+ audio_arr = audio.astype(np.float32)
581
+
582
+ if len(audio_arr.shape) > 1:
583
+ audio_arr = audio_arr[:, 0]
584
+
585
+ # Resample to 16kHz
586
+ if sr != 16000:
587
+ audio_arr = librosa.resample(audio_arr, orig_sr=sr, target_sr=16000)
588
+ sr = 16000
589
+
590
+ # Truncate
591
+ max_samples = int(max_duration * sr)
592
+ audio_arr = audio_arr[:max_samples]
593
+
594
+ # Normalize
595
+ if np.abs(audio_arr).max() > 0:
596
+ audio_arr = audio_arr / np.abs(audio_arr).max()
597
+
598
+ feats8 = self._extract_audio_features(audio_arr, sr)
599
+
600
+ # Get DF_Arena_1B score (Speech DF Arena 2025, trained on 8 datasets)
601
+ arena_score = self._arena_score(audio_arr)
602
+
603
+ # v5: Arena is now a meta-classifier feature (not just override)
604
+ feats = feats8 + [max(0.0, arena_score)]
605
+ raw_prob = _logistic_predict(feats, _AUD_SCALER_MEAN, _AUD_SCALER_SCALE, _AUD_LR_COEF, _AUD_LR_INTERCEPT)
606
+
607
+ # Feature indices: [0]=DavidCombei, [1]=Gustking, [2]=mo-thecreator,
608
+ # [3]=spec_flat, [4]=centroid_mean, [5]=centroid_std, [6]=zcr, [7]=rolloff, [8]=Arena
609
+ centroid_mean = feats[4]
610
+ centroid_std = feats[5]
611
+ spec_flat = feats[3]
612
+ rolloff = feats[7]
613
+
614
+ # Count how many spectral indicators suggest "real" audio
615
+ spectral_real_votes = 0
616
+ if centroid_mean > 2000:
617
+ spectral_real_votes += 1
618
+ if centroid_std > 1000:
619
+ spectral_real_votes += 1
620
+ if spec_flat > 0.04:
621
+ spectral_real_votes += 1
622
+ if rolloff > 6500:
623
+ spectral_real_votes += 1
624
+
625
+ # v5: meta-classifier includes Arena, so minimal overrides needed
626
+ ai_prob = raw_prob
627
+
628
+ is_ai = ai_prob > 0.5
629
+ confidence = abs(ai_prob - 0.5) * 2
630
+
631
+ details = {
632
+ "DavidCombei-1B": round(feats[0], 4),
633
+ "Gustking": round(feats[1], 4),
634
+ "mo-thecreator": round(feats[2], 4),
635
+ "spectral_flatness": round(feats[3], 6),
636
+ "centroid_mean": round(feats[4], 2),
637
+ "centroid_std": round(feats[5], 2),
638
+ "zcr": round(feats[6], 6),
639
+ "rolloff_99": round(feats[7], 2),
640
+ "spectral_real_votes": spectral_real_votes,
641
+ }
642
+ if arena_score >= 0:
643
+ details["DF_Arena_1B"] = round(arena_score, 4)
644
+
645
+ return {
646
+ "is_ai": is_ai,
647
+ "confidence": round(confidence, 3),
648
+ "ai_probability": round(ai_prob, 4),
649
+ "label": "AI-Generated" if is_ai else "Real",
650
+ "details": details,
651
+ }
652
+
653
+ def detect_audio_batch(self, audio_files: List[str]) -> List[Dict]:
654
+ """Batch process multiple audio files."""
655
+ return [self.detect_audio(f) for f in audio_files]
656
+
657
+ # ─── TEXT DETECTION ────────────────────────────────────────────
658
+
659
+ def _load_text_models(self):
660
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline as hf_pipeline
661
+ print("Loading text detectors (Binoculars + RoBERTa + fakespot)...")
662
+
663
+ # Binoculars: Falcon-7B observer/performer pair
664
+ observer_name = "tiiuae/falcon-7b"
665
+ performer_name = "tiiuae/falcon-7b-instruct"
666
+
667
+ self._tokenizer = AutoTokenizer.from_pretrained(observer_name)
668
+ if self._tokenizer.pad_token is None:
669
+ self._tokenizer.pad_token = self._tokenizer.eos_token
670
+
671
+ if self._quantize_text:
672
+ # INT8 quantization: halves VRAM (26GB β†’ ~13GB)
673
+ print(" Using INT8 quantization for Falcon-7B")
674
+ try:
675
+ from transformers import BitsAndBytesConfig
676
+ bnb_config = BitsAndBytesConfig(load_in_8bit=True)
677
+ self._observer = AutoModelForCausalLM.from_pretrained(
678
+ observer_name, quantization_config=bnb_config, device_map="auto"
679
+ )
680
+ self._performer = AutoModelForCausalLM.from_pretrained(
681
+ performer_name, quantization_config=bnb_config, device_map="auto"
682
+ )
683
+ except (ImportError, TypeError):
684
+ # Fallback for older transformers (<5.0)
685
+ self._observer = AutoModelForCausalLM.from_pretrained(
686
+ observer_name, load_in_8bit=True, device_map="auto"
687
+ )
688
+ self._performer = AutoModelForCausalLM.from_pretrained(
689
+ performer_name, load_in_8bit=True, device_map="auto"
690
+ )
691
+ else:
692
+ self._observer = AutoModelForCausalLM.from_pretrained(
693
+ observer_name, torch_dtype=torch.float16, device_map="auto"
694
+ )
695
+ self._performer = AutoModelForCausalLM.from_pretrained(
696
+ performer_name, torch_dtype=torch.float16, device_map="auto"
697
+ )
698
+ self._observer.eval()
699
+ self._performer.eval()
700
+
701
+ # RoBERTa ChatGPT detector (original)
702
+ dev = 0 if self.device == "cuda" else -1
703
+ self._roberta_clf = hf_pipeline(
704
+ "text-classification", model="Hello-SimpleAI/chatgpt-detector-roberta", device=dev, top_k=None
705
+ )
706
+
707
+ # fakespot-ai RoBERTa (Mozilla-backed, Apache 2.0, catches GPT technical)
708
+ self._fakespot_clf = None
709
+ try:
710
+ self._fakespot_clf = hf_pipeline(
711
+ "text-classification", model="fakespot-ai/roberta-base-ai-text-detection-v1",
712
+ device=dev, top_k=None
713
+ )
714
+ print(" fakespot-ai RoBERTa loaded (Mozilla-backed)")
715
+ except Exception as e:
716
+ print(f" Warning: fakespot-ai failed to load: {e}")
717
+
718
+ self._text_models = True
719
+ print("Text models loaded!")
720
+
721
+ def _binoculars_score(self, text: str) -> float:
722
+ """Compute Binoculars score: lower = more likely AI"""
723
+ inputs = self._tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
724
+ inputs = {k: v.to(self._observer.device) for k, v in inputs.items()}
725
+
726
+ with torch.no_grad():
727
+ obs_logits = self._observer(**inputs).logits
728
+ per_logits = self._performer(**inputs).logits
729
+
730
+ pobs = torch.log_softmax(obs_logits[:, :-1], dim=-1)
731
+ pper = torch.log_softmax(per_logits[:, :-1], dim=-1)
732
+
733
+ ids = inputs["input_ids"][:, 1:]
734
+ log_obs = pobs.gather(-1, ids.unsqueeze(-1)).squeeze(-1)
735
+ log_per = pper.gather(-1, ids.unsqueeze(-1)).squeeze(-1)
736
+
737
+ mask = inputs.get("attention_mask", torch.ones_like(inputs["input_ids"]))[:, 1:]
738
+ log_obs = (log_obs * mask).sum() / mask.sum()
739
+ log_per = (log_per * mask).sum() / mask.sum()
740
+
741
+ return float(torch.exp(log_obs - log_per))
742
+
743
+ def _roberta_ai_score(self, text: str) -> float:
744
+ """Get RoBERTa ChatGPT detector score."""
745
+ result = self._roberta_clf(text[:512])
746
+ # top_k=None returns [[{label, score}, ...]], flatten if nested
747
+ if result and isinstance(result[0], list):
748
+ result = result[0]
749
+ for r in result:
750
+ if r["label"].lower() in ["chatgpt", "fake", "ai", "1", "label_1"]:
751
+ return r["score"]
752
+ return 0.0
753
+
754
+ def _fakespot_ai_score(self, text: str) -> float:
755
+ """Get fakespot-ai RoBERTa AI score. Returns -1 if not loaded."""
756
+ if self._fakespot_clf is None:
757
+ return -1.0
758
+ try:
759
+ result = self._fakespot_clf(text[:512])
760
+ if result and isinstance(result[0], list):
761
+ result = result[0]
762
+ for r in result:
763
+ if r["label"].lower() in ["machine", "ai", "fake", "generated", "1", "label_1"]:
764
+ return r["score"]
765
+ return 0.0
766
+ except Exception:
767
+ return -1.0
768
+
769
+ @staticmethod
770
+ def _text_stats(text: str) -> list:
771
+ """Compute statistical text features: burstiness, entropy, ttr, hapax, avg_word_len."""
772
+ words = text.split()
773
+ sentences = [s.strip() for s in text.replace('!', '.').replace('?', '.').split('.') if len(s.strip()) > 5]
774
+ if len(words) < 10 or len(sentences) < 2:
775
+ return [0.0] * 5
776
+ sent_lens = [len(s.split()) for s in sentences]
777
+ mean_l, std_l = np.mean(sent_lens), np.std(sent_lens)
778
+ burstiness = (std_l - mean_l) / (std_l + mean_l) if (std_l + mean_l) > 0 else 0
779
+ freq = Counter(w.lower() for w in words)
780
+ entropy = -sum((c / len(words)) * math.log2(c / len(words)) for c in freq.values())
781
+ ttr = len(set(w.lower() for w in words)) / len(words)
782
+ hapax = sum(1 for c in freq.values() if c == 1) / len(words)
783
+ avg_word_len = np.mean([len(w) for w in words])
784
+ return [burstiness, entropy, ttr, hapax, avg_word_len]
785
+
786
+ def _extract_text_features(self, text: str) -> list:
787
+ """Extract Binoculars + RoBERTa + stats for meta-classifier."""
788
+ feats = []
789
+ feats.append(self._binoculars_score(text[:1000]))
790
+ feats.append(self._roberta_ai_score(text))
791
+ feats.extend(self._text_stats(text[:2000]))
792
+ return feats
793
+
794
+ def detect_text(self, text: str) -> Dict:
795
+ """
796
+ Detect if text is AI-generated using stacking meta-classifier + fakespot.
797
+
798
+ Args:
799
+ text: Text to analyze (min ~100 chars for reliable results)
800
+
801
+ Returns:
802
+ {"is_ai": bool, "confidence": float, "ai_probability": float, "label": str, "details": dict}
803
+ """
804
+ if self._text_models is None:
805
+ raise RuntimeError("Text models not loaded. Initialize with load_text=True")
806
+
807
+ if len(text) < 50:
808
+ return {"is_ai": False, "confidence": 0.0, "ai_probability": 0.0,
809
+ "label": "Too short", "warning": "Text too short for reliable detection"}
810
+
811
+ feats7 = self._extract_text_features(text)
812
+ word_count = len(text.split())
813
+
814
+ # Get fakespot-ai score β€” now a meta-classifier feature (#1 by coefficient)
815
+ fakespot = self._fakespot_ai_score(text)
816
+ feats = feats7 + [max(0.0, fakespot)]
817
+
818
+ # For short texts (<100 words), TTR and hapax_ratio are naturally inflated
819
+ # because words don't repeat. Fall back to Binoculars + RoBERTa + fakespot.
820
+ if word_count < 100:
821
+ bino = feats[0]
822
+ roberta = feats[1]
823
+ bino_ai = max(0.0, min(1.0, (1.10 - bino) / 0.15))
824
+ if fakespot >= 0:
825
+ ai_prob = bino_ai * 0.50 + roberta * 0.25 + fakespot * 0.25
826
+ else:
827
+ ai_prob = bino_ai * 0.65 + roberta * 0.35
828
+ ai_prob = max(0.0, min(1.0, ai_prob))
829
+ else:
830
+ # v5: fakespot is now part of the meta-classifier feature vector
831
+ ai_prob = _logistic_predict(feats, _TXT_SCALER_MEAN, _TXT_SCALER_SCALE, _TXT_LR_COEF, _TXT_LR_INTERCEPT)
832
+
833
+ is_ai = ai_prob > 0.5
834
+ confidence = abs(ai_prob - 0.5) * 2
835
+
836
+ details = {
837
+ "binoculars_score": round(feats[0], 4),
838
+ "roberta_ai_score": round(feats[1], 4),
839
+ "burstiness": round(feats[2], 4),
840
+ "entropy": round(feats[3], 4),
841
+ "ttr": round(feats[4], 4),
842
+ "hapax_ratio": round(feats[5], 4),
843
+ "avg_word_len": round(feats[6], 4),
844
+ }
845
+ if fakespot >= 0:
846
+ details["fakespot_ai_score"] = round(fakespot, 4)
847
+ if word_count < 100:
848
+ details["short_text_mode"] = True
849
+
850
+ return {
851
+ "is_ai": is_ai,
852
+ "confidence": round(confidence, 3),
853
+ "ai_probability": round(ai_prob, 4),
854
+ "label": "AI-Generated" if is_ai else "Human-Written",
855
+ "details": details,
856
+ }
857
+
858
+ def detect_text_batch(self, texts: List[str]) -> List[Dict]:
859
+ """Batch process multiple texts."""
860
+ return [self.detect_text(t) for t in texts]
861
+
862
+ # ─── VIDEO DETECTION ───────────────────────────────────────────
863
+
864
+ def detect_video(self, video: str, num_frames: int = 8, analyze_audio: bool = True) -> Dict:
865
+ """
866
+ Detect if a video is AI-generated by analyzing frames + audio track.
867
+
868
+ Combines image detection on sampled frames with audio detection on
869
+ the extracted audio track (via ffmpeg). Returns separate results for
870
+ video (frames) and audio, plus a combined probability.
871
+
872
+ Args:
873
+ video: Path to video file (mp4, avi, webm, etc.)
874
+ num_frames: Number of frames to sample (default 8)
875
+ analyze_audio: Also extract and analyze audio track (default True)
876
+
877
+ Returns:
878
+ {"is_ai": bool, "ai_probability": float, "confidence": float, "label": str,
879
+ "video": {...frames analysis...},
880
+ "audio": {...audio analysis or None...},
881
+ "combined_ai_probability": float}
882
+ """
883
+ if self._image_models is None:
884
+ raise RuntimeError("Image models not loaded. Initialize with load_image=True")
885
+
886
+ import cv2
887
+
888
+ # ── Frame analysis ──
889
+ cap = cv2.VideoCapture(video)
890
+ if not cap.isOpened():
891
+ raise ValueError(f"Cannot open video: {video}")
892
+
893
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
894
+ if total_frames <= 0:
895
+ raise ValueError(f"Cannot read frame count: {video}")
896
+
897
+ # Sample evenly-spaced frame indices (skip first/last 5%)
898
+ start = int(total_frames * 0.05)
899
+ end = int(total_frames * 0.95)
900
+ if end <= start:
901
+ start, end = 0, total_frames
902
+ indices = np.linspace(start, end - 1, num_frames, dtype=int)
903
+
904
+ frame_results = []
905
+ for idx in indices:
906
+ cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
907
+ ret, frame = cap.read()
908
+ if not ret:
909
+ continue
910
+ pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
911
+ result = self.detect_image(pil_img)
912
+ frame_results.append(result)
913
+
914
+ cap.release()
915
+
916
+ if not frame_results:
917
+ raise ValueError(f"Could not read any frames from: {video}")
918
+
919
+ ai_count = sum(1 for r in frame_results if r["is_ai"])
920
+ video_prob = float(np.mean([r["ai_probability"] for r in frame_results]))
921
+ video_is_ai = ai_count > len(frame_results) / 2
922
+
923
+ video_result = {
924
+ "is_ai": video_is_ai,
925
+ "ai_probability": round(video_prob, 4),
926
+ "frames_analyzed": len(frame_results),
927
+ "frames_ai": ai_count,
928
+ "label": "AI-Generated" if video_is_ai else "Real",
929
+ "details": {f"frame_{i}": round(r["ai_probability"], 4) for i, r in enumerate(frame_results)},
930
+ }
931
+
932
+ # ── Audio analysis ──
933
+ audio_result = None
934
+ if analyze_audio and self._audio_models is not None:
935
+ audio_result = self._extract_and_analyze_audio(video)
936
+
937
+ # ── Combined result ──
938
+ # Equal weight: both modalities contribute equally
939
+ if audio_result is not None:
940
+ audio_prob = audio_result["ai_probability"]
941
+ combined_prob = 0.5 * video_prob + 0.5 * audio_prob
942
+ else:
943
+ combined_prob = video_prob
944
+
945
+ is_ai = combined_prob > 0.5
946
+ confidence = abs(combined_prob - 0.5) * 2
947
+
948
+ return {
949
+ "is_ai": is_ai,
950
+ "ai_probability": round(combined_prob, 4),
951
+ "confidence": round(confidence, 3),
952
+ "label": "AI-Generated" if is_ai else "Real",
953
+ "video": video_result,
954
+ "audio": audio_result,
955
+ "combined_ai_probability": round(combined_prob, 4),
956
+ }
957
+
958
+ def _extract_and_analyze_audio(self, video_path: str) -> Optional[Dict]:
959
+ """Extract audio track from video via ffmpeg and run audio detection."""
960
+ import subprocess
961
+ import tempfile
962
+
963
+ tmp_wav = None
964
+ try:
965
+ tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
966
+ tmp_wav.close()
967
+
968
+ # Extract audio with ffmpeg (mono, 16kHz for our models)
969
+ result = subprocess.run(
970
+ ["ffmpeg", "-y", "-i", video_path, "-vn", "-ac", "1", "-ar", "16000", "-f", "wav", tmp_wav.name],
971
+ capture_output=True, timeout=30,
972
+ )
973
+ if result.returncode != 0:
974
+ return None # No audio track or ffmpeg error
975
+
976
+ # Check if output file has actual audio data (not just WAV header)
977
+ if os.path.getsize(tmp_wav.name) < 1000:
978
+ return None
979
+
980
+ return self.detect_audio(tmp_wav.name)
981
+ except Exception:
982
+ return None
983
+ finally:
984
+ if tmp_wav and os.path.exists(tmp_wav.name):
985
+ os.unlink(tmp_wav.name)
986
+
987
+ def detect_video_batch(self, video_files: List[str], num_frames: int = 8) -> List[Dict]:
988
+ """Batch process multiple videos."""
989
+ return [self.detect_video(f, num_frames) for f in video_files]
990
+
991
+ # ─── CLEANUP ───────────────────────────────────────────────────
992
+
993
+ def unload(self, modality: str = "all"):
994
+ """Free GPU memory for a modality: 'image', 'audio', 'text', or 'all'"""
995
+ if modality in ("image", "all") and self._image_models:
996
+ del self._image_models
997
+ self._image_models = None
998
+ if self._bombek_model is not None:
999
+ del self._bombek_model
1000
+ self._bombek_model = None
1001
+ if modality in ("audio", "all") and self._audio_models:
1002
+ for m in self._audio_models:
1003
+ del m["model"]
1004
+ self._audio_models = None
1005
+ if self._arena_pipe is not None:
1006
+ del self._arena_pipe
1007
+ self._arena_pipe = None
1008
+ if modality in ("text", "all") and self._text_models:
1009
+ del self._observer, self._performer, self._roberta_clf
1010
+ if self._fakespot_clf is not None:
1011
+ del self._fakespot_clf
1012
+ self._fakespot_clf = None
1013
+ self._text_models = None
1014
+ torch.cuda.empty_cache()
1015
+
1016
+
1017
+ # ─── Quick test ────────────────────────────────────────────────
1018
+ if __name__ == "__main__":
1019
+ print("=" * 60)
1020
+ print("AI Content Detector v2 - Stacking Ensemble Validation")
1021
+ print("=" * 60)
1022
+
1023
+ detector = AIContentDetector(load_text=False)
1024
+
1025
+ # Test image
1026
+ ai_dir = "/home/jupyter/ai-detection/image/ai_generated"
1027
+ if os.path.exists(ai_dir):
1028
+ files = [f for f in os.listdir(ai_dir) if f.endswith(".png")]
1029
+ if files:
1030
+ result = detector.detect_image(os.path.join(ai_dir, files[0]))
1031
+ print(f"\nImage test (AI-generated): {result['label']} (prob={result['ai_probability']}, conf={result['confidence']})")
1032
+
1033
+ # Test batch images
1034
+ from datasets import load_dataset
1035
+ ds = load_dataset("uoft-cs/cifar10", split="test[:5]")
1036
+ results = detector.detect_images_batch([img["img"].resize((512, 512)) for img in ds])
1037
+ real_count = sum(1 for r in results if not r["is_ai"])
1038
+ print(f"Image batch (5 real CIFAR-10): {real_count}/5 correctly identified as Real")
1039
+
1040
+ # Test audio
1041
+ audio_dir = "/home/jupyter/ai-detection/audio/test_audio"
1042
+ if os.path.exists(audio_dir):
1043
+ wav_files = [f for f in sorted(os.listdir(audio_dir)) if f.endswith(".wav") and "synth" not in f and "real_speech_" not in f]
1044
+ if wav_files:
1045
+ result = detector.detect_audio(os.path.join(audio_dir, wav_files[0]))
1046
+ print(f"\nAudio test ({wav_files[0]}): {result['label']} (prob={result['ai_probability']})")
1047
+
1048
+ print("\nDone! Import with: from detector import AIContentDetector")