bahaeddinmselmi commited on
Commit
be2cb51
·
0 Parent(s):

Deploy VeriVid Backend to Hugging Face Spaces

Browse files
.dockerignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ __pycache__
2
+ *.pyc
3
+ .git
4
+ .env
5
+ temp/
6
+ cache/
.env.example ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Backend Environment Variables
2
+ # For production, set these in Render dashboard
3
+
4
+ HF_TOKEN=your_huggingface_token_here
5
+ SIGHTENGINE_API_USER=your_sightengine_user
6
+ SIGHTENGINE_API_SECRET=your_sightengine_secret
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .env
2
+ __pycache__/
3
+ *.pyc
4
+ temp/
5
+ cache/
6
+ *.log
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.9
2
+ FROM python:3.9
3
+
4
+ # Install FFmpeg (Required for VeriVid pipeline)
5
+ RUN apt-get update && apt-get install -y ffmpeg && rm -rf /var/lib/apt/lists/*
6
+
7
+ # Set up a new user named "user" with user ID 1000
8
+ RUN useradd -m -u 1000 user
9
+
10
+ # Switch to the "user" user
11
+ USER user
12
+
13
+ # Set home to the user's home directory
14
+ ENV HOME=/home/user \
15
+ PATH=/home/user/.local/bin:$PATH
16
+
17
+ # Set the working directory to the user's home directory
18
+ WORKDIR $HOME/app
19
+
20
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
21
+ COPY --chown=user . $HOME/app
22
+
23
+ # Install requirements
24
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
25
+
26
+ # Create temp directory for processing
27
+ RUN mkdir -p $HOME/app/temp
28
+
29
+ # Run the application on port 7860
30
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
app/api/routes.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # C:\Users\bahae\.gemini\antigravity\scratch\verivid-ai\backend\app\api\routes.py
2
+ from fastapi import APIRouter, BackgroundTasks, UploadFile, File, Form, Request
3
+ from fastapi.responses import JSONResponse
4
+ from slowapi import Limiter
5
+ from slowapi.util import get_remote_address
6
+ from app.services.pipeline import run_analysis_pipeline
7
+ import uuid
8
+ import os
9
+ import shutil
10
+
11
+ router = APIRouter()
12
+ limiter = Limiter(key_func=get_remote_address)
13
+
14
+ JOBS = {}
15
+ TEMP_DIR = os.path.join(os.path.dirname(__file__), '..', '..', 'temp')
16
+
17
+ @router.post("/analyze")
18
+ @limiter.limit("10/hour") # 10 requests per hour per IP
19
+ async def start_analysis(
20
+ request: Request,
21
+ background_tasks: BackgroundTasks,
22
+ url: str = Form(None),
23
+ file: UploadFile = File(None)
24
+ ):
25
+ """
26
+ Start video analysis.
27
+ Rate limited to 10 requests per hour per IP.
28
+ """
29
+ # Validate input
30
+ if not url and not (file and file.filename):
31
+ return JSONResponse(
32
+ status_code=400,
33
+ content={"error": "Please provide either a URL or upload a file"}
34
+ )
35
+
36
+ # Basic URL validation
37
+ if url:
38
+ if len(url) > 500:
39
+ return JSONResponse(status_code=400, content={"error": "URL too long"})
40
+ if not url.startswith(("http://", "https://")):
41
+ return JSONResponse(status_code=400, content={"error": "Invalid URL format"})
42
+
43
+ job_id = str(uuid.uuid4())
44
+ JOBS[job_id] = {"status": "queued"}
45
+
46
+ # If file uploaded, save it with size limit
47
+ file_path = None
48
+ if file and file.filename:
49
+ # 100MB limit
50
+ MAX_SIZE = 100 * 1024 * 1024
51
+ contents = await file.read()
52
+ if len(contents) > MAX_SIZE:
53
+ return JSONResponse(status_code=400, content={"error": "File too large (max 100MB)"})
54
+
55
+ os.makedirs(TEMP_DIR, exist_ok=True)
56
+ file_path = os.path.join(TEMP_DIR, f"{job_id}_upload.mp4")
57
+ with open(file_path, 'wb') as f:
58
+ f.write(contents)
59
+
60
+ # Start async processing
61
+ background_tasks.add_task(
62
+ run_analysis_pipeline,
63
+ job_id,
64
+ url,
65
+ file_path,
66
+ JOBS
67
+ )
68
+
69
+ return {"job_id": job_id, "status": "queued"}
70
+
71
+ @router.get("/analyze/{job_id}")
72
+ @limiter.limit("60/minute") # 60 result checks per minute
73
+ async def get_result(request: Request, job_id: str):
74
+ """Get analysis result by job ID"""
75
+ # Validate job_id format (UUID)
76
+ try:
77
+ uuid.UUID(job_id)
78
+ except ValueError:
79
+ return JSONResponse(status_code=400, content={"error": "Invalid job ID"})
80
+
81
+ result = JOBS.get(job_id)
82
+ if not result:
83
+ return JSONResponse(status_code=404, content={"status": "not_found"})
84
+
85
+ return result
86
+
87
+ @router.get("/health")
88
+ async def health_check():
89
+ """Health check endpoint"""
90
+ return {"status": "ok", "jobs_in_memory": len(JOBS)}
app/core/config.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings
2
+ import os
3
+
4
+ class Settings(BaseSettings):
5
+ """
6
+ Settings resolve from environment variables.
7
+ Secrets must be set in the deployment environment (Render, Hugging Face, etc).
8
+ """
9
+
10
+ HF_TOKEN: str = os.getenv("HF_TOKEN", "")
11
+ SIGHTENGINE_API_USER: str = os.getenv("SIGHTENGINE_API_USER", "")
12
+ SIGHTENGINE_API_SECRET: str = os.getenv("SIGHTENGINE_API_SECRET", "")
13
+
14
+ class Config:
15
+ env_file = ".env"
16
+ env_file_encoding = "utf-8"
17
+
18
+
19
+ settings = Settings()
app/core/scoring.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # C:\Users\bahae\.gemini\antigravity\scratch\verivid-ai\backend\app\core\scoring.py
2
+
3
+ def calculate_risk(signals: dict):
4
+ """
5
+ Calculate final risk score using SightEngine-calibrated thresholds
6
+ """
7
+ visual = signals.get('visual', {})
8
+ audio = signals.get('audio', {})
9
+ meta = signals.get('metadata', {})
10
+ heur = signals.get('heuristics', {})
11
+
12
+ v_avg = visual.get('avg_prob', 0)
13
+ v_max = visual.get('max_prob', 0)
14
+ frame_count = visual.get('frame_count', 1)
15
+
16
+ a_score = audio.get('spoof_prob', 0)
17
+ m_score = meta.get('risk_score', 0)
18
+ h_score = heur.get('risk_score', 0)
19
+
20
+ # Use max between avg and max (catches localized AI)
21
+ visual_prob = max(v_avg, v_max * 0.9)
22
+
23
+ # SightEngine-calibrated weights
24
+ # SightEngine is more reliable, so we trust visual more
25
+ weights = {
26
+ "visual": 0.60,
27
+ "audio": 0.10,
28
+ "metadata": 0.20,
29
+ "heuristics": 0.10
30
+ }
31
+
32
+ # Calculate weighted score
33
+ final_score = (
34
+ visual_prob * 100 * weights['visual'] +
35
+ a_score * 100 * weights['audio'] +
36
+ m_score * 100 * weights['metadata'] +
37
+ h_score * 100 * weights['heuristics']
38
+ )
39
+
40
+ # Confidence based on frame count and visual strength
41
+ if frame_count >= 3 and visual_prob > 0.5:
42
+ confidence = "HIGH"
43
+ elif frame_count >= 2 or visual_prob > 0.3:
44
+ confidence = "MEDIUM"
45
+ else:
46
+ confidence = "LOW"
47
+
48
+ # Recommendation thresholds
49
+ if final_score >= 50:
50
+ rec = "HIGH RISK"
51
+ elif final_score >= 25:
52
+ rec = "REVIEW"
53
+ else:
54
+ rec = "SAFE"
55
+
56
+ return round(final_score), confidence, rec
app/services/downloader.py ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # C:\Users\bahae\.gemini\antigravity\scratch\verivid-ai\backend\app\services\downloader.py
2
+ """
3
+ Smart Scraper Pipeline with Cobalt API
4
+ =======================================
5
+ This module uses a 3-layer strategy for video extraction:
6
+ 1. Cobalt API (external, avoids IP blocks)
7
+ 2. yt-dlp streaming (local, zero storage)
8
+ 3. yt-dlp download (fallback, uses temp storage)
9
+
10
+ Storage usage: ~500KB per analysis (5 frames) instead of 50-100MB per video.
11
+ """
12
+
13
+ import subprocess
14
+ import os
15
+ import glob
16
+ import requests
17
+ from urllib.parse import urlparse, urlunparse
18
+ import yt_dlp
19
+
20
+ TEMP_DIR = os.path.join(os.path.dirname(__file__), '..', '..', 'temp')
21
+
22
+ # Cobalt API endpoints (public instances)
23
+ COBALT_ENDPOINTS = [
24
+ "https://api.cobalt.tools",
25
+ "https://co.wuk.sh", # Backup instance
26
+ ]
27
+
28
+
29
+ def ensure_temp_dir():
30
+ os.makedirs(TEMP_DIR, exist_ok=True)
31
+
32
+
33
+ def clean_temp(job_id: str):
34
+ """Clean up temp files for a job"""
35
+ pattern = os.path.join(TEMP_DIR, f"{job_id}*")
36
+ for f in glob.glob(pattern):
37
+ try:
38
+ os.remove(f)
39
+ except:
40
+ pass
41
+
42
+
43
+ def normalize_url(url: str) -> str:
44
+ """Clean common tracking params from YouTube Shorts to improve yt-dlp success."""
45
+ try:
46
+ parsed = urlparse(url)
47
+ if "youtube.com" in parsed.netloc and "/shorts/" in parsed.path:
48
+ parsed = parsed._replace(query="")
49
+ return urlunparse(parsed)
50
+ except Exception:
51
+ pass
52
+ return url
53
+
54
+
55
+ # ============================================================
56
+ # LAYER 1: COBALT API (Best - External service, avoids IP blocks)
57
+ # ============================================================
58
+
59
+ def get_cobalt_url(url: str) -> tuple[str, str]:
60
+ """
61
+ Query Cobalt API to get direct stream URL.
62
+ Returns (video_url, audio_url) or (None, None) on failure.
63
+
64
+ Cobalt handles TikTok, Instagram, YouTube, Twitter, etc. perfectly
65
+ without our server IP getting blocked.
66
+ """
67
+ url = normalize_url(url)
68
+
69
+ headers = {
70
+ "Accept": "application/json",
71
+ "Content-Type": "application/json",
72
+ }
73
+
74
+ payload = {
75
+ "url": url,
76
+ "vCodec": "h264",
77
+ "vQuality": "720",
78
+ "aFormat": "mp3",
79
+ "isNoTTWatermark": True, # TikTok without watermark
80
+ "isAudioOnly": False,
81
+ }
82
+
83
+ for endpoint in COBALT_ENDPOINTS:
84
+ try:
85
+ print(f"[Cobalt] Trying {endpoint}...")
86
+ response = requests.post(
87
+ f"{endpoint}/api/json",
88
+ json=payload,
89
+ headers=headers,
90
+ timeout=15
91
+ )
92
+
93
+ if response.status_code != 200:
94
+ print(f"[Cobalt] {endpoint} returned {response.status_code}")
95
+ continue
96
+
97
+ data = response.json()
98
+ status = data.get("status")
99
+
100
+ if status == "stream" or status == "redirect":
101
+ # Direct stream URL available
102
+ stream_url = data.get("url")
103
+ print(f"[Cobalt] Success! Got stream URL from {endpoint}")
104
+ return stream_url, None
105
+
106
+ elif status == "picker":
107
+ # Multiple options (e.g., video + audio separate)
108
+ picker = data.get("picker", [])
109
+ video_url = None
110
+ audio_url = None
111
+ for item in picker:
112
+ if item.get("type") == "video" and not video_url:
113
+ video_url = item.get("url")
114
+ elif item.get("type") == "audio" and not audio_url:
115
+ audio_url = item.get("url")
116
+ if video_url:
117
+ print(f"[Cobalt] Success! Got picker URLs from {endpoint}")
118
+ return video_url, audio_url
119
+
120
+ elif status == "error":
121
+ print(f"[Cobalt] Error from {endpoint}: {data.get('text', 'Unknown error')}")
122
+ continue
123
+
124
+ except requests.Timeout:
125
+ print(f"[Cobalt] Timeout from {endpoint}")
126
+ continue
127
+ except Exception as e:
128
+ print(f"[Cobalt] Exception from {endpoint}: {e}")
129
+ continue
130
+
131
+ print("[Cobalt] All endpoints failed, falling back to yt-dlp")
132
+ return None, None
133
+
134
+
135
+ def smart_get_stream_url(url: str) -> tuple[str, str, str]:
136
+ """
137
+ Smart 3-layer strategy to get the best stream URL:
138
+
139
+ Layer 1: Cobalt API (external, fast, avoids blocks)
140
+ Layer 2: yt-dlp direct URL (local, may get blocked)
141
+ Layer 3: Returns None (caller should use download fallback)
142
+
143
+ Returns: (video_url, audio_url, source) where source is 'cobalt', 'ytdlp', or None
144
+ """
145
+ # Layer 1: Try Cobalt first
146
+ video_url, audio_url = get_cobalt_url(url)
147
+ if video_url:
148
+ return video_url, audio_url, "cobalt"
149
+
150
+ # Layer 2: Try yt-dlp direct URL
151
+ video_url, audio_url = get_direct_url(url)
152
+ if video_url:
153
+ return video_url, audio_url, "ytdlp"
154
+
155
+ # Layer 3: Signal caller to use download fallback
156
+ return None, None, None
157
+
158
+
159
+ def get_video_info(url: str):
160
+ """Extract metadata without downloading"""
161
+ url = normalize_url(url)
162
+ ydl_opts = {
163
+ 'quiet': True,
164
+ 'no_warnings': True,
165
+ 'skip_download': True,
166
+ 'noplaylist': True,
167
+ 'geo_bypass': True,
168
+ 'extractor_args': {
169
+ 'youtube': {
170
+ 'player_client': ['android'],
171
+ 'geo_bypass_country': ['US']
172
+ }
173
+ },
174
+ 'http_headers': {
175
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
176
+ },
177
+ }
178
+
179
+ try:
180
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
181
+ info = ydl.extract_info(url, download=False)
182
+ return {
183
+ "title": info.get('title'),
184
+ "thumbnail": info.get('thumbnail'),
185
+ "duration": info.get('duration'),
186
+ "uploader": info.get('uploader'),
187
+ "view_count": info.get('view_count'),
188
+ "fps": info.get('fps'),
189
+ "vcodec": info.get('vcodec'),
190
+ "acodec": info.get('acodec'),
191
+ "width": info.get('width'),
192
+ "height": info.get('height'),
193
+ }
194
+ except Exception as e:
195
+ print(f"yt-dlp info error: {e}")
196
+ return None
197
+
198
+
199
+ def get_direct_url(url: str) -> tuple[str, str]:
200
+ """
201
+ Get direct video and audio URLs without downloading.
202
+ Returns (video_url, audio_url) - audio_url may be None.
203
+ """
204
+ url = normalize_url(url)
205
+ ydl_opts = {
206
+ 'quiet': True,
207
+ 'no_warnings': True,
208
+ 'skip_download': True,
209
+ 'noplaylist': True,
210
+ 'geo_bypass': True,
211
+ 'format': 'bestvideo[ext=mp4][height<=720]/best[ext=mp4]/best',
212
+ 'extractor_args': {
213
+ 'youtube': {
214
+ 'player_client': ['android'],
215
+ 'geo_bypass_country': ['US']
216
+ }
217
+ },
218
+ 'http_headers': {
219
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
220
+ },
221
+ }
222
+
223
+ try:
224
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
225
+ info = ydl.extract_info(url, download=False)
226
+ video_url = info.get('url')
227
+
228
+ # Try to get audio URL separately for platforms with separated streams
229
+ audio_url = None
230
+ if info.get('requested_formats'):
231
+ for fmt in info['requested_formats']:
232
+ if fmt.get('acodec') != 'none' and fmt.get('vcodec') == 'none':
233
+ audio_url = fmt.get('url')
234
+ break
235
+
236
+ return video_url, audio_url
237
+ except Exception as e:
238
+ print(f"yt-dlp URL extraction error: {e}")
239
+ return None, None
240
+
241
+
242
+ def stream_extract_frames(url: str, job_id: str, max_frames: int = 5, duration: int = 30) -> list:
243
+ """
244
+ SMART STREAMING: Uses Cobalt API first, then yt-dlp fallback.
245
+
246
+ Pipeline: URL → Cobalt/yt-dlp → ffmpeg → frame JPEGs
247
+
248
+ Only the small frame images (~100KB each) are saved to disk.
249
+ """
250
+ ensure_temp_dir()
251
+ frame_pattern = os.path.join(TEMP_DIR, f"{job_id}_frame_%03d.jpg")
252
+
253
+ # Get streaming URL using smart 3-layer strategy
254
+ video_url, _, source = smart_get_stream_url(url)
255
+ if not video_url:
256
+ print(f"[{job_id}] Could not get stream URL from any source")
257
+ return []
258
+
259
+ print(f"[{job_id}] Using stream URL from: {source}")
260
+
261
+ try:
262
+ # Calculate fps to get max_frames from duration seconds
263
+ # e.g., 5 frames from 30 seconds = 1 frame every 6 seconds = 0.167 fps
264
+ fps = max_frames / duration
265
+
266
+ # FFmpeg reads directly from the URL (no disk write for video)
267
+ cmd = [
268
+ 'ffmpeg',
269
+ '-t', str(duration), # Only process first N seconds
270
+ '-i', video_url, # Input directly from URL
271
+ '-vf', f'fps={fps}', # Extract at calculated fps
272
+ '-frames:v', str(max_frames), # Max frames
273
+ '-q:v', '2', # High quality JPEG
274
+ '-y', # Overwrite
275
+ frame_pattern
276
+ ]
277
+
278
+ result = subprocess.run(
279
+ cmd,
280
+ capture_output=True,
281
+ timeout=90,
282
+ env={**os.environ, 'FFMPEG_HTTP_SEEKABLE': '1'}
283
+ )
284
+
285
+ if result.returncode != 0:
286
+ print(f"FFmpeg stderr: {result.stderr.decode()[:500]}")
287
+
288
+ # Find extracted frames
289
+ frames = sorted(glob.glob(os.path.join(TEMP_DIR, f"{job_id}_frame_*.jpg")))
290
+ return frames
291
+
292
+ except subprocess.TimeoutExpired:
293
+ print("FFmpeg streaming timeout")
294
+ return []
295
+ except Exception as e:
296
+ print(f"FFmpeg streaming error: {e}")
297
+ return []
298
+
299
+
300
+ def stream_extract_audio(url: str, job_id: str, duration: int = 30) -> str:
301
+ """
302
+ Extract audio using smart 3-layer strategy (Cobalt first, then yt-dlp).
303
+ Output is a small WAV file (~500KB for 30s mono 16kHz).
304
+ """
305
+ ensure_temp_dir()
306
+ audio_path = os.path.join(TEMP_DIR, f"{job_id}_audio.wav")
307
+
308
+ # Get URL using smart 3-layer strategy
309
+ video_url, audio_url, source = smart_get_stream_url(url)
310
+ source_url = audio_url or video_url
311
+
312
+ if not source_url:
313
+ print(f"[{job_id}] Could not get URL for audio extraction")
314
+ return None
315
+
316
+ print(f"[{job_id}] Using audio source from: {source}")
317
+
318
+ try:
319
+ cmd = [
320
+ 'ffmpeg',
321
+ '-t', str(duration), # Only first N seconds
322
+ '-i', source_url, # Input from URL
323
+ '-vn', # No video
324
+ '-acodec', 'pcm_s16le', # PCM 16-bit
325
+ '-ar', '16000', # 16kHz (speech model compatible)
326
+ '-ac', '1', # Mono
327
+ '-y',
328
+ audio_path
329
+ ]
330
+
331
+ result = subprocess.run(cmd, capture_output=True, timeout=60)
332
+
333
+ if result.returncode != 0:
334
+ print(f"Audio extraction stderr: {result.stderr.decode()[:300]}")
335
+
336
+ return audio_path if os.path.exists(audio_path) else None
337
+
338
+ except Exception as e:
339
+ print(f"Audio streaming error: {e}")
340
+ return None
341
+
342
+
343
+ # ============================================================
344
+ # LEGACY FUNCTIONS (kept for backward compatibility / fallback)
345
+ # ============================================================
346
+
347
+ def download_video(url: str, job_id: str) -> str:
348
+ """
349
+ LEGACY: Download video to temp directory.
350
+ Use stream_extract_frames() instead to avoid disk usage.
351
+ """
352
+ url = normalize_url(url)
353
+ ensure_temp_dir()
354
+ output_path = os.path.join(TEMP_DIR, f"{job_id}.mp4")
355
+
356
+ ydl_opts = {
357
+ 'quiet': True,
358
+ 'no_warnings': True,
359
+ 'outtmpl': output_path,
360
+ 'noplaylist': True,
361
+ 'geo_bypass': True,
362
+ 'retries': 3,
363
+ 'fragment_retries': 3,
364
+ 'extractor_args': {
365
+ 'youtube': {
366
+ 'player_client': ['android'],
367
+ 'geo_bypass_country': ['US']
368
+ }
369
+ },
370
+ 'download_ranges': lambda info, ydl: [{'start_time': 0, 'end_time': 30}],
371
+ 'force_keyframes_at_cuts': True,
372
+ 'format': 'bestvideo[ext=mp4][height<=720]+bestaudio[ext=m4a]/best[ext=mp4]/best',
373
+ 'merge_output_format': 'mp4',
374
+ 'http_headers': {
375
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
376
+ },
377
+ }
378
+
379
+ try:
380
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
381
+ ydl.download([url])
382
+ return output_path if os.path.exists(output_path) else None
383
+ except Exception as e:
384
+ print(f"yt-dlp download error: {e}")
385
+ return None
386
+
387
+
388
+ def extract_frames(video_path: str, job_id: str, fps: float = 0.5, max_frames: int = 5) -> list:
389
+ """
390
+ LEGACY: Extract frames from a local video file.
391
+ Use stream_extract_frames() instead to avoid disk usage.
392
+ """
393
+ ensure_temp_dir()
394
+ frame_pattern = os.path.join(TEMP_DIR, f"{job_id}_frame_%03d.jpg")
395
+
396
+ try:
397
+ cmd = [
398
+ 'ffmpeg', '-i', video_path,
399
+ '-vf', f'fps={fps}',
400
+ '-frames:v', str(max_frames),
401
+ '-q:v', '2',
402
+ frame_pattern,
403
+ '-y'
404
+ ]
405
+
406
+ subprocess.run(cmd, capture_output=True, timeout=60)
407
+ frames = sorted(glob.glob(os.path.join(TEMP_DIR, f"{job_id}_frame_*.jpg")))
408
+ return frames
409
+
410
+ except subprocess.TimeoutExpired:
411
+ print("FFmpeg timeout")
412
+ return []
413
+ except Exception as e:
414
+ print(f"FFmpeg error: {e}")
415
+ return []
416
+
417
+
418
+ def extract_audio(video_path: str, job_id: str) -> str:
419
+ """
420
+ LEGACY: Extract audio from a local video file.
421
+ Use stream_extract_audio() instead.
422
+ """
423
+ ensure_temp_dir()
424
+ audio_path = os.path.join(TEMP_DIR, f"{job_id}_audio.wav")
425
+
426
+ try:
427
+ cmd = [
428
+ 'ffmpeg', '-i', video_path,
429
+ '-vn',
430
+ '-acodec', 'pcm_s16le',
431
+ '-ar', '16000',
432
+ '-ac', '1',
433
+ audio_path,
434
+ '-y'
435
+ ]
436
+
437
+ subprocess.run(cmd, capture_output=True, timeout=60)
438
+ return audio_path if os.path.exists(audio_path) else None
439
+
440
+ except Exception as e:
441
+ print(f"Audio extraction error: {e}")
442
+ return None
app/services/hf_inference.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # C:\Users\bahae\.gemini\antigravity\scratch\verivid-ai\backend\app\services\hf_inference.py
2
+ import os
3
+ import requests
4
+ from app.core.config import settings
5
+
6
+ # Fallback HuggingFace models
7
+ MODELS = [("Organika/sdxl-detector", ["artificial", "ai", "synthetic"])]
8
+
9
+ def call_hf_model(model_name: str, image_bytes: bytes, ai_labels: list) -> float:
10
+ """Call HuggingFace model as fallback"""
11
+ if not settings.HF_TOKEN:
12
+ return None
13
+
14
+ headers = {
15
+ "Authorization": f"Bearer {settings.HF_TOKEN}",
16
+ "Content-Type": "image/jpeg",
17
+ }
18
+
19
+ try:
20
+ url = f"https://router.huggingface.co/hf-inference/models/{model_name}"
21
+ response = requests.post(url, headers=headers, data=image_bytes, timeout=45)
22
+
23
+ if response.status_code != 200:
24
+ return None
25
+ if response.text.startswith('<!doctype'):
26
+ return None
27
+
28
+ result = response.json()
29
+
30
+ for item in result:
31
+ label = str(item.get('label', '')).lower()
32
+ score = float(item.get('score', 0))
33
+
34
+ for ai_label in ai_labels:
35
+ if ai_label in label:
36
+ return score
37
+ if 'human' in label or 'real' in label:
38
+ return 1 - score
39
+ return 0
40
+ except:
41
+ return None
42
+
43
+ def analyze_visual_fallback(frame_paths: list) -> dict:
44
+ """Fallback visual analysis using HuggingFace"""
45
+ scores = []
46
+
47
+ for path in frame_paths[:3]: # Only 3 frames for fallback
48
+ try:
49
+ with open(path, 'rb') as f:
50
+ img_bytes = f.read()
51
+
52
+ for model_name, ai_labels in MODELS:
53
+ score = call_hf_model(model_name, img_bytes, ai_labels)
54
+ if score is not None:
55
+ scores.append(score)
56
+ break
57
+ except:
58
+ continue
59
+
60
+ if scores:
61
+ return {
62
+ "avg_prob": sum(scores) / len(scores),
63
+ "max_prob": max(scores),
64
+ "frame_count": len(scores),
65
+ "details": f"HuggingFace fallback: {len(scores)} frames analyzed"
66
+ }
67
+ return {"avg_prob": 0, "max_prob": 0, "frame_count": 0, "details": "Fallback failed"}
68
+
69
+ def analyze_audio_ai(file_path: str, audio_path: str = None):
70
+ """
71
+ Real audio analysis for deepfake/synthetic speech detection.
72
+ Uses HuggingFace audio classification models.
73
+ """
74
+ if not audio_path or not os.path.exists(audio_path):
75
+ return {"spoof_prob": 0, "details": "No audio track.", "confidence": "high"}
76
+
77
+ audio_size = os.path.getsize(audio_path)
78
+ if audio_size < 1000:
79
+ return {"spoof_prob": 0.1, "details": "Silent or very short audio.", "confidence": "low"}
80
+
81
+ if not settings.HF_TOKEN:
82
+ # Fallback to heuristic analysis without API
83
+ return _analyze_audio_heuristic(audio_path, audio_size)
84
+
85
+ # Try HuggingFace audio deepfake detection model
86
+ try:
87
+ with open(audio_path, 'rb') as f:
88
+ audio_bytes = f.read()
89
+
90
+ # Use speech/audio classification model for deepfake detection
91
+ # Model: microsoft/wavlm-base-plus-sv (speaker verification - good for detecting synthetic)
92
+ # Alternative: facebook/wav2vec2-base for general audio analysis
93
+ headers = {
94
+ "Authorization": f"Bearer {settings.HF_TOKEN}",
95
+ "Content-Type": "audio/wav",
96
+ }
97
+
98
+ # Try audio classification
99
+ model_url = "https://router.huggingface.co/hf-inference/models/facebook/wav2vec2-base-960h"
100
+ response = requests.post(model_url, headers=headers, data=audio_bytes, timeout=30)
101
+
102
+ if response.status_code == 200 and not response.text.startswith('<!doctype'):
103
+ result = response.json()
104
+ # Analyze response for synthetic speech indicators
105
+ return _parse_audio_result(result, audio_size)
106
+ else:
107
+ # API failed, use heuristic
108
+ return _analyze_audio_heuristic(audio_path, audio_size)
109
+
110
+ except Exception as e:
111
+ print(f"Audio HF inference error: {e}")
112
+ return _analyze_audio_heuristic(audio_path, audio_size)
113
+
114
+
115
+ def _analyze_audio_heuristic(audio_path: str, audio_size: int) -> dict:
116
+ """
117
+ Heuristic audio analysis when API is unavailable.
118
+ Analyzes file characteristics as proxy signals.
119
+ """
120
+ import wave
121
+
122
+ try:
123
+ with wave.open(audio_path, 'rb') as wav:
124
+ framerate = wav.getframerate()
125
+ nchannels = wav.getnchannels()
126
+ nframes = wav.getnframes()
127
+ duration = nframes / framerate if framerate > 0 else 0
128
+
129
+ signals = []
130
+ spoof_prob = 0.0
131
+
132
+ # Check for TTS-typical characteristics
133
+ # Many TTS systems output at exactly 16kHz or 22050Hz
134
+ if framerate in [16000, 22050, 24000]:
135
+ spoof_prob += 0.15
136
+ signals.append(f"TTS-common sample rate ({framerate}Hz)")
137
+
138
+ # Very short duration with speech suggests clip
139
+ if 0 < duration < 3:
140
+ spoof_prob += 0.1
141
+ signals.append("Very short audio clip")
142
+
143
+ # Mono audio is common in TTS
144
+ if nchannels == 1:
145
+ spoof_prob += 0.05
146
+ signals.append("Mono audio (common in TTS)")
147
+
148
+ # File size vs duration ratio (synthetic often has consistent bitrate)
149
+ if duration > 0:
150
+ kb_per_second = (audio_size / 1024) / duration
151
+ if 28 < kb_per_second < 35: # Very consistent 256kbps range
152
+ spoof_prob += 0.1
153
+ signals.append("Uniform bitrate pattern")
154
+
155
+ # Cap probability
156
+ spoof_prob = min(spoof_prob, 0.5)
157
+
158
+ details = "; ".join(signals) if signals else f"Audio analyzed ({audio_size // 1024}KB, {duration:.1f}s)"
159
+ confidence = "medium" if signals else "low"
160
+
161
+ return {
162
+ "spoof_prob": round(spoof_prob, 2),
163
+ "details": details,
164
+ "confidence": confidence,
165
+ "duration_s": round(duration, 1),
166
+ "sample_rate": framerate
167
+ }
168
+
169
+ except Exception as e:
170
+ # Can't parse as WAV, return minimal info
171
+ return {
172
+ "spoof_prob": 0.15,
173
+ "details": f"Audio present but unparseable ({audio_size // 1024}KB)",
174
+ "confidence": "low"
175
+ }
176
+
177
+
178
+ def _parse_audio_result(result, audio_size: int) -> dict:
179
+ """Parse HuggingFace audio model response."""
180
+ # wav2vec2 returns transcription, not classification
181
+ # We use transcription quality as a proxy signal
182
+
183
+ if isinstance(result, dict) and 'text' in result:
184
+ text = result.get('text', '').strip()
185
+
186
+ if not text:
187
+ return {
188
+ "spoof_prob": 0.1,
189
+ "details": "No speech detected in audio.",
190
+ "confidence": "medium"
191
+ }
192
+
193
+ # Very clean transcription can indicate TTS (natural speech has more disfluencies)
194
+ word_count = len(text.split())
195
+
196
+ # Short, clean phrases are more likely TTS
197
+ if word_count < 5 and len(text) > 10:
198
+ return {
199
+ "spoof_prob": 0.25,
200
+ "details": f"Short clear speech detected: '{text[:50]}...'",
201
+ "confidence": "medium",
202
+ "transcript_preview": text[:100]
203
+ }
204
+
205
+ return {
206
+ "spoof_prob": 0.15,
207
+ "details": f"Speech detected ({word_count} words).",
208
+ "confidence": "medium",
209
+ "transcript_preview": text[:100]
210
+ }
211
+
212
+ return {
213
+ "spoof_prob": 0.15,
214
+ "details": f"Audio analyzed ({audio_size // 1024}KB).",
215
+ "confidence": "low"
216
+ }
app/services/local_signals.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # C:\Users\bahae\.gemini\antigravity\scratch\verivid-ai\backend\app\services\local_signals.py
2
+ import subprocess
3
+ import json
4
+ import os
5
+
6
+ def get_video_metadata(file_path: str) -> dict:
7
+ """Extract detailed metadata using ffprobe"""
8
+ if not file_path or not os.path.exists(file_path):
9
+ return {}
10
+
11
+ try:
12
+ cmd = [
13
+ 'ffprobe', '-v', 'quiet',
14
+ '-print_format', 'json',
15
+ '-show_format', '-show_streams',
16
+ file_path
17
+ ]
18
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
19
+ return json.loads(result.stdout) if result.stdout else {}
20
+ except:
21
+ return {}
22
+
23
+ def analyze_metadata(file_path: str, video_info: dict = None):
24
+ """
25
+ Step 2: Local Signals (Metadata Analysis)
26
+ """
27
+ risk_score = 0
28
+ flags = []
29
+
30
+ # If we have yt-dlp info
31
+ if video_info:
32
+ # Check for unusual resolution
33
+ width = video_info.get('width', 0)
34
+ height = video_info.get('height', 0)
35
+
36
+ if width and height:
37
+ # 9:16 ratio with very high res might be suspicious for UGC
38
+ if width > 1080 and height > 1920:
39
+ risk_score += 0.1
40
+ flags.append("Unusually high resolution for short-form content")
41
+
42
+ # Check FPS
43
+ fps = video_info.get('fps', 0)
44
+ if fps and fps > 30:
45
+ risk_score += 0.05
46
+ flags.append(f"High framerate ({fps}fps) uncommon for UGC")
47
+
48
+ # Check codec
49
+ vcodec = video_info.get('vcodec', '')
50
+ if vcodec and 'av1' in vcodec.lower():
51
+ risk_score += 0.1
52
+ flags.append("AV1 codec often used by AI rendering tools")
53
+
54
+ # If we have the actual file, do deeper analysis
55
+ if file_path and os.path.exists(file_path):
56
+ meta = get_video_metadata(file_path)
57
+
58
+ if meta:
59
+ fmt = meta.get('format', {})
60
+
61
+ # Check for missing encoder info (stripped metadata)
62
+ tags = fmt.get('tags', {})
63
+ if not tags.get('encoder') and not tags.get('creation_time'):
64
+ risk_score += 0.15
65
+ flags.append("Missing encoder/creation metadata (possibly stripped)")
66
+
67
+ # Check bitrate
68
+ bitrate = int(fmt.get('bit_rate', 0))
69
+ if bitrate > 0:
70
+ # Very low bitrate for resolution = re-encoding
71
+ duration = float(fmt.get('duration', 0))
72
+ size = int(fmt.get('size', 0))
73
+ if duration > 0 and size > 0:
74
+ expected_bitrate = size * 8 / duration
75
+ if expected_bitrate < 500000: # Less than 500kbps
76
+ risk_score += 0.1
77
+ flags.append("Low bitrate suggests heavy re-encoding")
78
+
79
+ # Default if no issues found
80
+ if not flags:
81
+ flags.append("No metadata anomalies detected")
82
+
83
+ return {
84
+ "risk_score": min(risk_score, 1.0), # Cap at 1.0
85
+ "flags": flags,
86
+ "details": "; ".join(flags)
87
+ }
88
+
89
+ def analyze_heuristics(file_path: str, meta: dict, video_info: dict = None):
90
+ """
91
+ Step 5: Heuristic Analysis
92
+ """
93
+ risk_score = 0
94
+ flags = []
95
+
96
+ # Platform-quality mismatch
97
+ if video_info:
98
+ # TikTok with 4K = suspicious
99
+ width = video_info.get('width', 0)
100
+ if width and width >= 2160:
101
+ risk_score += 0.2
102
+ flags.append("Studio-quality resolution unusual for platform")
103
+
104
+ # Very short video with high production value
105
+ duration = video_info.get('duration', 0)
106
+ if duration and duration < 15 and width and width > 1080:
107
+ risk_score += 0.1
108
+ flags.append("Short clip with high production quality")
109
+
110
+ # Check for consistent frame rate (from metadata)
111
+ if meta and 'flags' in meta:
112
+ if 'Low bitrate' in str(meta.get('flags', [])):
113
+ risk_score += 0.05
114
+ flags.append("Compression artifacts may hide manipulation")
115
+
116
+ if not flags:
117
+ flags.append("No heuristic red flags")
118
+
119
+ return {
120
+ "risk_score": min(risk_score, 1.0),
121
+ "flags": flags,
122
+ "details": "; ".join(flags)
123
+ }
app/services/pipeline.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # C:\Users\bahae\.gemini\antigravity\scratch\verivid-ai\backend\app\services\pipeline.py
2
+ """
3
+ Analysis Pipeline with Zero-Storage Streaming
4
+ ==============================================
5
+ For URL-based analysis: Uses streaming to avoid saving full video files.
6
+ For uploaded files: Uses traditional file-based processing.
7
+ """
8
+
9
+ import os
10
+ import json
11
+ import hashlib
12
+ from datetime import datetime
13
+
14
+ from app.services.downloader import (
15
+ get_video_info,
16
+ clean_temp,
17
+ # Streaming functions (zero storage)
18
+ stream_extract_frames,
19
+ stream_extract_audio,
20
+ # Legacy functions (for uploaded files)
21
+ extract_frames,
22
+ extract_audio
23
+ )
24
+ from app.services.local_signals import analyze_metadata, analyze_heuristics
25
+ from app.services.sightengine import analyze_frames_with_sightengine
26
+ from app.services.hf_inference import analyze_visual_fallback, analyze_audio_ai
27
+ from app.core.scoring import calculate_risk
28
+
29
+ # Cache
30
+ CACHE_DIR = os.path.join(os.path.dirname(__file__), '..', '..', 'cache')
31
+
32
+ def get_cache_key(url: str) -> str:
33
+ return hashlib.md5(url.encode()).hexdigest()
34
+
35
+ def get_cached_result(url: str):
36
+ os.makedirs(CACHE_DIR, exist_ok=True)
37
+ cache_file = os.path.join(CACHE_DIR, f"{get_cache_key(url)}.json")
38
+ if os.path.exists(cache_file):
39
+ try:
40
+ with open(cache_file, 'r') as f:
41
+ data = json.load(f)
42
+ cached_time = datetime.fromisoformat(data.get('cached_at', '2000-01-01'))
43
+ if (datetime.now() - cached_time).total_seconds() < 86400:
44
+ return data.get('result')
45
+ except:
46
+ pass
47
+ return None
48
+
49
+ def save_to_cache(url: str, result: dict):
50
+ os.makedirs(CACHE_DIR, exist_ok=True)
51
+ cache_file = os.path.join(CACHE_DIR, f"{get_cache_key(url)}.json")
52
+ try:
53
+ with open(cache_file, 'w') as f:
54
+ json.dump({'cached_at': datetime.now().isoformat(), 'url': url, 'result': result}, f)
55
+ except:
56
+ pass
57
+
58
+
59
+ async def run_analysis_pipeline(job_id: str, url: str, uploaded_file_path: str, jobs_db: dict):
60
+ """
61
+ Main analysis pipeline with ZERO-STORAGE streaming for URL analysis.
62
+
63
+ For URLs: Streams video directly from platform → ffmpeg → frames (no video saved to disk)
64
+ For uploads: Uses traditional file-based processing
65
+ """
66
+ print(f"[{job_id}] Starting analysis for URL: {url}")
67
+ jobs_db[job_id]["status"] = "processing"
68
+
69
+ try:
70
+ # Check cache
71
+ if url:
72
+ cached = get_cached_result(url)
73
+ if cached:
74
+ print(f"[{job_id}] Cache hit!")
75
+ cached['id'] = job_id
76
+ jobs_db[job_id] = {"status": "completed", "result": cached}
77
+ return
78
+
79
+ # Get video info (does not download)
80
+ video_info = None
81
+ if url:
82
+ print(f"[{job_id}] Fetching video info...")
83
+ video_info = get_video_info(url)
84
+ if not video_info:
85
+ video_info = {"thumbnail": None, "title": "Unknown"}
86
+
87
+ frame_paths = []
88
+ audio_path = None
89
+ video_path = None # Only set for uploaded files
90
+
91
+ # ============================================
92
+ # PATH A: URL-based analysis (try streaming first, fallback to download)
93
+ # ============================================
94
+ if url and not uploaded_file_path:
95
+ print(f"[{job_id}] STREAMING MODE: Attempting to extract frames directly from URL...")
96
+ frame_paths = stream_extract_frames(url, job_id, max_frames=5, duration=30)
97
+
98
+ # If streaming failed, fallback to traditional download
99
+ if not frame_paths:
100
+ print(f"[{job_id}] Streaming failed, falling back to traditional download...")
101
+ from app.services.downloader import download_video
102
+ video_path = download_video(url, job_id)
103
+
104
+ if video_path and os.path.exists(video_path):
105
+ print(f"[{job_id}] Downloaded video, extracting frames...")
106
+ frame_paths = extract_frames(video_path, job_id, fps=0.5, max_frames=5)
107
+
108
+ if frame_paths:
109
+ print(f"[{job_id}] Extracted {len(frame_paths)} frames via fallback")
110
+ audio_path = extract_audio(video_path, job_id)
111
+ else:
112
+ jobs_db[job_id] = {"status": "failed", "error": "Could not extract frames from video"}
113
+ print(f"[{job_id}] Failed: fallback extraction also failed")
114
+ return
115
+ else:
116
+ jobs_db[job_id] = {"status": "failed", "error": "Could not download video from URL"}
117
+ print(f"[{job_id}] Failed: download failed")
118
+ return
119
+ else:
120
+ print(f"[{job_id}] Streaming success! Extracted {len(frame_paths)} frames")
121
+ print(f"[{job_id}] Extracting audio via streaming...")
122
+ audio_path = stream_extract_audio(url, job_id, duration=30)
123
+
124
+ # ============================================
125
+ # PATH B: Uploaded file (traditional processing)
126
+ # ============================================
127
+ elif uploaded_file_path and os.path.exists(uploaded_file_path):
128
+ print(f"[{job_id}] FILE MODE: Processing uploaded file...")
129
+ video_path = uploaded_file_path
130
+
131
+ print(f"[{job_id}] Extracting frames from file...")
132
+ frame_paths = extract_frames(video_path, job_id, fps=0.5, max_frames=5)
133
+
134
+ if not frame_paths:
135
+ jobs_db[job_id] = {"status": "failed", "error": "No frames extracted from uploaded file"}
136
+ print(f"[{job_id}] Failed: 0 frames extracted from upload")
137
+ return
138
+
139
+ print(f"[{job_id}] Extracted {len(frame_paths)} frames from file")
140
+
141
+ print(f"[{job_id}] Extracting audio from file...")
142
+ audio_path = extract_audio(video_path, job_id)
143
+
144
+ else:
145
+ jobs_db[job_id] = {"status": "failed", "error": "No URL or file provided"}
146
+ print(f"[{job_id}] Failed: no input provided")
147
+ return
148
+
149
+ # ============================================
150
+ # ANALYSIS (same for both paths)
151
+ # ============================================
152
+
153
+ # PRIMARY: SightEngine Analysis
154
+ from app.core.config import settings
155
+ se_configured = bool(settings.SIGHTENGINE_API_USER and settings.SIGHTENGINE_API_SECRET)
156
+ print(f"[{job_id}] Running SightEngine analysis... configured={se_configured}")
157
+ sightengine_result = analyze_frames_with_sightengine(frame_paths)
158
+
159
+ # Build visual result
160
+ if sightengine_result.get("avg_score") is not None:
161
+ visual = {
162
+ "avg_prob": sightengine_result["avg_score"],
163
+ "max_prob": sightengine_result["max_score"],
164
+ "frame_count": sightengine_result["frame_count"],
165
+ "frame_scores": sightengine_result["frame_scores"],
166
+ "details": sightengine_result["details"],
167
+ "source": "SightEngine"
168
+ }
169
+ else:
170
+ # FALLBACK: HuggingFace
171
+ print(f"[{job_id}] SightEngine failed or not configured, using HuggingFace fallback...")
172
+ fallback = analyze_visual_fallback(frame_paths)
173
+ visual = {
174
+ "avg_prob": fallback["avg_prob"],
175
+ "max_prob": fallback["max_prob"],
176
+ "frame_count": fallback["frame_count"],
177
+ "frame_scores": [],
178
+ "details": fallback["details"],
179
+ "source": "HuggingFace (fallback)"
180
+ }
181
+
182
+ print(f"[{job_id}] Running audio analysis...")
183
+ audio = analyze_audio_ai(video_path, audio_path=audio_path)
184
+
185
+ print(f"[{job_id}] Running metadata analysis...")
186
+ # For streaming mode, we don't have a video file, so use video_info
187
+ meta = analyze_metadata(video_path, video_info=video_info)
188
+
189
+ print(f"[{job_id}] Running heuristics...")
190
+ heuristics = analyze_heuristics(video_path, meta, video_info=video_info)
191
+
192
+ # Calculate score
193
+ signals = {"visual": visual, "audio": audio, "metadata": meta, "heuristics": heuristics}
194
+ score, confidence, rec = calculate_risk(signals)
195
+
196
+ # Build result
197
+ result = {
198
+ "score": score,
199
+ "confidence": confidence,
200
+ "recommendation": rec,
201
+ "signals": signals,
202
+ "video_info": {
203
+ "title": video_info.get("title"),
204
+ "duration": video_info.get("duration"),
205
+ "resolution": f"{video_info.get('width', '?')}x{video_info.get('height', '?')}",
206
+ "frames_analyzed": len(frame_paths)
207
+ },
208
+ "explanation": f"Analyzed {len(frame_paths)} frames using {visual.get('source', 'AI')}. Risk score: {score}/100 ({rec}). {confidence} confidence.",
209
+ "disclaimer": "This assessment estimates the likelihood of AI generation. It does not guarantee absolute authenticity."
210
+ }
211
+
212
+ # Cache and cleanup
213
+ if url:
214
+ save_to_cache(url, result)
215
+ clean_temp(job_id)
216
+
217
+ result['id'] = job_id
218
+ jobs_db[job_id] = {"status": "completed", "result": result}
219
+ print(f"[{job_id}] Completed: {score}/100 ({rec})")
220
+
221
+ except Exception as e:
222
+ print(f"[{job_id}] Failed: {e}")
223
+ import traceback
224
+ traceback.print_exc()
225
+ jobs_db[job_id] = {"status": "failed", "error": str(e)}
226
+ clean_temp(job_id)
app/services/sightengine.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # C:\Users\bahae\.gemini\antigravity\scratch\verivid-ai\backend\app\services\sightengine.py
2
+ import requests
3
+ from app.core.config import settings
4
+
5
+ SIGHTENGINE_CHECK_URL = "https://api.sightengine.com/1.0/check.json"
6
+
7
+ def analyze_with_sightengine(image_url: str = None, image_bytes: bytes = None) -> dict:
8
+ """
9
+ Use SightEngine's professional AI detection.
10
+ Returns: {"ai_score": 0-1, "details": str, "raw": dict}
11
+ """
12
+ if not settings.SIGHTENGINE_API_USER or not settings.SIGHTENGINE_API_SECRET:
13
+ return {"ai_score": None, "details": "SightEngine not configured", "raw": None}
14
+
15
+ try:
16
+ if image_url:
17
+ # URL-based check
18
+ response = requests.post(
19
+ SIGHTENGINE_CHECK_URL,
20
+ data={
21
+ "url": image_url,
22
+ "models": "genai",
23
+ "api_user": settings.SIGHTENGINE_API_USER,
24
+ "api_secret": settings.SIGHTENGINE_API_SECRET
25
+ },
26
+ timeout=30
27
+ )
28
+ elif image_bytes:
29
+ # File-based check
30
+ response = requests.post(
31
+ SIGHTENGINE_CHECK_URL,
32
+ data={
33
+ "models": "genai",
34
+ "api_user": settings.SIGHTENGINE_API_USER,
35
+ "api_secret": settings.SIGHTENGINE_API_SECRET
36
+ },
37
+ files={"media": ("image.jpg", image_bytes, "image/jpeg")},
38
+ timeout=30
39
+ )
40
+ else:
41
+ return {"ai_score": None, "details": "No image provided", "raw": None}
42
+
43
+ if response.status_code != 200:
44
+ return {"ai_score": None, "details": f"API error: {response.status_code}", "raw": response.text[:200]}
45
+
46
+ data = response.json()
47
+
48
+ # SightEngine returns: {"type": {"ai_generated": 0.95, ...}}
49
+ if data.get("status") == "success":
50
+ genai_data = data.get("type", {})
51
+ ai_score = genai_data.get("ai_generated", 0)
52
+
53
+ return {
54
+ "ai_score": ai_score,
55
+ "details": f"SightEngine AI detection: {round(ai_score * 100)}% AI probability",
56
+ "raw": data
57
+ }
58
+ else:
59
+ return {"ai_score": None, "details": f"API error: {data.get('error', {}).get('message', 'Unknown')}", "raw": data}
60
+
61
+ except Exception as e:
62
+ return {"ai_score": None, "details": f"Exception: {str(e)}", "raw": None}
63
+
64
+ def analyze_frames_with_sightengine(frame_paths: list) -> dict:
65
+ """Analyze multiple frames and aggregate scores"""
66
+ scores = []
67
+ details = []
68
+
69
+ for path in frame_paths[:5]: # Limit to 5 frames to save API calls
70
+ try:
71
+ with open(path, 'rb') as f:
72
+ img_bytes = f.read()
73
+
74
+ result = analyze_with_sightengine(image_bytes=img_bytes)
75
+
76
+ if result["ai_score"] is not None:
77
+ scores.append(result["ai_score"])
78
+ details.append(f"Frame: {round(result['ai_score'] * 100)}%")
79
+ except Exception as e:
80
+ details.append(f"Error: {str(e)[:50]}")
81
+
82
+ if scores:
83
+ avg_score = sum(scores) / len(scores)
84
+ max_score = max(scores)
85
+ return {
86
+ "avg_score": avg_score,
87
+ "max_score": max_score,
88
+ "frame_count": len(scores),
89
+ "frame_scores": [round(s, 3) for s in scores],
90
+ "details": f"SightEngine analyzed {len(scores)} frames. Avg: {round(avg_score*100)}%, Max: {round(max_score*100)}%"
91
+ }
92
+ else:
93
+ return {
94
+ "avg_score": None,
95
+ "max_score": None,
96
+ "frame_count": 0,
97
+ "frame_scores": [],
98
+ "details": "SightEngine analysis failed: " + "; ".join(details)
99
+ }
main.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # C:\Users\bahae\.gemini\antigravity\scratch\verivid-ai\backend\main.py
2
+ from fastapi import FastAPI, Request
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from fastapi.responses import JSONResponse
5
+ from slowapi import Limiter, _rate_limit_exceeded_handler
6
+ from slowapi.util import get_remote_address
7
+ from slowapi.errors import RateLimitExceeded
8
+ from app.api import routes
9
+
10
+ # Rate limiter
11
+ limiter = Limiter(key_func=get_remote_address)
12
+
13
+ app = FastAPI(
14
+ title="VeriVid Risk Engine",
15
+ version="1.0.0",
16
+ description="AI Video Authenticity Detection API",
17
+ docs_url="/docs" if True else None, # Disable in production
18
+ redoc_url=None
19
+ )
20
+
21
+ # Add rate limiter
22
+ app.state.limiter = limiter
23
+ app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
24
+
25
+ # CORS - restrict origins in production
26
+ ALLOWED_ORIGINS = [
27
+ "http://localhost:3000",
28
+ "http://127.0.0.1:3000",
29
+ "https://verivid.ai",
30
+ "https://www.verivid.ai",
31
+ "https://verivid-ai.vercel.app",
32
+ "https://verivid-ai-final.vercel.app",
33
+ "https://verivid-ai-final.onrender.com",
34
+ "https://verivid-ai-final-1.onrender.com",
35
+ ]
36
+
37
+ # For development/preview, also allow Vercel preview URLs
38
+ import re
39
+ def is_allowed_origin(origin: str) -> bool:
40
+ if origin in ALLOWED_ORIGINS:
41
+ return True
42
+ # Allow ANY Vercel deployment (covers all preview URLs)
43
+ if re.match(r"https://.*\.vercel\.app", origin):
44
+ return True
45
+ return False
46
+
47
+ app.add_middleware(
48
+ CORSMiddleware,
49
+ allow_origins=ALLOWED_ORIGINS,
50
+ # Allow ANY vercel.app subdomain (covers preview deployments)
51
+ allow_origin_regex=r"https://.*\.vercel\.app",
52
+ allow_credentials=True,
53
+ allow_methods=["GET", "POST", "OPTIONS"],
54
+ allow_headers=["*"],
55
+ )
56
+
57
+ # Security headers middleware
58
+ @app.middleware("http")
59
+ async def add_security_headers(request: Request, call_next):
60
+ response = await call_next(request)
61
+ response.headers["X-Content-Type-Options"] = "nosniff"
62
+ response.headers["X-Frame-Options"] = "DENY"
63
+ response.headers["X-XSS-Protection"] = "1; mode=block"
64
+ response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
65
+ return response
66
+
67
+ # Include routes
68
+ app.include_router(routes.router, prefix="/api/v1")
69
+
70
+ @app.get("/")
71
+ def health():
72
+ return {"status": "ok", "service": "VeriVid Engine", "version": "1.0.0"}
73
+
74
+ @app.get("/health")
75
+ def health_check():
76
+ return {"status": "healthy"}
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi>=0.115.0
2
+ uvicorn[standard]>=0.32.0
3
+ python-dotenv>=1.0.0
4
+ pydantic-settings>=2.6.0
5
+ requests>=2.32.0
6
+ yt-dlp>=2024.12.0
7
+ slowapi>=0.1.9
8
+ python-multipart>=0.0.9