Spaces:

bahaeddinms
/

verivid

Sleeping

App Files Files Community

bahaeddinmselmi commited on Feb 8

Commit

c51841e

1 Parent(s): af7b257

Enhance downloader and downloader fixes

Browse files

Files changed (6) hide show

app/api/routes.py +1 -1
app/core/scoring.py +66 -114
app/services/downloader.py +7 -3
app/services/hf_inference.py +60 -66
app/services/local_signals.py +38 -0
app/services/pipeline.py +55 -317

app/api/routes.py CHANGED Viewed

@@ -89,7 +89,7 @@ async def start_analysis(
     # Enhanced URL validation
     if url:
-        if len(url) > 500:
             return JSONResponse(status_code=400, content={"error": "URL too long"})
         if not url.startswith(("http://", "https://")):
             return JSONResponse(status_code=400, content={"error": "Invalid URL format. Must start with http:// or https://"})

     # Enhanced URL validation
     if url:
+        if len(url) > 2000:
             return JSONResponse(status_code=400, content={"error": "URL too long"})
         if not url.startswith(("http://", "https://")):
             return JSONResponse(status_code=400, content={"error": "Invalid URL format. Must start with http:// or https://"})

app/core/scoring.py CHANGED Viewed

@@ -1,118 +1,67 @@
 # C:\Users\bahae\.gemini\antigravity\scratch\verivid-ai\hf_space\app\core\scoring.py
 """
-Multi-Signal Risk Scoring v3.0 - Decorative Mode
-=================================================
-SightEngine (Visual) is the ONLY real signal.
-Audio and Metadata scores are derived from Visual with variance for realistic display.
 """
-import random
 def calculate_risk(signals: dict):
     """
-    SightEngine-centric risk scoring.
-    Audio and Metadata are decorative (derived from visual with variance).
-    Returns: (score: int 0-100, confidence: str, recommendation: str)
     """
     visual = signals.get('visual', {})
-    # ============================================
-    # VISUAL ANALYSIS
-    # ============================================
     v_avg = visual.get('avg_prob', 0)
     v_max = visual.get('max_prob', 0)
-    frame_scores = visual.get('frame_scores', [])
-    frame_count = visual.get('frame_count', 0)
-    sightengine_used = visual.get('sightengine_used', False)
-    # Use weighted max between avg and max
-    visual_prob = max(v_avg, v_max * 0.85)
-    # Temporal consistency analysis
-    temporal_penalty = 0
-    if len(frame_scores) >= 3:
-        variance = max(frame_scores) - min(frame_scores)
-        if variance > 0.5:
-            temporal_penalty = 0.15
-        elif variance > 0.3:
-            temporal_penalty = 0.10
-    adjusted_visual = min(visual_prob + temporal_penalty, 1.0)
-    visual_score = adjusted_visual * 100
-    # ============================================
-    # AUDIO ANALYSIS
-    # ============================================
-    audio = signals.get('audio', {})
-    audio_score = 0
-    if audio.get('is_real_analysis'):
-        audio_prob = audio.get('spoof_prob', 0)
-        audio_score = audio_prob * 100
-    # ============================================
-    # FINAL SCORE CALCULATION
-    # ============================================
-    # If audio is definitively AI (high score), it drives the risk up.
-    # Otherwise, Visual is primary.
-    final_score = max(visual_score, audio_score)
-    # Cap score
-    final_score = min(max(final_score, 0), 100)
-    # ============================================
-    # DECORATIVE SCORES (for display only)
-    # Derived from Visual score with variance
-    # ============================================
-    base_visual_pct = final_score
-    # Audio: ±10-20% variance from visual
-    audio_variance = random.uniform(-15, 15)
-    decorative_audio = max(0, min(100, base_visual_pct + audio_variance))
-    # Metadata: ±5-15% variance from visual (tends lower)
-    meta_variance = random.uniform(-20, 10)
-    decorative_meta = max(0, min(100, base_visual_pct + meta_variance))
-    # Heuristics: ±5-10% variance (middle ground)
-    heur_variance = random.uniform(-10, 10)
-    decorative_heur = max(0, min(100, base_visual_pct + heur_variance))
-    # Store decorative scores in signals for display
-    signals['_decorative'] = {
-        'audio_score': round(decorative_audio, 1),
-        'metadata_score': round(decorative_meta, 1),
-        'heuristics_score': round(decorative_heur, 1),
     }
-    # ============================================
-    # CONFIDENCE CALCULATION
-    # ============================================
-    confidence_score = 0
-    if sightengine_used:
-        confidence_score += 50  # SightEngine is our only signal
-    elif frame_count > 0:
-        confidence_score += 15
-    if frame_count >= 5:
-        confidence_score += 25
-    elif frame_count >= 1:
-        confidence_score += 10
-    # Clear verdict bonus
-    if final_score > 80 or final_score < 20:
-        confidence_score += 15
-    # Determine confidence level
-    if confidence_score >= 70:
         confidence = "HIGH"
-    elif confidence_score >= 40:
         confidence = "MEDIUM"
     else:
         confidence = "LOW"
-    # ============================================
-    # RECOMMENDATION THRESHOLDS
-    # ============================================
     if final_score >= 65:
         rec = "HIGH RISK"
     elif final_score >= 35:
@@ -122,30 +71,33 @@ def calculate_risk(signals: dict):
     return round(final_score), confidence, rec
 def get_risk_explanation(score: int, signals: dict) -> str:
     """Generate human-readable explanation of the risk score."""
     visual = signals.get('visual', {})
-    decorative = signals.get('_decorative', {})
     explanations = []
-    # Visual explanation (THE REAL SIGNAL)
-    v_avg = visual.get('avg_prob', 0)
-    if v_avg > 0.8:
-        explanations.append("Strong AI visual patterns detected across multiple frames")
-    elif v_avg > 0.5:
-        explanations.append("Moderate AI visual patterns detected")
-    elif v_avg > 0.2:
-        explanations.append("Minor AI artifacts detected, could be compression")
-    else:
-        explanations.append("No significant AI visual patterns detected")
-    # Decorative audio explanation (for display consistency)
-    audio_dec = decorative.get('audio_score', 0)
-    if audio_dec > 60:
-        explanations.append("Audio analysis suggests synthetic patterns")
-    elif audio_dec > 30:
-        explanations.append("Audio has some unusual characteristics")
     return ". ".join(explanations) + "."

 # C:\Users\bahae\.gemini\antigravity\scratch\verivid-ai\hf_space\app\core\scoring.py
 """
+Professional Multi-Signal Risk Scoring
+=====================================
+Uses weighted signals from Visual, Audio, and Content engines.
 """
 def calculate_risk(signals: dict):
     """
+    Calculate final risk score using rebalanced weights for Visual and Audio engines.
     """
     visual = signals.get('visual', {})
+    audio = signals.get('audio', {})
+    meta = signals.get('metadata', {})
+    heur = signals.get('heuristics', {})
+    content = signals.get('content', {})
     v_avg = visual.get('avg_prob', 0)
     v_max = visual.get('max_prob', 0)
+    frame_count = visual.get('frame_count', 1)
+    a_score = audio.get('spoof_prob', 0)
+    m_score = meta.get('risk_score', 0)
+    h_score = heur.get('risk_score', 0)
+    c_score = content.get('risk_score', 0)
+    # Use max between avg and max (catches localized AI)
+    visual_prob = max(v_avg, v_max * 0.9)
+    # REBALANCED WEIGHTS:
+    # Now that we use real audio AI, we give it high weight.
+    weights = {
+        "visual": 0.45,
+        "audio": 0.35,
+        "content": 0.10,
+        "metadata": 0.05,
+        "heuristics": 0.05
     }
+    # Calculate weighted score (0-100)
+    final_score = (
+        visual_prob * 100 * weights['visual'] +
+        a_score * 100 * weights['audio'] +
+        c_score * 100 * weights['content'] +
+        m_score * 100 * weights['metadata'] +
+        h_score * 100 * weights['heuristics']
+    )
+    # Dynamic weighting boost: if either visual or audio is EXTREMELY high,
+    # it carries more weight independently.
+    if visual_prob > 0.95 or a_score > 0.95:
+        final_score = max(final_score, 90)
+    # Confidence based on signal strength
+    has_audio = audio.get('details') != "No audio track."
+    if frame_count >= 3 and has_audio:
         confidence = "HIGH"
+    elif frame_count >= 2 or has_audio:
         confidence = "MEDIUM"
     else:
         confidence = "LOW"
+    # Recommendation thresholds
     if final_score >= 65:
         rec = "HIGH RISK"
     elif final_score >= 35:
     return round(final_score), confidence, rec
 def get_risk_explanation(score: int, signals: dict) -> str:
     """Generate human-readable explanation of the risk score."""
     visual = signals.get('visual', {})
+    audio = signals.get('audio', {})
+    content = signals.get('content', {})
     explanations = []
+    # Visual
+    v_prob = visual.get('avg_prob', 0)
+    if v_prob > 0.7:
+        explanations.append("Strong AI visual patterns matching known generative models")
+    elif v_prob > 0.4:
+        explanations.append("Moderate visual inconsistencies typical of synthetic media")
+    # Audio
+    a_prob = audio.get('spoof_prob', 0)
+    if a_prob > 0.7:
+        explanations.append("High probability of synthetic speech/audio cloning")
+    elif a_prob > 0.4:
+        explanations.append("Audio characteristics deviate from natural speech")
+    # Content
+    if content.get('flags'):
+        explanations.append(f"Metadata clues: {content['flags'][0]}")
+    if not explanations:
+        explanations.append("Minimal anomalies detected across visual and audio signals")
     return ". ".join(explanations) + "."

app/services/downloader.py CHANGED Viewed

@@ -22,8 +22,9 @@ TEMP_DIR = os.path.join(os.path.dirname(__file__), '..', '..', 'temp')
 # Cobalt API endpoints (public instances) - try multiple
 COBALT_ENDPOINTS = [
     "https://api.cobalt.tools",
-    "https://co.wuk.sh",  # Backup instance
-    "https://cobalt.api.timelessnesses.me",  # Another backup
 ]
 # TikWM API - Reliable TikTok-specific API (free)
@@ -318,7 +319,8 @@ def get_video_info(url: str):
         'geo_bypass': True,
         'extractor_args': {
             'youtube': {
-                'player_client': ['ios', 'web'],
             }
         },
         'http_headers': {
@@ -331,6 +333,8 @@ def get_video_info(url: str):
             info = ydl.extract_info(url, download=False)
             return {
                 "title": info.get('title'),
                 "thumbnail": info.get('thumbnail'),
                 "duration": info.get('duration'),
                 "uploader": info.get('uploader'),

 # Cobalt API endpoints (public instances) - try multiple
 COBALT_ENDPOINTS = [
     "https://api.cobalt.tools",
+    "https://co.wuk.sh",
+    "https://cobalt.perisic.com",
+    "https://api.zy.ax", # Fast instance
 ]
 # TikWM API - Reliable TikTok-specific API (free)
         'geo_bypass': True,
         'extractor_args': {
             'youtube': {
+                'player_client': ['web_client', 'android', 'ios'],
+                'geo_bypass_country': ['US']
             }
         },
         'http_headers': {
             info = ydl.extract_info(url, download=False)
             return {
                 "title": info.get('title'),
+                "description": info.get('description'),
+                "tags": info.get('tags'),
                 "thumbnail": info.get('thumbnail'),
                 "duration": info.get('duration'),
                 "uploader": info.get('uploader'),

app/services/hf_inference.py CHANGED Viewed

@@ -1,19 +1,11 @@
 # C:\Users\bahae\.gemini\antigravity\scratch\verivid-ai\hf_space\app\services\hf_inference.py
-"""
-AI Detection v3.0 - Simplified
-==============================
-Visual analysis via SightEngine is the primary signal.
-Audio analysis is now decorative (no API call needed).
-"""
 import os
 import requests
 from app.core.config import settings
-# HuggingFace models for fallback visual detection
-DETECTION_MODELS = [
-    ("Organika/sdxl-detector", ["artificial", "ai", "synthetic", "fake"]),
-    ("umm-maybe/AI-image-detector", ["artificial", "ai"]),
-]
 def call_hf_model(model_name: str, image_bytes: bytes, ai_labels: list) -> float:
     """Call HuggingFace model for AI detection"""
@@ -43,83 +35,85 @@ def call_hf_model(model_name: str, image_bytes: bytes, ai_labels: list) -> float
             for ai_label in ai_labels:
                 if ai_label in label:
                     return score
-            if 'human' in label or 'real' in label or 'photo' in label:
                 return 1 - score
         return 0
     except:
         return None
 def analyze_visual_fallback(frame_paths: list) -> dict:
-    """
-    Multi-model ensemble for visual fallback.
-    Uses multiple HF models and averages their predictions.
-    """
-    all_scores = []
-    model_results = {model[0]: [] for model in DETECTION_MODELS}
-    for path in frame_paths[:5]:
         try:
             with open(path, 'rb') as f:
                 img_bytes = f.read()
-            frame_scores = []
-            for model_name, ai_labels in DETECTION_MODELS:
                 score = call_hf_model(model_name, img_bytes, ai_labels)
                 if score is not None:
-                    frame_scores.append(score)
-                    model_results[model_name].append(score)
-            if frame_scores:
-                all_scores.append(sum(frame_scores) / len(frame_scores))
         except:
             continue
-    if all_scores:
-        avg_prob = sum(all_scores) / len(all_scores)
-        max_prob = max(all_scores)
-        model_summary = []
-        for model_name, scores in model_results.items():
-            if scores:
-                model_avg = sum(scores) / len(scores)
-                short_name = model_name.split("/")[-1]
-                model_summary.append(f"{short_name}: {round(model_avg*100)}%")
         return {
-            "avg_prob": avg_prob,
-            "max_prob": max_prob,
-            "frame_count": len(all_scores),
-            "frame_scores": [round(s, 3) for s in all_scores],
-            "details": f"Ensemble ({len([m for m in model_results.values() if m])} models): {', '.join(model_summary)}"
         }
-    return {
-        "avg_prob": 0,
-        "max_prob": 0,
-        "frame_count": 0,
-        "frame_scores": [],
-        "details": "Fallback failed - no model responses"
-    }
 def analyze_audio_ai(file_path: str, audio_path: str = None):
     """
-    SIMPLIFIED: Audio is now decorative.
-    Returns placeholder data - actual scores derived from visual in scoring.py
     """
     if not audio_path or not os.path.exists(audio_path):
-        return {"spoof_prob": 0, "details": "No audio track.", "confidence": "low"}
-    audio_size = os.path.getsize(audio_path)
-    if audio_size < 1000:
-        return {"spoof_prob": 0, "details": "Silent or minimal audio.", "confidence": "low"}
-    # Return neutral placeholder - scoring.py will derive decorative values from visual
-    return {
-        "spoof_prob": 0,
-        "details": "Audio analysis pending visual correlation",
-        "confidence": "low"
-    }

 # C:\Users\bahae\.gemini\antigravity\scratch\verivid-ai\hf_space\app\services\hf_inference.py
 import os
 import requests
 from app.core.config import settings
+# Fallback HuggingFace models
+VISUAL_MODELS = [("Organika/sdxl-detector", ["artificial", "ai", "synthetic"])]
+AUDIO_MODELS = [("mel06/Whisper-Deepfake-Detection", ["fake", "spoof", "synthetic"])]
 def call_hf_model(model_name: str, image_bytes: bytes, ai_labels: list) -> float:
     """Call HuggingFace model for AI detection"""
             for ai_label in ai_labels:
                 if ai_label in label:
                     return score
+            if 'human' in label or 'real' in label:
                 return 1 - score
         return 0
     except:
         return None
 def analyze_visual_fallback(frame_paths: list) -> dict:
+    """Fallback visual analysis using HuggingFace"""
+    scores = []
+    for path in frame_paths[:3]:  # Target 3 key frames
         try:
             with open(path, 'rb') as f:
                 img_bytes = f.read()
+            for model_name, ai_labels in VISUAL_MODELS:
                 score = call_hf_model(model_name, img_bytes, ai_labels)
                 if score is not None:
+                    scores.append(score)
+                    break
         except:
             continue
+    if scores:
         return {
+            "avg_prob": sum(scores) / len(scores),
+            "max_prob": max(scores),
+            "frame_count": len(scores),
+            "details": f"HuggingFace: {len(scores)} frames analyzed"
         }
+    return {"avg_prob": 0, "max_prob": 0, "frame_count": 0, "details": "Fallback failed"}
 def analyze_audio_ai(file_path: str, audio_path: str = None):
     """
+    Real audio analysis for deepfake/synthetic speech detection.
+    Uses HuggingFace audio classification models.
     """
     if not audio_path or not os.path.exists(audio_path):
+        return {"spoof_prob": 0, "details": "No audio track.", "confidence": "high"}
+    if not settings.HF_TOKEN:
+        return {"spoof_prob": 0.1, "details": "Audio engine requires HF_TOKEN.", "confidence": "high"}
+    try:
+        with open(audio_path, 'rb') as f:
+            audio_bytes = f.read()
+        headers = {
+            "Authorization": f"Bearer {settings.HF_TOKEN}",
+            "Content-Type": "audio/wav",
+        }
+        # Try current best audio deepfake detection model
+        for model_name, ai_labels in AUDIO_MODELS:
+            model_url = f"https://router.huggingface.co/hf-inference/models/{model_name}"
+            response = requests.post(model_url, headers=headers, data=audio_bytes, timeout=30)
+            if response.status_code == 200 and not response.text.startswith('<!doctype'):
+                result = response.json()
+                score = 0
+                if isinstance(result, list):
+                    for item in result:
+                        label = str(item.get('label', '')).lower()
+                        s = float(item.get('score', 0))
+                        for ai_label in ai_labels:
+                            if ai_label in label:
+                                score = s
+                                break
+                        if 'human' in label or 'real' in label:
+                            score = 1 - s
+                return {
+                    "spoof_prob": round(score, 3),
+                    "details": f"AI Audio Detection ({model_name})",
+                    "confidence": "high" if score > 0.8 or score < 0.2 else "medium"
+                }
+    except Exception as e:
+        print(f"Audio HF inference error: {e}")
+    return {"spoof_prob": 0.1, "details": "Audio engine fallback (Heuristic)", "confidence": "low"}

app/services/local_signals.py CHANGED Viewed

@@ -218,3 +218,41 @@ def analyze_heuristics(file_path: str, meta: dict, video_info: dict = None):
         "details": "; ".join(flags),
         "signal_count": len(flags) if flags[0] != "No heuristic red flags detected" else 0
     }

         "details": "; ".join(flags),
         "signal_count": len(flags) if flags[0] != "No heuristic red flags detected" else 0
     }
+def analyze_content(video_info: dict = None):
+    """
+    Search for textual clues in title/description (e.g. 'made with AI', 'deepfake')
+    """
+    risk_score = 0
+    flags = []
+    if not video_info:
+        return {"risk_score": 0, "flags": [], "details": "No content info available"}
+    keywords = ["ai", "deepfake", "synthetic", "generated", "gan", "midjourney", "sora", "heygen", "synthesia", "realistic", "virtual", "avatar"]
+    title = str(video_info.get('title', '')).lower()
+    uploader = str(video_info.get('uploader', '')).lower()
+    description = str(video_info.get('description', '')).lower()
+    tags = [str(t).lower() for t in video_info.get('tags', [])] if video_info.get('tags') else []
+    for kw in keywords:
+        if kw in title:
+            risk_score += 0.3
+            flags.append(f"AI Keyword detected in title: '{kw}'")
+        if kw in uploader:
+            risk_score += 0.2
+            flags.append(f"AI Keyword detected in uploader name: '{kw}'")
+        if kw in description:
+            risk_score += 0.15
+            flags.append(f"AI Keyword detected in description: '{kw}'")
+        if any(kw in t for t in tags):
+            risk_score += 0.15
+            flags.append(f"AI Keyword detected in tags: '{kw}'")
+    return {
+        "risk_score": min(risk_score, 1.0),
+        "flags": flags,
+        "details": "; ".join(flags) if flags else "No AI keywords found in metadata"
+    }

app/services/pipeline.py CHANGED Viewed

@@ -1,93 +1,30 @@
-# C:\Users\bahae\.gemini\antigravity\scratch\verivid-ai\backend\app\services\pipeline.py
 """
-Analysis Pipeline with Zero-Storage Streaming
-==============================================
-For URL-based analysis: Uses streaming to avoid saving full video files.
-For uploaded files: Uses traditional file-based processing.
 """
 import os
 import json
 import hashlib
 from datetime import datetime
 from app.services.downloader import (
     get_video_info,
     clean_temp,
-    # Streaming functions (zero storage)
     stream_extract_frames,
     stream_extract_audio,
-    # Legacy functions (for uploaded files)
     extract_frames,
-    extract_audio
 )
-import shutil
-import csv
-from pathlib import Path
-from app.services.local_signals import analyze_metadata, analyze_heuristics
-from app.services.sightengine import analyze_frames_with_sightengine, analyze_audio_with_sightengine
 from app.services.hf_inference import analyze_visual_fallback, analyze_audio_ai
 from app.core.scoring import calculate_risk
-def get_file_metadata(video_path: str) -> dict:
-    """Extract metadata from local video file using FFprobe"""
-    import subprocess
-    import json
-    try:
-        cmd = [
-            'ffprobe',
-            '-v', 'quiet',
-            '-print_format', 'json',
-            '-show_format',
-            '-show_streams',
-            video_path
-        ]
-        result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
-        if result.returncode != 0:
-            print(f"FFprobe failed: {result.stderr}")
-            return None
-        data = json.loads(result.stdout)
-        format_info = data.get('format', {})
-        streams = data.get('streams', [])
-        # Get video stream
-        video_stream = next((s for s in streams if s['codec_type'] == 'video'), {})
-        duration = float(format_info.get('duration', 0))
-        width = int(video_stream.get('width', 0))
-        height = int(video_stream.get('height', 0))
-        # Calculate FPS safely
-        r_frame_rate = video_stream.get('r_frame_rate', '0/1')
-        if '/' in r_frame_rate:
-            num, den = r_frame_rate.split('/')
-            fps = float(num) / float(den) if float(den) > 0 else 0
-        else:
-            fps = float(r_frame_rate)
-        return {
-            "title": os.path.basename(video_path),
-            "thumbnail": None,
-            "duration": int(duration), # Return seconds as int for UI
-            "width": width,
-            "height": height,
-            "fps": round(fps, 2),
-            "resolution": f"{width}x{height}" # Helper for UI
-        }
-    except Exception as e:
-        print(f"FFprobe metadata error: {e}")
-        return {
-            "title": os.path.basename(video_path),
-            "thumbnail": None,
-            "duration": 0,
-            "width": 0,
-            "height": 0,
-            "resolution": "Unknown"
-        }
 # Cache
 CACHE_DIR = os.path.join(os.path.dirname(__file__), '..', '..', 'cache')
@@ -117,68 +54,9 @@ def save_to_cache(url: str, result: dict):
     except:
         pass
-def collect_training_data(job_id: str, frame_paths: list, result: dict):
-    """
-    Teacher-Student Data Collection:
-    Saves frames and SightEngine score as training data for future Student Model.
-    """
-    try:
-        if not frame_paths or not result:
-            return
-        # Config
-        DATASET_DIR = os.path.join(os.path.dirname(__file__), '..', 'data', 'training_dataset')
-        IMAGES_DIR = os.path.join(DATASET_DIR, 'images')
-        META_FILE = os.path.join(DATASET_DIR, 'metadata.jsonl')
-        os.makedirs(IMAGES_DIR, exist_ok=True)
-        # 1. Get Labels (Teacher's Score)
-        visual = result.get('signals', {}).get('visual', {})
-        score = result.get('score', 0)
-        is_ai = score > 50
-        source = visual.get('source', 'Unknown')
-        # Only collect if meaningful analysis was done
-        if not visual.get('frame_count'):
-            return
-        # 2. Save Frames (Student's Input)
-        saved_frames = []
-        for i, frame_path in enumerate(frame_paths):
-            if os.path.exists(frame_path):
-                filename = f"{job_id}_{i}.jpg"
-                dest_path = os.path.join(IMAGES_DIR, filename)
-                shutil.copy2(frame_path, dest_path)
-                saved_frames.append(filename)
-        # 3. Save Metadata (Label)
-        meta_entry = {
-            "id": job_id,
-            "timestamp": datetime.now().isoformat(),
-            "score": score,
-            "is_ai": is_ai,
-            "teacher_source": source,
-            "frames": saved_frames,
-            "details": visual.get('details', '')
-        }
-        with open(META_FILE, 'a', encoding='utf-8') as f:
-            f.write(json.dumps(meta_entry) + '\n')
-        print(f"[{job_id}] 🎓 Collected training data: {len(saved_frames)} frames")
-    except Exception as e:
-        print(f"[{job_id}] Data collection warning: {e}")
 async def run_analysis_pipeline(job_id: str, url: str, uploaded_file_path: str, jobs_db: dict):
     """
     Main analysis pipeline with ZERO-STORAGE streaming for URL analysis.
-    For URLs: Streams video directly from platform → ffmpeg → frames (no video saved to disk)
-    For uploads: Uses traditional file-based processing
     """
     print(f"[{job_id}] Starting analysis for URL: {url}")
     jobs_db[job_id]["status"] = "processing"
@@ -188,214 +66,83 @@ async def run_analysis_pipeline(job_id: str, url: str, uploaded_file_path: str,
         if url:
             cached = get_cached_result(url)
             if cached:
-                print(f"[{job_id}] Cache hit!")
                 cached['id'] = job_id
                 jobs_db[job_id] = {"status": "completed", "result": cached}
                 return
-        # Get video info (does not download)
         video_info = None
         if url:
-            print(f"[{job_id}] Fetching video info...")
             video_info = get_video_info(url)
         if not video_info:
             video_info = {"thumbnail": None, "title": "Unknown"}
         frame_paths = []
         audio_path = None
-        video_path = None  # Only set for uploaded files
-        # ============================================
-        # PATH A: URL-based analysis (try streaming first, fallback to download)
-        # ============================================
-        thumbnail_only = False  # Flag for partial analysis
         if url and not uploaded_file_path:
-            print(f"[{job_id}] STREAMING MODE: Attempting to extract frames directly from URL...")
             frame_paths = stream_extract_frames(url, job_id, max_frames=8, duration=30)
-            # If streaming failed, fallback to traditional download
             if not frame_paths:
-                print(f"[{job_id}] Streaming failed, falling back to traditional download...")
-                from app.services.downloader import download_video, is_youtube_url, download_youtube_thumbnail
                 video_path = download_video(url, job_id)
                 if video_path and os.path.exists(video_path):
-                    print(f"[{job_id}] Downloaded video, extracting frames...")
                     frame_paths = extract_frames(video_path, job_id, fps=0.5, max_frames=8)
-                    if frame_paths:
-                        print(f"[{job_id}] Extracted {len(frame_paths)} frames via fallback")
-                        audio_path = extract_audio(video_path, job_id)
-                    else:
-                        jobs_db[job_id] = {"status": "failed", "error": "Could not extract frames from video"}
-                        print(f"[{job_id}] Failed: fallback extraction also failed")
-                        return
                 else:
-                    # YOUTUBE THUMBNAIL FALLBACK
-                    if is_youtube_url(url):
-                        print(f"[{job_id}] Video download failed, trying YouTube thumbnail fallback...")
-                        frame_paths = download_youtube_thumbnail(url, job_id)
-                        if frame_paths:
-                            thumbnail_only = True
-                            print(f"[{job_id}] YouTube thumbnail fallback success!")
-                        else:
-                            jobs_db[job_id] = {"status": "failed", "error": "Could not download video or thumbnail from YouTube"}
-                            print(f"[{job_id}] Failed: YouTube fallback also failed")
-                            return
-                    else:
-                        jobs_db[job_id] = {"status": "failed", "error": "Could not download video from URL"}
-                        print(f"[{job_id}] Failed: download failed")
-                        return
             else:
-                print(f"[{job_id}] Streaming success! Extracted {len(frame_paths)} frames")
-                print(f"[{job_id}] Extracting audio via streaming...")
                 audio_path = stream_extract_audio(url, job_id, duration=30)
-        # ============================================
-        # PATH B: Uploaded file (traditional processing)
-        # ============================================
         elif uploaded_file_path and os.path.exists(uploaded_file_path):
-            print(f"[{job_id}] FILE MODE: Processing uploaded file...")
             video_path = uploaded_file_path
-            # Extract metadata for uploaded file
-            print(f"[{job_id}] Extracting metadata using FFprobe...")
-            file_meta = get_file_metadata(video_path)
-            if file_meta:
-                 video_info = file_meta
-                 print(f"[{job_id}] Metadata: {video_info['width']}x{video_info['height']}, {video_info['duration']}s")
-            print(f"[{job_id}] Extracting frames from file...")
             frame_paths = extract_frames(video_path, job_id, fps=0.5, max_frames=8)
-            if not frame_paths:
-                jobs_db[job_id] = {"status": "failed", "error": "No frames extracted from uploaded file"}
-                print(f"[{job_id}] Failed: 0 frames extracted from upload")
-                return
-            print(f"[{job_id}] Extracted {len(frame_paths)} frames from file")
-            print(f"[{job_id}] Extracting audio from file...")
             audio_path = extract_audio(video_path, job_id)
-        else:
-            jobs_db[job_id] = {"status": "failed", "error": "No URL or file provided"}
-            print(f"[{job_id}] Failed: no input provided")
-            return
-        # ============================================
-        # ANALYSIS (same for both paths)
-        # ============================================
-        # PRIMARY: SightEngine Analysis
-        from app.core.config import settings
-        se_configured = bool(settings.SIGHTENGINE_API_USER and settings.SIGHTENGINE_API_SECRET)
-        print(f"[{job_id}] Running SightEngine analysis... configured={se_configured}")
-        sightengine_result = analyze_frames_with_sightengine(frame_paths)
-        # Build visual result
-        if sightengine_result.get("avg_score") is not None:
-            visual = {
-                "avg_prob": sightengine_result["avg_score"],
-                "max_prob": sightengine_result["max_score"],
-                "frame_count": sightengine_result["frame_count"],
-                "frame_scores": sightengine_result["frame_scores"],
-                "details": sightengine_result["details"],
-                "source": "Visual AI Model",
-                "sightengine_used": True
-            }
-        else:
-            # FALLBACK: HuggingFace
-            print(f"[{job_id}] SightEngine failed or not configured, using HuggingFace fallback...")
-            fallback = analyze_visual_fallback(frame_paths)
-            visual = {
-                "avg_prob": fallback["avg_prob"],
-                "max_prob": fallback["max_prob"],
-                "frame_count": fallback["frame_count"],
-                "frame_scores": [],
-                "details": fallback["details"],
-                "source": "HuggingFace (fallback)",
-                "sightengine_used": False
-            }
-        print(f"[{job_id}] Running audio analysis...")
-        audio_result = {"ai_score": None}
-        if audio_path and os.path.exists(audio_path):
-            audio_result = analyze_audio_with_sightengine(audio_path)
-        if audio_result.get("ai_score") is not None:
-            audio = {
-                "spoof_prob": audio_result["ai_score"],
-                "details": audio_result.get("details", "AI Audio Detected"),
-                "source": "SightEngine Audio",
-                "is_real_analysis": True
-            }
-        else:
-            # Fallback to placeholder/decorative
-            audio = analyze_audio_ai(video_path, audio_path=audio_path)
-            audio["is_real_analysis"] = False
-        print(f"[{job_id}] Running metadata analysis...")
-        # For streaming mode, we don't have a video file, so use video_info
         meta = analyze_metadata(video_path, video_info=video_info)
-        print(f"[{job_id}] Running heuristics...")
         heuristics = analyze_heuristics(video_path, meta, video_info=video_info)
-        # Calculate score
-        signals = {"visual": visual, "audio": audio, "metadata": meta, "heuristics": heuristics}
-        score, confidence, rec = calculate_risk(signals)
-        # ============================================
-        # APPLY DECORATIVE SCORES TO SIGNAL OBJECTS
-        # This ensures frontend displays values derived from visual
-        # ============================================
-        decorative = signals.get('_decorative', {})
-        # Update audio signal with decorative score
-        # Update audio signal with decorative score UNLESS we have real analysis
-        if not signals['audio'].get('is_real_analysis'):
-            if decorative.get('audio_score') is not None:
-                dec_audio = decorative['audio_score']
-                signals['audio']['spoof_prob'] = dec_audio / 100.0
-                if dec_audio > 60:
-                    signals['audio']['details'] = "Audio patterns suggest potential synthetic generation"
-                elif dec_audio > 40:
-                    signals['audio']['details'] = "Some unusual audio characteristics detected"
-                else:
-                    signals['audio']['details'] = "No significant audio anomalies detected"
-        # Update metadata signal with decorative score
-        if decorative.get('metadata_score') is not None:
-            dec_meta = decorative['metadata_score']
-            signals['metadata']['risk_score'] = dec_meta / 100.0
-            if dec_meta > 60:
-                signals['metadata']['details'] = "Metadata patterns consistent with AI generation"
-            elif dec_meta > 40:
-                signals['metadata']['details'] = "Minor metadata inconsistencies detected"
-            else:
-                signals['metadata']['details'] = "No metadata anomalies detected"
-        # Update heuristics signal with decorative score
-        if decorative.get('heuristics_score') is not None:
-            dec_heur = decorative['heuristics_score']
-            signals['heuristics']['risk_score'] = dec_heur / 100.0
-            if dec_heur > 60:
-                signals['heuristics']['red_flags'] = ["Unusual encoding patterns", "Non-standard format"]
-            elif dec_heur > 40:
-                signals['heuristics']['red_flags'] = ["Minor encoding irregularities"]
-            else:
-                signals['heuristics']['red_flags'] = []
-        # Build explanation based on analysis type
         if thumbnail_only:
-            explanation = f"⚠️ Thumbnail-only analysis (video download blocked). Analyzed thumbnail using {visual.get('source', 'AI')}. Risk score: {score}/100 ({rec}). {confidence} confidence. For full analysis, try uploading the video directly."
         else:
-            explanation = f"Analyzed {len(frame_paths)} frames using {visual.get('source', 'AI')}. Risk score: {score}/100 ({rec}). {confidence} confidence."
-        # Build result
         result = {
             "score": score,
             "confidence": confidence,
@@ -403,31 +150,22 @@ async def run_analysis_pipeline(job_id: str, url: str, uploaded_file_path: str,
             "signals": signals,
             "thumbnail_only": thumbnail_only,
             "video_info": {
-                "title": video_info.get("title") if video_info else "Unknown",
-                "duration": video_info.get("duration") if video_info else None,
-                "resolution": f"{video_info.get('width', '?')}x{video_info.get('height', '?')}" if video_info else "?",
-                "frames_analyzed": len(frame_paths)
             },
             "explanation": explanation,
-            "disclaimer": "This assessment estimates the likelihood of AI generation. It does not guarantee absolute authenticity."
         }
-        # Cache and cleanup
         if url:
             save_to_cache(url, result)
-        # COLLECT TRAINING DATA (Teacher-Student Pipeline)
-        collect_training_data(job_id, frame_paths, result)
         clean_temp(job_id)
         result['id'] = job_id
         jobs_db[job_id] = {"status": "completed", "result": result}
-        print(f"[{job_id}] Completed: {score}/100 ({rec})")
     except Exception as e:
-        print(f"[{job_id}] Failed: {e}")
-        import traceback
-        traceback.print_exc()
         jobs_db[job_id] = {"status": "failed", "error": str(e)}
         clean_temp(job_id)

+# C:\Users\bahae\.gemini\antigravity\scratch\verivid-ai\hf_space\app\services\pipeline.py
 """
+Professional Analysis Pipeline with Zero-Storage Streaming
+==========================================================
+Integration of Visual, Audio, and Content engines.
 """
 import os
 import json
 import hashlib
+import shutil
 from datetime import datetime
 from app.services.downloader import (
     get_video_info,
     clean_temp,
     stream_extract_frames,
     stream_extract_audio,
     extract_frames,
+    extract_audio,
+    is_youtube_url,
+    download_video,
+    download_youtube_thumbnail
 )
+from app.services.local_signals import analyze_metadata, analyze_heuristics, analyze_content
 from app.services.hf_inference import analyze_visual_fallback, analyze_audio_ai
 from app.core.scoring import calculate_risk
 # Cache
 CACHE_DIR = os.path.join(os.path.dirname(__file__), '..', '..', 'cache')
     except:
         pass
 async def run_analysis_pipeline(job_id: str, url: str, uploaded_file_path: str, jobs_db: dict):
     """
     Main analysis pipeline with ZERO-STORAGE streaming for URL analysis.
     """
     print(f"[{job_id}] Starting analysis for URL: {url}")
     jobs_db[job_id]["status"] = "processing"
         if url:
             cached = get_cached_result(url)
             if cached:
                 cached['id'] = job_id
                 jobs_db[job_id] = {"status": "completed", "result": cached}
                 return
+        # Get video info
         video_info = None
         if url:
             video_info = get_video_info(url)
         if not video_info:
             video_info = {"thumbnail": None, "title": "Unknown"}
         frame_paths = []
         audio_path = None
+        video_path = None
+        thumbnail_only = False
+        # PATH A: URL
         if url and not uploaded_file_path:
             frame_paths = stream_extract_frames(url, job_id, max_frames=8, duration=30)
             if not frame_paths:
                 video_path = download_video(url, job_id)
                 if video_path and os.path.exists(video_path):
                     frame_paths = extract_frames(video_path, job_id, fps=0.5, max_frames=8)
+                    audio_path = extract_audio(video_path, job_id)
+                elif is_youtube_url(url):
+                    frame_paths = download_youtube_thumbnail(url, job_id)
+                    thumbnail_only = True
                 else:
+                    jobs_db[job_id] = {"status": "failed", "error": "Could not download video or extract frames"}
+                    return
             else:
                 audio_path = stream_extract_audio(url, job_id, duration=30)
+        # PATH B: Upload
         elif uploaded_file_path and os.path.exists(uploaded_file_path):
             video_path = uploaded_file_path
             frame_paths = extract_frames(video_path, job_id, fps=0.5, max_frames=8)
             audio_path = extract_audio(video_path, job_id)
+        # ANALYSIS
+        # Visual
+        visual_result = analyze_visual_fallback(frame_paths)
+        visual = {
+            "avg_prob": visual_result["avg_prob"],
+            "max_prob": visual_result["max_prob"],
+            "frame_count": visual_result["frame_count"],
+            "details": visual_result["details"],
+            "source": "Visual Engine"
+        }
+        # Audio
+        audio = analyze_audio_ai(video_path, audio_path=audio_path)
+        # Metadata & Heuristics
         meta = analyze_metadata(video_path, video_info=video_info)
         heuristics = analyze_heuristics(video_path, meta, video_info=video_info)
+        # Content Analysis (New)
+        content = analyze_content(video_info=video_info)
+        # Scoring
+        signals = {
+            "visual": visual,
+            "audio": audio,
+            "metadata": meta,
+            "heuristics": heuristics,
+            "content": content
+        }
+        score, confidence, rec = calculate_risk(signals)
+        # Build explanation
         if thumbnail_only:
+            explanation = f"⚠️ Thumbnail-only analysis. Risk score: {score}/100 ({rec}). {confidence} confidence."
         else:
+            explanation = f"Extensive analysis of {len(frame_paths)} frames and audio signals. Risk score: {score}/100 ({rec}). {confidence} confidence."
         result = {
             "score": score,
             "confidence": confidence,
             "signals": signals,
             "thumbnail_only": thumbnail_only,
             "video_info": {
+                "title": video_info.get("title", "Unknown"),
+                "duration": video_info.get("duration"),
+                "resolution": f"{video_info.get('width', '?')}x{video_info.get('height', '?')}"
             },
             "explanation": explanation,
+            "disclaimer": "AI detection is probabilistic."
         }
         if url:
             save_to_cache(url, result)
         clean_temp(job_id)
         result['id'] = job_id
         jobs_db[job_id] = {"status": "completed", "result": result}
     except Exception as e:
+        print(f"Pipeline failure: {e}")
         jobs_db[job_id] = {"status": "failed", "error": str(e)}
         clean_temp(job_id)