Spaces:

UAJK-Practice
/

Voice_Detection

Sleeping

App Files Files Community

AbdulWahab14 commited on about 1 month ago

Commit

a1de353

verified ·

1 Parent(s): 4307d15

Update app.py

Browse files

Files changed (1) hide show

app.py +495 -226

app.py CHANGED Viewed

@@ -1,129 +1,443 @@
-# app.py — This is your main Hugging Face Spaces app
 import gradio as gr
 import torch
 import numpy as np
 import librosa
 import cv2
 import os
-import matplotlib.pyplot as plt
 import matplotlib
 matplotlib.use('Agg')
 from PIL import Image
-import io
-import base64
 import warnings
-warnings.filterwarnings('ignore')
-# ─── Paste your model classes here (or import them) ──────────────────────────
-# Copy the AudioDeepfakeDetector and VideoDeepfakeDetector classes from above
-# OR use import statements if you structured it as a package:
-# from model.audio_detector import AudioDeepfakeDetector, extract_audio_features, predict_audio
-# from model.video_detector import VideoDeepfakeDetector, predict_video
-# For Spaces, we'll use lightweight pretrained models from HuggingFace Hub
-# as fallback if custom models aren't trained yet:
-from transformers import pipeline
-# Load pre-trained audio classifier (for demo)
-# For production, replace with your trained model weights
-AUDIO_MODEL_PATH = "audio_model.pth"
-VIDEO_MODEL_PATH = "video_model.pth"
-USE_TRAINED_MODEL = os.path.exists(AUDIO_MODEL_PATH)
-# ─── Fallback: Use transformers pipeline ─────────────────────────────────────
-def analyze_audio_transformers(audio_path):
-    """
-    Uses a HuggingFace pipeline for audio classification.
-    Replace this with your trained model for better accuracy.
-    """
-    try:
-        classifier = pipeline(
-            "audio-classification",
-            model="facebook/wav2vec2-base",
-            # For real deepfake detection use:
-            # model="mo-aqrabi/deepfake-audio-detection"
-        )
-        # This is a placeholder — replace with actual deepfake model
-        # For now returns heuristic based on spectral analysis
-        y, sr = librosa.load(audio_path, sr=16000, duration=5)
-        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
-        zcr = librosa.feature.zero_crossing_rate(y)
-        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
-        # Heuristic: AI audio tends to have very regular ZCR and centroid
-        zcr_regularity = 1 - (np.std(zcr) / (np.mean(zcr) + 1e-6))
-        spectral_regularity = 1 - (np.std(spectral_centroid) / (np.mean(spectral_centroid) + 1e-6))
-        # Combine into a rough AI score
-        ai_score = np.clip((zcr_regularity + spectral_regularity) / 2, 0, 1)
-        return float(ai_score)
-    except Exception as e:
-        print(f"Fallback audio analysis error: {e}")
-        return 0.5  # Uncertain
-# ─── Main Detection Functions ─────────────────────────────────────────────────
 def detect_audio(audio_file):
-    """Full audio analysis pipeline."""
     if audio_file is None:
         return None, "❌ No audio file provided", None
     try:
-        if USE_TRAINED_MODEL:
-            score = predict_audio(audio_file, AUDIO_MODEL_PATH)
         else:
-            score = analyze_audio_transformers(audio_file)
         percentage = score * 100
         verdict = "🤖 AI GENERATED" if score > 0.5 else "✅ REAL / HUMAN"
         confidence = max(score, 1 - score) * 100
-        # Generate waveform + spectrogram plot
-        y, sr = librosa.load(audio_file, sr=16000, duration=10)
-        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 6))
-        fig.patch.set_facecolor('#0d0d0d')
-        # Waveform
-        ax1.set_facecolor('#1a1a2e')
-        librosa.display.waveshow(y, sr=sr, ax=ax1, color='#00d4ff')
-        ax1.set_title('Audio Waveform', color='white', fontsize=12)
-        ax1.tick_params(colors='white')
-        # Mel Spectrogram
-        ax2.set_facecolor('#1a1a2e')
-        mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
-        mel_db = librosa.power_to_db(mel, ref=np.max)
-        img = librosa.display.specshow(mel_db, sr=sr, ax=ax2,
-                                        x_axis='time', y_axis='mel', cmap='magma')
-        plt.colorbar(img, ax=ax2, format='%+2.0f dB')
-        ax2.set_title('Mel Spectrogram', color='white', fontsize=12)
-        ax2.tick_params(colors='white')
-        ax2.yaxis.label.set_color('white')
-        ax2.xaxis.label.set_color('white')
-        plt.tight_layout()
         plot_path = '/tmp/audio_analysis.png'
-        plt.savefig(plot_path, facecolor='#0d0d0d', bbox_inches='tight')
-        plt.close()
         result_text = f"""
-## 🔊 Audio Analysis Result
 | Metric | Value |
 |--------|-------|
 | **AI Probability** | {percentage:.1f}% |
 | **Verdict** | {verdict} |
 | **Confidence** | {confidence:.1f}% |
-| **Status** | {'⚠️ HIGH RISK' if percentage > 75 else '🟡 SUSPICIOUS' if percentage > 50 else '🟢 LIKELY REAL'} |
 ### Interpretation
-- **0-30%**: Very likely genuine/human-created
-- **30-50%**: Possibly human, some AI characteristics
-- **50-70%**: Likely AI-generated, needs verification
-- **70-100%**: Almost certainly AI-generated
 """
         return plot_path, result_text, f"{percentage:.1f}%"
@@ -132,72 +446,46 @@ def detect_audio(audio_file):
 def detect_video(video_file):
-    """Full video analysis pipeline."""
     if video_file is None:
         return None, "❌ No video file provided", None
     try:
-        if USE_TRAINED_MODEL and os.path.exists(VIDEO_MODEL_PATH):
-            result = predict_video(video_file, VIDEO_MODEL_PATH)
-            score = result['final_score']
-            frame_scores = result['per_frame_scores']
-            temporal = result['temporal_score']
-        else:
-            # Fallback: basic frame analysis
-            score, frame_scores, temporal = analyze_video_basic(video_file)
         percentage = score * 100
         verdict = "🤖 AI GENERATED" if score > 0.5 else "�� REAL / HUMAN"
-        # Generate frame score visualization
-        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
-        fig.patch.set_facecolor('#0d0d0d')
-        # Frame scores over time
-        ax1.set_facecolor('#1a1a2e')
-        frames_x = range(len(frame_scores))
-        ax1.plot(frames_x, [f * 100 for f in frame_scores], 'cyan', linewidth=2)
-        ax1.fill_between(frames_x, [f * 100 for f in frame_scores], alpha=0.3, color='cyan')
-        ax1.axhline(y=50, color='red', linestyle='--', alpha=0.7, label='Threshold (50%)')
-        ax1.set_facecolor('#1a1a2e')
-        ax1.set_xlabel('Frame', color='white')
-        ax1.set_ylabel('AI Score (%)', color='white')
-        ax1.set_title('Per-Frame AI Score', color='white')
-        ax1.tick_params(colors='white')
-        ax1.legend(facecolor='#1a1a2e', labelcolor='white')
-        ax1.set_ylim(0, 100)
-        # Score breakdown pie
-        ax2.set_facecolor('#1a1a2e')
-        sizes = [score * 100, (1 - score) * 100]
-        colors_pie = ['#FF4444', '#00CC44']
-        wedges, texts, autotexts = ax2.pie(sizes, labels=['AI Generated', 'Real/Human'],
-                                            colors=colors_pie, autopct='%1.1f%%',
-                                            textprops={'color': 'white', 'fontsize': 12},
-                                            startangle=90)
-        ax2.set_title(f'Final Verdict: {verdict}', color='white', fontsize=13)
-        plt.tight_layout()
         plot_path = '/tmp/video_analysis.png'
-        plt.savefig(plot_path, facecolor='#0d0d0d', bbox_inches='tight')
-        plt.close()
         result_text = f"""
-## 🎬 Video Analysis Result
 | Metric | Value |
 |--------|-------|
 | **AI Probability** | {percentage:.1f}% |
 | **Verdict** | {verdict} |
 | **Frames Analyzed** | {len(frame_scores)} |
-| **Temporal Inconsistency** | {temporal*100:.1f}% |
-| **Status** | {'⚠️ HIGH RISK' if percentage > 75 else '🟡 SUSPICIOUS' if percentage > 50 else '🟢 LIKELY REAL'} |
-### What we checked:
-- ✓ Face region analysis per frame
-- ✓ Temporal consistency between frames
-- ✓ Artifact patterns typical of AI generation
-- ✓ Natural motion flow analysis
 """
         return plot_path, result_text, f"{percentage:.1f}%"
@@ -205,41 +493,9 @@ def detect_video(video_file):
         return None, f"❌ Error analyzing video: {str(e)}", "N/A"
-def analyze_video_basic(video_path):
-    """Basic video analysis fallback without trained model."""
-    cap = cv2.VideoCapture(video_path)
-    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    indices = np.linspace(0, total - 1, 20, dtype=int)
-    frame_scores = []
-    prev_gray = None
-    for idx in indices:
-        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
-        ret, frame = cap.read()
-        if not ret:
-            continue
-        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-        if prev_gray is not None:
-            # Check for unnatural sharpness transitions (common in deepfakes)
-            diff = cv2.absdiff(gray, prev_gray)
-            score = float(np.std(diff)) / 50.0
-            score = np.clip(score, 0, 1)
-            frame_scores.append(score)
-        prev_gray = gray
-    cap.release()
-    if not frame_scores:
-        return 0.5, [0.5], 0.5
-    avg_score = np.mean(frame_scores)
-    temporal = float(np.std(frame_scores))
-    return avg_score, frame_scores, min(temporal, 1.0)
-# ─── Gradio UI ────────────────────────────────────────────────────────────────
 def build_ui():
     with gr.Blocks(
@@ -249,19 +505,22 @@ def build_ui():
             neutral_hue="slate",
         ),
         css="""
-        .gradio-container { max-width: 1100px; margin: auto; }
         .result-box { border-radius: 12px; padding: 16px; }
-        h1 { text-align: center; }
         .score-display { font-size: 48px; font-weight: bold; text-align: center; }
         """
     ) as demo:
         gr.HTML("""
-        <div style="text-align:center; padding: 20px 0;">
-            <h1 style="font-size: 2.5em; font-weight: 800;">🔍 DeepFake AI Detector</h1>
-            <p style="font-size: 1.1em; color: #888;">
-                Upload audio or video to detect AI generation.<br>
-                Get a precise percentage score of how much AI was used.
             </p>
         </div>
         """)
@@ -269,19 +528,23 @@ def build_ui():
         with gr.Tabs():
             # ── Audio Tab ──────────────────────────────────────────────────────
-            with gr.TabItem("🔊 Audio Detection"):
-                gr.Markdown("### Upload an audio file to check if it's AI-generated")
                 with gr.Row():
                     with gr.Column(scale=1):
                         audio_input = gr.Audio(
-                            label="Upload Audio (MP3, WAV, M4A)",
                             type="filepath"
                         )
                         audio_btn = gr.Button("🔍 Analyze Audio", variant="primary", size="lg")
                     with gr.Column(scale=2):
                         audio_score = gr.Label(label="🎯 AI Score", num_top_classes=1)
-                        audio_plot = gr.Image(label="📊 Audio Analysis")
                         audio_result = gr.Markdown(label="📋 Detailed Report")
                 audio_btn.click(
@@ -290,21 +553,19 @@ def build_ui():
                     outputs=[audio_plot, audio_result, audio_score]
                 )
-                gr.Examples(
-                    examples=[],
-                    inputs=audio_input,
-                    label="Example files (add your own samples)"
-                )
             # ── Video Tab ──────────────────────────────────────────────────────
-            with gr.TabItem("🎬 Video Detection"):
-                gr.Markdown("### Upload a video file to check if it's a deepfake")
                 with gr.Row():
                     with gr.Column(scale=1):
                         video_input = gr.Video(
                             label="Upload Video (MP4, AVI, MOV)",
                         )
                         video_btn = gr.Button("🔍 Analyze Video", variant="primary", size="lg")
                     with gr.Column(scale=2):
                         video_score = gr.Label(label="🎯 AI Score", num_top_classes=1)
@@ -318,38 +579,50 @@ def build_ui():
                 )
             # ── About Tab ─────────────────────────────────────────────────────
-            with gr.TabItem("ℹ️ How It Works"):
                 gr.Markdown("""
-## 🧠 Detection Methodology
-### Audio Analysis
-| Feature | What it detects |
-|---------|----------------|
-| MFCC (40 coefficients) | Unnatural vocal tract patterns |
-| Mel Spectrogram | Frequency distribution anomalies |
-| Zero Crossing Rate | Overly smooth AI transitions |
-| Spectral Centroid | Frequency center shifts |
-| Tonnetz | Harmonic content irregularities |
-### Video Analysis
-| Method | What it detects |
-|--------|----------------|
-| EfficientNet-B4 CNN | Per-frame visual artifacts |
-| Optical Flow | Temporal inconsistencies |
-| Face Detection | Blending boundary anomalies |
-| Ensemble Scoring | Combined confidence score |
 ### Score Interpretation
-- **0-30%**: 🟢 Very likely genuine
-- **30-50%**: 🟡 Some AI characteristics, inconclusive
-- **50-70%**: 🟠 Likely AI-generated
-- **70-100%**: 🔴 Almost certainly AI-generated
 ### ⚠️ Limitations
-- No detector is 100% accurate
-- Newer AI models may evade detection
-- Low quality media reduces accuracy
-- Always combine with human judgment
                 """)
     return demo
@@ -357,8 +630,4 @@ def build_ui():
 if __name__ == "__main__":
     demo = build_ui()
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=True  # Creates public URL for testing
-    )

+# ═══════════════════════════════════════════════════════════════════════════════
+# 🔍 DeepFake AI Detector — AI Voice & Video Forensics System v5
+# Hugging Face Spaces • Gradio • CPU-Optimized
+# ═══════════════════════════════════════════════════════════════════════════════
 import gradio as gr
 import torch
 import numpy as np
 import librosa
 import cv2
 import os
 import matplotlib
 matplotlib.use('Agg')
+import matplotlib.pyplot as plt
 from PIL import Image
 import warnings
+import soundfile as sf
+from scipy import stats
+from scipy.signal import hilbert
+warnings.filterwarnings('ignore')
+# ───────────────────────────────────────────────────────────────────────────────
+# 🧠 CUSTOM MODEL INTEGRATION (Paste your Colab model here)
+# ───────────────────────────────────────────────────────────────────────────────
+USE_CUSTOM_MODEL = False          # ← Set True when you add your own .pth
+CUSTOM_MODEL_PATH = "audio_model.pth"
+def load_custom_model(model_path: str):
+    """Load your trained model. Return a callable model or None."""
+    # TODO: Paste your Colab model loading code here
+    # Example:
+    # checkpoint = torch.load(model_path, map_location='cpu')
+    # model = YourModelClass(...)
+    # model.load_state_dict(checkpoint)
+    # model.eval()
+    # return model
+    return None
+def predict_audio_custom(audio_path: str) -> float:
+    """Your Colab inference pipeline. Must return float in [0, 1]."""
+    # TODO: Paste your Colab prediction code here
+    # 1. Load audio (librosa/soundfile)
+    # 2. Extract features exactly as in training
+    # 3. Forward pass
+    # 4. Return AI probability (0.0 = real, 1.0 = fake)
+    raise NotImplementedError("Paste your model code or disable USE_CUSTOM_MODEL")
+# ───────────────────────────────────────────────────────────────────────────────
+# 🔊 AUDIO FORENSICS ENSEMBLE (Spectral + Statistical)
+# ───────────────────────────────────────────────────────────────────────────────
+class AudioForensicsEnsemble:
+    def __init__(self, sr: int = 16000):
+        self.sr = sr
+        self.frame_dur = 0.5          # seconds per analysis frame
+        self.hop_dur = 0.25           # seconds hop length
+    def _extract_frame_features(self, y: np.ndarray) -> dict:
+        """Extract forensic features from a single audio frame."""
+        sr = self.sr
+        feats = {}
+        # 1. MFCC + derivatives (timbre / vocal tract)
+        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40, n_fft=512, hop_length=256)
+        feats['mfcc_mean'] = np.mean(mfcc, axis=1)
+        feats['mfcc_std'] = np.std(mfcc, axis=1)
+        delta = librosa.feature.delta(mfcc)
+        feats['mfcc_delta_std'] = np.std(delta, axis=1)
+        # 2. Zero-crossing rate (temporal crispness)
+        zcr = librosa.feature.zero_crossing_rate(y, hop_length=256)[0]
+        feats['zcr_mean'] = np.mean(zcr)
+        feats['zcr_std'] = np.std(zcr)
+        # 3. Spectral moments
+        spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length=256)[0]
+        spec_band = librosa.feature.spectral_bandwidth(y=y, sr=sr, hop_length=256)[0]
+        spec_roll = librosa.feature.spectral_rolloff(y=y, sr=sr, hop_length=256)[0]
+        spec_flat = librosa.feature.spectral_flatness(y=y, hop_length=256)[0]
+        feats['centroid_mean'] = np.mean(spec_cent)
+        feats['centroid_std'] = np.std(spec_cent)
+        feats['bandwidth_mean'] = np.mean(spec_band)
+        feats['rolloff_mean'] = np.mean(spec_roll)
+        feats['flatness_mean'] = np.mean(spec_flat)
+        feats['flatness_std'] = np.std(spec_flat)
+        # 4. RMS energy dynamics
+        rms = librosa.feature.rms(y=y, hop_length=256)[0]
+        feats['rms_mean'] = np.mean(rms)
+        feats['rms_std'] = np.std(rms)
+        # 5. Chroma (harmonic content)
+        chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=256)
+        feats['chroma_std'] = np.std(chroma, axis=1).mean()
+        # 6. Spectral contrast (periodicity vs noise)
+        contrast = librosa.feature.spectral_contrast(y=y, sr=sr, hop_length=256)
+        feats['contrast_std'] = np.std(contrast, axis=1).mean()
+        # 7. Tonnetz (harmonic network)
+        tonnetz = librosa.feature.tonnetz(y=y, sr=sr)
+        feats['tonnetz_std'] = np.std(tonnetz, axis=1).mean()
+        # 8. Phase coherence via analytic signal
+        analytic = hilbert(y)
+        phase = np.unwrap(np.angle(analytic))
+        feats['phase_std'] = np.std(np.diff(phase))
+        return feats
+    def _score_frame(self, feats: dict) -> float:
+        """
+        Forensic scoring function.
+        Higher score → more likely AI-generated.
+        Based on statistical regularities common in synthetic speech.
+        """
+        scores = []
+        # A. Regularity penalty: AI audio is often too consistent
+        regularity = 0.0
+        regularity += 1.0 - min(feats['zcr_std'] / (feats['zcr_mean'] + 1e-6), 1.0)
+        regularity += 1.0 - min(feats['centroid_std'] / (feats['centroid_mean'] + 1e-6), 1.0)
+        regularity += 1.0 - min(feats['mfcc_delta_std'].mean() / (np.abs(feats['mfcc_mean']).mean() + 1e-6), 1.0)
+        regularity /= 3.0
+        scores.append(regularity * 0.35)
+        # B. Spectral flatness: AI can exhibit unnatural flatness
+        flatness_sigmoid = 1.0 / (1.0 + np.exp(-20 * (feats['flatness_mean'] - 0.15)))
+        scores.append(flatness_sigmoid * 0.15)
+        # C. Phase coherence: synthetic audio often has lower phase variance
+        phase_score = 1.0 - min(feats['phase_std'] / 5.0, 1.0)
+        scores.append(phase_score * 0.20)
+        # D. Harmonic regularity: chroma + tonnetz uniformity
+        harmonic_reg = feats['chroma_std'] + feats['tonnetz_std']
+        harmonic_score = 1.0 - min(harmonic_reg / 0.3, 1.0)
+        scores.append(harmonic_score * 0.15)
+        # E. Energy dynamics: AI sometimes lacks natural micro-dynamics
+        dynamic_score = 1.0 - min(feats['rms_std'] / (feats['rms_mean'] + 1e-6), 1.0)
+        scores.append(dynamic_score * 0.15)
+        final = np.clip(sum(scores), 0.0, 1.0)
+        return final
+    def detect(self, audio_path: str):
+        """Run full forensic analysis on an audio file."""
+        y, sr = librosa.load(audio_path, sr=self.sr, mono=True, duration=60)
+        if len(y) < self.sr * 2:
+            # Pad short clips
+            y = np.pad(y, (0, self.sr * 2 - len(y)))
+        frame_len = int(self.frame_dur * sr)
+        hop_len = int(self.hop_dur * sr)
+        frame_scores = []
+        all_feats = []
+        for start in range(0, len(y) - frame_len, hop_len):
+            frame = y[start:start + frame_len]
+            feats = self._extract_frame_features(frame)
+            all_feats.append(feats)
+            frame_scores.append(self._score_frame(feats))
+        if not frame_scores:
+            return 0.5, [0.5], 0.5, None
+        # Temporal inconsistency: real humans vary more frame-to-frame
+        temporal_std = np.std(frame_scores)
+        temporal_score = np.clip(temporal_std * 2.5, 0.0, 1.0)
+        # Overall: weight average frame score vs temporal variance
+        avg_score = np.mean(frame_scores)
+        # High temporal variance → likely real (humans are irregular)
+        # Low temporal variance + high frame score → likely AI
+        ai_likelihood = avg_score * 0.7 + (1.0 - temporal_score) * 0.3
+        return np.clip(ai_likelihood, 0.0, 1.0), frame_scores, temporal_score, y
+# Global ensemble instance
+AUDIO_ENSEMBLE = AudioForensicsEnsemble()
+# ───────────────────────────────────────────────────────────────────────────────
+# 🎬 VIDEO FORENSICS (Optical Flow + Frame Artifact Detection)
+# ───────────────────────────────────────────────────────────────────────────────
+class VideoForensics:
+    def __init__(self, n_frames: int = 24):
+        self.n_frames = n_frames
+    def detect(self, video_path: str):
+        cap = cv2.VideoCapture(video_path)
+        total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        if total <= 0:
+            return 0.5, [0.5], 0.5
+        indices = np.linspace(0, total - 1, min(self.n_frames, total), dtype=int)
+        frame_scores = []
+        prev_gray = None
+        prev_faces = None
+        face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
+        for idx in indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+            ret, frame = cap.read()
+            if not ret:
+                continue
+            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+            h, w = gray.shape
+            # 1. Temporal inconsistency via optical flow magnitude
+            flow_score = 0.0
+            if prev_gray is not None:
+                flow = cv2.calcOpticalFlowFarneback(prev_gray, gray, None,
+                                                     pyr_scale=0.5, levels=3,
+                                                     winsize=15, iterations=3,
+                                                     poly_n=5, poly_sigma=1.2,
+                                                     flags=0)
+                mag, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1])
+                flow_mean = np.mean(mag)
+                flow_std = np.std(mag)
+                # Unnatural smoothness or extreme jitter
+                if flow_std < 0.5:
+                    flow_score = 0.6  # Too static
+                elif flow_std > 5.0:
+                    flow_score = 0.4  # Too jittery
+                else:
+                    flow_score = 0.2
+            # 2. Face boundary artifacts
+            face_score = 0.0
+            faces = face_cascade.detectMultiScale(gray, 1.1, 4)
+            if len(faces) > 0:
+                for (x, y, fw, fh) in faces:
+                    face_roi = gray[y:y+fh, x:x+fw]
+                    # Blending boundary check: sharpness variance at edges
+                    laplacian_var = cv2.Laplacian(face_roi, cv2.CV_64F).var()
+                    if laplacian_var > 1000:
+                        face_score = max(face_score, 0.3)  # Over-sharpened
+                    # Color consistency around face (simplified)
+                    if prev_faces is not None:
+                        # Check for sudden face swaps
+                        face_score = max(face_score, 0.2)
+            # 3. Noise pattern analysis
+            noise = cv2.Laplacian(gray, cv2.CV_64F).var()
+            noise_score = 0.0
+            if noise < 50:
+                noise_score = 0.4  # Too clean = suspicious
+            combined = np.clip((flow_score + face_score + noise_score) / 1.2, 0.0, 1.0)
+            frame_scores.append(combined)
+            prev_gray = gray
+            prev_faces = faces
+        cap.release()
+        if not frame_scores:
+            return 0.5, [0.5], 0.5
+        avg_score = np.mean(frame_scores)
+        temporal = float(np.std(frame_scores))
+        temporal_penalty = 1.0 - min(temporal * 2.0, 1.0)
+        final = avg_score * 0.6 + temporal_penalty * 0.4
+        return np.clip(final, 0.0, 1.0), frame_scores, np.clip(temporal, 0.0, 1.0)
+VIDEO_ENSEMBLE = VideoForensics()
+# ─────────────────────���─────────────────────────────────────────────────────────
+# 🖼️ VISUALIZATION HELPERS
+# ───────────────────────────────────────────────────────────────────────────────
+def plot_audio_analysis(y, sr, frame_scores, hop_dur, save_path: str):
+    fig = plt.figure(figsize=(12, 8), facecolor='#0d0d0d')
+    gs = fig.add_gridspec(3, 2, hspace=0.35, wspace=0.25)
+    # Waveform
+    ax1 = fig.add_subplot(gs[0, :])
+    ax1.set_facecolor('#1a1a2e')
+    librosa.display.waveshow(y, sr=sr, ax=ax1, color='#00d4ff', alpha=0.9)
+    ax1.set_title('Audio Waveform', color='white', fontsize=13, fontweight='bold')
+    ax1.tick_params(colors='white')
+    ax1.set_xlabel('Time (s)', color='white')
+    ax1.set_ylabel('Amplitude', color='white')
+    for spine in ax1.spines.values():
+        spine.set_color('#333')
+    # Mel Spectrogram
+    ax2 = fig.add_subplot(gs[1, :])
+    ax2.set_facecolor('#1a1a2e')
+    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, n_fft=2048, hop_length=512)
+    mel_db = librosa.power_to_db(mel, ref=np.max)
+    img = librosa.display.specshow(mel_db, sr=sr, ax=ax2, x_axis='time', y_axis='mel', cmap='magma')
+    cbar = plt.colorbar(img, ax=ax2, format='%+2.0f dB', fraction=0.046)
+    cbar.ax.yaxis.set_tick_params(color='white')
+    plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='white')
+    ax2.set_title('Mel Spectrogram', color='white', fontsize=13, fontweight='bold')
+    ax2.tick_params(colors='white')
+    ax2.set_xlabel('Time (s)', color='white')
+    ax2.set_ylabel('Mel Frequency', color='white')
+    for spine in ax2.spines.values():
+        spine.set_color('#333')
+    # Frame scores over time
+    ax3 = fig.add_subplot(gs[2, 0])
+    ax3.set_facecolor('#1a1a2e')
+    times = np.arange(len(frame_scores)) * hop_dur
+    ax3.plot(times, [f * 100 for f in frame_scores], color='#ff6b6b', linewidth=2, marker='o', markersize=3)
+    ax3.axhline(y=50, color='#ffd93d', linestyle='--', alpha=0.7, label='Threshold')
+    ax3.fill_between(times, [f * 100 for f in frame_scores], alpha=0.2, color='#ff6b6b')
+    ax3.set_xlabel('Time (s)', color='white')
+    ax3.set_ylabel('AI Score (%)', color='white')
+    ax3.set_title('Per-Frame AI Probability', color='white', fontsize=12, fontweight='bold')
+    ax3.tick_params(colors='white')
+    ax3.set_ylim(0, 100)
+    ax3.legend(facecolor='#1a1a2e', labelcolor='white')
+    for spine in ax3.spines.values():
+        spine.set_color('#333')
+    # Feature distribution
+    ax4 = fig.add_subplot(gs[2, 1])
+    ax4.set_facecolor('#1a1a2e')
+    ax4.hist([f * 100 for f in frame_scores], bins=12, color='#4ecdc4', edgecolor='white', alpha=0.8)
+    ax4.axvline(x=50, color='#ffd93d', linestyle='--', alpha=0.7)
+    ax4.set_xlabel('AI Score (%)', color='white')
+    ax4.set_ylabel('Frame Count', color='white')
+    ax4.set_title('Score Distribution', color='white', fontsize=12, fontweight='bold')
+    ax4.tick_params(colors='white')
+    for spine in ax4.spines.values():
+        spine.set_color('#333')
+    plt.savefig(save_path, facecolor='#0d0d0d', bbox_inches='tight', dpi=120)
+    plt.close()
+    return save_path
+def plot_video_analysis(frame_scores, save_path: str):
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5), facecolor='#0d0d0d')
+    # Frame scores
+    ax1.set_facecolor('#1a1a2e')
+    frames_x = range(len(frame_scores))
+    ax1.plot(frames_x, [f * 100 for f in frame_scores], color='#00d4ff', linewidth=2.5)
+    ax1.fill_between(frames_x, [f * 100 for f in frame_scores], alpha=0.25, color='#00d4ff')
+    ax1.axhline(y=50, color='#ff6b6b', linestyle='--', alpha=0.7, label='Threshold (50%)')
+    ax1.set_xlabel('Frame Index', color='white', fontsize=11)
+    ax1.set_ylabel('AI Score (%)', color='white', fontsize=11)
+    ax1.set_title('Per-Frame Deepfake Score', color='white', fontsize=13, fontweight='bold')
+    ax1.tick_params(colors='white')
+    ax1.legend(facecolor='#1a1a2e', labelcolor='white')
+    ax1.set_ylim(0, 100)
+    for spine in ax1.spines.values():
+        spine.set_color('#333')
+    # Pie chart
+    ax2.set_facecolor('#1a1a2e')
+    avg = np.mean(frame_scores)
+    sizes = [avg * 100, (1 - avg) * 100]
+    colors_pie = ['#ff6b6b', '#4ecdc4']
+    wedges, texts, autotexts = ax2.pie(
+        sizes, labels=['AI Generated', 'Real / Human'],
+        colors=colors_pie, autopct='%1.1f%%',
+        textprops={'color': 'white', 'fontsize': 12},
+        startangle=90, explode=(0.02, 0.02)
+    )
+    verdict = "🤖 AI GENERATED" if avg > 0.5 else "✅ REAL / HUMAN"
+    ax2.set_title(f'Verdict: {verdict}', color='white', fontsize=13, fontweight='bold')
+    for spine in ax2.spines.values():
+        spine.set_color('#333')
+    plt.tight_layout()
+    plt.savefig(save_path, facecolor='#0d0d0d', bbox_inches='tight', dpi=120)
+    plt.close()
+    return save_path
+# ───────────────────────────────────────────────────────────────────────────────
+# 🚀 MAIN DETECTION PIPELINES
+# ───────────────────────────────────────────────────────────────────────────────
 def detect_audio(audio_file):
     if audio_file is None:
         return None, "❌ No audio file provided", None
     try:
+        # Route to custom model if enabled and available
+        if USE_CUSTOM_MODEL and os.path.exists(CUSTOM_MODEL_PATH):
+            score = predict_audio_custom(audio_file)
+            y, sr = librosa.load(audio_file, sr=16000, duration=10)
+            frame_scores = [score]  # Single score for custom models
+            temporal = 0.5
         else:
+            score, frame_scores, temporal, y = AUDIO_ENSEMBLE.detect(audio_file)
+            sr = AUDIO_ENSEMBLE.sr
         percentage = score * 100
         verdict = "🤖 AI GENERATED" if score > 0.5 else "✅ REAL / HUMAN"
         confidence = max(score, 1 - score) * 100
         plot_path = '/tmp/audio_analysis.png'
+        plot_audio_analysis(y, sr, frame_scores, AUDIO_ENSEMBLE.hop_dur, plot_path)
+        status = '🔴 HIGH RISK' if percentage > 75 else '🟠 SUSPICIOUS' if percentage > 50 else '🟢 LIKELY REAL'
         result_text = f"""
+## 🔊 Audio Forensics Report
 | Metric | Value |
 |--------|-------|
 | **AI Probability** | {percentage:.1f}% |
 | **Verdict** | {verdict} |
 | **Confidence** | {confidence:.1f}% |
+| **Temporal Regularity** | {temporal*100:.1f}% |
+| **Status** | {status} |
+### Methodology
+| Feature | What it detects |
+|---------|----------------|
+| MFCC (40-dim) | Vocal tract anomalies |
+| Spectral Centroid | Frequency center shifts |
+| Zero Crossing Rate | Overly smooth transitions |
+| Phase Coherence | Synthetic phase patterns |
+| Spectral Flatness | Unnatural noise floor |
+| Chroma / Tonnetz | Harmonic irregularities |
 ### Interpretation
+- **0–30%**: Very likely genuine human speech
+- **30–50%**: Some synthetic characteristics, inconclusive
+- **50–75%**: Likely AI-generated, manual review recommended
+- **75–100%**: Strong indicators of synthetic audio
 """
         return plot_path, result_text, f"{percentage:.1f}%"
 def detect_video(video_file):
     if video_file is None:
         return None, "❌ No video file provided", None
     try:
+        score, frame_scores, temporal = VIDEO_ENSEMBLE.detect(video_file)
         percentage = score * 100
         verdict = "🤖 AI GENERATED" if score > 0.5 else "�� REAL / HUMAN"
+        confidence = max(score, 1 - score) * 100
         plot_path = '/tmp/video_analysis.png'
+        plot_video_analysis(frame_scores, plot_path)
+        status = '🔴 HIGH RISK' if percentage > 75 else '🟠 SUSPICIOUS' if percentage > 50 else '🟢 LIKELY REAL'
         result_text = f"""
+## 🎬 Video Forensics Report
 | Metric | Value |
 |--------|-------|
 | **AI Probability** | {percentage:.1f}% |
 | **Verdict** | {verdict} |
+| **Confidence** | {confidence:.1f}% |
 | **Frames Analyzed** | {len(frame_scores)} |
+| **Temporal Variance** | {temporal*100:.1f}% |
+| **Status** | {status} |
+### Detection Methods
+| Method | What it detects |
+|--------|----------------|
+| Optical Flow | Unnatural motion between frames |
+| Face Detection | Blending boundary artifacts |
+| Laplacian Variance | Over-smoothing / over-sharpening |
+| Temporal Consistency | Frame-to-frame irregularities |
+### Interpretation
+- **0–30%**: 🟢 Very likely genuine
+- **30–50%**: 🟡 Some AI characteristics
+- **50–75%**: 🟠 Likely deepfake
+- **75–100%**: 🔴 Strong deepfake indicators
 """
         return plot_path, result_text, f"{percentage:.1f}%"
         return None, f"❌ Error analyzing video: {str(e)}", "N/A"
+# ───────────────────────────────────────────────────────────────────────────────
+# 🎨 GRADIO UI
+# ───────────────────────────────────────────────────────────────────────────────
 def build_ui():
     with gr.Blocks(
             neutral_hue="slate",
         ),
         css="""
+        .gradio-container { max-width: 1200px; margin: auto; }
         .result-box { border-radius: 12px; padding: 16px; }
+        h1 { text-align: center; letter-spacing: -0.5px; }
         .score-display { font-size: 48px; font-weight: bold; text-align: center; }
+        .tab-button { font-weight: 600; }
         """
     ) as demo:
         gr.HTML("""
+        <div style="text-align:center; padding: 24px 0 12px 0;">
+            <h1 style="font-size: 2.8em; font-weight: 800; margin-bottom: 8px;">
+                🔍 DeepFake AI Detector
+            </h1>
+            <p style="font-size: 1.15em; color: #888; max-width: 600px; margin: auto;">
+                Upload audio or video to detect AI generation via spectral forensics
+                and temporal artifact analysis.
             </p>
         </div>
         """)
         with gr.Tabs():
             # ── Audio Tab ──────────────────────────────────────────────────────
+            with gr.TabItem("🔊 Audio Detection", id=0):
+                gr.Markdown("### Upload an audio file to analyze for synthetic speech")
                 with gr.Row():
                     with gr.Column(scale=1):
                         audio_input = gr.Audio(
+                            label="Upload Audio (MP3, WAV, M4A, FLAC)",
                             type="filepath"
                         )
                         audio_btn = gr.Button("🔍 Analyze Audio", variant="primary", size="lg")
+                        gr.Markdown("""
+                        **Supported formats:** WAV, MP3, M4A, FLAC
+                        **Max duration:** 60 seconds (auto-trimmed)
+                        """)
                     with gr.Column(scale=2):
                         audio_score = gr.Label(label="🎯 AI Score", num_top_classes=1)
+                        audio_plot = gr.Image(label="📊 Forensic Visualization")
                         audio_result = gr.Markdown(label="📋 Detailed Report")
                 audio_btn.click(
                     outputs=[audio_plot, audio_result, audio_score]
                 )
             # ── Video Tab ──────────────────────────────────────────────────────
+            with gr.TabItem("🎬 Video Detection", id=1):
+                gr.Markdown("### Upload a video file to check for deepfake artifacts")
                 with gr.Row():
                     with gr.Column(scale=1):
                         video_input = gr.Video(
                             label="Upload Video (MP4, AVI, MOV)",
                         )
                         video_btn = gr.Button("🔍 Analyze Video", variant="primary", size="lg")
+                        gr.Markdown("""
+                        **Supported formats:** MP4, AVI, MOV
+                        **Analyzes:** 24 uniformly sampled frames
+                        """)
                     with gr.Column(scale=2):
                         video_score = gr.Label(label="🎯 AI Score", num_top_classes=1)
                 )
             # ── About Tab ─────────────────────────────────────────────────────
+            with gr.TabItem("ℹ️ How It Works", id=2):
                 gr.Markdown("""
+## 🧠 Detection Architecture
+### Audio Forensics Ensemble
+This system uses a **multi-feature spectral ensemble** that analyzes:
+| Feature Class | Specific Metrics | Synthetic Indicator |
+|---------------|------------------|---------------------|
+| **Timbre** | 40-dim MFCC + Δ + Δ² | Unnatural vocal tract patterns |
+| **Temporal** | ZCR mean/std | Overly smooth frame transitions |
+| **Spectral** | Centroid, bandwidth, rolloff | Frequency distribution anomalies |
+| **Harmonic** | Chroma, Tonnetz | Artificial harmonic structure |
+| **Phase** | Analytic signal phase std | Reduced phase coherence |
+| **Dynamics** | RMS micro-dynamics | Compressed natural variation |
+**Scoring**: Each 0.5-second frame is scored independently. The final verdict blends
+mean frame probability with temporal variance (real speech is more irregular).
+### Video Forensics
+| Method | Artifact Detected |
+|--------|-------------------|
+| Optical Flow Farneback | Unnatural motion smoothness |
+| Haar Face Detection | Boundary blending errors |
+| Laplacian Variance | Over-sharpening / smoothing |
+| Frame-to-frame StdDev | Temporal inconsistency |
 ### Score Interpretation
+- **0–30%**: 🟢 Very likely genuine
+- **30–50%**: 🟡 Some AI characteristics, inconclusive
+- **50–75%**: 🟠 Likely AI-generated, needs verification
+- **75–100%**: 🔴 Almost certainly AI-generated
 ### ⚠️ Limitations
+- No detector is 100% accurate against adversarial or novel generative models
+- Performance degrades on heavily compressed or low-bitrate media
+- Always combine automated scores with human expert review
+- Maximum audio analysis length: 60 seconds
+### 🔌 Custom Model Integration
+To use your own trained model:
+1. Set `USE_CUSTOM_MODEL = True` in `app.py`
+2. Implement `load_custom_model()` and `predict_audio_custom()` with your Colab code
+3. Upload your `.pth` weights to the Space repository root
                 """)
     return demo
 if __name__ == "__main__":
     demo = build_ui()
+    demo.launch(server_name="0.0.0.0", server_port=7860)