AbdulWahab14 commited on
Commit
a1de353
Β·
verified Β·
1 Parent(s): 4307d15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +495 -226
app.py CHANGED
@@ -1,129 +1,443 @@
1
- # app.py β€” This is your main Hugging Face Spaces app
 
 
 
 
2
  import gradio as gr
3
  import torch
4
  import numpy as np
5
  import librosa
6
  import cv2
7
  import os
8
- import matplotlib.pyplot as plt
9
  import matplotlib
10
  matplotlib.use('Agg')
 
11
  from PIL import Image
12
- import io
13
- import base64
14
  import warnings
15
- warnings.filterwarnings('ignore')
16
-
17
- # ─── Paste your model classes here (or import them) ──────────────────────────
18
- # Copy the AudioDeepfakeDetector and VideoDeepfakeDetector classes from above
19
- # OR use import statements if you structured it as a package:
20
- # from model.audio_detector import AudioDeepfakeDetector, extract_audio_features, predict_audio
21
- # from model.video_detector import VideoDeepfakeDetector, predict_video
22
-
23
- # For Spaces, we'll use lightweight pretrained models from HuggingFace Hub
24
- # as fallback if custom models aren't trained yet:
25
-
26
- from transformers import pipeline
27
-
28
- # Load pre-trained audio classifier (for demo)
29
- # For production, replace with your trained model weights
30
- AUDIO_MODEL_PATH = "audio_model.pth"
31
- VIDEO_MODEL_PATH = "video_model.pth"
32
-
33
- USE_TRAINED_MODEL = os.path.exists(AUDIO_MODEL_PATH)
34
-
35
- # ─── Fallback: Use transformers pipeline ─────────────────────────────────────
36
- def analyze_audio_transformers(audio_path):
37
- """
38
- Uses a HuggingFace pipeline for audio classification.
39
- Replace this with your trained model for better accuracy.
40
- """
41
- try:
42
- classifier = pipeline(
43
- "audio-classification",
44
- model="facebook/wav2vec2-base",
45
- # For real deepfake detection use:
46
- # model="mo-aqrabi/deepfake-audio-detection"
47
- )
48
- # This is a placeholder β€” replace with actual deepfake model
49
- # For now returns heuristic based on spectral analysis
50
- y, sr = librosa.load(audio_path, sr=16000, duration=5)
51
- mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
52
- zcr = librosa.feature.zero_crossing_rate(y)
53
- spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
54
-
55
- # Heuristic: AI audio tends to have very regular ZCR and centroid
56
- zcr_regularity = 1 - (np.std(zcr) / (np.mean(zcr) + 1e-6))
57
- spectral_regularity = 1 - (np.std(spectral_centroid) / (np.mean(spectral_centroid) + 1e-6))
58
-
59
- # Combine into a rough AI score
60
- ai_score = np.clip((zcr_regularity + spectral_regularity) / 2, 0, 1)
61
- return float(ai_score)
62
- except Exception as e:
63
- print(f"Fallback audio analysis error: {e}")
64
- return 0.5 # Uncertain
65
 
 
66
 
67
- # ─── Main Detection Functions ─────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  def detect_audio(audio_file):
70
- """Full audio analysis pipeline."""
71
  if audio_file is None:
72
  return None, "❌ No audio file provided", None
73
 
74
  try:
75
- if USE_TRAINED_MODEL:
76
- score = predict_audio(audio_file, AUDIO_MODEL_PATH)
 
 
 
 
77
  else:
78
- score = analyze_audio_transformers(audio_file)
 
79
 
80
  percentage = score * 100
81
  verdict = "πŸ€– AI GENERATED" if score > 0.5 else "βœ… REAL / HUMAN"
82
  confidence = max(score, 1 - score) * 100
83
 
84
- # Generate waveform + spectrogram plot
85
- y, sr = librosa.load(audio_file, sr=16000, duration=10)
86
- fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 6))
87
- fig.patch.set_facecolor('#0d0d0d')
88
-
89
- # Waveform
90
- ax1.set_facecolor('#1a1a2e')
91
- librosa.display.waveshow(y, sr=sr, ax=ax1, color='#00d4ff')
92
- ax1.set_title('Audio Waveform', color='white', fontsize=12)
93
- ax1.tick_params(colors='white')
94
-
95
- # Mel Spectrogram
96
- ax2.set_facecolor('#1a1a2e')
97
- mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
98
- mel_db = librosa.power_to_db(mel, ref=np.max)
99
- img = librosa.display.specshow(mel_db, sr=sr, ax=ax2,
100
- x_axis='time', y_axis='mel', cmap='magma')
101
- plt.colorbar(img, ax=ax2, format='%+2.0f dB')
102
- ax2.set_title('Mel Spectrogram', color='white', fontsize=12)
103
- ax2.tick_params(colors='white')
104
- ax2.yaxis.label.set_color('white')
105
- ax2.xaxis.label.set_color('white')
106
-
107
- plt.tight_layout()
108
  plot_path = '/tmp/audio_analysis.png'
109
- plt.savefig(plot_path, facecolor='#0d0d0d', bbox_inches='tight')
110
- plt.close()
 
111
 
112
  result_text = f"""
113
- ## πŸ”Š Audio Analysis Result
114
 
115
  | Metric | Value |
116
  |--------|-------|
117
  | **AI Probability** | {percentage:.1f}% |
118
  | **Verdict** | {verdict} |
119
  | **Confidence** | {confidence:.1f}% |
120
- | **Status** | {'⚠️ HIGH RISK' if percentage > 75 else '🟑 SUSPICIOUS' if percentage > 50 else '🟒 LIKELY REAL'} |
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  ### Interpretation
123
- - **0-30%**: Very likely genuine/human-created
124
- - **30-50%**: Possibly human, some AI characteristics
125
- - **50-70%**: Likely AI-generated, needs verification
126
- - **70-100%**: Almost certainly AI-generated
127
  """
128
  return plot_path, result_text, f"{percentage:.1f}%"
129
 
@@ -132,72 +446,46 @@ def detect_audio(audio_file):
132
 
133
 
134
  def detect_video(video_file):
135
- """Full video analysis pipeline."""
136
  if video_file is None:
137
  return None, "❌ No video file provided", None
138
 
139
  try:
140
- if USE_TRAINED_MODEL and os.path.exists(VIDEO_MODEL_PATH):
141
- result = predict_video(video_file, VIDEO_MODEL_PATH)
142
- score = result['final_score']
143
- frame_scores = result['per_frame_scores']
144
- temporal = result['temporal_score']
145
- else:
146
- # Fallback: basic frame analysis
147
- score, frame_scores, temporal = analyze_video_basic(video_file)
148
 
149
  percentage = score * 100
150
  verdict = "πŸ€– AI GENERATED" if score > 0.5 else "οΏ½οΏ½ REAL / HUMAN"
 
151
 
152
- # Generate frame score visualization
153
- fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
154
- fig.patch.set_facecolor('#0d0d0d')
155
-
156
- # Frame scores over time
157
- ax1.set_facecolor('#1a1a2e')
158
- frames_x = range(len(frame_scores))
159
- ax1.plot(frames_x, [f * 100 for f in frame_scores], 'cyan', linewidth=2)
160
- ax1.fill_between(frames_x, [f * 100 for f in frame_scores], alpha=0.3, color='cyan')
161
- ax1.axhline(y=50, color='red', linestyle='--', alpha=0.7, label='Threshold (50%)')
162
- ax1.set_facecolor('#1a1a2e')
163
- ax1.set_xlabel('Frame', color='white')
164
- ax1.set_ylabel('AI Score (%)', color='white')
165
- ax1.set_title('Per-Frame AI Score', color='white')
166
- ax1.tick_params(colors='white')
167
- ax1.legend(facecolor='#1a1a2e', labelcolor='white')
168
- ax1.set_ylim(0, 100)
169
-
170
- # Score breakdown pie
171
- ax2.set_facecolor('#1a1a2e')
172
- sizes = [score * 100, (1 - score) * 100]
173
- colors_pie = ['#FF4444', '#00CC44']
174
- wedges, texts, autotexts = ax2.pie(sizes, labels=['AI Generated', 'Real/Human'],
175
- colors=colors_pie, autopct='%1.1f%%',
176
- textprops={'color': 'white', 'fontsize': 12},
177
- startangle=90)
178
- ax2.set_title(f'Final Verdict: {verdict}', color='white', fontsize=13)
179
-
180
- plt.tight_layout()
181
  plot_path = '/tmp/video_analysis.png'
182
- plt.savefig(plot_path, facecolor='#0d0d0d', bbox_inches='tight')
183
- plt.close()
 
184
 
185
  result_text = f"""
186
- ## 🎬 Video Analysis Result
187
 
188
  | Metric | Value |
189
  |--------|-------|
190
  | **AI Probability** | {percentage:.1f}% |
191
  | **Verdict** | {verdict} |
 
192
  | **Frames Analyzed** | {len(frame_scores)} |
193
- | **Temporal Inconsistency** | {temporal*100:.1f}% |
194
- | **Status** | {'⚠️ HIGH RISK' if percentage > 75 else '🟑 SUSPICIOUS' if percentage > 50 else '🟒 LIKELY REAL'} |
195
-
196
- ### What we checked:
197
- - βœ“ Face region analysis per frame
198
- - βœ“ Temporal consistency between frames
199
- - βœ“ Artifact patterns typical of AI generation
200
- - βœ“ Natural motion flow analysis
 
 
 
 
 
 
 
 
201
  """
202
  return plot_path, result_text, f"{percentage:.1f}%"
203
 
@@ -205,41 +493,9 @@ def detect_video(video_file):
205
  return None, f"❌ Error analyzing video: {str(e)}", "N/A"
206
 
207
 
208
- def analyze_video_basic(video_path):
209
- """Basic video analysis fallback without trained model."""
210
- cap = cv2.VideoCapture(video_path)
211
- total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
212
- indices = np.linspace(0, total - 1, 20, dtype=int)
213
-
214
- frame_scores = []
215
- prev_gray = None
216
-
217
- for idx in indices:
218
- cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
219
- ret, frame = cap.read()
220
- if not ret:
221
- continue
222
- gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
223
-
224
- if prev_gray is not None:
225
- # Check for unnatural sharpness transitions (common in deepfakes)
226
- diff = cv2.absdiff(gray, prev_gray)
227
- score = float(np.std(diff)) / 50.0
228
- score = np.clip(score, 0, 1)
229
- frame_scores.append(score)
230
- prev_gray = gray
231
-
232
- cap.release()
233
-
234
- if not frame_scores:
235
- return 0.5, [0.5], 0.5
236
-
237
- avg_score = np.mean(frame_scores)
238
- temporal = float(np.std(frame_scores))
239
- return avg_score, frame_scores, min(temporal, 1.0)
240
-
241
-
242
- # ─── Gradio UI ────────────────────────────────────────────────────────────────
243
 
244
  def build_ui():
245
  with gr.Blocks(
@@ -249,19 +505,22 @@ def build_ui():
249
  neutral_hue="slate",
250
  ),
251
  css="""
252
- .gradio-container { max-width: 1100px; margin: auto; }
253
  .result-box { border-radius: 12px; padding: 16px; }
254
- h1 { text-align: center; }
255
  .score-display { font-size: 48px; font-weight: bold; text-align: center; }
 
256
  """
257
  ) as demo:
258
 
259
  gr.HTML("""
260
- <div style="text-align:center; padding: 20px 0;">
261
- <h1 style="font-size: 2.5em; font-weight: 800;">πŸ” DeepFake AI Detector</h1>
262
- <p style="font-size: 1.1em; color: #888;">
263
- Upload audio or video to detect AI generation.<br>
264
- Get a precise percentage score of how much AI was used.
 
 
265
  </p>
266
  </div>
267
  """)
@@ -269,19 +528,23 @@ def build_ui():
269
  with gr.Tabs():
270
 
271
  # ── Audio Tab ──────────────────────────────────────────────────────
272
- with gr.TabItem("πŸ”Š Audio Detection"):
273
- gr.Markdown("### Upload an audio file to check if it's AI-generated")
274
  with gr.Row():
275
  with gr.Column(scale=1):
276
  audio_input = gr.Audio(
277
- label="Upload Audio (MP3, WAV, M4A)",
278
  type="filepath"
279
  )
280
  audio_btn = gr.Button("πŸ” Analyze Audio", variant="primary", size="lg")
 
 
 
 
281
 
282
  with gr.Column(scale=2):
283
  audio_score = gr.Label(label="🎯 AI Score", num_top_classes=1)
284
- audio_plot = gr.Image(label="πŸ“Š Audio Analysis")
285
  audio_result = gr.Markdown(label="πŸ“‹ Detailed Report")
286
 
287
  audio_btn.click(
@@ -290,21 +553,19 @@ def build_ui():
290
  outputs=[audio_plot, audio_result, audio_score]
291
  )
292
 
293
- gr.Examples(
294
- examples=[],
295
- inputs=audio_input,
296
- label="Example files (add your own samples)"
297
- )
298
-
299
  # ── Video Tab ──────────────────────────────────────────────────────
300
- with gr.TabItem("🎬 Video Detection"):
301
- gr.Markdown("### Upload a video file to check if it's a deepfake")
302
  with gr.Row():
303
  with gr.Column(scale=1):
304
  video_input = gr.Video(
305
  label="Upload Video (MP4, AVI, MOV)",
306
  )
307
  video_btn = gr.Button("πŸ” Analyze Video", variant="primary", size="lg")
 
 
 
 
308
 
309
  with gr.Column(scale=2):
310
  video_score = gr.Label(label="🎯 AI Score", num_top_classes=1)
@@ -318,38 +579,50 @@ def build_ui():
318
  )
319
 
320
  # ── About Tab ─────────────────────────────────────────────────────
321
- with gr.TabItem("ℹ️ How It Works"):
322
  gr.Markdown("""
323
- ## 🧠 Detection Methodology
324
-
325
- ### Audio Analysis
326
- | Feature | What it detects |
327
- |---------|----------------|
328
- | MFCC (40 coefficients) | Unnatural vocal tract patterns |
329
- | Mel Spectrogram | Frequency distribution anomalies |
330
- | Zero Crossing Rate | Overly smooth AI transitions |
331
- | Spectral Centroid | Frequency center shifts |
332
- | Tonnetz | Harmonic content irregularities |
333
-
334
- ### Video Analysis
335
- | Method | What it detects |
336
- |--------|----------------|
337
- | EfficientNet-B4 CNN | Per-frame visual artifacts |
338
- | Optical Flow | Temporal inconsistencies |
339
- | Face Detection | Blending boundary anomalies |
340
- | Ensemble Scoring | Combined confidence score |
 
 
 
 
 
 
341
 
342
  ### Score Interpretation
343
- - **0-30%**: 🟒 Very likely genuine
344
- - **30-50%**: 🟑 Some AI characteristics, inconclusive
345
- - **50-70%**: 🟠 Likely AI-generated
346
- - **70-100%**: πŸ”΄ Almost certainly AI-generated
347
 
348
  ### ⚠️ Limitations
349
- - No detector is 100% accurate
350
- - Newer AI models may evade detection
351
- - Low quality media reduces accuracy
352
- - Always combine with human judgment
 
 
 
 
 
 
353
  """)
354
 
355
  return demo
@@ -357,8 +630,4 @@ def build_ui():
357
 
358
  if __name__ == "__main__":
359
  demo = build_ui()
360
- demo.launch(
361
- server_name="0.0.0.0",
362
- server_port=7860,
363
- share=True # Creates public URL for testing
364
- )
 
1
+ # ═══════════════════════════════════════════════════════════════════════════════
2
+ # πŸ” DeepFake AI Detector β€” AI Voice & Video Forensics System v5
3
+ # Hugging Face Spaces β€’ Gradio β€’ CPU-Optimized
4
+ # ═══════════════════════════════════════════════════════════════════════════════
5
+
6
  import gradio as gr
7
  import torch
8
  import numpy as np
9
  import librosa
10
  import cv2
11
  import os
 
12
  import matplotlib
13
  matplotlib.use('Agg')
14
+ import matplotlib.pyplot as plt
15
  from PIL import Image
 
 
16
  import warnings
17
+ import soundfile as sf
18
+ from scipy import stats
19
+ from scipy.signal import hilbert
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ warnings.filterwarnings('ignore')
22
 
23
+ # ───────────────────────────────────────────────────────────────────────────────
24
+ # 🧠 CUSTOM MODEL INTEGRATION (Paste your Colab model here)
25
+ # ───────────────────────────────────────────────────────────────────────────────
26
+
27
+ USE_CUSTOM_MODEL = False # ← Set True when you add your own .pth
28
+ CUSTOM_MODEL_PATH = "audio_model.pth"
29
+
30
+ def load_custom_model(model_path: str):
31
+ """Load your trained model. Return a callable model or None."""
32
+ # TODO: Paste your Colab model loading code here
33
+ # Example:
34
+ # checkpoint = torch.load(model_path, map_location='cpu')
35
+ # model = YourModelClass(...)
36
+ # model.load_state_dict(checkpoint)
37
+ # model.eval()
38
+ # return model
39
+ return None
40
+
41
+ def predict_audio_custom(audio_path: str) -> float:
42
+ """Your Colab inference pipeline. Must return float in [0, 1]."""
43
+ # TODO: Paste your Colab prediction code here
44
+ # 1. Load audio (librosa/soundfile)
45
+ # 2. Extract features exactly as in training
46
+ # 3. Forward pass
47
+ # 4. Return AI probability (0.0 = real, 1.0 = fake)
48
+ raise NotImplementedError("Paste your model code or disable USE_CUSTOM_MODEL")
49
+
50
+ # ───────────────────────────────────────────────────────────────────────────────
51
+ # πŸ”Š AUDIO FORENSICS ENSEMBLE (Spectral + Statistical)
52
+ # ───────────────────────────────────────────────────────────────────────────────
53
+
54
+ class AudioForensicsEnsemble:
55
+ def __init__(self, sr: int = 16000):
56
+ self.sr = sr
57
+ self.frame_dur = 0.5 # seconds per analysis frame
58
+ self.hop_dur = 0.25 # seconds hop length
59
+
60
+ def _extract_frame_features(self, y: np.ndarray) -> dict:
61
+ """Extract forensic features from a single audio frame."""
62
+ sr = self.sr
63
+ feats = {}
64
+
65
+ # 1. MFCC + derivatives (timbre / vocal tract)
66
+ mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40, n_fft=512, hop_length=256)
67
+ feats['mfcc_mean'] = np.mean(mfcc, axis=1)
68
+ feats['mfcc_std'] = np.std(mfcc, axis=1)
69
+ delta = librosa.feature.delta(mfcc)
70
+ feats['mfcc_delta_std'] = np.std(delta, axis=1)
71
+
72
+ # 2. Zero-crossing rate (temporal crispness)
73
+ zcr = librosa.feature.zero_crossing_rate(y, hop_length=256)[0]
74
+ feats['zcr_mean'] = np.mean(zcr)
75
+ feats['zcr_std'] = np.std(zcr)
76
+
77
+ # 3. Spectral moments
78
+ spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length=256)[0]
79
+ spec_band = librosa.feature.spectral_bandwidth(y=y, sr=sr, hop_length=256)[0]
80
+ spec_roll = librosa.feature.spectral_rolloff(y=y, sr=sr, hop_length=256)[0]
81
+ spec_flat = librosa.feature.spectral_flatness(y=y, hop_length=256)[0]
82
+
83
+ feats['centroid_mean'] = np.mean(spec_cent)
84
+ feats['centroid_std'] = np.std(spec_cent)
85
+ feats['bandwidth_mean'] = np.mean(spec_band)
86
+ feats['rolloff_mean'] = np.mean(spec_roll)
87
+ feats['flatness_mean'] = np.mean(spec_flat)
88
+ feats['flatness_std'] = np.std(spec_flat)
89
+
90
+ # 4. RMS energy dynamics
91
+ rms = librosa.feature.rms(y=y, hop_length=256)[0]
92
+ feats['rms_mean'] = np.mean(rms)
93
+ feats['rms_std'] = np.std(rms)
94
+
95
+ # 5. Chroma (harmonic content)
96
+ chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=256)
97
+ feats['chroma_std'] = np.std(chroma, axis=1).mean()
98
+
99
+ # 6. Spectral contrast (periodicity vs noise)
100
+ contrast = librosa.feature.spectral_contrast(y=y, sr=sr, hop_length=256)
101
+ feats['contrast_std'] = np.std(contrast, axis=1).mean()
102
+
103
+ # 7. Tonnetz (harmonic network)
104
+ tonnetz = librosa.feature.tonnetz(y=y, sr=sr)
105
+ feats['tonnetz_std'] = np.std(tonnetz, axis=1).mean()
106
+
107
+ # 8. Phase coherence via analytic signal
108
+ analytic = hilbert(y)
109
+ phase = np.unwrap(np.angle(analytic))
110
+ feats['phase_std'] = np.std(np.diff(phase))
111
+
112
+ return feats
113
+
114
+ def _score_frame(self, feats: dict) -> float:
115
+ """
116
+ Forensic scoring function.
117
+ Higher score β†’ more likely AI-generated.
118
+ Based on statistical regularities common in synthetic speech.
119
+ """
120
+ scores = []
121
+
122
+ # A. Regularity penalty: AI audio is often too consistent
123
+ regularity = 0.0
124
+ regularity += 1.0 - min(feats['zcr_std'] / (feats['zcr_mean'] + 1e-6), 1.0)
125
+ regularity += 1.0 - min(feats['centroid_std'] / (feats['centroid_mean'] + 1e-6), 1.0)
126
+ regularity += 1.0 - min(feats['mfcc_delta_std'].mean() / (np.abs(feats['mfcc_mean']).mean() + 1e-6), 1.0)
127
+ regularity /= 3.0
128
+ scores.append(regularity * 0.35)
129
+
130
+ # B. Spectral flatness: AI can exhibit unnatural flatness
131
+ flatness_sigmoid = 1.0 / (1.0 + np.exp(-20 * (feats['flatness_mean'] - 0.15)))
132
+ scores.append(flatness_sigmoid * 0.15)
133
+
134
+ # C. Phase coherence: synthetic audio often has lower phase variance
135
+ phase_score = 1.0 - min(feats['phase_std'] / 5.0, 1.0)
136
+ scores.append(phase_score * 0.20)
137
+
138
+ # D. Harmonic regularity: chroma + tonnetz uniformity
139
+ harmonic_reg = feats['chroma_std'] + feats['tonnetz_std']
140
+ harmonic_score = 1.0 - min(harmonic_reg / 0.3, 1.0)
141
+ scores.append(harmonic_score * 0.15)
142
+
143
+ # E. Energy dynamics: AI sometimes lacks natural micro-dynamics
144
+ dynamic_score = 1.0 - min(feats['rms_std'] / (feats['rms_mean'] + 1e-6), 1.0)
145
+ scores.append(dynamic_score * 0.15)
146
+
147
+ final = np.clip(sum(scores), 0.0, 1.0)
148
+ return final
149
+
150
+ def detect(self, audio_path: str):
151
+ """Run full forensic analysis on an audio file."""
152
+ y, sr = librosa.load(audio_path, sr=self.sr, mono=True, duration=60)
153
+ if len(y) < self.sr * 2:
154
+ # Pad short clips
155
+ y = np.pad(y, (0, self.sr * 2 - len(y)))
156
+
157
+ frame_len = int(self.frame_dur * sr)
158
+ hop_len = int(self.hop_dur * sr)
159
+
160
+ frame_scores = []
161
+ all_feats = []
162
+
163
+ for start in range(0, len(y) - frame_len, hop_len):
164
+ frame = y[start:start + frame_len]
165
+ feats = self._extract_frame_features(frame)
166
+ all_feats.append(feats)
167
+ frame_scores.append(self._score_frame(feats))
168
+
169
+ if not frame_scores:
170
+ return 0.5, [0.5], 0.5, None
171
+
172
+ # Temporal inconsistency: real humans vary more frame-to-frame
173
+ temporal_std = np.std(frame_scores)
174
+ temporal_score = np.clip(temporal_std * 2.5, 0.0, 1.0)
175
+
176
+ # Overall: weight average frame score vs temporal variance
177
+ avg_score = np.mean(frame_scores)
178
+ # High temporal variance β†’ likely real (humans are irregular)
179
+ # Low temporal variance + high frame score β†’ likely AI
180
+ ai_likelihood = avg_score * 0.7 + (1.0 - temporal_score) * 0.3
181
+
182
+ return np.clip(ai_likelihood, 0.0, 1.0), frame_scores, temporal_score, y
183
+
184
+
185
+ # Global ensemble instance
186
+ AUDIO_ENSEMBLE = AudioForensicsEnsemble()
187
+
188
+ # ───────────────────────────────────────────────────────────────────────────────
189
+ # 🎬 VIDEO FORENSICS (Optical Flow + Frame Artifact Detection)
190
+ # ───────────────────────────────────────────────────────────────────────────────
191
+
192
+ class VideoForensics:
193
+ def __init__(self, n_frames: int = 24):
194
+ self.n_frames = n_frames
195
+
196
+ def detect(self, video_path: str):
197
+ cap = cv2.VideoCapture(video_path)
198
+ total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
199
+ if total <= 0:
200
+ return 0.5, [0.5], 0.5
201
+
202
+ indices = np.linspace(0, total - 1, min(self.n_frames, total), dtype=int)
203
+ frame_scores = []
204
+ prev_gray = None
205
+ prev_faces = None
206
+
207
+ face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
208
+
209
+ for idx in indices:
210
+ cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
211
+ ret, frame = cap.read()
212
+ if not ret:
213
+ continue
214
+
215
+ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
216
+ h, w = gray.shape
217
+
218
+ # 1. Temporal inconsistency via optical flow magnitude
219
+ flow_score = 0.0
220
+ if prev_gray is not None:
221
+ flow = cv2.calcOpticalFlowFarneback(prev_gray, gray, None,
222
+ pyr_scale=0.5, levels=3,
223
+ winsize=15, iterations=3,
224
+ poly_n=5, poly_sigma=1.2,
225
+ flags=0)
226
+ mag, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1])
227
+ flow_mean = np.mean(mag)
228
+ flow_std = np.std(mag)
229
+ # Unnatural smoothness or extreme jitter
230
+ if flow_std < 0.5:
231
+ flow_score = 0.6 # Too static
232
+ elif flow_std > 5.0:
233
+ flow_score = 0.4 # Too jittery
234
+ else:
235
+ flow_score = 0.2
236
+
237
+ # 2. Face boundary artifacts
238
+ face_score = 0.0
239
+ faces = face_cascade.detectMultiScale(gray, 1.1, 4)
240
+ if len(faces) > 0:
241
+ for (x, y, fw, fh) in faces:
242
+ face_roi = gray[y:y+fh, x:x+fw]
243
+ # Blending boundary check: sharpness variance at edges
244
+ laplacian_var = cv2.Laplacian(face_roi, cv2.CV_64F).var()
245
+ if laplacian_var > 1000:
246
+ face_score = max(face_score, 0.3) # Over-sharpened
247
+ # Color consistency around face (simplified)
248
+ if prev_faces is not None:
249
+ # Check for sudden face swaps
250
+ face_score = max(face_score, 0.2)
251
+
252
+ # 3. Noise pattern analysis
253
+ noise = cv2.Laplacian(gray, cv2.CV_64F).var()
254
+ noise_score = 0.0
255
+ if noise < 50:
256
+ noise_score = 0.4 # Too clean = suspicious
257
+
258
+ combined = np.clip((flow_score + face_score + noise_score) / 1.2, 0.0, 1.0)
259
+ frame_scores.append(combined)
260
+
261
+ prev_gray = gray
262
+ prev_faces = faces
263
+
264
+ cap.release()
265
+
266
+ if not frame_scores:
267
+ return 0.5, [0.5], 0.5
268
+
269
+ avg_score = np.mean(frame_scores)
270
+ temporal = float(np.std(frame_scores))
271
+ temporal_penalty = 1.0 - min(temporal * 2.0, 1.0)
272
+
273
+ final = avg_score * 0.6 + temporal_penalty * 0.4
274
+ return np.clip(final, 0.0, 1.0), frame_scores, np.clip(temporal, 0.0, 1.0)
275
+
276
+
277
+ VIDEO_ENSEMBLE = VideoForensics()
278
+
279
+ # ─────────────────────���─────────────────────────────────────────────────────────
280
+ # πŸ–ΌοΈ VISUALIZATION HELPERS
281
+ # ───────────────────────────────────────────────────────────────────────────────
282
+
283
+ def plot_audio_analysis(y, sr, frame_scores, hop_dur, save_path: str):
284
+ fig = plt.figure(figsize=(12, 8), facecolor='#0d0d0d')
285
+ gs = fig.add_gridspec(3, 2, hspace=0.35, wspace=0.25)
286
+
287
+ # Waveform
288
+ ax1 = fig.add_subplot(gs[0, :])
289
+ ax1.set_facecolor('#1a1a2e')
290
+ librosa.display.waveshow(y, sr=sr, ax=ax1, color='#00d4ff', alpha=0.9)
291
+ ax1.set_title('Audio Waveform', color='white', fontsize=13, fontweight='bold')
292
+ ax1.tick_params(colors='white')
293
+ ax1.set_xlabel('Time (s)', color='white')
294
+ ax1.set_ylabel('Amplitude', color='white')
295
+ for spine in ax1.spines.values():
296
+ spine.set_color('#333')
297
+
298
+ # Mel Spectrogram
299
+ ax2 = fig.add_subplot(gs[1, :])
300
+ ax2.set_facecolor('#1a1a2e')
301
+ mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, n_fft=2048, hop_length=512)
302
+ mel_db = librosa.power_to_db(mel, ref=np.max)
303
+ img = librosa.display.specshow(mel_db, sr=sr, ax=ax2, x_axis='time', y_axis='mel', cmap='magma')
304
+ cbar = plt.colorbar(img, ax=ax2, format='%+2.0f dB', fraction=0.046)
305
+ cbar.ax.yaxis.set_tick_params(color='white')
306
+ plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='white')
307
+ ax2.set_title('Mel Spectrogram', color='white', fontsize=13, fontweight='bold')
308
+ ax2.tick_params(colors='white')
309
+ ax2.set_xlabel('Time (s)', color='white')
310
+ ax2.set_ylabel('Mel Frequency', color='white')
311
+ for spine in ax2.spines.values():
312
+ spine.set_color('#333')
313
+
314
+ # Frame scores over time
315
+ ax3 = fig.add_subplot(gs[2, 0])
316
+ ax3.set_facecolor('#1a1a2e')
317
+ times = np.arange(len(frame_scores)) * hop_dur
318
+ ax3.plot(times, [f * 100 for f in frame_scores], color='#ff6b6b', linewidth=2, marker='o', markersize=3)
319
+ ax3.axhline(y=50, color='#ffd93d', linestyle='--', alpha=0.7, label='Threshold')
320
+ ax3.fill_between(times, [f * 100 for f in frame_scores], alpha=0.2, color='#ff6b6b')
321
+ ax3.set_xlabel('Time (s)', color='white')
322
+ ax3.set_ylabel('AI Score (%)', color='white')
323
+ ax3.set_title('Per-Frame AI Probability', color='white', fontsize=12, fontweight='bold')
324
+ ax3.tick_params(colors='white')
325
+ ax3.set_ylim(0, 100)
326
+ ax3.legend(facecolor='#1a1a2e', labelcolor='white')
327
+ for spine in ax3.spines.values():
328
+ spine.set_color('#333')
329
+
330
+ # Feature distribution
331
+ ax4 = fig.add_subplot(gs[2, 1])
332
+ ax4.set_facecolor('#1a1a2e')
333
+ ax4.hist([f * 100 for f in frame_scores], bins=12, color='#4ecdc4', edgecolor='white', alpha=0.8)
334
+ ax4.axvline(x=50, color='#ffd93d', linestyle='--', alpha=0.7)
335
+ ax4.set_xlabel('AI Score (%)', color='white')
336
+ ax4.set_ylabel('Frame Count', color='white')
337
+ ax4.set_title('Score Distribution', color='white', fontsize=12, fontweight='bold')
338
+ ax4.tick_params(colors='white')
339
+ for spine in ax4.spines.values():
340
+ spine.set_color('#333')
341
+
342
+ plt.savefig(save_path, facecolor='#0d0d0d', bbox_inches='tight', dpi=120)
343
+ plt.close()
344
+ return save_path
345
+
346
+
347
+ def plot_video_analysis(frame_scores, save_path: str):
348
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5), facecolor='#0d0d0d')
349
+
350
+ # Frame scores
351
+ ax1.set_facecolor('#1a1a2e')
352
+ frames_x = range(len(frame_scores))
353
+ ax1.plot(frames_x, [f * 100 for f in frame_scores], color='#00d4ff', linewidth=2.5)
354
+ ax1.fill_between(frames_x, [f * 100 for f in frame_scores], alpha=0.25, color='#00d4ff')
355
+ ax1.axhline(y=50, color='#ff6b6b', linestyle='--', alpha=0.7, label='Threshold (50%)')
356
+ ax1.set_xlabel('Frame Index', color='white', fontsize=11)
357
+ ax1.set_ylabel('AI Score (%)', color='white', fontsize=11)
358
+ ax1.set_title('Per-Frame Deepfake Score', color='white', fontsize=13, fontweight='bold')
359
+ ax1.tick_params(colors='white')
360
+ ax1.legend(facecolor='#1a1a2e', labelcolor='white')
361
+ ax1.set_ylim(0, 100)
362
+ for spine in ax1.spines.values():
363
+ spine.set_color('#333')
364
+
365
+ # Pie chart
366
+ ax2.set_facecolor('#1a1a2e')
367
+ avg = np.mean(frame_scores)
368
+ sizes = [avg * 100, (1 - avg) * 100]
369
+ colors_pie = ['#ff6b6b', '#4ecdc4']
370
+ wedges, texts, autotexts = ax2.pie(
371
+ sizes, labels=['AI Generated', 'Real / Human'],
372
+ colors=colors_pie, autopct='%1.1f%%',
373
+ textprops={'color': 'white', 'fontsize': 12},
374
+ startangle=90, explode=(0.02, 0.02)
375
+ )
376
+ verdict = "πŸ€– AI GENERATED" if avg > 0.5 else "βœ… REAL / HUMAN"
377
+ ax2.set_title(f'Verdict: {verdict}', color='white', fontsize=13, fontweight='bold')
378
+ for spine in ax2.spines.values():
379
+ spine.set_color('#333')
380
+
381
+ plt.tight_layout()
382
+ plt.savefig(save_path, facecolor='#0d0d0d', bbox_inches='tight', dpi=120)
383
+ plt.close()
384
+ return save_path
385
+
386
+
387
+ # ───────────────────────────────────────────────────────────────────────────────
388
+ # πŸš€ MAIN DETECTION PIPELINES
389
+ # ───────────────────────────────────────────────────────────────────────────────
390
 
391
  def detect_audio(audio_file):
 
392
  if audio_file is None:
393
  return None, "❌ No audio file provided", None
394
 
395
  try:
396
+ # Route to custom model if enabled and available
397
+ if USE_CUSTOM_MODEL and os.path.exists(CUSTOM_MODEL_PATH):
398
+ score = predict_audio_custom(audio_file)
399
+ y, sr = librosa.load(audio_file, sr=16000, duration=10)
400
+ frame_scores = [score] # Single score for custom models
401
+ temporal = 0.5
402
  else:
403
+ score, frame_scores, temporal, y = AUDIO_ENSEMBLE.detect(audio_file)
404
+ sr = AUDIO_ENSEMBLE.sr
405
 
406
  percentage = score * 100
407
  verdict = "πŸ€– AI GENERATED" if score > 0.5 else "βœ… REAL / HUMAN"
408
  confidence = max(score, 1 - score) * 100
409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
  plot_path = '/tmp/audio_analysis.png'
411
+ plot_audio_analysis(y, sr, frame_scores, AUDIO_ENSEMBLE.hop_dur, plot_path)
412
+
413
+ status = 'πŸ”΄ HIGH RISK' if percentage > 75 else '🟠 SUSPICIOUS' if percentage > 50 else '🟒 LIKELY REAL'
414
 
415
  result_text = f"""
416
+ ## πŸ”Š Audio Forensics Report
417
 
418
  | Metric | Value |
419
  |--------|-------|
420
  | **AI Probability** | {percentage:.1f}% |
421
  | **Verdict** | {verdict} |
422
  | **Confidence** | {confidence:.1f}% |
423
+ | **Temporal Regularity** | {temporal*100:.1f}% |
424
+ | **Status** | {status} |
425
+
426
+ ### Methodology
427
+ | Feature | What it detects |
428
+ |---------|----------------|
429
+ | MFCC (40-dim) | Vocal tract anomalies |
430
+ | Spectral Centroid | Frequency center shifts |
431
+ | Zero Crossing Rate | Overly smooth transitions |
432
+ | Phase Coherence | Synthetic phase patterns |
433
+ | Spectral Flatness | Unnatural noise floor |
434
+ | Chroma / Tonnetz | Harmonic irregularities |
435
 
436
  ### Interpretation
437
+ - **0–30%**: Very likely genuine human speech
438
+ - **30–50%**: Some synthetic characteristics, inconclusive
439
+ - **50–75%**: Likely AI-generated, manual review recommended
440
+ - **75–100%**: Strong indicators of synthetic audio
441
  """
442
  return plot_path, result_text, f"{percentage:.1f}%"
443
 
 
446
 
447
 
448
  def detect_video(video_file):
 
449
  if video_file is None:
450
  return None, "❌ No video file provided", None
451
 
452
  try:
453
+ score, frame_scores, temporal = VIDEO_ENSEMBLE.detect(video_file)
 
 
 
 
 
 
 
454
 
455
  percentage = score * 100
456
  verdict = "πŸ€– AI GENERATED" if score > 0.5 else "οΏ½οΏ½ REAL / HUMAN"
457
+ confidence = max(score, 1 - score) * 100
458
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  plot_path = '/tmp/video_analysis.png'
460
+ plot_video_analysis(frame_scores, plot_path)
461
+
462
+ status = 'πŸ”΄ HIGH RISK' if percentage > 75 else '🟠 SUSPICIOUS' if percentage > 50 else '🟒 LIKELY REAL'
463
 
464
  result_text = f"""
465
+ ## 🎬 Video Forensics Report
466
 
467
  | Metric | Value |
468
  |--------|-------|
469
  | **AI Probability** | {percentage:.1f}% |
470
  | **Verdict** | {verdict} |
471
+ | **Confidence** | {confidence:.1f}% |
472
  | **Frames Analyzed** | {len(frame_scores)} |
473
+ | **Temporal Variance** | {temporal*100:.1f}% |
474
+ | **Status** | {status} |
475
+
476
+ ### Detection Methods
477
+ | Method | What it detects |
478
+ |--------|----------------|
479
+ | Optical Flow | Unnatural motion between frames |
480
+ | Face Detection | Blending boundary artifacts |
481
+ | Laplacian Variance | Over-smoothing / over-sharpening |
482
+ | Temporal Consistency | Frame-to-frame irregularities |
483
+
484
+ ### Interpretation
485
+ - **0–30%**: 🟒 Very likely genuine
486
+ - **30–50%**: 🟑 Some AI characteristics
487
+ - **50–75%**: 🟠 Likely deepfake
488
+ - **75–100%**: πŸ”΄ Strong deepfake indicators
489
  """
490
  return plot_path, result_text, f"{percentage:.1f}%"
491
 
 
493
  return None, f"❌ Error analyzing video: {str(e)}", "N/A"
494
 
495
 
496
+ # ───────────────────────────────────────────────────────────────────────────────
497
+ # 🎨 GRADIO UI
498
+ # ───────────────────────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
499
 
500
  def build_ui():
501
  with gr.Blocks(
 
505
  neutral_hue="slate",
506
  ),
507
  css="""
508
+ .gradio-container { max-width: 1200px; margin: auto; }
509
  .result-box { border-radius: 12px; padding: 16px; }
510
+ h1 { text-align: center; letter-spacing: -0.5px; }
511
  .score-display { font-size: 48px; font-weight: bold; text-align: center; }
512
+ .tab-button { font-weight: 600; }
513
  """
514
  ) as demo:
515
 
516
  gr.HTML("""
517
+ <div style="text-align:center; padding: 24px 0 12px 0;">
518
+ <h1 style="font-size: 2.8em; font-weight: 800; margin-bottom: 8px;">
519
+ πŸ” DeepFake AI Detector
520
+ </h1>
521
+ <p style="font-size: 1.15em; color: #888; max-width: 600px; margin: auto;">
522
+ Upload audio or video to detect AI generation via spectral forensics
523
+ and temporal artifact analysis.
524
  </p>
525
  </div>
526
  """)
 
528
  with gr.Tabs():
529
 
530
  # ── Audio Tab ──────────────────────────────────────────────────────
531
+ with gr.TabItem("πŸ”Š Audio Detection", id=0):
532
+ gr.Markdown("### Upload an audio file to analyze for synthetic speech")
533
  with gr.Row():
534
  with gr.Column(scale=1):
535
  audio_input = gr.Audio(
536
+ label="Upload Audio (MP3, WAV, M4A, FLAC)",
537
  type="filepath"
538
  )
539
  audio_btn = gr.Button("πŸ” Analyze Audio", variant="primary", size="lg")
540
+ gr.Markdown("""
541
+ **Supported formats:** WAV, MP3, M4A, FLAC
542
+ **Max duration:** 60 seconds (auto-trimmed)
543
+ """)
544
 
545
  with gr.Column(scale=2):
546
  audio_score = gr.Label(label="🎯 AI Score", num_top_classes=1)
547
+ audio_plot = gr.Image(label="πŸ“Š Forensic Visualization")
548
  audio_result = gr.Markdown(label="πŸ“‹ Detailed Report")
549
 
550
  audio_btn.click(
 
553
  outputs=[audio_plot, audio_result, audio_score]
554
  )
555
 
 
 
 
 
 
 
556
  # ── Video Tab ──────────────────────────────────────────────────────
557
+ with gr.TabItem("🎬 Video Detection", id=1):
558
+ gr.Markdown("### Upload a video file to check for deepfake artifacts")
559
  with gr.Row():
560
  with gr.Column(scale=1):
561
  video_input = gr.Video(
562
  label="Upload Video (MP4, AVI, MOV)",
563
  )
564
  video_btn = gr.Button("πŸ” Analyze Video", variant="primary", size="lg")
565
+ gr.Markdown("""
566
+ **Supported formats:** MP4, AVI, MOV
567
+ **Analyzes:** 24 uniformly sampled frames
568
+ """)
569
 
570
  with gr.Column(scale=2):
571
  video_score = gr.Label(label="🎯 AI Score", num_top_classes=1)
 
579
  )
580
 
581
  # ── About Tab ─────────────────────────────────────────────────────
582
+ with gr.TabItem("ℹ️ How It Works", id=2):
583
  gr.Markdown("""
584
+ ## 🧠 Detection Architecture
585
+
586
+ ### Audio Forensics Ensemble
587
+ This system uses a **multi-feature spectral ensemble** that analyzes:
588
+
589
+ | Feature Class | Specific Metrics | Synthetic Indicator |
590
+ |---------------|------------------|---------------------|
591
+ | **Timbre** | 40-dim MFCC + Ξ” + Δ² | Unnatural vocal tract patterns |
592
+ | **Temporal** | ZCR mean/std | Overly smooth frame transitions |
593
+ | **Spectral** | Centroid, bandwidth, rolloff | Frequency distribution anomalies |
594
+ | **Harmonic** | Chroma, Tonnetz | Artificial harmonic structure |
595
+ | **Phase** | Analytic signal phase std | Reduced phase coherence |
596
+ | **Dynamics** | RMS micro-dynamics | Compressed natural variation |
597
+
598
+ **Scoring**: Each 0.5-second frame is scored independently. The final verdict blends
599
+ mean frame probability with temporal variance (real speech is more irregular).
600
+
601
+ ### Video Forensics
602
+ | Method | Artifact Detected |
603
+ |--------|-------------------|
604
+ | Optical Flow Farneback | Unnatural motion smoothness |
605
+ | Haar Face Detection | Boundary blending errors |
606
+ | Laplacian Variance | Over-sharpening / smoothing |
607
+ | Frame-to-frame StdDev | Temporal inconsistency |
608
 
609
  ### Score Interpretation
610
+ - **0–30%**: 🟒 Very likely genuine
611
+ - **30–50%**: 🟑 Some AI characteristics, inconclusive
612
+ - **50–75%**: 🟠 Likely AI-generated, needs verification
613
+ - **75–100%**: πŸ”΄ Almost certainly AI-generated
614
 
615
  ### ⚠️ Limitations
616
+ - No detector is 100% accurate against adversarial or novel generative models
617
+ - Performance degrades on heavily compressed or low-bitrate media
618
+ - Always combine automated scores with human expert review
619
+ - Maximum audio analysis length: 60 seconds
620
+
621
+ ### πŸ”Œ Custom Model Integration
622
+ To use your own trained model:
623
+ 1. Set `USE_CUSTOM_MODEL = True` in `app.py`
624
+ 2. Implement `load_custom_model()` and `predict_audio_custom()` with your Colab code
625
+ 3. Upload your `.pth` weights to the Space repository root
626
  """)
627
 
628
  return demo
 
630
 
631
  if __name__ == "__main__":
632
  demo = build_ui()
633
+ demo.launch(server_name="0.0.0.0", server_port=7860)