Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,129 +1,443 @@
|
|
| 1 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
import torch
|
| 4 |
import numpy as np
|
| 5 |
import librosa
|
| 6 |
import cv2
|
| 7 |
import os
|
| 8 |
-
import matplotlib.pyplot as plt
|
| 9 |
import matplotlib
|
| 10 |
matplotlib.use('Agg')
|
|
|
|
| 11 |
from PIL import Image
|
| 12 |
-
import io
|
| 13 |
-
import base64
|
| 14 |
import warnings
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
# Copy the AudioDeepfakeDetector and VideoDeepfakeDetector classes from above
|
| 19 |
-
# OR use import statements if you structured it as a package:
|
| 20 |
-
# from model.audio_detector import AudioDeepfakeDetector, extract_audio_features, predict_audio
|
| 21 |
-
# from model.video_detector import VideoDeepfakeDetector, predict_video
|
| 22 |
-
|
| 23 |
-
# For Spaces, we'll use lightweight pretrained models from HuggingFace Hub
|
| 24 |
-
# as fallback if custom models aren't trained yet:
|
| 25 |
-
|
| 26 |
-
from transformers import pipeline
|
| 27 |
-
|
| 28 |
-
# Load pre-trained audio classifier (for demo)
|
| 29 |
-
# For production, replace with your trained model weights
|
| 30 |
-
AUDIO_MODEL_PATH = "audio_model.pth"
|
| 31 |
-
VIDEO_MODEL_PATH = "video_model.pth"
|
| 32 |
-
|
| 33 |
-
USE_TRAINED_MODEL = os.path.exists(AUDIO_MODEL_PATH)
|
| 34 |
-
|
| 35 |
-
# βββ Fallback: Use transformers pipeline βββββββββββββββββββββββββββββββββββββ
|
| 36 |
-
def analyze_audio_transformers(audio_path):
|
| 37 |
-
"""
|
| 38 |
-
Uses a HuggingFace pipeline for audio classification.
|
| 39 |
-
Replace this with your trained model for better accuracy.
|
| 40 |
-
"""
|
| 41 |
-
try:
|
| 42 |
-
classifier = pipeline(
|
| 43 |
-
"audio-classification",
|
| 44 |
-
model="facebook/wav2vec2-base",
|
| 45 |
-
# For real deepfake detection use:
|
| 46 |
-
# model="mo-aqrabi/deepfake-audio-detection"
|
| 47 |
-
)
|
| 48 |
-
# This is a placeholder β replace with actual deepfake model
|
| 49 |
-
# For now returns heuristic based on spectral analysis
|
| 50 |
-
y, sr = librosa.load(audio_path, sr=16000, duration=5)
|
| 51 |
-
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
|
| 52 |
-
zcr = librosa.feature.zero_crossing_rate(y)
|
| 53 |
-
spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
|
| 54 |
-
|
| 55 |
-
# Heuristic: AI audio tends to have very regular ZCR and centroid
|
| 56 |
-
zcr_regularity = 1 - (np.std(zcr) / (np.mean(zcr) + 1e-6))
|
| 57 |
-
spectral_regularity = 1 - (np.std(spectral_centroid) / (np.mean(spectral_centroid) + 1e-6))
|
| 58 |
-
|
| 59 |
-
# Combine into a rough AI score
|
| 60 |
-
ai_score = np.clip((zcr_regularity + spectral_regularity) / 2, 0, 1)
|
| 61 |
-
return float(ai_score)
|
| 62 |
-
except Exception as e:
|
| 63 |
-
print(f"Fallback audio analysis error: {e}")
|
| 64 |
-
return 0.5 # Uncertain
|
| 65 |
|
|
|
|
| 66 |
|
| 67 |
-
# βββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
def detect_audio(audio_file):
|
| 70 |
-
"""Full audio analysis pipeline."""
|
| 71 |
if audio_file is None:
|
| 72 |
return None, "β No audio file provided", None
|
| 73 |
|
| 74 |
try:
|
| 75 |
-
if
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
else:
|
| 78 |
-
score =
|
|
|
|
| 79 |
|
| 80 |
percentage = score * 100
|
| 81 |
verdict = "π€ AI GENERATED" if score > 0.5 else "β
REAL / HUMAN"
|
| 82 |
confidence = max(score, 1 - score) * 100
|
| 83 |
|
| 84 |
-
# Generate waveform + spectrogram plot
|
| 85 |
-
y, sr = librosa.load(audio_file, sr=16000, duration=10)
|
| 86 |
-
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 6))
|
| 87 |
-
fig.patch.set_facecolor('#0d0d0d')
|
| 88 |
-
|
| 89 |
-
# Waveform
|
| 90 |
-
ax1.set_facecolor('#1a1a2e')
|
| 91 |
-
librosa.display.waveshow(y, sr=sr, ax=ax1, color='#00d4ff')
|
| 92 |
-
ax1.set_title('Audio Waveform', color='white', fontsize=12)
|
| 93 |
-
ax1.tick_params(colors='white')
|
| 94 |
-
|
| 95 |
-
# Mel Spectrogram
|
| 96 |
-
ax2.set_facecolor('#1a1a2e')
|
| 97 |
-
mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
|
| 98 |
-
mel_db = librosa.power_to_db(mel, ref=np.max)
|
| 99 |
-
img = librosa.display.specshow(mel_db, sr=sr, ax=ax2,
|
| 100 |
-
x_axis='time', y_axis='mel', cmap='magma')
|
| 101 |
-
plt.colorbar(img, ax=ax2, format='%+2.0f dB')
|
| 102 |
-
ax2.set_title('Mel Spectrogram', color='white', fontsize=12)
|
| 103 |
-
ax2.tick_params(colors='white')
|
| 104 |
-
ax2.yaxis.label.set_color('white')
|
| 105 |
-
ax2.xaxis.label.set_color('white')
|
| 106 |
-
|
| 107 |
-
plt.tight_layout()
|
| 108 |
plot_path = '/tmp/audio_analysis.png'
|
| 109 |
-
|
| 110 |
-
|
|
|
|
| 111 |
|
| 112 |
result_text = f"""
|
| 113 |
-
## π Audio
|
| 114 |
|
| 115 |
| Metric | Value |
|
| 116 |
|--------|-------|
|
| 117 |
| **AI Probability** | {percentage:.1f}% |
|
| 118 |
| **Verdict** | {verdict} |
|
| 119 |
| **Confidence** | {confidence:.1f}% |
|
| 120 |
-
| **
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
### Interpretation
|
| 123 |
-
- **0
|
| 124 |
-
- **30
|
| 125 |
-
- **50
|
| 126 |
-
- **
|
| 127 |
"""
|
| 128 |
return plot_path, result_text, f"{percentage:.1f}%"
|
| 129 |
|
|
@@ -132,72 +446,46 @@ def detect_audio(audio_file):
|
|
| 132 |
|
| 133 |
|
| 134 |
def detect_video(video_file):
|
| 135 |
-
"""Full video analysis pipeline."""
|
| 136 |
if video_file is None:
|
| 137 |
return None, "β No video file provided", None
|
| 138 |
|
| 139 |
try:
|
| 140 |
-
|
| 141 |
-
result = predict_video(video_file, VIDEO_MODEL_PATH)
|
| 142 |
-
score = result['final_score']
|
| 143 |
-
frame_scores = result['per_frame_scores']
|
| 144 |
-
temporal = result['temporal_score']
|
| 145 |
-
else:
|
| 146 |
-
# Fallback: basic frame analysis
|
| 147 |
-
score, frame_scores, temporal = analyze_video_basic(video_file)
|
| 148 |
|
| 149 |
percentage = score * 100
|
| 150 |
verdict = "π€ AI GENERATED" if score > 0.5 else "οΏ½οΏ½ REAL / HUMAN"
|
|
|
|
| 151 |
|
| 152 |
-
# Generate frame score visualization
|
| 153 |
-
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
|
| 154 |
-
fig.patch.set_facecolor('#0d0d0d')
|
| 155 |
-
|
| 156 |
-
# Frame scores over time
|
| 157 |
-
ax1.set_facecolor('#1a1a2e')
|
| 158 |
-
frames_x = range(len(frame_scores))
|
| 159 |
-
ax1.plot(frames_x, [f * 100 for f in frame_scores], 'cyan', linewidth=2)
|
| 160 |
-
ax1.fill_between(frames_x, [f * 100 for f in frame_scores], alpha=0.3, color='cyan')
|
| 161 |
-
ax1.axhline(y=50, color='red', linestyle='--', alpha=0.7, label='Threshold (50%)')
|
| 162 |
-
ax1.set_facecolor('#1a1a2e')
|
| 163 |
-
ax1.set_xlabel('Frame', color='white')
|
| 164 |
-
ax1.set_ylabel('AI Score (%)', color='white')
|
| 165 |
-
ax1.set_title('Per-Frame AI Score', color='white')
|
| 166 |
-
ax1.tick_params(colors='white')
|
| 167 |
-
ax1.legend(facecolor='#1a1a2e', labelcolor='white')
|
| 168 |
-
ax1.set_ylim(0, 100)
|
| 169 |
-
|
| 170 |
-
# Score breakdown pie
|
| 171 |
-
ax2.set_facecolor('#1a1a2e')
|
| 172 |
-
sizes = [score * 100, (1 - score) * 100]
|
| 173 |
-
colors_pie = ['#FF4444', '#00CC44']
|
| 174 |
-
wedges, texts, autotexts = ax2.pie(sizes, labels=['AI Generated', 'Real/Human'],
|
| 175 |
-
colors=colors_pie, autopct='%1.1f%%',
|
| 176 |
-
textprops={'color': 'white', 'fontsize': 12},
|
| 177 |
-
startangle=90)
|
| 178 |
-
ax2.set_title(f'Final Verdict: {verdict}', color='white', fontsize=13)
|
| 179 |
-
|
| 180 |
-
plt.tight_layout()
|
| 181 |
plot_path = '/tmp/video_analysis.png'
|
| 182 |
-
|
| 183 |
-
|
|
|
|
| 184 |
|
| 185 |
result_text = f"""
|
| 186 |
-
## π¬ Video
|
| 187 |
|
| 188 |
| Metric | Value |
|
| 189 |
|--------|-------|
|
| 190 |
| **AI Probability** | {percentage:.1f}% |
|
| 191 |
| **Verdict** | {verdict} |
|
|
|
|
| 192 |
| **Frames Analyzed** | {len(frame_scores)} |
|
| 193 |
-
| **Temporal
|
| 194 |
-
| **Status** | {
|
| 195 |
-
|
| 196 |
-
###
|
| 197 |
-
|
| 198 |
-
-
|
| 199 |
-
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
"""
|
| 202 |
return plot_path, result_text, f"{percentage:.1f}%"
|
| 203 |
|
|
@@ -205,41 +493,9 @@ def detect_video(video_file):
|
|
| 205 |
return None, f"β Error analyzing video: {str(e)}", "N/A"
|
| 206 |
|
| 207 |
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 212 |
-
indices = np.linspace(0, total - 1, 20, dtype=int)
|
| 213 |
-
|
| 214 |
-
frame_scores = []
|
| 215 |
-
prev_gray = None
|
| 216 |
-
|
| 217 |
-
for idx in indices:
|
| 218 |
-
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
|
| 219 |
-
ret, frame = cap.read()
|
| 220 |
-
if not ret:
|
| 221 |
-
continue
|
| 222 |
-
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
| 223 |
-
|
| 224 |
-
if prev_gray is not None:
|
| 225 |
-
# Check for unnatural sharpness transitions (common in deepfakes)
|
| 226 |
-
diff = cv2.absdiff(gray, prev_gray)
|
| 227 |
-
score = float(np.std(diff)) / 50.0
|
| 228 |
-
score = np.clip(score, 0, 1)
|
| 229 |
-
frame_scores.append(score)
|
| 230 |
-
prev_gray = gray
|
| 231 |
-
|
| 232 |
-
cap.release()
|
| 233 |
-
|
| 234 |
-
if not frame_scores:
|
| 235 |
-
return 0.5, [0.5], 0.5
|
| 236 |
-
|
| 237 |
-
avg_score = np.mean(frame_scores)
|
| 238 |
-
temporal = float(np.std(frame_scores))
|
| 239 |
-
return avg_score, frame_scores, min(temporal, 1.0)
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
# βββ Gradio UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 243 |
|
| 244 |
def build_ui():
|
| 245 |
with gr.Blocks(
|
|
@@ -249,19 +505,22 @@ def build_ui():
|
|
| 249 |
neutral_hue="slate",
|
| 250 |
),
|
| 251 |
css="""
|
| 252 |
-
.gradio-container { max-width:
|
| 253 |
.result-box { border-radius: 12px; padding: 16px; }
|
| 254 |
-
h1 { text-align: center; }
|
| 255 |
.score-display { font-size: 48px; font-weight: bold; text-align: center; }
|
|
|
|
| 256 |
"""
|
| 257 |
) as demo:
|
| 258 |
|
| 259 |
gr.HTML("""
|
| 260 |
-
<div style="text-align:center; padding:
|
| 261 |
-
<h1 style="font-size: 2.
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
|
|
|
|
|
|
| 265 |
</p>
|
| 266 |
</div>
|
| 267 |
""")
|
|
@@ -269,19 +528,23 @@ def build_ui():
|
|
| 269 |
with gr.Tabs():
|
| 270 |
|
| 271 |
# ββ Audio Tab ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 272 |
-
with gr.TabItem("π Audio Detection"):
|
| 273 |
-
gr.Markdown("### Upload an audio file to
|
| 274 |
with gr.Row():
|
| 275 |
with gr.Column(scale=1):
|
| 276 |
audio_input = gr.Audio(
|
| 277 |
-
label="Upload Audio (MP3, WAV, M4A)",
|
| 278 |
type="filepath"
|
| 279 |
)
|
| 280 |
audio_btn = gr.Button("π Analyze Audio", variant="primary", size="lg")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
with gr.Column(scale=2):
|
| 283 |
audio_score = gr.Label(label="π― AI Score", num_top_classes=1)
|
| 284 |
-
audio_plot = gr.Image(label="π
|
| 285 |
audio_result = gr.Markdown(label="π Detailed Report")
|
| 286 |
|
| 287 |
audio_btn.click(
|
|
@@ -290,21 +553,19 @@ def build_ui():
|
|
| 290 |
outputs=[audio_plot, audio_result, audio_score]
|
| 291 |
)
|
| 292 |
|
| 293 |
-
gr.Examples(
|
| 294 |
-
examples=[],
|
| 295 |
-
inputs=audio_input,
|
| 296 |
-
label="Example files (add your own samples)"
|
| 297 |
-
)
|
| 298 |
-
|
| 299 |
# ββ Video Tab ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 300 |
-
with gr.TabItem("π¬ Video Detection"):
|
| 301 |
-
gr.Markdown("### Upload a video file to check
|
| 302 |
with gr.Row():
|
| 303 |
with gr.Column(scale=1):
|
| 304 |
video_input = gr.Video(
|
| 305 |
label="Upload Video (MP4, AVI, MOV)",
|
| 306 |
)
|
| 307 |
video_btn = gr.Button("π Analyze Video", variant="primary", size="lg")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
|
| 309 |
with gr.Column(scale=2):
|
| 310 |
video_score = gr.Label(label="π― AI Score", num_top_classes=1)
|
|
@@ -318,38 +579,50 @@ def build_ui():
|
|
| 318 |
)
|
| 319 |
|
| 320 |
# ββ About Tab βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 321 |
-
with gr.TabItem("βΉοΈ How It Works"):
|
| 322 |
gr.Markdown("""
|
| 323 |
-
## π§ Detection
|
| 324 |
-
|
| 325 |
-
### Audio
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
|
| 329 |
-
|
|
| 330 |
-
|
|
| 331 |
-
|
|
| 332 |
-
|
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
|
| 342 |
### Score Interpretation
|
| 343 |
-
- **0
|
| 344 |
-
- **30
|
| 345 |
-
- **50
|
| 346 |
-
- **
|
| 347 |
|
| 348 |
### β οΈ Limitations
|
| 349 |
-
- No detector is 100% accurate
|
| 350 |
-
-
|
| 351 |
-
-
|
| 352 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
""")
|
| 354 |
|
| 355 |
return demo
|
|
@@ -357,8 +630,4 @@ def build_ui():
|
|
| 357 |
|
| 358 |
if __name__ == "__main__":
|
| 359 |
demo = build_ui()
|
| 360 |
-
demo.launch(
|
| 361 |
-
server_name="0.0.0.0",
|
| 362 |
-
server_port=7860,
|
| 363 |
-
share=True # Creates public URL for testing
|
| 364 |
-
)
|
|
|
|
| 1 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2 |
+
# π DeepFake AI Detector β AI Voice & Video Forensics System v5
|
| 3 |
+
# Hugging Face Spaces β’ Gradio β’ CPU-Optimized
|
| 4 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 5 |
+
|
| 6 |
import gradio as gr
|
| 7 |
import torch
|
| 8 |
import numpy as np
|
| 9 |
import librosa
|
| 10 |
import cv2
|
| 11 |
import os
|
|
|
|
| 12 |
import matplotlib
|
| 13 |
matplotlib.use('Agg')
|
| 14 |
+
import matplotlib.pyplot as plt
|
| 15 |
from PIL import Image
|
|
|
|
|
|
|
| 16 |
import warnings
|
| 17 |
+
import soundfile as sf
|
| 18 |
+
from scipy import stats
|
| 19 |
+
from scipy.signal import hilbert
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
+
warnings.filterwarnings('ignore')
|
| 22 |
|
| 23 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 24 |
+
# π§ CUSTOM MODEL INTEGRATION (Paste your Colab model here)
|
| 25 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
|
| 27 |
+
USE_CUSTOM_MODEL = False # β Set True when you add your own .pth
|
| 28 |
+
CUSTOM_MODEL_PATH = "audio_model.pth"
|
| 29 |
+
|
| 30 |
+
def load_custom_model(model_path: str):
|
| 31 |
+
"""Load your trained model. Return a callable model or None."""
|
| 32 |
+
# TODO: Paste your Colab model loading code here
|
| 33 |
+
# Example:
|
| 34 |
+
# checkpoint = torch.load(model_path, map_location='cpu')
|
| 35 |
+
# model = YourModelClass(...)
|
| 36 |
+
# model.load_state_dict(checkpoint)
|
| 37 |
+
# model.eval()
|
| 38 |
+
# return model
|
| 39 |
+
return None
|
| 40 |
+
|
| 41 |
+
def predict_audio_custom(audio_path: str) -> float:
|
| 42 |
+
"""Your Colab inference pipeline. Must return float in [0, 1]."""
|
| 43 |
+
# TODO: Paste your Colab prediction code here
|
| 44 |
+
# 1. Load audio (librosa/soundfile)
|
| 45 |
+
# 2. Extract features exactly as in training
|
| 46 |
+
# 3. Forward pass
|
| 47 |
+
# 4. Return AI probability (0.0 = real, 1.0 = fake)
|
| 48 |
+
raise NotImplementedError("Paste your model code or disable USE_CUSTOM_MODEL")
|
| 49 |
+
|
| 50 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 51 |
+
# π AUDIO FORENSICS ENSEMBLE (Spectral + Statistical)
|
| 52 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 53 |
+
|
| 54 |
+
class AudioForensicsEnsemble:
|
| 55 |
+
def __init__(self, sr: int = 16000):
|
| 56 |
+
self.sr = sr
|
| 57 |
+
self.frame_dur = 0.5 # seconds per analysis frame
|
| 58 |
+
self.hop_dur = 0.25 # seconds hop length
|
| 59 |
+
|
| 60 |
+
def _extract_frame_features(self, y: np.ndarray) -> dict:
|
| 61 |
+
"""Extract forensic features from a single audio frame."""
|
| 62 |
+
sr = self.sr
|
| 63 |
+
feats = {}
|
| 64 |
+
|
| 65 |
+
# 1. MFCC + derivatives (timbre / vocal tract)
|
| 66 |
+
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40, n_fft=512, hop_length=256)
|
| 67 |
+
feats['mfcc_mean'] = np.mean(mfcc, axis=1)
|
| 68 |
+
feats['mfcc_std'] = np.std(mfcc, axis=1)
|
| 69 |
+
delta = librosa.feature.delta(mfcc)
|
| 70 |
+
feats['mfcc_delta_std'] = np.std(delta, axis=1)
|
| 71 |
+
|
| 72 |
+
# 2. Zero-crossing rate (temporal crispness)
|
| 73 |
+
zcr = librosa.feature.zero_crossing_rate(y, hop_length=256)[0]
|
| 74 |
+
feats['zcr_mean'] = np.mean(zcr)
|
| 75 |
+
feats['zcr_std'] = np.std(zcr)
|
| 76 |
+
|
| 77 |
+
# 3. Spectral moments
|
| 78 |
+
spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length=256)[0]
|
| 79 |
+
spec_band = librosa.feature.spectral_bandwidth(y=y, sr=sr, hop_length=256)[0]
|
| 80 |
+
spec_roll = librosa.feature.spectral_rolloff(y=y, sr=sr, hop_length=256)[0]
|
| 81 |
+
spec_flat = librosa.feature.spectral_flatness(y=y, hop_length=256)[0]
|
| 82 |
+
|
| 83 |
+
feats['centroid_mean'] = np.mean(spec_cent)
|
| 84 |
+
feats['centroid_std'] = np.std(spec_cent)
|
| 85 |
+
feats['bandwidth_mean'] = np.mean(spec_band)
|
| 86 |
+
feats['rolloff_mean'] = np.mean(spec_roll)
|
| 87 |
+
feats['flatness_mean'] = np.mean(spec_flat)
|
| 88 |
+
feats['flatness_std'] = np.std(spec_flat)
|
| 89 |
+
|
| 90 |
+
# 4. RMS energy dynamics
|
| 91 |
+
rms = librosa.feature.rms(y=y, hop_length=256)[0]
|
| 92 |
+
feats['rms_mean'] = np.mean(rms)
|
| 93 |
+
feats['rms_std'] = np.std(rms)
|
| 94 |
+
|
| 95 |
+
# 5. Chroma (harmonic content)
|
| 96 |
+
chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=256)
|
| 97 |
+
feats['chroma_std'] = np.std(chroma, axis=1).mean()
|
| 98 |
+
|
| 99 |
+
# 6. Spectral contrast (periodicity vs noise)
|
| 100 |
+
contrast = librosa.feature.spectral_contrast(y=y, sr=sr, hop_length=256)
|
| 101 |
+
feats['contrast_std'] = np.std(contrast, axis=1).mean()
|
| 102 |
+
|
| 103 |
+
# 7. Tonnetz (harmonic network)
|
| 104 |
+
tonnetz = librosa.feature.tonnetz(y=y, sr=sr)
|
| 105 |
+
feats['tonnetz_std'] = np.std(tonnetz, axis=1).mean()
|
| 106 |
+
|
| 107 |
+
# 8. Phase coherence via analytic signal
|
| 108 |
+
analytic = hilbert(y)
|
| 109 |
+
phase = np.unwrap(np.angle(analytic))
|
| 110 |
+
feats['phase_std'] = np.std(np.diff(phase))
|
| 111 |
+
|
| 112 |
+
return feats
|
| 113 |
+
|
| 114 |
+
def _score_frame(self, feats: dict) -> float:
|
| 115 |
+
"""
|
| 116 |
+
Forensic scoring function.
|
| 117 |
+
Higher score β more likely AI-generated.
|
| 118 |
+
Based on statistical regularities common in synthetic speech.
|
| 119 |
+
"""
|
| 120 |
+
scores = []
|
| 121 |
+
|
| 122 |
+
# A. Regularity penalty: AI audio is often too consistent
|
| 123 |
+
regularity = 0.0
|
| 124 |
+
regularity += 1.0 - min(feats['zcr_std'] / (feats['zcr_mean'] + 1e-6), 1.0)
|
| 125 |
+
regularity += 1.0 - min(feats['centroid_std'] / (feats['centroid_mean'] + 1e-6), 1.0)
|
| 126 |
+
regularity += 1.0 - min(feats['mfcc_delta_std'].mean() / (np.abs(feats['mfcc_mean']).mean() + 1e-6), 1.0)
|
| 127 |
+
regularity /= 3.0
|
| 128 |
+
scores.append(regularity * 0.35)
|
| 129 |
+
|
| 130 |
+
# B. Spectral flatness: AI can exhibit unnatural flatness
|
| 131 |
+
flatness_sigmoid = 1.0 / (1.0 + np.exp(-20 * (feats['flatness_mean'] - 0.15)))
|
| 132 |
+
scores.append(flatness_sigmoid * 0.15)
|
| 133 |
+
|
| 134 |
+
# C. Phase coherence: synthetic audio often has lower phase variance
|
| 135 |
+
phase_score = 1.0 - min(feats['phase_std'] / 5.0, 1.0)
|
| 136 |
+
scores.append(phase_score * 0.20)
|
| 137 |
+
|
| 138 |
+
# D. Harmonic regularity: chroma + tonnetz uniformity
|
| 139 |
+
harmonic_reg = feats['chroma_std'] + feats['tonnetz_std']
|
| 140 |
+
harmonic_score = 1.0 - min(harmonic_reg / 0.3, 1.0)
|
| 141 |
+
scores.append(harmonic_score * 0.15)
|
| 142 |
+
|
| 143 |
+
# E. Energy dynamics: AI sometimes lacks natural micro-dynamics
|
| 144 |
+
dynamic_score = 1.0 - min(feats['rms_std'] / (feats['rms_mean'] + 1e-6), 1.0)
|
| 145 |
+
scores.append(dynamic_score * 0.15)
|
| 146 |
+
|
| 147 |
+
final = np.clip(sum(scores), 0.0, 1.0)
|
| 148 |
+
return final
|
| 149 |
+
|
| 150 |
+
def detect(self, audio_path: str):
|
| 151 |
+
"""Run full forensic analysis on an audio file."""
|
| 152 |
+
y, sr = librosa.load(audio_path, sr=self.sr, mono=True, duration=60)
|
| 153 |
+
if len(y) < self.sr * 2:
|
| 154 |
+
# Pad short clips
|
| 155 |
+
y = np.pad(y, (0, self.sr * 2 - len(y)))
|
| 156 |
+
|
| 157 |
+
frame_len = int(self.frame_dur * sr)
|
| 158 |
+
hop_len = int(self.hop_dur * sr)
|
| 159 |
+
|
| 160 |
+
frame_scores = []
|
| 161 |
+
all_feats = []
|
| 162 |
+
|
| 163 |
+
for start in range(0, len(y) - frame_len, hop_len):
|
| 164 |
+
frame = y[start:start + frame_len]
|
| 165 |
+
feats = self._extract_frame_features(frame)
|
| 166 |
+
all_feats.append(feats)
|
| 167 |
+
frame_scores.append(self._score_frame(feats))
|
| 168 |
+
|
| 169 |
+
if not frame_scores:
|
| 170 |
+
return 0.5, [0.5], 0.5, None
|
| 171 |
+
|
| 172 |
+
# Temporal inconsistency: real humans vary more frame-to-frame
|
| 173 |
+
temporal_std = np.std(frame_scores)
|
| 174 |
+
temporal_score = np.clip(temporal_std * 2.5, 0.0, 1.0)
|
| 175 |
+
|
| 176 |
+
# Overall: weight average frame score vs temporal variance
|
| 177 |
+
avg_score = np.mean(frame_scores)
|
| 178 |
+
# High temporal variance β likely real (humans are irregular)
|
| 179 |
+
# Low temporal variance + high frame score β likely AI
|
| 180 |
+
ai_likelihood = avg_score * 0.7 + (1.0 - temporal_score) * 0.3
|
| 181 |
+
|
| 182 |
+
return np.clip(ai_likelihood, 0.0, 1.0), frame_scores, temporal_score, y
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
# Global ensemble instance
|
| 186 |
+
AUDIO_ENSEMBLE = AudioForensicsEnsemble()
|
| 187 |
+
|
| 188 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 189 |
+
# π¬ VIDEO FORENSICS (Optical Flow + Frame Artifact Detection)
|
| 190 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 191 |
+
|
| 192 |
+
class VideoForensics:
|
| 193 |
+
def __init__(self, n_frames: int = 24):
|
| 194 |
+
self.n_frames = n_frames
|
| 195 |
+
|
| 196 |
+
def detect(self, video_path: str):
|
| 197 |
+
cap = cv2.VideoCapture(video_path)
|
| 198 |
+
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 199 |
+
if total <= 0:
|
| 200 |
+
return 0.5, [0.5], 0.5
|
| 201 |
+
|
| 202 |
+
indices = np.linspace(0, total - 1, min(self.n_frames, total), dtype=int)
|
| 203 |
+
frame_scores = []
|
| 204 |
+
prev_gray = None
|
| 205 |
+
prev_faces = None
|
| 206 |
+
|
| 207 |
+
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
|
| 208 |
+
|
| 209 |
+
for idx in indices:
|
| 210 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
|
| 211 |
+
ret, frame = cap.read()
|
| 212 |
+
if not ret:
|
| 213 |
+
continue
|
| 214 |
+
|
| 215 |
+
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
| 216 |
+
h, w = gray.shape
|
| 217 |
+
|
| 218 |
+
# 1. Temporal inconsistency via optical flow magnitude
|
| 219 |
+
flow_score = 0.0
|
| 220 |
+
if prev_gray is not None:
|
| 221 |
+
flow = cv2.calcOpticalFlowFarneback(prev_gray, gray, None,
|
| 222 |
+
pyr_scale=0.5, levels=3,
|
| 223 |
+
winsize=15, iterations=3,
|
| 224 |
+
poly_n=5, poly_sigma=1.2,
|
| 225 |
+
flags=0)
|
| 226 |
+
mag, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1])
|
| 227 |
+
flow_mean = np.mean(mag)
|
| 228 |
+
flow_std = np.std(mag)
|
| 229 |
+
# Unnatural smoothness or extreme jitter
|
| 230 |
+
if flow_std < 0.5:
|
| 231 |
+
flow_score = 0.6 # Too static
|
| 232 |
+
elif flow_std > 5.0:
|
| 233 |
+
flow_score = 0.4 # Too jittery
|
| 234 |
+
else:
|
| 235 |
+
flow_score = 0.2
|
| 236 |
+
|
| 237 |
+
# 2. Face boundary artifacts
|
| 238 |
+
face_score = 0.0
|
| 239 |
+
faces = face_cascade.detectMultiScale(gray, 1.1, 4)
|
| 240 |
+
if len(faces) > 0:
|
| 241 |
+
for (x, y, fw, fh) in faces:
|
| 242 |
+
face_roi = gray[y:y+fh, x:x+fw]
|
| 243 |
+
# Blending boundary check: sharpness variance at edges
|
| 244 |
+
laplacian_var = cv2.Laplacian(face_roi, cv2.CV_64F).var()
|
| 245 |
+
if laplacian_var > 1000:
|
| 246 |
+
face_score = max(face_score, 0.3) # Over-sharpened
|
| 247 |
+
# Color consistency around face (simplified)
|
| 248 |
+
if prev_faces is not None:
|
| 249 |
+
# Check for sudden face swaps
|
| 250 |
+
face_score = max(face_score, 0.2)
|
| 251 |
+
|
| 252 |
+
# 3. Noise pattern analysis
|
| 253 |
+
noise = cv2.Laplacian(gray, cv2.CV_64F).var()
|
| 254 |
+
noise_score = 0.0
|
| 255 |
+
if noise < 50:
|
| 256 |
+
noise_score = 0.4 # Too clean = suspicious
|
| 257 |
+
|
| 258 |
+
combined = np.clip((flow_score + face_score + noise_score) / 1.2, 0.0, 1.0)
|
| 259 |
+
frame_scores.append(combined)
|
| 260 |
+
|
| 261 |
+
prev_gray = gray
|
| 262 |
+
prev_faces = faces
|
| 263 |
+
|
| 264 |
+
cap.release()
|
| 265 |
+
|
| 266 |
+
if not frame_scores:
|
| 267 |
+
return 0.5, [0.5], 0.5
|
| 268 |
+
|
| 269 |
+
avg_score = np.mean(frame_scores)
|
| 270 |
+
temporal = float(np.std(frame_scores))
|
| 271 |
+
temporal_penalty = 1.0 - min(temporal * 2.0, 1.0)
|
| 272 |
+
|
| 273 |
+
final = avg_score * 0.6 + temporal_penalty * 0.4
|
| 274 |
+
return np.clip(final, 0.0, 1.0), frame_scores, np.clip(temporal, 0.0, 1.0)
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
VIDEO_ENSEMBLE = VideoForensics()
|
| 278 |
+
|
| 279 |
+
# βββββββββββββββββββββοΏ½οΏ½οΏ½βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 280 |
+
# πΌοΈ VISUALIZATION HELPERS
|
| 281 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 282 |
+
|
| 283 |
+
def plot_audio_analysis(y, sr, frame_scores, hop_dur, save_path: str):
|
| 284 |
+
fig = plt.figure(figsize=(12, 8), facecolor='#0d0d0d')
|
| 285 |
+
gs = fig.add_gridspec(3, 2, hspace=0.35, wspace=0.25)
|
| 286 |
+
|
| 287 |
+
# Waveform
|
| 288 |
+
ax1 = fig.add_subplot(gs[0, :])
|
| 289 |
+
ax1.set_facecolor('#1a1a2e')
|
| 290 |
+
librosa.display.waveshow(y, sr=sr, ax=ax1, color='#00d4ff', alpha=0.9)
|
| 291 |
+
ax1.set_title('Audio Waveform', color='white', fontsize=13, fontweight='bold')
|
| 292 |
+
ax1.tick_params(colors='white')
|
| 293 |
+
ax1.set_xlabel('Time (s)', color='white')
|
| 294 |
+
ax1.set_ylabel('Amplitude', color='white')
|
| 295 |
+
for spine in ax1.spines.values():
|
| 296 |
+
spine.set_color('#333')
|
| 297 |
+
|
| 298 |
+
# Mel Spectrogram
|
| 299 |
+
ax2 = fig.add_subplot(gs[1, :])
|
| 300 |
+
ax2.set_facecolor('#1a1a2e')
|
| 301 |
+
mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, n_fft=2048, hop_length=512)
|
| 302 |
+
mel_db = librosa.power_to_db(mel, ref=np.max)
|
| 303 |
+
img = librosa.display.specshow(mel_db, sr=sr, ax=ax2, x_axis='time', y_axis='mel', cmap='magma')
|
| 304 |
+
cbar = plt.colorbar(img, ax=ax2, format='%+2.0f dB', fraction=0.046)
|
| 305 |
+
cbar.ax.yaxis.set_tick_params(color='white')
|
| 306 |
+
plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='white')
|
| 307 |
+
ax2.set_title('Mel Spectrogram', color='white', fontsize=13, fontweight='bold')
|
| 308 |
+
ax2.tick_params(colors='white')
|
| 309 |
+
ax2.set_xlabel('Time (s)', color='white')
|
| 310 |
+
ax2.set_ylabel('Mel Frequency', color='white')
|
| 311 |
+
for spine in ax2.spines.values():
|
| 312 |
+
spine.set_color('#333')
|
| 313 |
+
|
| 314 |
+
# Frame scores over time
|
| 315 |
+
ax3 = fig.add_subplot(gs[2, 0])
|
| 316 |
+
ax3.set_facecolor('#1a1a2e')
|
| 317 |
+
times = np.arange(len(frame_scores)) * hop_dur
|
| 318 |
+
ax3.plot(times, [f * 100 for f in frame_scores], color='#ff6b6b', linewidth=2, marker='o', markersize=3)
|
| 319 |
+
ax3.axhline(y=50, color='#ffd93d', linestyle='--', alpha=0.7, label='Threshold')
|
| 320 |
+
ax3.fill_between(times, [f * 100 for f in frame_scores], alpha=0.2, color='#ff6b6b')
|
| 321 |
+
ax3.set_xlabel('Time (s)', color='white')
|
| 322 |
+
ax3.set_ylabel('AI Score (%)', color='white')
|
| 323 |
+
ax3.set_title('Per-Frame AI Probability', color='white', fontsize=12, fontweight='bold')
|
| 324 |
+
ax3.tick_params(colors='white')
|
| 325 |
+
ax3.set_ylim(0, 100)
|
| 326 |
+
ax3.legend(facecolor='#1a1a2e', labelcolor='white')
|
| 327 |
+
for spine in ax3.spines.values():
|
| 328 |
+
spine.set_color('#333')
|
| 329 |
+
|
| 330 |
+
# Feature distribution
|
| 331 |
+
ax4 = fig.add_subplot(gs[2, 1])
|
| 332 |
+
ax4.set_facecolor('#1a1a2e')
|
| 333 |
+
ax4.hist([f * 100 for f in frame_scores], bins=12, color='#4ecdc4', edgecolor='white', alpha=0.8)
|
| 334 |
+
ax4.axvline(x=50, color='#ffd93d', linestyle='--', alpha=0.7)
|
| 335 |
+
ax4.set_xlabel('AI Score (%)', color='white')
|
| 336 |
+
ax4.set_ylabel('Frame Count', color='white')
|
| 337 |
+
ax4.set_title('Score Distribution', color='white', fontsize=12, fontweight='bold')
|
| 338 |
+
ax4.tick_params(colors='white')
|
| 339 |
+
for spine in ax4.spines.values():
|
| 340 |
+
spine.set_color('#333')
|
| 341 |
+
|
| 342 |
+
plt.savefig(save_path, facecolor='#0d0d0d', bbox_inches='tight', dpi=120)
|
| 343 |
+
plt.close()
|
| 344 |
+
return save_path
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
def plot_video_analysis(frame_scores, save_path: str):
|
| 348 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5), facecolor='#0d0d0d')
|
| 349 |
+
|
| 350 |
+
# Frame scores
|
| 351 |
+
ax1.set_facecolor('#1a1a2e')
|
| 352 |
+
frames_x = range(len(frame_scores))
|
| 353 |
+
ax1.plot(frames_x, [f * 100 for f in frame_scores], color='#00d4ff', linewidth=2.5)
|
| 354 |
+
ax1.fill_between(frames_x, [f * 100 for f in frame_scores], alpha=0.25, color='#00d4ff')
|
| 355 |
+
ax1.axhline(y=50, color='#ff6b6b', linestyle='--', alpha=0.7, label='Threshold (50%)')
|
| 356 |
+
ax1.set_xlabel('Frame Index', color='white', fontsize=11)
|
| 357 |
+
ax1.set_ylabel('AI Score (%)', color='white', fontsize=11)
|
| 358 |
+
ax1.set_title('Per-Frame Deepfake Score', color='white', fontsize=13, fontweight='bold')
|
| 359 |
+
ax1.tick_params(colors='white')
|
| 360 |
+
ax1.legend(facecolor='#1a1a2e', labelcolor='white')
|
| 361 |
+
ax1.set_ylim(0, 100)
|
| 362 |
+
for spine in ax1.spines.values():
|
| 363 |
+
spine.set_color('#333')
|
| 364 |
+
|
| 365 |
+
# Pie chart
|
| 366 |
+
ax2.set_facecolor('#1a1a2e')
|
| 367 |
+
avg = np.mean(frame_scores)
|
| 368 |
+
sizes = [avg * 100, (1 - avg) * 100]
|
| 369 |
+
colors_pie = ['#ff6b6b', '#4ecdc4']
|
| 370 |
+
wedges, texts, autotexts = ax2.pie(
|
| 371 |
+
sizes, labels=['AI Generated', 'Real / Human'],
|
| 372 |
+
colors=colors_pie, autopct='%1.1f%%',
|
| 373 |
+
textprops={'color': 'white', 'fontsize': 12},
|
| 374 |
+
startangle=90, explode=(0.02, 0.02)
|
| 375 |
+
)
|
| 376 |
+
verdict = "π€ AI GENERATED" if avg > 0.5 else "β
REAL / HUMAN"
|
| 377 |
+
ax2.set_title(f'Verdict: {verdict}', color='white', fontsize=13, fontweight='bold')
|
| 378 |
+
for spine in ax2.spines.values():
|
| 379 |
+
spine.set_color('#333')
|
| 380 |
+
|
| 381 |
+
plt.tight_layout()
|
| 382 |
+
plt.savefig(save_path, facecolor='#0d0d0d', bbox_inches='tight', dpi=120)
|
| 383 |
+
plt.close()
|
| 384 |
+
return save_path
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 388 |
+
# π MAIN DETECTION PIPELINES
|
| 389 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 390 |
|
| 391 |
def detect_audio(audio_file):
|
|
|
|
| 392 |
if audio_file is None:
|
| 393 |
return None, "β No audio file provided", None
|
| 394 |
|
| 395 |
try:
|
| 396 |
+
# Route to custom model if enabled and available
|
| 397 |
+
if USE_CUSTOM_MODEL and os.path.exists(CUSTOM_MODEL_PATH):
|
| 398 |
+
score = predict_audio_custom(audio_file)
|
| 399 |
+
y, sr = librosa.load(audio_file, sr=16000, duration=10)
|
| 400 |
+
frame_scores = [score] # Single score for custom models
|
| 401 |
+
temporal = 0.5
|
| 402 |
else:
|
| 403 |
+
score, frame_scores, temporal, y = AUDIO_ENSEMBLE.detect(audio_file)
|
| 404 |
+
sr = AUDIO_ENSEMBLE.sr
|
| 405 |
|
| 406 |
percentage = score * 100
|
| 407 |
verdict = "π€ AI GENERATED" if score > 0.5 else "β
REAL / HUMAN"
|
| 408 |
confidence = max(score, 1 - score) * 100
|
| 409 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
plot_path = '/tmp/audio_analysis.png'
|
| 411 |
+
plot_audio_analysis(y, sr, frame_scores, AUDIO_ENSEMBLE.hop_dur, plot_path)
|
| 412 |
+
|
| 413 |
+
status = 'π΄ HIGH RISK' if percentage > 75 else 'π SUSPICIOUS' if percentage > 50 else 'π’ LIKELY REAL'
|
| 414 |
|
| 415 |
result_text = f"""
|
| 416 |
+
## π Audio Forensics Report
|
| 417 |
|
| 418 |
| Metric | Value |
|
| 419 |
|--------|-------|
|
| 420 |
| **AI Probability** | {percentage:.1f}% |
|
| 421 |
| **Verdict** | {verdict} |
|
| 422 |
| **Confidence** | {confidence:.1f}% |
|
| 423 |
+
| **Temporal Regularity** | {temporal*100:.1f}% |
|
| 424 |
+
| **Status** | {status} |
|
| 425 |
+
|
| 426 |
+
### Methodology
|
| 427 |
+
| Feature | What it detects |
|
| 428 |
+
|---------|----------------|
|
| 429 |
+
| MFCC (40-dim) | Vocal tract anomalies |
|
| 430 |
+
| Spectral Centroid | Frequency center shifts |
|
| 431 |
+
| Zero Crossing Rate | Overly smooth transitions |
|
| 432 |
+
| Phase Coherence | Synthetic phase patterns |
|
| 433 |
+
| Spectral Flatness | Unnatural noise floor |
|
| 434 |
+
| Chroma / Tonnetz | Harmonic irregularities |
|
| 435 |
|
| 436 |
### Interpretation
|
| 437 |
+
- **0β30%**: Very likely genuine human speech
|
| 438 |
+
- **30β50%**: Some synthetic characteristics, inconclusive
|
| 439 |
+
- **50β75%**: Likely AI-generated, manual review recommended
|
| 440 |
+
- **75β100%**: Strong indicators of synthetic audio
|
| 441 |
"""
|
| 442 |
return plot_path, result_text, f"{percentage:.1f}%"
|
| 443 |
|
|
|
|
| 446 |
|
| 447 |
|
| 448 |
def detect_video(video_file):
|
|
|
|
| 449 |
if video_file is None:
|
| 450 |
return None, "β No video file provided", None
|
| 451 |
|
| 452 |
try:
|
| 453 |
+
score, frame_scores, temporal = VIDEO_ENSEMBLE.detect(video_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
|
| 455 |
percentage = score * 100
|
| 456 |
verdict = "π€ AI GENERATED" if score > 0.5 else "οΏ½οΏ½ REAL / HUMAN"
|
| 457 |
+
confidence = max(score, 1 - score) * 100
|
| 458 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
plot_path = '/tmp/video_analysis.png'
|
| 460 |
+
plot_video_analysis(frame_scores, plot_path)
|
| 461 |
+
|
| 462 |
+
status = 'π΄ HIGH RISK' if percentage > 75 else 'π SUSPICIOUS' if percentage > 50 else 'π’ LIKELY REAL'
|
| 463 |
|
| 464 |
result_text = f"""
|
| 465 |
+
## π¬ Video Forensics Report
|
| 466 |
|
| 467 |
| Metric | Value |
|
| 468 |
|--------|-------|
|
| 469 |
| **AI Probability** | {percentage:.1f}% |
|
| 470 |
| **Verdict** | {verdict} |
|
| 471 |
+
| **Confidence** | {confidence:.1f}% |
|
| 472 |
| **Frames Analyzed** | {len(frame_scores)} |
|
| 473 |
+
| **Temporal Variance** | {temporal*100:.1f}% |
|
| 474 |
+
| **Status** | {status} |
|
| 475 |
+
|
| 476 |
+
### Detection Methods
|
| 477 |
+
| Method | What it detects |
|
| 478 |
+
|--------|----------------|
|
| 479 |
+
| Optical Flow | Unnatural motion between frames |
|
| 480 |
+
| Face Detection | Blending boundary artifacts |
|
| 481 |
+
| Laplacian Variance | Over-smoothing / over-sharpening |
|
| 482 |
+
| Temporal Consistency | Frame-to-frame irregularities |
|
| 483 |
+
|
| 484 |
+
### Interpretation
|
| 485 |
+
- **0β30%**: π’ Very likely genuine
|
| 486 |
+
- **30β50%**: π‘ Some AI characteristics
|
| 487 |
+
- **50β75%**: π Likely deepfake
|
| 488 |
+
- **75β100%**: π΄ Strong deepfake indicators
|
| 489 |
"""
|
| 490 |
return plot_path, result_text, f"{percentage:.1f}%"
|
| 491 |
|
|
|
|
| 493 |
return None, f"β Error analyzing video: {str(e)}", "N/A"
|
| 494 |
|
| 495 |
|
| 496 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 497 |
+
# π¨ GRADIO UI
|
| 498 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
|
| 500 |
def build_ui():
|
| 501 |
with gr.Blocks(
|
|
|
|
| 505 |
neutral_hue="slate",
|
| 506 |
),
|
| 507 |
css="""
|
| 508 |
+
.gradio-container { max-width: 1200px; margin: auto; }
|
| 509 |
.result-box { border-radius: 12px; padding: 16px; }
|
| 510 |
+
h1 { text-align: center; letter-spacing: -0.5px; }
|
| 511 |
.score-display { font-size: 48px; font-weight: bold; text-align: center; }
|
| 512 |
+
.tab-button { font-weight: 600; }
|
| 513 |
"""
|
| 514 |
) as demo:
|
| 515 |
|
| 516 |
gr.HTML("""
|
| 517 |
+
<div style="text-align:center; padding: 24px 0 12px 0;">
|
| 518 |
+
<h1 style="font-size: 2.8em; font-weight: 800; margin-bottom: 8px;">
|
| 519 |
+
π DeepFake AI Detector
|
| 520 |
+
</h1>
|
| 521 |
+
<p style="font-size: 1.15em; color: #888; max-width: 600px; margin: auto;">
|
| 522 |
+
Upload audio or video to detect AI generation via spectral forensics
|
| 523 |
+
and temporal artifact analysis.
|
| 524 |
</p>
|
| 525 |
</div>
|
| 526 |
""")
|
|
|
|
| 528 |
with gr.Tabs():
|
| 529 |
|
| 530 |
# ββ Audio Tab ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 531 |
+
with gr.TabItem("π Audio Detection", id=0):
|
| 532 |
+
gr.Markdown("### Upload an audio file to analyze for synthetic speech")
|
| 533 |
with gr.Row():
|
| 534 |
with gr.Column(scale=1):
|
| 535 |
audio_input = gr.Audio(
|
| 536 |
+
label="Upload Audio (MP3, WAV, M4A, FLAC)",
|
| 537 |
type="filepath"
|
| 538 |
)
|
| 539 |
audio_btn = gr.Button("π Analyze Audio", variant="primary", size="lg")
|
| 540 |
+
gr.Markdown("""
|
| 541 |
+
**Supported formats:** WAV, MP3, M4A, FLAC
|
| 542 |
+
**Max duration:** 60 seconds (auto-trimmed)
|
| 543 |
+
""")
|
| 544 |
|
| 545 |
with gr.Column(scale=2):
|
| 546 |
audio_score = gr.Label(label="π― AI Score", num_top_classes=1)
|
| 547 |
+
audio_plot = gr.Image(label="π Forensic Visualization")
|
| 548 |
audio_result = gr.Markdown(label="π Detailed Report")
|
| 549 |
|
| 550 |
audio_btn.click(
|
|
|
|
| 553 |
outputs=[audio_plot, audio_result, audio_score]
|
| 554 |
)
|
| 555 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
# ββ Video Tab ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 557 |
+
with gr.TabItem("π¬ Video Detection", id=1):
|
| 558 |
+
gr.Markdown("### Upload a video file to check for deepfake artifacts")
|
| 559 |
with gr.Row():
|
| 560 |
with gr.Column(scale=1):
|
| 561 |
video_input = gr.Video(
|
| 562 |
label="Upload Video (MP4, AVI, MOV)",
|
| 563 |
)
|
| 564 |
video_btn = gr.Button("π Analyze Video", variant="primary", size="lg")
|
| 565 |
+
gr.Markdown("""
|
| 566 |
+
**Supported formats:** MP4, AVI, MOV
|
| 567 |
+
**Analyzes:** 24 uniformly sampled frames
|
| 568 |
+
""")
|
| 569 |
|
| 570 |
with gr.Column(scale=2):
|
| 571 |
video_score = gr.Label(label="π― AI Score", num_top_classes=1)
|
|
|
|
| 579 |
)
|
| 580 |
|
| 581 |
# ββ About Tab βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 582 |
+
with gr.TabItem("βΉοΈ How It Works", id=2):
|
| 583 |
gr.Markdown("""
|
| 584 |
+
## π§ Detection Architecture
|
| 585 |
+
|
| 586 |
+
### Audio Forensics Ensemble
|
| 587 |
+
This system uses a **multi-feature spectral ensemble** that analyzes:
|
| 588 |
+
|
| 589 |
+
| Feature Class | Specific Metrics | Synthetic Indicator |
|
| 590 |
+
|---------------|------------------|---------------------|
|
| 591 |
+
| **Timbre** | 40-dim MFCC + Ξ + ΞΒ² | Unnatural vocal tract patterns |
|
| 592 |
+
| **Temporal** | ZCR mean/std | Overly smooth frame transitions |
|
| 593 |
+
| **Spectral** | Centroid, bandwidth, rolloff | Frequency distribution anomalies |
|
| 594 |
+
| **Harmonic** | Chroma, Tonnetz | Artificial harmonic structure |
|
| 595 |
+
| **Phase** | Analytic signal phase std | Reduced phase coherence |
|
| 596 |
+
| **Dynamics** | RMS micro-dynamics | Compressed natural variation |
|
| 597 |
+
|
| 598 |
+
**Scoring**: Each 0.5-second frame is scored independently. The final verdict blends
|
| 599 |
+
mean frame probability with temporal variance (real speech is more irregular).
|
| 600 |
+
|
| 601 |
+
### Video Forensics
|
| 602 |
+
| Method | Artifact Detected |
|
| 603 |
+
|--------|-------------------|
|
| 604 |
+
| Optical Flow Farneback | Unnatural motion smoothness |
|
| 605 |
+
| Haar Face Detection | Boundary blending errors |
|
| 606 |
+
| Laplacian Variance | Over-sharpening / smoothing |
|
| 607 |
+
| Frame-to-frame StdDev | Temporal inconsistency |
|
| 608 |
|
| 609 |
### Score Interpretation
|
| 610 |
+
- **0β30%**: π’ Very likely genuine
|
| 611 |
+
- **30β50%**: π‘ Some AI characteristics, inconclusive
|
| 612 |
+
- **50β75%**: π Likely AI-generated, needs verification
|
| 613 |
+
- **75β100%**: π΄ Almost certainly AI-generated
|
| 614 |
|
| 615 |
### β οΈ Limitations
|
| 616 |
+
- No detector is 100% accurate against adversarial or novel generative models
|
| 617 |
+
- Performance degrades on heavily compressed or low-bitrate media
|
| 618 |
+
- Always combine automated scores with human expert review
|
| 619 |
+
- Maximum audio analysis length: 60 seconds
|
| 620 |
+
|
| 621 |
+
### π Custom Model Integration
|
| 622 |
+
To use your own trained model:
|
| 623 |
+
1. Set `USE_CUSTOM_MODEL = True` in `app.py`
|
| 624 |
+
2. Implement `load_custom_model()` and `predict_audio_custom()` with your Colab code
|
| 625 |
+
3. Upload your `.pth` weights to the Space repository root
|
| 626 |
""")
|
| 627 |
|
| 628 |
return demo
|
|
|
|
| 630 |
|
| 631 |
if __name__ == "__main__":
|
| 632 |
demo = build_ui()
|
| 633 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
|
|
|
|
|
|
|
|