DEEPFAKEAI / app.py
Khalil09's picture
Create app.py
5249f52 verified
# app.py β€” This is your main Hugging Face Spaces app
import gradio as gr
import torch
import numpy as np
import librosa
import cv2
import os
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')
from PIL import Image
import io
import base64
import warnings
warnings.filterwarnings('ignore')
# ─── Paste your model classes here (or import them) ──────────────────────────
# Copy the AudioDeepfakeDetector and VideoDeepfakeDetector classes from above
# OR use import statements if you structured it as a package:
# from model.audio_detector import AudioDeepfakeDetector, extract_audio_features, predict_audio
# from model.video_detector import VideoDeepfakeDetector, predict_video
# For Spaces, we'll use lightweight pretrained models from HuggingFace Hub
# as fallback if custom models aren't trained yet:
from transformers import pipeline
# Load pre-trained audio classifier (for demo)
# For production, replace with your trained model weights
AUDIO_MODEL_PATH = "audio_model.pth"
VIDEO_MODEL_PATH = "video_model.pth"
USE_TRAINED_MODEL = os.path.exists(AUDIO_MODEL_PATH)
# ─── Fallback: Use transformers pipeline ─────────────────────────────────────
def analyze_audio_transformers(audio_path):
"""
Uses a HuggingFace pipeline for audio classification.
Replace this with your trained model for better accuracy.
"""
try:
classifier = pipeline(
"audio-classification",
model="facebook/wav2vec2-base",
# For real deepfake detection use:
# model="mo-aqrabi/deepfake-audio-detection"
)
# This is a placeholder β€” replace with actual deepfake model
# For now returns heuristic based on spectral analysis
y, sr = librosa.load(audio_path, sr=16000, duration=5)
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
zcr = librosa.feature.zero_crossing_rate(y)
spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
# Heuristic: AI audio tends to have very regular ZCR and centroid
zcr_regularity = 1 - (np.std(zcr) / (np.mean(zcr) + 1e-6))
spectral_regularity = 1 - (np.std(spectral_centroid) / (np.mean(spectral_centroid) + 1e-6))
# Combine into a rough AI score
ai_score = np.clip((zcr_regularity + spectral_regularity) / 2, 0, 1)
return float(ai_score)
except Exception as e:
print(f"Fallback audio analysis error: {e}")
return 0.5 # Uncertain
# ─── Main Detection Functions ─────────────────────────────────────────────────
def detect_audio(audio_file):
"""Full audio analysis pipeline."""
if audio_file is None:
return None, "❌ No audio file provided", None
try:
if USE_TRAINED_MODEL:
score = predict_audio(audio_file, AUDIO_MODEL_PATH)
else:
score = analyze_audio_transformers(audio_file)
percentage = score * 100
verdict = "πŸ€– AI GENERATED" if score > 0.5 else "βœ… REAL / HUMAN"
confidence = max(score, 1 - score) * 100
# Generate waveform + spectrogram plot
y, sr = librosa.load(audio_file, sr=16000, duration=10)
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 6))
fig.patch.set_facecolor('#0d0d0d')
# Waveform
ax1.set_facecolor('#1a1a2e')
librosa.display.waveshow(y, sr=sr, ax=ax1, color='#00d4ff')
ax1.set_title('Audio Waveform', color='white', fontsize=12)
ax1.tick_params(colors='white')
# Mel Spectrogram
ax2.set_facecolor('#1a1a2e')
mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
mel_db = librosa.power_to_db(mel, ref=np.max)
img = librosa.display.specshow(mel_db, sr=sr, ax=ax2,
x_axis='time', y_axis='mel', cmap='magma')
plt.colorbar(img, ax=ax2, format='%+2.0f dB')
ax2.set_title('Mel Spectrogram', color='white', fontsize=12)
ax2.tick_params(colors='white')
ax2.yaxis.label.set_color('white')
ax2.xaxis.label.set_color('white')
plt.tight_layout()
plot_path = '/tmp/audio_analysis.png'
plt.savefig(plot_path, facecolor='#0d0d0d', bbox_inches='tight')
plt.close()
result_text = f"""
## πŸ”Š Audio Analysis Result
| Metric | Value |
|--------|-------|
| **AI Probability** | {percentage:.1f}% |
| **Verdict** | {verdict} |
| **Confidence** | {confidence:.1f}% |
| **Status** | {'⚠️ HIGH RISK' if percentage > 75 else '🟑 SUSPICIOUS' if percentage > 50 else '🟒 LIKELY REAL'} |
### Interpretation
- **0-30%**: Very likely genuine/human-created
- **30-50%**: Possibly human, some AI characteristics
- **50-70%**: Likely AI-generated, needs verification
- **70-100%**: Almost certainly AI-generated
"""
return plot_path, result_text, f"{percentage:.1f}%"
except Exception as e:
return None, f"❌ Error analyzing audio: {str(e)}", "N/A"
def detect_video(video_file):
"""Full video analysis pipeline."""
if video_file is None:
return None, "❌ No video file provided", None
try:
if USE_TRAINED_MODEL and os.path.exists(VIDEO_MODEL_PATH):
result = predict_video(video_file, VIDEO_MODEL_PATH)
score = result['final_score']
frame_scores = result['per_frame_scores']
temporal = result['temporal_score']
else:
# Fallback: basic frame analysis
score, frame_scores, temporal = analyze_video_basic(video_file)
percentage = score * 100
verdict = "πŸ€– AI GENERATED" if score > 0.5 else "βœ… REAL / HUMAN"
# Generate frame score visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
fig.patch.set_facecolor('#0d0d0d')
# Frame scores over time
ax1.set_facecolor('#1a1a2e')
frames_x = range(len(frame_scores))
ax1.plot(frames_x, [f * 100 for f in frame_scores], 'cyan', linewidth=2)
ax1.fill_between(frames_x, [f * 100 for f in frame_scores], alpha=0.3, color='cyan')
ax1.axhline(y=50, color='red', linestyle='--', alpha=0.7, label='Threshold (50%)')
ax1.set_facecolor('#1a1a2e')
ax1.set_xlabel('Frame', color='white')
ax1.set_ylabel('AI Score (%)', color='white')
ax1.set_title('Per-Frame AI Score', color='white')
ax1.tick_params(colors='white')
ax1.legend(facecolor='#1a1a2e', labelcolor='white')
ax1.set_ylim(0, 100)
# Score breakdown pie
ax2.set_facecolor('#1a1a2e')
sizes = [score * 100, (1 - score) * 100]
colors_pie = ['#FF4444', '#00CC44']
wedges, texts, autotexts = ax2.pie(sizes, labels=['AI Generated', 'Real/Human'],
colors=colors_pie, autopct='%1.1f%%',
textprops={'color': 'white', 'fontsize': 12},
startangle=90)
ax2.set_title(f'Final Verdict: {verdict}', color='white', fontsize=13)
plt.tight_layout()
plot_path = '/tmp/video_analysis.png'
plt.savefig(plot_path, facecolor='#0d0d0d', bbox_inches='tight')
plt.close()
result_text = f"""
## 🎬 Video Analysis Result
| Metric | Value |
|--------|-------|
| **AI Probability** | {percentage:.1f}% |
| **Verdict** | {verdict} |
| **Frames Analyzed** | {len(frame_scores)} |
| **Temporal Inconsistency** | {temporal*100:.1f}% |
| **Status** | {'⚠️ HIGH RISK' if percentage > 75 else '🟑 SUSPICIOUS' if percentage > 50 else '🟒 LIKELY REAL'} |
### What we checked:
- βœ“ Face region analysis per frame
- βœ“ Temporal consistency between frames
- βœ“ Artifact patterns typical of AI generation
- βœ“ Natural motion flow analysis
"""
return plot_path, result_text, f"{percentage:.1f}%"
except Exception as e:
return None, f"❌ Error analyzing video: {str(e)}", "N/A"
def analyze_video_basic(video_path):
"""Basic video analysis fallback without trained model."""
cap = cv2.VideoCapture(video_path)
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
indices = np.linspace(0, total - 1, 20, dtype=int)
frame_scores = []
prev_gray = None
for idx in indices:
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
ret, frame = cap.read()
if not ret:
continue
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
if prev_gray is not None:
# Check for unnatural sharpness transitions (common in deepfakes)
diff = cv2.absdiff(gray, prev_gray)
score = float(np.std(diff)) / 50.0
score = np.clip(score, 0, 1)
frame_scores.append(score)
prev_gray = gray
cap.release()
if not frame_scores:
return 0.5, [0.5], 0.5
avg_score = np.mean(frame_scores)
temporal = float(np.std(frame_scores))
return avg_score, frame_scores, min(temporal, 1.0)
# ─── Gradio UI ────────────────────────────────────────────────────────────────
def build_ui():
with gr.Blocks(
title="πŸ” DeepFake AI Detector",
theme=gr.themes.Base(
primary_hue="blue",
neutral_hue="slate",
),
css="""
.gradio-container { max-width: 1100px; margin: auto; }
.result-box { border-radius: 12px; padding: 16px; }
h1 { text-align: center; }
.score-display { font-size: 48px; font-weight: bold; text-align: center; }
"""
) as demo:
gr.HTML("""
<div style="text-align:center; padding: 20px 0;">
<h1 style="font-size: 2.5em; font-weight: 800;">πŸ” DeepFake AI Detector</h1>
<p style="font-size: 1.1em; color: #888;">
Upload audio or video to detect AI generation.<br>
Get a precise percentage score of how much AI was used.
</p>
</div>
""")
with gr.Tabs():
# ── Audio Tab ──────────────────────────────────────────────────────
with gr.TabItem("πŸ”Š Audio Detection"):
gr.Markdown("### Upload an audio file to check if it's AI-generated")
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(
label="Upload Audio (MP3, WAV, M4A)",
type="filepath"
)
audio_btn = gr.Button("πŸ” Analyze Audio", variant="primary", size="lg")
with gr.Column(scale=2):
audio_score = gr.Label(label="🎯 AI Score", num_top_classes=1)
audio_plot = gr.Image(label="πŸ“Š Audio Analysis")
audio_result = gr.Markdown(label="πŸ“‹ Detailed Report")
audio_btn.click(
fn=detect_audio,
inputs=[audio_input],
outputs=[audio_plot, audio_result, audio_score]
)
gr.Examples(
examples=[],
inputs=audio_input,
label="Example files (add your own samples)"
)
# ── Video Tab ──────────────────────────────────────────────────────
with gr.TabItem("🎬 Video Detection"):
gr.Markdown("### Upload a video file to check if it's a deepfake")
with gr.Row():
with gr.Column(scale=1):
video_input = gr.Video(
label="Upload Video (MP4, AVI, MOV)",
)
video_btn = gr.Button("πŸ” Analyze Video", variant="primary", size="lg")
with gr.Column(scale=2):
video_score = gr.Label(label="🎯 AI Score", num_top_classes=1)
video_plot = gr.Image(label="πŸ“Š Frame Analysis")
video_result = gr.Markdown(label="πŸ“‹ Detailed Report")
video_btn.click(
fn=detect_video,
inputs=[video_input],
outputs=[video_plot, video_result, video_score]
)
# ── About Tab ─────────────────────────────────────────────────────
with gr.TabItem("ℹ️ How It Works"):
gr.Markdown("""
## 🧠 Detection Methodology
### Audio Analysis
| Feature | What it detects |
|---------|----------------|
| MFCC (40 coefficients) | Unnatural vocal tract patterns |
| Mel Spectrogram | Frequency distribution anomalies |
| Zero Crossing Rate | Overly smooth AI transitions |
| Spectral Centroid | Frequency center shifts |
| Tonnetz | Harmonic content irregularities |
### Video Analysis
| Method | What it detects |
|--------|----------------|
| EfficientNet-B4 CNN | Per-frame visual artifacts |
| Optical Flow | Temporal inconsistencies |
| Face Detection | Blending boundary anomalies |
| Ensemble Scoring | Combined confidence score |
### Score Interpretation
- **0-30%**: 🟒 Very likely genuine
- **30-50%**: 🟑 Some AI characteristics, inconclusive
- **50-70%**: 🟠 Likely AI-generated
- **70-100%**: πŸ”΄ Almost certainly AI-generated
### ⚠️ Limitations
- No detector is 100% accurate
- Newer AI models may evade detection
- Low quality media reduces accuracy
- Always combine with human judgment
""")
return demo
if __name__ == "__main__":
demo = build_ui()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True # Creates public URL for testing
)