Spaces:
Sleeping
Sleeping
| # app.py β This is your main Hugging Face Spaces app | |
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| import librosa | |
| import cv2 | |
| import os | |
| import matplotlib.pyplot as plt | |
| import matplotlib | |
| matplotlib.use('Agg') | |
| from PIL import Image | |
| import io | |
| import base64 | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # βββ Paste your model classes here (or import them) ββββββββββββββββββββββββββ | |
| # Copy the AudioDeepfakeDetector and VideoDeepfakeDetector classes from above | |
| # OR use import statements if you structured it as a package: | |
| # from model.audio_detector import AudioDeepfakeDetector, extract_audio_features, predict_audio | |
| # from model.video_detector import VideoDeepfakeDetector, predict_video | |
| # For Spaces, we'll use lightweight pretrained models from HuggingFace Hub | |
| # as fallback if custom models aren't trained yet: | |
| from transformers import pipeline | |
| # Load pre-trained audio classifier (for demo) | |
| # For production, replace with your trained model weights | |
| AUDIO_MODEL_PATH = "audio_model.pth" | |
| VIDEO_MODEL_PATH = "video_model.pth" | |
| USE_TRAINED_MODEL = os.path.exists(AUDIO_MODEL_PATH) | |
| # βββ Fallback: Use transformers pipeline βββββββββββββββββββββββββββββββββββββ | |
| def analyze_audio_transformers(audio_path): | |
| """ | |
| Uses a HuggingFace pipeline for audio classification. | |
| Replace this with your trained model for better accuracy. | |
| """ | |
| try: | |
| classifier = pipeline( | |
| "audio-classification", | |
| model="facebook/wav2vec2-base", | |
| # For real deepfake detection use: | |
| # model="mo-aqrabi/deepfake-audio-detection" | |
| ) | |
| # This is a placeholder β replace with actual deepfake model | |
| # For now returns heuristic based on spectral analysis | |
| y, sr = librosa.load(audio_path, sr=16000, duration=5) | |
| mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) | |
| zcr = librosa.feature.zero_crossing_rate(y) | |
| spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr) | |
| # Heuristic: AI audio tends to have very regular ZCR and centroid | |
| zcr_regularity = 1 - (np.std(zcr) / (np.mean(zcr) + 1e-6)) | |
| spectral_regularity = 1 - (np.std(spectral_centroid) / (np.mean(spectral_centroid) + 1e-6)) | |
| # Combine into a rough AI score | |
| ai_score = np.clip((zcr_regularity + spectral_regularity) / 2, 0, 1) | |
| return float(ai_score) | |
| except Exception as e: | |
| print(f"Fallback audio analysis error: {e}") | |
| return 0.5 # Uncertain | |
| # βββ Main Detection Functions βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def detect_audio(audio_file): | |
| """Full audio analysis pipeline.""" | |
| if audio_file is None: | |
| return None, "β No audio file provided", None | |
| try: | |
| if USE_TRAINED_MODEL: | |
| score = predict_audio(audio_file, AUDIO_MODEL_PATH) | |
| else: | |
| score = analyze_audio_transformers(audio_file) | |
| percentage = score * 100 | |
| verdict = "π€ AI GENERATED" if score > 0.5 else "β REAL / HUMAN" | |
| confidence = max(score, 1 - score) * 100 | |
| # Generate waveform + spectrogram plot | |
| y, sr = librosa.load(audio_file, sr=16000, duration=10) | |
| fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 6)) | |
| fig.patch.set_facecolor('#0d0d0d') | |
| # Waveform | |
| ax1.set_facecolor('#1a1a2e') | |
| librosa.display.waveshow(y, sr=sr, ax=ax1, color='#00d4ff') | |
| ax1.set_title('Audio Waveform', color='white', fontsize=12) | |
| ax1.tick_params(colors='white') | |
| # Mel Spectrogram | |
| ax2.set_facecolor('#1a1a2e') | |
| mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) | |
| mel_db = librosa.power_to_db(mel, ref=np.max) | |
| img = librosa.display.specshow(mel_db, sr=sr, ax=ax2, | |
| x_axis='time', y_axis='mel', cmap='magma') | |
| plt.colorbar(img, ax=ax2, format='%+2.0f dB') | |
| ax2.set_title('Mel Spectrogram', color='white', fontsize=12) | |
| ax2.tick_params(colors='white') | |
| ax2.yaxis.label.set_color('white') | |
| ax2.xaxis.label.set_color('white') | |
| plt.tight_layout() | |
| plot_path = '/tmp/audio_analysis.png' | |
| plt.savefig(plot_path, facecolor='#0d0d0d', bbox_inches='tight') | |
| plt.close() | |
| result_text = f""" | |
| ## π Audio Analysis Result | |
| | Metric | Value | | |
| |--------|-------| | |
| | **AI Probability** | {percentage:.1f}% | | |
| | **Verdict** | {verdict} | | |
| | **Confidence** | {confidence:.1f}% | | |
| | **Status** | {'β οΈ HIGH RISK' if percentage > 75 else 'π‘ SUSPICIOUS' if percentage > 50 else 'π’ LIKELY REAL'} | | |
| ### Interpretation | |
| - **0-30%**: Very likely genuine/human-created | |
| - **30-50%**: Possibly human, some AI characteristics | |
| - **50-70%**: Likely AI-generated, needs verification | |
| - **70-100%**: Almost certainly AI-generated | |
| """ | |
| return plot_path, result_text, f"{percentage:.1f}%" | |
| except Exception as e: | |
| return None, f"β Error analyzing audio: {str(e)}", "N/A" | |
| def detect_video(video_file): | |
| """Full video analysis pipeline.""" | |
| if video_file is None: | |
| return None, "β No video file provided", None | |
| try: | |
| if USE_TRAINED_MODEL and os.path.exists(VIDEO_MODEL_PATH): | |
| result = predict_video(video_file, VIDEO_MODEL_PATH) | |
| score = result['final_score'] | |
| frame_scores = result['per_frame_scores'] | |
| temporal = result['temporal_score'] | |
| else: | |
| # Fallback: basic frame analysis | |
| score, frame_scores, temporal = analyze_video_basic(video_file) | |
| percentage = score * 100 | |
| verdict = "π€ AI GENERATED" if score > 0.5 else "β REAL / HUMAN" | |
| # Generate frame score visualization | |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) | |
| fig.patch.set_facecolor('#0d0d0d') | |
| # Frame scores over time | |
| ax1.set_facecolor('#1a1a2e') | |
| frames_x = range(len(frame_scores)) | |
| ax1.plot(frames_x, [f * 100 for f in frame_scores], 'cyan', linewidth=2) | |
| ax1.fill_between(frames_x, [f * 100 for f in frame_scores], alpha=0.3, color='cyan') | |
| ax1.axhline(y=50, color='red', linestyle='--', alpha=0.7, label='Threshold (50%)') | |
| ax1.set_facecolor('#1a1a2e') | |
| ax1.set_xlabel('Frame', color='white') | |
| ax1.set_ylabel('AI Score (%)', color='white') | |
| ax1.set_title('Per-Frame AI Score', color='white') | |
| ax1.tick_params(colors='white') | |
| ax1.legend(facecolor='#1a1a2e', labelcolor='white') | |
| ax1.set_ylim(0, 100) | |
| # Score breakdown pie | |
| ax2.set_facecolor('#1a1a2e') | |
| sizes = [score * 100, (1 - score) * 100] | |
| colors_pie = ['#FF4444', '#00CC44'] | |
| wedges, texts, autotexts = ax2.pie(sizes, labels=['AI Generated', 'Real/Human'], | |
| colors=colors_pie, autopct='%1.1f%%', | |
| textprops={'color': 'white', 'fontsize': 12}, | |
| startangle=90) | |
| ax2.set_title(f'Final Verdict: {verdict}', color='white', fontsize=13) | |
| plt.tight_layout() | |
| plot_path = '/tmp/video_analysis.png' | |
| plt.savefig(plot_path, facecolor='#0d0d0d', bbox_inches='tight') | |
| plt.close() | |
| result_text = f""" | |
| ## π¬ Video Analysis Result | |
| | Metric | Value | | |
| |--------|-------| | |
| | **AI Probability** | {percentage:.1f}% | | |
| | **Verdict** | {verdict} | | |
| | **Frames Analyzed** | {len(frame_scores)} | | |
| | **Temporal Inconsistency** | {temporal*100:.1f}% | | |
| | **Status** | {'β οΈ HIGH RISK' if percentage > 75 else 'π‘ SUSPICIOUS' if percentage > 50 else 'π’ LIKELY REAL'} | | |
| ### What we checked: | |
| - β Face region analysis per frame | |
| - β Temporal consistency between frames | |
| - β Artifact patterns typical of AI generation | |
| - β Natural motion flow analysis | |
| """ | |
| return plot_path, result_text, f"{percentage:.1f}%" | |
| except Exception as e: | |
| return None, f"β Error analyzing video: {str(e)}", "N/A" | |
| def analyze_video_basic(video_path): | |
| """Basic video analysis fallback without trained model.""" | |
| cap = cv2.VideoCapture(video_path) | |
| total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| indices = np.linspace(0, total - 1, 20, dtype=int) | |
| frame_scores = [] | |
| prev_gray = None | |
| for idx in indices: | |
| cap.set(cv2.CAP_PROP_POS_FRAMES, idx) | |
| ret, frame = cap.read() | |
| if not ret: | |
| continue | |
| gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) | |
| if prev_gray is not None: | |
| # Check for unnatural sharpness transitions (common in deepfakes) | |
| diff = cv2.absdiff(gray, prev_gray) | |
| score = float(np.std(diff)) / 50.0 | |
| score = np.clip(score, 0, 1) | |
| frame_scores.append(score) | |
| prev_gray = gray | |
| cap.release() | |
| if not frame_scores: | |
| return 0.5, [0.5], 0.5 | |
| avg_score = np.mean(frame_scores) | |
| temporal = float(np.std(frame_scores)) | |
| return avg_score, frame_scores, min(temporal, 1.0) | |
| # βββ Gradio UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_ui(): | |
| with gr.Blocks( | |
| title="π DeepFake AI Detector", | |
| theme=gr.themes.Base( | |
| primary_hue="blue", | |
| neutral_hue="slate", | |
| ), | |
| css=""" | |
| .gradio-container { max-width: 1100px; margin: auto; } | |
| .result-box { border-radius: 12px; padding: 16px; } | |
| h1 { text-align: center; } | |
| .score-display { font-size: 48px; font-weight: bold; text-align: center; } | |
| """ | |
| ) as demo: | |
| gr.HTML(""" | |
| <div style="text-align:center; padding: 20px 0;"> | |
| <h1 style="font-size: 2.5em; font-weight: 800;">π DeepFake AI Detector</h1> | |
| <p style="font-size: 1.1em; color: #888;"> | |
| Upload audio or video to detect AI generation.<br> | |
| Get a precise percentage score of how much AI was used. | |
| </p> | |
| </div> | |
| """) | |
| with gr.Tabs(): | |
| # ββ Audio Tab ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("π Audio Detection"): | |
| gr.Markdown("### Upload an audio file to check if it's AI-generated") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| audio_input = gr.Audio( | |
| label="Upload Audio (MP3, WAV, M4A)", | |
| type="filepath" | |
| ) | |
| audio_btn = gr.Button("π Analyze Audio", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| audio_score = gr.Label(label="π― AI Score", num_top_classes=1) | |
| audio_plot = gr.Image(label="π Audio Analysis") | |
| audio_result = gr.Markdown(label="π Detailed Report") | |
| audio_btn.click( | |
| fn=detect_audio, | |
| inputs=[audio_input], | |
| outputs=[audio_plot, audio_result, audio_score] | |
| ) | |
| gr.Examples( | |
| examples=[], | |
| inputs=audio_input, | |
| label="Example files (add your own samples)" | |
| ) | |
| # ββ Video Tab ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("π¬ Video Detection"): | |
| gr.Markdown("### Upload a video file to check if it's a deepfake") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| video_input = gr.Video( | |
| label="Upload Video (MP4, AVI, MOV)", | |
| ) | |
| video_btn = gr.Button("π Analyze Video", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| video_score = gr.Label(label="π― AI Score", num_top_classes=1) | |
| video_plot = gr.Image(label="π Frame Analysis") | |
| video_result = gr.Markdown(label="π Detailed Report") | |
| video_btn.click( | |
| fn=detect_video, | |
| inputs=[video_input], | |
| outputs=[video_plot, video_result, video_score] | |
| ) | |
| # ββ About Tab βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("βΉοΈ How It Works"): | |
| gr.Markdown(""" | |
| ## π§ Detection Methodology | |
| ### Audio Analysis | |
| | Feature | What it detects | | |
| |---------|----------------| | |
| | MFCC (40 coefficients) | Unnatural vocal tract patterns | | |
| | Mel Spectrogram | Frequency distribution anomalies | | |
| | Zero Crossing Rate | Overly smooth AI transitions | | |
| | Spectral Centroid | Frequency center shifts | | |
| | Tonnetz | Harmonic content irregularities | | |
| ### Video Analysis | |
| | Method | What it detects | | |
| |--------|----------------| | |
| | EfficientNet-B4 CNN | Per-frame visual artifacts | | |
| | Optical Flow | Temporal inconsistencies | | |
| | Face Detection | Blending boundary anomalies | | |
| | Ensemble Scoring | Combined confidence score | | |
| ### Score Interpretation | |
| - **0-30%**: π’ Very likely genuine | |
| - **30-50%**: π‘ Some AI characteristics, inconclusive | |
| - **50-70%**: π Likely AI-generated | |
| - **70-100%**: π΄ Almost certainly AI-generated | |
| ### β οΈ Limitations | |
| - No detector is 100% accurate | |
| - Newer AI models may evade detection | |
| - Low quality media reduces accuracy | |
| - Always combine with human judgment | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = build_ui() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=True # Creates public URL for testing | |
| ) |