detection_final2

Sleeping

App Files Files Community

kimnamjoon0007 commited on Jan 30

Commit

166d169

verified ·

1 Parent(s): 6517dff

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +83 -142

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ TARGET_SR = 16000
 MAX_DURATION = 10.0
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Model architecture (must match training)
 class W2VBertDeepfakeDetector(nn.Module):
     def __init__(self, backbone, num_labels=2):
         super().__init__()
@@ -42,7 +42,6 @@ print("Loading model...")
 backbone = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-xlsr-53")
 model = W2VBertDeepfakeDetector(backbone, num_labels=2)
-# Try to load from HF Hub
 try:
     from huggingface_hub import hf_hub_download
     model_path = hf_hub_download(repo_id=MODEL_REPO, filename="best_model.pt")
@@ -51,7 +50,6 @@ try:
     print(f"✓ Loaded model from {MODEL_REPO}")
 except Exception as e:
     print(f"Warning: Could not load from HF Hub: {e}")
-    # Fallback to local file
     if os.path.exists("best_model.pt"):
         model.load_state_dict(torch.load("best_model.pt", map_location="cpu"))
         print("✓ Loaded model from local file")
@@ -63,48 +61,42 @@ print(f"Model ready on {DEVICE}")
 def load_audio(audio_path):
     """Load and preprocess audio file."""
-    try:
-        audio_segment = AudioSegment.from_file(audio_path)
-        samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32)
-        if audio_segment.channels > 1:
-            samples = samples.reshape(-1, audio_segment.channels).mean(axis=1)
-        samples /= 32767.0
-        sr = audio_segment.frame_rate
-        if sr != TARGET_SR:
-            samples = librosa.resample(samples, orig_sr=sr, target_sr=TARGET_SR)
-        # Truncate to max duration
-        max_len = int(MAX_DURATION * TARGET_SR)
-        if len(samples) > max_len:
-            samples = samples[:max_len]
-        return torch.from_numpy(samples).float()
-    except Exception as e:
-        raise gr.Error(f"Error loading audio: {e}")
 def classify_audio(audio_input):
-    """Main classification function for Gradio."""
     if audio_input is None:
-        return "Please upload or record an audio file.", None
-    # Handle both file upload and microphone input
-    if isinstance(audio_input, tuple):
-        # Microphone input: (sample_rate, numpy_array)
-        sr, audio_data = audio_input
-        temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-        import scipy.io.wavfile as wav
-        wav.write(temp_file.name, sr, audio_data)
-        audio_path = temp_file.name
-    else:
-        # File upload
-        audio_path = audio_input
     try:
-        # Load and preprocess
         waveform = load_audio(audio_path)
         input_values = waveform.unsqueeze(0).to(DEVICE)
@@ -115,117 +107,66 @@ def classify_audio(audio_input):
             pred_class = torch.argmax(probs, dim=-1).item()
             confidence = probs[0, pred_class].item()
-        # Result
-        label = "🤖 AI-GENERATED" if pred_class == 1 else "👤 HUMAN"
-        # Create detailed result
-        result_text = f"""
-## Classification Result
-**Verdict:** {label}
-**Confidence:** {confidence:.1%}
 ---
-### Probability Breakdown
-- Human: {probs[0, 0].item():.1%}
-- AI-Generated: {probs[0, 1].item():.1%}
 """
-        # Create confidence bar data
-        confidence_data = {
-            "Human": float(probs[0, 0].item()),
-            "AI-Generated": float(probs[0, 1].item())
-        }
-        return result_text, confidence_data
     except Exception as e:
-        return f"Error: {str(e)}", None
     finally:
-        # Cleanup temp file if created
-        if isinstance(audio_input, tuple) and os.path.exists(audio_path):
-            os.remove(audio_path)
-# Gradio Interface
-with gr.Blocks(
-    title="AI Voice Detection",
-    theme=gr.themes.Soft(primary_hue="blue"),
-    css="""
-    .gradio-container { max-width: 800px; margin: auto; }
-    .result-box { font-size: 1.2em; }
-    """
-) as demo:
-    gr.Markdown("""
-    # 🎤 AI Voice Detection
-    Detect whether an audio clip is **AI-generated** or spoken by a **human**.
-    ### Supported Languages
-    Tamil • English • Hindi • Malayalam • Telugu
-    ---
-    """)
-    with gr.Row():
-        with gr.Column(scale=1):
-            audio_input = gr.Audio(
-                label="Upload or Record Audio",
-                type="filepath",
-                sources=["upload", "microphone"]
-            )
-            submit_btn = gr.Button("🔍 Analyze", variant="primary", size="lg")
-            gr.Markdown("""
-            **Tips:**
-            - Upload MP3, WAV, or other audio formats
-            - Or use microphone to record directly
-            - Audio will be analyzed up to 10 seconds
-            """)
-        with gr.Column(scale=1):
-            result_output = gr.Markdown(
-                label="Result",
-                elem_classes=["result-box"]
-            )
-            confidence_chart = gr.Label(
-                label="Confidence Scores",
-                num_top_classes=2
-            )
-    # Event handlers
-    submit_btn.click(
-        fn=classify_audio,
-        inputs=[audio_input],
-        outputs=[result_output, confidence_chart]
-    )
-    audio_input.change(
-        fn=classify_audio,
-        inputs=[audio_input],
-        outputs=[result_output, confidence_chart]
-    )
-    gr.Markdown("""
-    ---
-    ### About
-    This model uses **Wav2Vec2-large-xlsr-53** as the backbone, fine-tuned for AI voice detection.
-    - **Accuracy:** 99.69%
-    - **AUROC:** 1.0
-    - **EER:** 0.25%
-    [View Model on Hugging Face](https://huggingface.co/kimnamjoon0007/lkht-v440)
-    """)
-# Launch
 if __name__ == "__main__":
-    demo.launch()

 MAX_DURATION = 10.0
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 class W2VBertDeepfakeDetector(nn.Module):
     def __init__(self, backbone, num_labels=2):
         super().__init__()
 backbone = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-xlsr-53")
 model = W2VBertDeepfakeDetector(backbone, num_labels=2)
 try:
     from huggingface_hub import hf_hub_download
     model_path = hf_hub_download(repo_id=MODEL_REPO, filename="best_model.pt")
     print(f"✓ Loaded model from {MODEL_REPO}")
 except Exception as e:
     print(f"Warning: Could not load from HF Hub: {e}")
     if os.path.exists("best_model.pt"):
         model.load_state_dict(torch.load("best_model.pt", map_location="cpu"))
         print("✓ Loaded model from local file")
 def load_audio(audio_path):
     """Load and preprocess audio file."""
+    audio_segment = AudioSegment.from_file(audio_path)
+    samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32)
+    if audio_segment.channels > 1:
+        samples = samples.reshape(-1, audio_segment.channels).mean(axis=1)
+    samples /= 32767.0
+    sr = audio_segment.frame_rate
+    if sr != TARGET_SR:
+        samples = librosa.resample(samples, orig_sr=sr, target_sr=TARGET_SR)
+    max_len = int(MAX_DURATION * TARGET_SR)
+    if len(samples) > max_len:
+        samples = samples[:max_len]
+    return torch.from_numpy(samples).float()
 def classify_audio(audio_input):
+    """Main classification function."""
     if audio_input is None:
+        return "⚠️ Please upload or record an audio file."
     try:
+        # Handle tuple input from microphone (sample_rate, audio_array)
+        if isinstance(audio_input, tuple):
+            import scipy.io.wavfile as wav
+            sr, audio_data = audio_input
+            temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+            wav.write(temp_file.name, sr, audio_data)
+            audio_path = temp_file.name
+        else:
+            audio_path = audio_input
+        # Load and process
         waveform = load_audio(audio_path)
         input_values = waveform.unsqueeze(0).to(DEVICE)
             pred_class = torch.argmax(probs, dim=-1).item()
             confidence = probs[0, pred_class].item()
+        human_prob = probs[0, 0].item() * 100
+        ai_prob = probs[0, 1].item() * 100
+        if pred_class == 1:
+            verdict = "🤖 AI-GENERATED"
+            color = "red"
+        else:
+            verdict = "👤 HUMAN"
+            color = "green"
+        result = f"""
+## Result: {verdict}
+**Confidence: {confidence:.1%}**
 ---
+| Category | Probability |
+|----------|-------------|
+| 👤 Human | {human_prob:.1f}% |
+| 🤖 AI-Generated | {ai_prob:.1f}% |
+---
+*Model: Wav2Vec2-large-xlsr-53 fine-tuned for voice detection*
 """
+        return result
     except Exception as e:
+        return f"❌ Error processing audio: {str(e)}"
     finally:
+        if isinstance(audio_input, tuple) and 'audio_path' in locals():
+            try:
+                os.remove(audio_path)
+            except:
+                pass
+# Simple Gradio Interface
+demo = gr.Interface(
+    fn=classify_audio,
+    inputs=gr.Audio(
+        label="Upload or Record Audio",
+        type="filepath",
+        sources=["upload", "microphone"]
+    ),
+    outputs=gr.Markdown(label="Result"),
+    title="🎤 AI Voice Detection",
+    description="""
+    **Detect if audio is AI-generated or Human speech**
+    Supported languages: Tamil, English, Hindi, Malayalam, Telugu
+    Upload an audio file (MP3, WAV, etc.) or record directly using your microphone.
+    """,
+    examples=[],
+    theme=gr.themes.Soft(),
+    allow_flagging="never"
+)
+# Launch for HuggingFace Spaces
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)