detection_final2

Sleeping

App Files Files Community

kimnamjoon0007 commited on Jan 30

Commit

3c18ee6

verified ·

1 Parent(s): dcd9e51

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +231 -0

app.py ADDED Viewed

	@@ -0,0 +1,231 @@

+"""
+AI Voice Detection - Hugging Face Spaces Demo
+Detects AI-generated vs Human voices in multilingual audio
+"""
+import os
+import tempfile
+import numpy as np
+import torch
+import torch.nn as nn
+import gradio as gr
+from transformers import Wav2Vec2Model
+from pydub import AudioSegment
+import librosa
+# Configuration
+MODEL_REPO = "kimnamjoon0007/lkht-v440"
+TARGET_SR = 16000
+MAX_DURATION = 10.0
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Model architecture (must match training)
+class W2VBertDeepfakeDetector(nn.Module):
+    def __init__(self, backbone, num_labels=2):
+        super().__init__()
+        self.backbone = backbone
+        hidden_size = backbone.config.hidden_size
+        self.dropout = nn.Dropout(0.1)
+        self.classifier = nn.Linear(hidden_size, num_labels)
+    def forward(self, input_values, attention_mask=None):
+        outputs = self.backbone(input_values=input_values, attention_mask=attention_mask)
+        hidden_states = outputs.last_hidden_state
+        pooled = hidden_states.mean(dim=1)
+        pooled = self.dropout(pooled)
+        logits = self.classifier(pooled)
+        return logits
+# Load model
+print("Loading model...")
+backbone = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-xlsr-53")
+model = W2VBertDeepfakeDetector(backbone, num_labels=2)
+# Try to load from HF Hub
+try:
+    from huggingface_hub import hf_hub_download
+    model_path = hf_hub_download(repo_id=MODEL_REPO, filename="best_model.pt")
+    state_dict = torch.load(model_path, map_location="cpu")
+    model.load_state_dict(state_dict)
+    print(f"✓ Loaded model from {MODEL_REPO}")
+except Exception as e:
+    print(f"Warning: Could not load from HF Hub: {e}")
+    # Fallback to local file
+    if os.path.exists("best_model.pt"):
+        model.load_state_dict(torch.load("best_model.pt", map_location="cpu"))
+        print("✓ Loaded model from local file")
+model.to(DEVICE)
+model.eval()
+print(f"Model ready on {DEVICE}")
+def load_audio(audio_path):
+    """Load and preprocess audio file."""
+    try:
+        audio_segment = AudioSegment.from_file(audio_path)
+        samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32)
+        if audio_segment.channels > 1:
+            samples = samples.reshape(-1, audio_segment.channels).mean(axis=1)
+        samples /= 32767.0
+        sr = audio_segment.frame_rate
+        if sr != TARGET_SR:
+            samples = librosa.resample(samples, orig_sr=sr, target_sr=TARGET_SR)
+        # Truncate to max duration
+        max_len = int(MAX_DURATION * TARGET_SR)
+        if len(samples) > max_len:
+            samples = samples[:max_len]
+        return torch.from_numpy(samples).float()
+    except Exception as e:
+        raise gr.Error(f"Error loading audio: {e}")
+def classify_audio(audio_input):
+    """Main classification function for Gradio."""
+    if audio_input is None:
+        return "Please upload or record an audio file.", None
+    # Handle both file upload and microphone input
+    if isinstance(audio_input, tuple):
+        # Microphone input: (sample_rate, numpy_array)
+        sr, audio_data = audio_input
+        temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+        import scipy.io.wavfile as wav
+        wav.write(temp_file.name, sr, audio_data)
+        audio_path = temp_file.name
+    else:
+        # File upload
+        audio_path = audio_input
+    try:
+        # Load and preprocess
+        waveform = load_audio(audio_path)
+        input_values = waveform.unsqueeze(0).to(DEVICE)
+        # Inference
+        with torch.no_grad():
+            logits = model(input_values)
+            probs = torch.softmax(logits, dim=-1)
+            pred_class = torch.argmax(probs, dim=-1).item()
+            confidence = probs[0, pred_class].item()
+        # Result
+        label = "🤖 AI-GENERATED" if pred_class == 1 else "👤 HUMAN"
+        # Create detailed result
+        result_text = f"""
+## Classification Result
+**Verdict:** {label}
+**Confidence:** {confidence:.1%}
+---
+### Probability Breakdown
+- Human: {probs[0, 0].item():.1%}
+- AI-Generated: {probs[0, 1].item():.1%}
+"""
+        # Create confidence bar data
+        confidence_data = {
+            "Human": float(probs[0, 0].item()),
+            "AI-Generated": float(probs[0, 1].item())
+        }
+        return result_text, confidence_data
+    except Exception as e:
+        return f"Error: {str(e)}", None
+    finally:
+        # Cleanup temp file if created
+        if isinstance(audio_input, tuple) and os.path.exists(audio_path):
+            os.remove(audio_path)
+# Gradio Interface
+with gr.Blocks(
+    title="AI Voice Detection",
+    theme=gr.themes.Soft(primary_hue="blue"),
+    css="""
+    .gradio-container { max-width: 800px; margin: auto; }
+    .result-box { font-size: 1.2em; }
+    """
+) as demo:
+    gr.Markdown("""
+    # 🎤 AI Voice Detection
+    Detect whether an audio clip is **AI-generated** or spoken by a **human**.
+    ### Supported Languages
+    Tamil • English • Hindi • Malayalam • Telugu
+    ---
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            audio_input = gr.Audio(
+                label="Upload or Record Audio",
+                type="filepath",
+                sources=["upload", "microphone"]
+            )
+            submit_btn = gr.Button("🔍 Analyze", variant="primary", size="lg")
+            gr.Markdown("""
+            **Tips:**
+            - Upload MP3, WAV, or other audio formats
+            - Or use microphone to record directly
+            - Audio will be analyzed up to 10 seconds
+            """)
+        with gr.Column(scale=1):
+            result_output = gr.Markdown(
+                label="Result",
+                elem_classes=["result-box"]
+            )
+            confidence_chart = gr.Label(
+                label="Confidence Scores",
+                num_top_classes=2
+            )
+    # Event handlers
+    submit_btn.click(
+        fn=classify_audio,
+        inputs=[audio_input],
+        outputs=[result_output, confidence_chart]
+    )
+    audio_input.change(
+        fn=classify_audio,
+        inputs=[audio_input],
+        outputs=[result_output, confidence_chart]
+    )
+    gr.Markdown("""
+    ---
+    ### About
+    This model uses **Wav2Vec2-large-xlsr-53** as the backbone, fine-tuned for AI voice detection.
+    - **Accuracy:** 99.69%
+    - **AUROC:** 1.0
+    - **EER:** 0.25%
+    [View Model on Hugging Face](https://huggingface.co/kimnamjoon0007/lkht-v440)
+    """)
+# Launch
+if __name__ == "__main__":
+    demo.launch()