Spaces:

ranar118
/

voice_detection

Sleeping

App Files Files Community

ranar110 commited on Feb 1

Commit

aee4240

1 Parent(s): f6d50b1

Upgrade: Replaced mock detector with Real AI Model and added Fine-Tuning Guide

Browse files

Files changed (4) hide show

fine_tuning_guide.md +115 -0
main.py +1 -1
real_detector.py +120 -0
requirements.txt +9 -0

fine_tuning_guide.md ADDED Viewed

	@@ -0,0 +1,115 @@

+# 🎓 Guide: Fine-Tuning Your Voice Detection Model
+This guide explains how to improve your voice detection model's accuracy by fine-tuning it on specialized datasets like **ASVspoof** or **In-the-Wild**.
+## 1. Prerequisites
+You will need a GPU-enabled environment. **Google Colab (Free Tier)** is the easiest way to start.
+- [Google Colab](https://colab.research.google.com/)
+- Hugging Face Account
+## 2. The Dataset
+For audio deepfake detection, you need a dataset with labeled "Real" and "Fake" audio.
+**Recommended Datasets:**
+- **ASVspoof 2019/2021**: The gold standard for voice anti-spoofing.
+- **WaveFake**: A dataset of deepfake audio.
+- **In-the-Wild**: Dataset containing deepfakes of politicians and celebrities.
+## 3. Fine-Tuning Steps (in Google Colab)
+### Step A: Install Libraries
+```python
+!pip install transformers datasets torch librosa accelerate
+```
+### Step B: Load Your Dataset
+Assuming you have a folder structure like `data/real/*.wav` and `data/fake/*.wav`.
+```python
+from datasets import load_dataset, Audio
+# Load from local folder or a Hugging Face dataset rep
+dataset = load_dataset("audiofolder", data_dir="path_to_your_data")
+# Split into train/test
+dataset = dataset.train_test_split(test_size=0.2)
+```
+### Step C: Preprocessing
+Resample all audio to 16kHz (required by Wav2Vec2).
+```python
+from transformers import AutoFeatureExtractor
+model_id = "MelodyMachine/Deepfake-audio-detection"
+feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
+def preprocess_function(examples):
+    audio_arrays = [x["array"] for x in examples["audio"]]
+    inputs = feature_extractor(
+        audio_arrays,
+        sampling_rate=16000,
+        max_length=160000, # 10 seconds
+        truncation=True
+    )
+    return inputs
+dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+encoded_dataset = dataset.map(preprocess_function, remove_columns="audio", batched=True)
+```
+### Step D: Load Model & Training Config
+```python
+from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
+num_labels = 2
+label2id = {"Fake": 0, "Real": 1}
+id2label = {0: "Fake", 1: "Real"}
+model = AutoModelForAudioClassification.from_pretrained(
+    model_id,
+    num_labels=num_labels,
+    label2id=label2id,
+    id2label=id2label,
+    ignore_mismatched_sizes=True # Important when fine-tuning on new classes
+)
+training_args = TrainingArguments(
+    output_dir="./results",
+    evaluation_strategy="epoch",
+    learning_rate=3e-5,
+    per_device_train_batch_size=8,
+    num_train_epochs=5,
+)
+```
+### Step E: Train!
+```python
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=encoded_dataset["train"],
+    eval_dataset=encoded_dataset["test"],
+    tokenizer=feature_extractor,
+)
+trainer.train()
+```
+### Step F: Save & Export
+```python
+model.save_pretrained("my_finetuned_model")
+feature_extractor.save_pretrained("my_finetuned_model")
+```
+## 4. Using Your New Model
+Once trained, upload your "my_finetuned_model" folder to Hugging Face Hub.
+Then, simply update `MODEL_NAME` in your `real_detector.py`:
+```python
+MODEL_NAME = "your-username/my_finetuned_model"
+```
+## 💡 Tips for Accuracy
+- **Diversity**: Ensure your "Fake" data includes many different TTS engines (ElevenLabs, Murf, Coqui, etc.).
+- **Noise**: Add background noise to your training data to make the model robust against real-world recordings.

main.py CHANGED Viewed

@@ -3,7 +3,7 @@ from fastapi.staticfiles import StaticFiles
 from fastapi.responses import FileResponse
 from auth import verify_api_key
 from audio_processor import process_audio
-from detector import analyze_audio
 from murf_generator import generate_audio_with_murf
 from pydantic import BaseModel
 from typing import Optional

 from fastapi.responses import FileResponse
 from auth import verify_api_key
 from audio_processor import process_audio
+from real_detector import analyze_audio_real as analyze_audio
 from murf_generator import generate_audio_with_murf
 from pydantic import BaseModel
 from typing import Optional

real_detector.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import torch
+import librosa
+import numpy as np
+import os
+from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
+import warnings
+# Suppress warnings
+warnings.filterwarnings("ignore")
+# Global model cache
+MODEL_CACHE = {}
+MODEL_NAME = "MelodyMachine/Deepfake-audio-detection" # A good starting model from HF
+def load_model():
+    """Load the model and feature extractor if not already loaded."""
+    if MODEL_CACHE.get("model") is None:
+        print(f"Loading model: {MODEL_NAME}...")
+        try:
+            # Load feature extractor
+            feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
+            # Load model
+            model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME)
+            MODEL_CACHE["feature_extractor"] = feature_extractor
+            MODEL_CACHE["model"] = model
+            print("Model loaded successfully.")
+        except Exception as e:
+            print(f"Error loading model: {e}")
+            return None, None
+    return MODEL_CACHE["model"], MODEL_CACHE["feature_extractor"]
+def preprocess_audio(file_path, max_duration=10):
+    """Load and preprocess audio file for the model."""
+    try:
+        # Load audio file (resample to 16kHz as typically required by Wav2Vec2)
+        audio, sample_rate = librosa.load(file_path, sr=16000, duration=max_duration)
+        return audio, sample_rate
+    except Exception as e:
+        print(f"Error preprocessing audio: {e}")
+        return None, None
+def analyze_audio_real(metadata):
+    """
+    Run actual AI inference on the audio file.
+    Replaces the mock logic with real Deep Learning model predictions.
+    """
+    file_path = metadata.get('file_path')
+    if not file_path or not os.path.exists(file_path):
+        return {
+            "error": "File not found",
+            "is_human": None,
+            "confidence": 0.0
+        }
+    # Load model
+    model, feature_extractor = load_model()
+    if not model or not feature_extractor:
+        # Fallback if model fails to load (e.g. no internet/memory)
+        return {
+            "error": "Model failed to load",
+            "is_human": None,
+            "confidence": 0.0
+        }
+    try:
+        # Preprocess
+        audio, sr = preprocess_audio(file_path)
+        if audio is None:
+            return {"error": "Invalid audio file", "is_human": None}
+        # Prepare inputs
+        inputs = feature_extractor(audio, sampling_rate=sr, return_tensors="pt")
+        # Inference
+        with torch.no_grad():
+            logits = model(**inputs).logits
+        # Get probabilities (softmax)
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+        # Get predicted label and score
+        # Assuming label 0 is "Fake" and 1 is "Real" (Need to verify model specific mapping)
+        # For MelodyMachine/Deepfake-audio-detection:
+        # Label 0: Real
+        # Label 1: Fake
+        # (We will verify this mapping or adjust based on model config)
+        predicted_id = torch.argmax(logits, dim=-1).item()
+        confidence = probs[0][predicted_id].item()
+        # Mapping for MelodyMachine model (need to verify mapping)
+        # Usually checking id2label from config is safest
+        id2label = model.config.id2label
+        predicted_label = id2label[predicted_id]
+        # Logic: if label contains "real" or "bona-fide", it's human
+        is_human = "real" in predicted_label.lower() or "bona" in predicted_label.lower()
+        # Return structured analysis
+        return {
+            "is_human": is_human,
+            "confidence": round(confidence, 4),
+            "detected_language": "analyzed",
+            "model_used": MODEL_NAME,
+            "raw_label": predicted_label,
+            "segments": [
+                {"start": 0.0, "end": min(metadata.get('duration_seconds', 0), 10.0), "label": predicted_label}
+            ]
+        }
+    except Exception as e:
+        print(f"Inference error: {e}")
+        return {
+            "error": str(e),
+            "is_human": None,
+            "confidence": 0.0
+        }

requirements.txt CHANGED Viewed

@@ -1,4 +1,13 @@
 fastapi
 uvicorn
 python-multipart
 requests

+# AI/ML Dependencies
+torch>=2.0.0
+transformers>=4.30.0
+librosa>=0.10.0
+numpy>=1.24.0
+scipy>=1.10.0
+# API & Server
 fastapi
 uvicorn
 python-multipart
 requests
+pydantic