ranar110 commited on
Commit
aee4240
·
1 Parent(s): f6d50b1

Upgrade: Replaced mock detector with Real AI Model and added Fine-Tuning Guide

Browse files
Files changed (4) hide show
  1. fine_tuning_guide.md +115 -0
  2. main.py +1 -1
  3. real_detector.py +120 -0
  4. requirements.txt +9 -0
fine_tuning_guide.md ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🎓 Guide: Fine-Tuning Your Voice Detection Model
2
+
3
+ This guide explains how to improve your voice detection model's accuracy by fine-tuning it on specialized datasets like **ASVspoof** or **In-the-Wild**.
4
+
5
+ ## 1. Prerequisites
6
+ You will need a GPU-enabled environment. **Google Colab (Free Tier)** is the easiest way to start.
7
+ - [Google Colab](https://colab.research.google.com/)
8
+ - Hugging Face Account
9
+
10
+ ## 2. The Dataset
11
+ For audio deepfake detection, you need a dataset with labeled "Real" and "Fake" audio.
12
+ **Recommended Datasets:**
13
+ - **ASVspoof 2019/2021**: The gold standard for voice anti-spoofing.
14
+ - **WaveFake**: A dataset of deepfake audio.
15
+ - **In-the-Wild**: Dataset containing deepfakes of politicians and celebrities.
16
+
17
+ ## 3. Fine-Tuning Steps (in Google Colab)
18
+
19
+ ### Step A: Install Libraries
20
+ ```python
21
+ !pip install transformers datasets torch librosa accelerate
22
+ ```
23
+
24
+ ### Step B: Load Your Dataset
25
+ Assuming you have a folder structure like `data/real/*.wav` and `data/fake/*.wav`.
26
+
27
+ ```python
28
+ from datasets import load_dataset, Audio
29
+
30
+ # Load from local folder or a Hugging Face dataset rep
31
+ dataset = load_dataset("audiofolder", data_dir="path_to_your_data")
32
+ # Split into train/test
33
+ dataset = dataset.train_test_split(test_size=0.2)
34
+ ```
35
+
36
+ ### Step C: Preprocessing
37
+ Resample all audio to 16kHz (required by Wav2Vec2).
38
+
39
+ ```python
40
+ from transformers import AutoFeatureExtractor
41
+
42
+ model_id = "MelodyMachine/Deepfake-audio-detection"
43
+ feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
44
+
45
+ def preprocess_function(examples):
46
+ audio_arrays = [x["array"] for x in examples["audio"]]
47
+ inputs = feature_extractor(
48
+ audio_arrays,
49
+ sampling_rate=16000,
50
+ max_length=160000, # 10 seconds
51
+ truncation=True
52
+ )
53
+ return inputs
54
+
55
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
56
+ encoded_dataset = dataset.map(preprocess_function, remove_columns="audio", batched=True)
57
+ ```
58
+
59
+ ### Step D: Load Model & Training Config
60
+
61
+ ```python
62
+ from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
63
+
64
+ num_labels = 2
65
+ label2id = {"Fake": 0, "Real": 1}
66
+ id2label = {0: "Fake", 1: "Real"}
67
+
68
+ model = AutoModelForAudioClassification.from_pretrained(
69
+ model_id,
70
+ num_labels=num_labels,
71
+ label2id=label2id,
72
+ id2label=id2label,
73
+ ignore_mismatched_sizes=True # Important when fine-tuning on new classes
74
+ )
75
+
76
+ training_args = TrainingArguments(
77
+ output_dir="./results",
78
+ evaluation_strategy="epoch",
79
+ learning_rate=3e-5,
80
+ per_device_train_batch_size=8,
81
+ num_train_epochs=5,
82
+ )
83
+ ```
84
+
85
+ ### Step E: Train!
86
+
87
+ ```python
88
+ trainer = Trainer(
89
+ model=model,
90
+ args=training_args,
91
+ train_dataset=encoded_dataset["train"],
92
+ eval_dataset=encoded_dataset["test"],
93
+ tokenizer=feature_extractor,
94
+ )
95
+
96
+ trainer.train()
97
+ ```
98
+
99
+ ### Step F: Save & Export
100
+ ```python
101
+ model.save_pretrained("my_finetuned_model")
102
+ feature_extractor.save_pretrained("my_finetuned_model")
103
+ ```
104
+
105
+ ## 4. Using Your New Model
106
+ Once trained, upload your "my_finetuned_model" folder to Hugging Face Hub.
107
+ Then, simply update `MODEL_NAME` in your `real_detector.py`:
108
+
109
+ ```python
110
+ MODEL_NAME = "your-username/my_finetuned_model"
111
+ ```
112
+
113
+ ## 💡 Tips for Accuracy
114
+ - **Diversity**: Ensure your "Fake" data includes many different TTS engines (ElevenLabs, Murf, Coqui, etc.).
115
+ - **Noise**: Add background noise to your training data to make the model robust against real-world recordings.
main.py CHANGED
@@ -3,7 +3,7 @@ from fastapi.staticfiles import StaticFiles
3
  from fastapi.responses import FileResponse
4
  from auth import verify_api_key
5
  from audio_processor import process_audio
6
- from detector import analyze_audio
7
  from murf_generator import generate_audio_with_murf
8
  from pydantic import BaseModel
9
  from typing import Optional
 
3
  from fastapi.responses import FileResponse
4
  from auth import verify_api_key
5
  from audio_processor import process_audio
6
+ from real_detector import analyze_audio_real as analyze_audio
7
  from murf_generator import generate_audio_with_murf
8
  from pydantic import BaseModel
9
  from typing import Optional
real_detector.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import librosa
3
+ import numpy as np
4
+ import os
5
+ from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
6
+ import warnings
7
+
8
+ # Suppress warnings
9
+ warnings.filterwarnings("ignore")
10
+
11
+ # Global model cache
12
+ MODEL_CACHE = {}
13
+ MODEL_NAME = "MelodyMachine/Deepfake-audio-detection" # A good starting model from HF
14
+
15
+ def load_model():
16
+ """Load the model and feature extractor if not already loaded."""
17
+ if MODEL_CACHE.get("model") is None:
18
+ print(f"Loading model: {MODEL_NAME}...")
19
+ try:
20
+ # Load feature extractor
21
+ feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
22
+ # Load model
23
+ model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME)
24
+
25
+ MODEL_CACHE["feature_extractor"] = feature_extractor
26
+ MODEL_CACHE["model"] = model
27
+ print("Model loaded successfully.")
28
+ except Exception as e:
29
+ print(f"Error loading model: {e}")
30
+ return None, None
31
+
32
+ return MODEL_CACHE["model"], MODEL_CACHE["feature_extractor"]
33
+
34
+ def preprocess_audio(file_path, max_duration=10):
35
+ """Load and preprocess audio file for the model."""
36
+ try:
37
+ # Load audio file (resample to 16kHz as typically required by Wav2Vec2)
38
+ audio, sample_rate = librosa.load(file_path, sr=16000, duration=max_duration)
39
+ return audio, sample_rate
40
+ except Exception as e:
41
+ print(f"Error preprocessing audio: {e}")
42
+ return None, None
43
+
44
+ def analyze_audio_real(metadata):
45
+ """
46
+ Run actual AI inference on the audio file.
47
+ Replaces the mock logic with real Deep Learning model predictions.
48
+ """
49
+ file_path = metadata.get('file_path')
50
+
51
+ if not file_path or not os.path.exists(file_path):
52
+ return {
53
+ "error": "File not found",
54
+ "is_human": None,
55
+ "confidence": 0.0
56
+ }
57
+
58
+ # Load model
59
+ model, feature_extractor = load_model()
60
+ if not model or not feature_extractor:
61
+ # Fallback if model fails to load (e.g. no internet/memory)
62
+ return {
63
+ "error": "Model failed to load",
64
+ "is_human": None,
65
+ "confidence": 0.0
66
+ }
67
+
68
+ try:
69
+ # Preprocess
70
+ audio, sr = preprocess_audio(file_path)
71
+ if audio is None:
72
+ return {"error": "Invalid audio file", "is_human": None}
73
+
74
+ # Prepare inputs
75
+ inputs = feature_extractor(audio, sampling_rate=sr, return_tensors="pt")
76
+
77
+ # Inference
78
+ with torch.no_grad():
79
+ logits = model(**inputs).logits
80
+
81
+ # Get probabilities (softmax)
82
+ probs = torch.nn.functional.softmax(logits, dim=-1)
83
+
84
+ # Get predicted label and score
85
+ # Assuming label 0 is "Fake" and 1 is "Real" (Need to verify model specific mapping)
86
+ # For MelodyMachine/Deepfake-audio-detection:
87
+ # Label 0: Real
88
+ # Label 1: Fake
89
+ # (We will verify this mapping or adjust based on model config)
90
+
91
+ predicted_id = torch.argmax(logits, dim=-1).item()
92
+ confidence = probs[0][predicted_id].item()
93
+
94
+ # Mapping for MelodyMachine model (need to verify mapping)
95
+ # Usually checking id2label from config is safest
96
+ id2label = model.config.id2label
97
+ predicted_label = id2label[predicted_id]
98
+
99
+ # Logic: if label contains "real" or "bona-fide", it's human
100
+ is_human = "real" in predicted_label.lower() or "bona" in predicted_label.lower()
101
+
102
+ # Return structured analysis
103
+ return {
104
+ "is_human": is_human,
105
+ "confidence": round(confidence, 4),
106
+ "detected_language": "analyzed",
107
+ "model_used": MODEL_NAME,
108
+ "raw_label": predicted_label,
109
+ "segments": [
110
+ {"start": 0.0, "end": min(metadata.get('duration_seconds', 0), 10.0), "label": predicted_label}
111
+ ]
112
+ }
113
+
114
+ except Exception as e:
115
+ print(f"Inference error: {e}")
116
+ return {
117
+ "error": str(e),
118
+ "is_human": None,
119
+ "confidence": 0.0
120
+ }
requirements.txt CHANGED
@@ -1,4 +1,13 @@
 
 
 
 
 
 
 
 
1
  fastapi
2
  uvicorn
3
  python-multipart
4
  requests
 
 
1
+ # AI/ML Dependencies
2
+ torch>=2.0.0
3
+ transformers>=4.30.0
4
+ librosa>=0.10.0
5
+ numpy>=1.24.0
6
+ scipy>=1.10.0
7
+
8
+ # API & Server
9
  fastapi
10
  uvicorn
11
  python-multipart
12
  requests
13
+ pydantic