File size: 4,801 Bytes
c5c9261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""
Training script for fine-tuning the AI Voice Detector.
Usage: python train_model.py --dataset_path ./my_dataset --epochs 3

Dataset Structure:
/my_dataset
    /real
        human_audio1.mp3
        human_audio2.wav
    /fake
        ai_audio1.mp3
        ai_audio2.wav
"""
import os
import argparse
import torch
import numpy as np
import librosa
from datasets import load_dataset, Audio, ClassLabel, DatasetDict, Dataset
from transformers import (
    AutoFeatureExtractor, 
    AutoModelForAudioClassification, 
    TrainingArguments, 
    Trainer
)
import evaluate
from app.config import settings

# Configuration
MODEL_NAME = settings.MODEL_NAME
SAMPLE_RATE = settings.SAMPLE_RATE
OUTPUT_DIR = "./voice_detection_model_finetuned"

def setup_dataset(dataset_path):
    """
    Load dataset from folders
    """
    data = []
    
    # Check if directories exist
    real_dir = os.path.join(dataset_path, "real")
    fake_dir = os.path.join(dataset_path, "fake")
    
    if not os.path.exists(real_dir) or not os.path.exists(fake_dir):
        raise ValueError(f"Dataset path must contain 'real' and 'fake' subdirectories at {dataset_path}")

    # Load Real (Human) - Label 0
    for filename in os.listdir(real_dir):
        if filename.lower().endswith(('.wav', '.mp3', '.flac')):
            data.append({"audio": os.path.join(real_dir, filename), "label": 0})
            
    # Load Fake (AI) - Label 1
    for filename in os.listdir(fake_dir):
        if filename.lower().endswith(('.wav', '.mp3', '.flac')):
            data.append({"audio": os.path.join(fake_dir, filename), "label": 1})
            
    # Create Dataset
    ds = Dataset.from_list(data)
    
    # Split
    ds = ds.train_test_split(test_size=0.2)
    return ds

def preprocess_function(examples, feature_extractor):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, 
        sampling_rate=feature_extractor.sampling_rate, 
        max_length=SAMPLE_RATE * 5, # 5 seconds max
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    return inputs

def compute_metrics(eval_pred):
    accuracy = evaluate.load("accuracy")
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

def main():
    parser = argparse.ArgumentParser(description="Train AI Voice Detector")
    parser.add_argument("--dataset_path", type=str, required=True, help="Path to dataset containing 'real' and 'fake' folders")
    parser.add_argument("--epochs", type=int, default=3, help="Number of training epochs")
    parser.add_argument("--batch_size", type=int, default=4, help="Batch size")
    args = parser.parse_args()

    print(f"Loading base model: {MODEL_NAME}")
    feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
    
    print(f"Preparing dataset from {args.dataset_path}...")
    dataset = setup_dataset(args.dataset_path)
    
    # Cast audio column
    dataset = dataset.cast_column("audio", Audio(sampling_rate=SAMPLE_RATE))
    
    print("Preprocessing audio...")
    encoded_dataset = dataset.map(
        lambda x: preprocess_function(x, feature_extractor), 
        batched=True, 
        remove_columns=["audio"]
    )

    # Load Model
    # Label mapping: 0 -> HUMAN (Real), 1 -> AI_GENERATED (Fake)
    # Note: Check original model config to align permissions if needed, but for fine-tuning we can redefine.
    id2label = {0: "HUMAN", 1: "AI_GENERATED"}
    label2id = {"HUMAN": 0, "AI_GENERATED": 1}
    
    model = AutoModelForAudioClassification.from_pretrained(
        MODEL_NAME, 
        num_labels=2, 
        label2id=label2id, 
        id2label=id2label,
        ignore_mismatched_sizes=True
    )

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=args.batch_size,
        gradient_accumulation_steps=4,
        per_device_eval_batch_size=args.batch_size,
        num_train_epochs=args.epochs,
        warmup_ratio=0.1,
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["test"],
        tokenizer=feature_extractor,
        compute_metrics=compute_metrics,
    )

    print("Starting training...")
    trainer.train()
    
    print(f"Training complete! Model saved to {OUTPUT_DIR}")
    trainer.save_model(OUTPUT_DIR)
    feature_extractor.save_pretrained(OUTPUT_DIR)

if __name__ == "__main__":
    main()