Spaces:
Sleeping
Sleeping
File size: 4,801 Bytes
c5c9261 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | """
Training script for fine-tuning the AI Voice Detector.
Usage: python train_model.py --dataset_path ./my_dataset --epochs 3
Dataset Structure:
/my_dataset
/real
human_audio1.mp3
human_audio2.wav
/fake
ai_audio1.mp3
ai_audio2.wav
"""
import os
import argparse
import torch
import numpy as np
import librosa
from datasets import load_dataset, Audio, ClassLabel, DatasetDict, Dataset
from transformers import (
AutoFeatureExtractor,
AutoModelForAudioClassification,
TrainingArguments,
Trainer
)
import evaluate
from app.config import settings
# Configuration
MODEL_NAME = settings.MODEL_NAME
SAMPLE_RATE = settings.SAMPLE_RATE
OUTPUT_DIR = "./voice_detection_model_finetuned"
def setup_dataset(dataset_path):
"""
Load dataset from folders
"""
data = []
# Check if directories exist
real_dir = os.path.join(dataset_path, "real")
fake_dir = os.path.join(dataset_path, "fake")
if not os.path.exists(real_dir) or not os.path.exists(fake_dir):
raise ValueError(f"Dataset path must contain 'real' and 'fake' subdirectories at {dataset_path}")
# Load Real (Human) - Label 0
for filename in os.listdir(real_dir):
if filename.lower().endswith(('.wav', '.mp3', '.flac')):
data.append({"audio": os.path.join(real_dir, filename), "label": 0})
# Load Fake (AI) - Label 1
for filename in os.listdir(fake_dir):
if filename.lower().endswith(('.wav', '.mp3', '.flac')):
data.append({"audio": os.path.join(fake_dir, filename), "label": 1})
# Create Dataset
ds = Dataset.from_list(data)
# Split
ds = ds.train_test_split(test_size=0.2)
return ds
def preprocess_function(examples, feature_extractor):
audio_arrays = [x["array"] for x in examples["audio"]]
inputs = feature_extractor(
audio_arrays,
sampling_rate=feature_extractor.sampling_rate,
max_length=SAMPLE_RATE * 5, # 5 seconds max
truncation=True,
padding=True,
return_tensors="pt"
)
return inputs
def compute_metrics(eval_pred):
accuracy = evaluate.load("accuracy")
predictions = np.argmax(eval_pred.predictions, axis=1)
return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)
def main():
parser = argparse.ArgumentParser(description="Train AI Voice Detector")
parser.add_argument("--dataset_path", type=str, required=True, help="Path to dataset containing 'real' and 'fake' folders")
parser.add_argument("--epochs", type=int, default=3, help="Number of training epochs")
parser.add_argument("--batch_size", type=int, default=4, help="Batch size")
args = parser.parse_args()
print(f"Loading base model: {MODEL_NAME}")
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
print(f"Preparing dataset from {args.dataset_path}...")
dataset = setup_dataset(args.dataset_path)
# Cast audio column
dataset = dataset.cast_column("audio", Audio(sampling_rate=SAMPLE_RATE))
print("Preprocessing audio...")
encoded_dataset = dataset.map(
lambda x: preprocess_function(x, feature_extractor),
batched=True,
remove_columns=["audio"]
)
# Load Model
# Label mapping: 0 -> HUMAN (Real), 1 -> AI_GENERATED (Fake)
# Note: Check original model config to align permissions if needed, but for fine-tuning we can redefine.
id2label = {0: "HUMAN", 1: "AI_GENERATED"}
label2id = {"HUMAN": 0, "AI_GENERATED": 1}
model = AutoModelForAudioClassification.from_pretrained(
MODEL_NAME,
num_labels=2,
label2id=label2id,
id2label=id2label,
ignore_mismatched_sizes=True
)
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=3e-5,
per_device_train_batch_size=args.batch_size,
gradient_accumulation_steps=4,
per_device_eval_batch_size=args.batch_size,
num_train_epochs=args.epochs,
warmup_ratio=0.1,
logging_steps=10,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
push_to_hub=False,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=encoded_dataset["train"],
eval_dataset=encoded_dataset["test"],
tokenizer=feature_extractor,
compute_metrics=compute_metrics,
)
print("Starting training...")
trainer.train()
print(f"Training complete! Model saved to {OUTPUT_DIR}")
trainer.save_model(OUTPUT_DIR)
feature_extractor.save_pretrained(OUTPUT_DIR)
if __name__ == "__main__":
main()
|