vigilaudio / src /models /train.py
nice-bill's picture
complete fine-tuning pipeline and inference engine
8d8f544
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics import classification_report, confusion_matrix
import os
# 1. Dataset Class
class EmotionEmbeddingDataset(Dataset):
def __init__(self, metadata_df, embedding_dir, label_map):
# Reset index so iloc matches current dataframe length
self.df = metadata_df.reset_index(drop=True)
self.embedding_dir = Path(embedding_dir)
self.label_map = label_map
# Pre-filter to only files that exist in the embedding dir
self.valid_indices = []
for idx, row in self.df.iterrows():
stem = Path(row['path']).stem
# This matches the naming in build_features.py
emb_path = self.embedding_dir / f"{row['split']}_{row['emotion']}_{stem}.npy"
if emb_path.exists():
self.valid_indices.append((idx, str(emb_path)))
print(f"Loaded {len(self.valid_indices)} valid embeddings.")
def __len__(self):
return len(self.valid_indices)
def __getitem__(self, idx):
original_idx, emb_path = self.valid_indices[idx]
embedding = np.load(emb_path)
label_str = self.df.iloc[original_idx]['emotion']
label = self.label_map[label_str]
return torch.tensor(embedding, dtype=torch.float32), torch.tensor(label, dtype=torch.long)
# 2. Simple Neural Network Architecture
class EmotionClassifier(nn.Module):
def __init__(self, input_dim=768, hidden_dim=256, num_classes=7):
super(EmotionClassifier, self).__init__()
self.network = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.BatchNorm1d(hidden_dim),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(hidden_dim, hidden_dim // 2),
nn.ReLU(),
nn.Linear(hidden_dim // 2, num_classes)
)
def forward(self, x):
return self.network(x)
def train_model():
# Setup paths
METADATA = "data/processed/metadata.csv"
EMB_DIR = "data/embeddings/wav2vec2"
MODEL_SAVE_PATH = "models/emotion_classifier.pth"
# Label Mapping
df = pd.read_csv(METADATA)
emotions = sorted(df['emotion'].unique())
label_map = {name: i for i, name in enumerate(emotions)}
print(f"Label Map: {label_map}")
# Prepare DataLoaders
train_ds = EmotionEmbeddingDataset(df[df['split']=='train'], EMB_DIR, label_map)
val_ds = EmotionEmbeddingDataset(df[df['split']=='val'], EMB_DIR, label_map)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)
# Initialize Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EmotionClassifier(num_classes=len(emotions)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Training Loop
epochs = 50
best_val_acc = 0.0
print("\nStarting Training...")
for epoch in range(epochs):
model.train()
train_loss = 0
for x, y in train_loader:
x, y = x.to(device), y.to(device)
optimizer.zero_grad()
outputs = model(x)
loss = criterion(outputs, y)
loss.backward()
optimizer.step()
train_loss += loss.item()
# Validation
model.eval()
correct = 0
total = 0
with torch.no_grad():
for x, y in val_loader:
x, y = x.to(device), y.to(device)
outputs = model(x)
_, predicted = torch.max(outputs.data, 1)
total += y.size(0)
correct += (predicted == y).sum().item()
val_acc = 100 * correct / total
print(f"Epoch {epoch+1}/{epochs} | Loss: {train_loss/len(train_loader):.4f} | Val Acc: {val_acc:.2f}%")
if val_acc > best_val_acc:
best_val_acc = val_acc
torch.save({
'model_state_dict': model.state_dict(),
'label_map': label_map,
'emotions': emotions
}, MODEL_SAVE_PATH)
print(f"Saved new best model!")
print(f"\nTraining Complete. Best Val Accuracy: {best_val_acc:.2f}%")
if __name__ == "__main__":
train_model()