Vishal-Padia
/

SentimentSound

Model card Files Files and versions

xet

Community

Vishal-Padia commited on Dec 2, 2024

Commit

c8fe9e1

verified ·

1 Parent(s): 987fbd4

Upload speech emotion recognition model

Browse files

Files changed (1) hide show

main.py +519 -0

main.py ADDED Viewed

	@@ -0,0 +1,519 @@

+import os
+import torch
+import wandb
+import librosa
+import torchaudio
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import torch.nn as nn
+import torch.optim as optim
+import matplotlib.pyplot as plt
+import torch.nn.functional as F
+from sklearn.utils import class_weight
+from torch.utils.data import Dataset, DataLoader
+from torch.optim.lr_scheduler import ReduceLROnPlateau
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+from sklearn.metrics import classification_report, confusion_matrix
+from sklearn.model_selection import train_test_split, StratifiedKFold
+# Advanced Configuration with More Options
+class Config:
+    """Enhanced configuration for emotion recognition project"""
+    # Data paths
+    DATA_DIR = "archive"
+    # Audio processing parameters
+    SAMPLE_RATE = 22050  # Standard sample rate
+    DURATION = 3  # seconds
+    N_MFCC = 20
+    # Model hyperparameters
+    BATCH_SIZE = 32
+    LEARNING_RATE = 0.001
+    NUM_EPOCHS = 20
+    # Feature extraction parameters
+    FEATURES = [
+        "mfcc",
+        "spectral_centroid",
+        "chroma",
+        "spectral_contrast",
+        "zero_crossing_rate",
+        "spectral_rolloff",
+    ]
+    # Augmentation parameters
+    AUGMENTATION = True
+    NOISE_FACTOR = 0.005
+    SCALE_RANGE = (0.9, 1.1)
+def extract_advanced_features(file_path):
+    """
+    Extract multiple audio features with more comprehensive approach
+    Args:
+        file_path (str): Path to the audio file
+    Returns:
+        numpy.ndarray: Concatenated feature vector
+    """
+    # Load the audio file
+    y, sr = librosa.load(file_path, duration=Config.DURATION, sr=Config.SAMPLE_RATE)
+    # Feature extraction
+    features = []
+    # MFCC features (increased resolution)
+    if "mfcc" in Config.FEATURES:
+        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=Config.N_MFCC)
+        mfccs_processed = np.mean(mfccs.T, axis=0)
+        features.append(mfccs_processed)
+    # Spectral Centroid
+    if "spectral_centroid" in Config.FEATURES:
+        spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
+        spectral_centroids_processed = np.mean(spectral_centroids)
+        features.append([spectral_centroids_processed])
+    # Chroma Features
+    if "chroma" in Config.FEATURES:
+        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
+        chroma_processed = np.mean(chroma.T, axis=0)
+        features.append(chroma_processed)
+    # Spectral Contrast
+    if "spectral_contrast" in Config.FEATURES:
+        spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
+        spectral_contrast_processed = np.mean(spectral_contrast.T, axis=0)
+        features.append(spectral_contrast_processed)
+    # Zero Crossing Rate
+    if "zero_crossing_rate" in Config.FEATURES:
+        zcr = librosa.feature.zero_crossing_rate(y)
+        zcr_processed = np.mean(zcr)
+        features.append([zcr_processed])
+    # Spectral Rolloff
+    if "spectral_rolloff" in Config.FEATURES:
+        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
+        spectral_rolloff_processed = np.mean(spectral_rolloff)
+        features.append([spectral_rolloff_processed])
+    # Concatenate all features
+    return np.concatenate(features)
+def augment_features(
+    features, noise_factor=Config.NOISE_FACTOR, scale_range=Config.SCALE_RANGE
+):
+    """
+    Advanced feature augmentation technique
+    Args:
+        features (numpy.ndarray): Input feature array
+        noise_factor (float): Magnitude of noise to add
+        scale_range (tuple): Range for feature scaling
+    Returns:
+        numpy.ndarray: Augmented features
+    """
+    if not Config.AUGMENTATION:
+        return features
+    # Add Gaussian noise
+    noise = np.random.normal(0, noise_factor, features.shape)
+    augmented_features = features + noise
+    # Random scaling
+    scale_factor = np.random.uniform(scale_range[0], scale_range[1])
+    augmented_features *= scale_factor
+    return augmented_features
+def prepare_dataset(data_dir):
+    """
+    Prepare dataset with more robust feature extraction and potential augmentation
+    Args:
+        data_dir (str): Root directory containing actor subdirectories
+    Returns:
+        tuple: Features and labels
+    """
+    features = []
+    labels = []
+    # Emotion mapping with potential for expansion
+    emotion_map = {
+        "01": "neutral",
+        "02": "calm",
+        "03": "happy",
+        "04": "sad",
+        "05": "angry",
+        "06": "fearful",
+        "07": "disgust",
+        "08": "surprised",
+    }
+    # Walk through all directories and subdirectories
+    for root, dirs, files in os.walk(data_dir):
+        for filename in files:
+            if filename.endswith(".wav"):
+                # Full file path
+                file_path = os.path.join(root, filename)
+                try:
+                    # Extract emotion from filename
+                    emotion_code = filename.split("-")[2]
+                    emotion = emotion_map.get(emotion_code, "unknown")
+                    # Extract original features
+                    file_features = extract_advanced_features(file_path)
+                    features.append(file_features)
+                    labels.append(emotion)
+                    # Optional augmentation
+                    if Config.AUGMENTATION:
+                        augmented_features = augment_features(file_features)
+                        features.append(augmented_features)
+                        labels.append(emotion)
+                except Exception as e:
+                    print(f"Error processing {filename}: {e}")
+    # Informative print about dataset
+    print(f"Dataset Summary:")
+    print(f"Total files processed: {len(features)}")
+    # Count of emotions
+    from collections import Counter
+    emotion_counts = Counter(labels)
+    for emotion, count in emotion_counts.items():
+        print(f"{emotion.capitalize()} emotion: {count} samples")
+    return np.array(features), np.array(labels)
+class EmotionDataset(Dataset):
+    """Enhanced Custom PyTorch Dataset for Emotion Recognition"""
+    def __init__(self, features, labels, scaler=None):
+        # Standardize features
+        if scaler is None:
+            self.scaler = StandardScaler()
+            features = self.scaler.fit_transform(features)
+        else:
+            features = scaler.transform(features)
+        self.features = torch.FloatTensor(features)
+        # Encode labels
+        self.label_encoder = LabelEncoder()
+        self.labels = torch.LongTensor(self.label_encoder.fit_transform(labels))
+    def __len__(self):
+        return len(self.labels)
+    def __getitem__(self, idx):
+        return self.features[idx], self.labels[idx]
+    def get_num_classes(self):
+        return len(self.label_encoder.classes_)
+    def get_class_names(self):
+        return self.label_encoder.classes_
+class HybridEmotionRecognitionModel(nn.Module):
+    """Advanced Hybrid Neural Network for Emotion Recognition"""
+    def __init__(self, input_dim, num_classes):
+        super().__init__()
+        # Enhanced input projection with residual connection
+        self.input_projection = nn.Sequential(
+            nn.Linear(input_dim, 512),
+            nn.BatchNorm1d(512),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(512, 256),
+            nn.ReLU(),
+        )
+        # More complex convolutional layers with residual connections
+        self.conv_layers = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Conv1d(1, 64, kernel_size=3, padding=1),
+                    nn.BatchNorm1d(64),
+                    nn.ReLU(),
+                    nn.MaxPool1d(2),
+                ),
+                nn.Sequential(
+                    nn.Conv1d(64, 128, kernel_size=3, padding=1),
+                    nn.BatchNorm1d(128),
+                    nn.ReLU(),
+                    nn.MaxPool1d(2),
+                ),
+            ]
+        )
+        # Bidirectional LSTM with more layers
+        self.lstm_layers = nn.LSTM(
+            input_size=128,
+            hidden_size=256,
+            num_layers=3,
+            batch_first=True,
+            bidirectional=True,
+            dropout=0.4,
+        )
+        # More complex fully connected layers
+        self.fc_layers = nn.Sequential(
+            nn.Linear(512, 256),  # Note the 512 due to bidirectional LSTM
+            nn.BatchNorm1d(256),
+            nn.ReLU(),
+            nn.Dropout(0.4),
+            nn.Linear(256, 128),
+            nn.BatchNorm1d(128),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+        )
+        self.output_layer = nn.Linear(128, num_classes)
+    def forward(self, x):
+        # Input projection
+        x = self.input_projection(x)
+        # Reshape for conv layers
+        x = x.unsqueeze(1)
+        # Convolutional layers with residual-like processing
+        for conv_layer in self.conv_layers:
+            x = conv_layer(x)
+        # Prepare for LSTM
+        x = x.permute(0, 2, 1)
+        # LSTM processing
+        lstm_out, _ = self.lstm_layers(x)
+        x = lstm_out[:, -1, :]
+        # Fully connected layers
+        x = self.fc_layers(x)
+        return self.output_layer(x)
+def train_model(model, train_loader, val_loader, labels, num_epochs=Config.NUM_EPOCHS):
+    """
+    Advanced training function with improved techniques
+    Args:
+        model (nn.Module): PyTorch model
+        train_loader (DataLoader): Training data loader
+        val_loader (DataLoader): Validation data loader
+        labels (numpy.ndarray): Original labels for class weight computation
+        num_epochs (int): Number of training epochs
+    """
+    # Compute class weights to handle class imbalance
+    class_weights = class_weight.compute_class_weight(
+        "balanced", classes=np.unique(labels), y=labels
+    )
+    class_weights = torch.FloatTensor(class_weights)
+    # Loss with class weights
+    criterion = nn.CrossEntropyLoss(weight=class_weights)
+    # Adam with weight decay (L2 regularization)
+    optimizer = optim.AdamW(
+        model.parameters(), lr=Config.LEARNING_RATE, weight_decay=1e-5
+    )
+    # Learning rate scheduler
+    scheduler = ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=5, verbose=True
+    )
+    # Initialize wandb
+    wandb.init(
+        project="SentimentSound",
+        config={
+            "learning_rate": Config.LEARNING_RATE,
+            "batch_size": Config.BATCH_SIZE,
+            "epochs": num_epochs,
+            "augmentation": Config.AUGMENTATION,
+        },
+    )
+    # Training loop with more advanced techniques
+    best_val_loss = float("inf")
+    for epoch in range(num_epochs):
+        model.train()
+        train_loss = 0
+        train_correct = 0
+        train_total = 0
+        for features, batch_labels in train_loader:
+            optimizer.zero_grad()
+            # Forward and backward pass
+            outputs = model(features)
+            loss = criterion(outputs, batch_labels)
+            loss.backward()
+            # Gradient clipping
+            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+            optimizer.step()
+            train_loss += loss.item()
+            _, predicted = torch.max(outputs.data, 1)
+            train_total += batch_labels.size(0)
+            train_correct += (predicted == batch_labels).sum().item()
+        # Validation
+        model.eval()
+        val_loss = 0
+        val_correct = 0
+        val_total = 0
+        with torch.no_grad():
+            for features, batch_labels in val_loader:
+                outputs = model(features)
+                loss = criterion(outputs, batch_labels)
+                val_loss += loss.item()
+                _, predicted = torch.max(outputs.data, 1)
+                val_total += batch_labels.size(0)
+                val_correct += (predicted == batch_labels).sum().item()
+        # Compute metrics
+        train_accuracy = 100 * train_correct / train_total
+        val_accuracy = 100 * val_correct / val_total
+        # Learning rate scheduling
+        scheduler.step(val_loss)
+        # Logging to wandb
+        wandb.log(
+            {
+                "train_loss": train_loss / len(train_loader),
+                "train_accuracy": train_accuracy,
+                "val_loss": val_loss / len(val_loader),
+                "val_accuracy": val_accuracy,
+            }
+        )
+        # Print epoch summary
+        print(f"Epoch {epoch+1}/{num_epochs}")
+        print(f"Train Loss: {train_loss / len(train_loader):.4f}")
+        print(f"Train Accuracy: {train_accuracy:.2f}%")
+        print(f"Val Loss: {val_loss / len(val_loader):.4f}")
+        print(f"Val Accuracy: {val_accuracy:.2f}%")
+        # Save best model
+        if val_loss < best_val_loss:
+            best_val_loss = val_loss
+            torch.save(model.state_dict(), "best_emotion_model.pth")
+    # Finish wandb run
+    wandb.finish()
+    return model
+def evaluate_model(model, test_loader, dataset):
+    """
+    Evaluate the model and generate detailed metrics
+    Args:
+        model (nn.Module): Trained PyTorch model
+        test_loader (DataLoader): Test data loader
+        dataset (EmotionDataset): Dataset for class names
+    """
+    model.eval()
+    all_preds = []
+    all_labels = []
+    with torch.no_grad():
+        for features, labels in test_loader:
+            outputs = model(features)
+            _, predicted = torch.max(outputs, 1)
+            all_preds.extend(predicted.numpy())
+            all_labels.extend(labels.numpy())
+    # Classification Report
+    class_names = dataset.get_class_names()
+    print("\nClassification Report:")
+    print(classification_report(all_labels, all_preds, target_names=class_names))
+    # Confusion Matrix Visualization
+    cm = confusion_matrix(all_labels, all_preds)
+    plt.figure(figsize=(10, 8))
+    sns.heatmap(
+        cm, annot=True, fmt="d", xticklabels=class_names, yticklabels=class_names
+    )
+    plt.title("Confusion Matrix")
+    plt.xlabel("Predicted")
+    plt.ylabel("Actual")
+    plt.tight_layout()
+    plt.savefig("confusion_matrix.png")
+    plt.close()
+def main():
+    # Set random seed for reproducibility
+    torch.manual_seed(42)
+    np.random.seed(42)
+    # Data Preparation
+    features, labels = prepare_dataset(Config.DATA_DIR)
+    # Split data
+    X_train, X_test, y_train, y_test = train_test_split(
+        features, labels, test_size=0.2, random_state=42
+    )
+    X_train, X_val, y_train, y_val = train_test_split(
+        X_train, y_train, test_size=0.2, random_state=42
+    )
+    # Create datasets
+    train_dataset = EmotionDataset(X_train, y_train)
+    val_dataset = EmotionDataset(X_val, y_val)
+    test_dataset = EmotionDataset(X_test, y_test)
+    # Data loaders
+    train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=Config.BATCH_SIZE)
+    test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE)
+    # Model Initialization
+    model = HybridEmotionRecognitionModel(
+        input_dim=len(X_train[0]), num_classes=train_dataset.get_num_classes()
+    )
+    # Train Model
+    train_model(
+        model,
+        train_loader,
+        val_loader,
+        labels,
+        num_epochs=Config.NUM_EPOCHS,
+    )
+    # Evaluate Model
+    evaluate_model(model, test_loader, train_dataset)
+if __name__ == "__main__":
+    main()