saadmannan
/

speech-emotion-recognition

+"""
+Data Preparation Module
+Extracts audio features from RAVDESS dataset
+"""
+import os
+import numpy as np
+import pandas as pd
+import librosa
+from pathlib import Path
+from tqdm import tqdm
+import pickle
+# Emotion mapping based on RAVDESS filename convention
+EMOTION_MAP = {
+    '01': 'neutral',
+    '02': 'calm',
+    '03': 'happy',
+    '04': 'sad',
+    '05': 'angry',
+    '06': 'fearful',
+    '07': 'disgust',
+    '08': 'surprised'
+}
+EMOTION_TO_IDX = {emotion: idx for idx, emotion in enumerate(EMOTION_MAP.values())}
+# Audio processing parameters
+SAMPLE_RATE = 16000
+N_MELS = 128
+N_MFCC = 13
+MAX_LENGTH = 128  # Fixed length for spectrograms (time steps)
+def parse_filename(filename):
+    """
+    Parse RAVDESS filename to extract metadata
+    Format: Modality-VocalChannel-Emotion-EmotionIntensity-Statement-Repetition-Actor.wav
+    Example: 03-01-05-02-01-01-12.wav
+    """
+    parts = filename.stem.split('-')
+    if len(parts) == 7:
+        return {
+            'modality': parts[0],
+            'vocal_channel': parts[1],
+            'emotion': EMOTION_MAP.get(parts[2], 'unknown'),
+            'emotion_code': parts[2],
+            'intensity': parts[3],
+            'statement': parts[4],
+            'repetition': parts[5],
+            'actor': parts[6]
+        }
+    return None
+def extract_features(audio_path, sr=SAMPLE_RATE):
+    """
+    Extract enhanced audio features for better emotion recognition
+    """
+    try:
+        # Load audio
+        y, sr = librosa.load(audio_path, sr=sr, duration=3.0)  # Limit to 3 seconds
+        # 1. Mel-spectrogram (128 features)
+        mel_spec = librosa.feature.melspectrogram(
+            y=y,
+            sr=sr,
+            n_mels=N_MELS,
+            n_fft=2048,
+            hop_length=512
+        )
+        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
+        # 2. MFCCs (13 features)
+        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC)
+        # 3. Delta MFCCs - temporal dynamics (13 features)
+        mfcc_delta = librosa.feature.delta(mfccs)
+        # 4. Delta-Delta MFCCs - acceleration (13 features)
+        mfcc_delta2 = librosa.feature.delta(mfccs, order=2)
+        # 5. Chromagram - pitch content (12 features)
+        chroma = librosa.feature.chroma_stft(y=y, sr=sr, n_fft=2048, hop_length=512)
+        # 6. Spectral Contrast - texture (7 features)
+        spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, n_fft=2048, hop_length=512)
+        # 7. Tonnetz - harmonic content (6 features)
+        tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)
+        # 8. Zero Crossing Rate (1 feature)
+        zcr = librosa.feature.zero_crossing_rate(y)
+        # 9. Spectral Centroid (1 feature)
+        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
+        # 10. Spectral Rolloff (1 feature)
+        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
+        # 11. Spectral Bandwidth (1 feature)
+        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
+        # Stack all features vertically
+        # Total: 128 + 13 + 13 + 13 + 12 + 7 + 6 + 1 + 1 + 1 + 1 = 196 features
+        features = np.vstack([
+            mel_spec_db,
+            mfccs,
+            mfcc_delta,
+            mfcc_delta2,
+            chroma,
+            spectral_contrast,
+            tonnetz,
+            zcr,
+            spectral_centroid,
+            spectral_rolloff,
+            spectral_bandwidth
+        ])
+        # Pad or truncate to fixed length
+        if features.shape[1] < MAX_LENGTH:
+            # Pad with zeros
+            pad_width = MAX_LENGTH - features.shape[1]
+            features = np.pad(features, ((0, 0), (0, pad_width)), mode='constant')
+        else:
+            # Truncate
+            features = features[:, :MAX_LENGTH]
+        return features
+    except Exception as e:
+        print(f"Error processing {audio_path}: {e}")
+        return None
+def prepare_dataset(data_dir, output_dir):
+    """
+    Process all audio files and create dataset
+    """
+    data_dir = Path(data_dir)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True)
+    # Find all audio files
+    audio_files = list(data_dir.rglob("*.wav"))
+    print(f"Found {len(audio_files)} audio files")
+    # Process files
+    features_list = []
+    labels_list = []
+    metadata_list = []
+    for audio_file in tqdm(audio_files, desc="Extracting features"):
+        # Parse filename
+        metadata = parse_filename(audio_file)
+        if metadata is None or metadata['emotion'] == 'unknown':
+            continue
+        # Extract features
+        features = extract_features(audio_file)
+        if features is None:
+            continue
+        features_list.append(features)
+        labels_list.append(EMOTION_TO_IDX[metadata['emotion']])
+        metadata_list.append(metadata)
+    # Convert to arrays
+    features_array = np.array(features_list, dtype=np.float32)
+    labels_array = np.array(labels_list, dtype=np.int64)
+    print(f"\nDataset shape: {features_array.shape}")
+    print(f"Labels shape: {labels_array.shape}")
+    # Normalize features (important for training stability!)
+    print("\nNormalizing features...")
+    print(f"Before normalization - Mean: {features_array.mean():.4f}, Std: {features_array.std():.4f}")
+    # Standardize to zero mean and unit variance
+    mean = features_array.mean()
+    std = features_array.std()
+    features_array = (features_array - mean) / (std + 1e-8)
+    print(f"After normalization - Mean: {features_array.mean():.4f}, Std: {features_array.std():.4f}")
+    # Save processed data
+    np.save(output_dir / "features.npy", features_array)
+    np.save(output_dir / "labels.npy", labels_array)
+    # Save normalization parameters
+    norm_params = {'mean': float(mean), 'std': float(std)}
+    import json
+    with open(output_dir / "norm_params.json", 'w') as f:
+        json.dump(norm_params, f)
+    # Save metadata
+    metadata_df = pd.DataFrame(metadata_list)
+    metadata_df.to_csv(output_dir / "metadata.csv", index=False)
+    # Print class distribution
+    print("\nClass distribution:")
+    for emotion, idx in EMOTION_TO_IDX.items():
+        count = np.sum(labels_array == idx)
+        print(f"  {emotion}: {count} samples")
+    print(f"\n✓ Dataset prepared successfully!")
+    print(f"✓ Saved to: {output_dir.absolute()}")
+    return features_array, labels_array, metadata_df
+if __name__ == "__main__":
+    # Paths
+    data_dir = Path(__file__).parent / "ravdess"
+    output_dir = Path(__file__).parent / "processed"
+    # Prepare dataset
+    features, labels, metadata = prepare_dataset(data_dir, output_dir)