""" Data Preparation Module Extracts audio features from RAVDESS dataset """ import os import numpy as np import pandas as pd import librosa from pathlib import Path from tqdm import tqdm import pickle # Emotion mapping based on RAVDESS filename convention EMOTION_MAP = { '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad', '05': 'angry', '06': 'fearful', '07': 'disgust', '08': 'surprised' } EMOTION_TO_IDX = {emotion: idx for idx, emotion in enumerate(EMOTION_MAP.values())} # Audio processing parameters SAMPLE_RATE = 16000 N_MELS = 128 N_MFCC = 13 MAX_LENGTH = 128 # Fixed length for spectrograms (time steps) def parse_filename(filename): """ Parse RAVDESS filename to extract metadata Format: Modality-VocalChannel-Emotion-EmotionIntensity-Statement-Repetition-Actor.wav Example: 03-01-05-02-01-01-12.wav """ parts = filename.stem.split('-') if len(parts) == 7: return { 'modality': parts[0], 'vocal_channel': parts[1], 'emotion': EMOTION_MAP.get(parts[2], 'unknown'), 'emotion_code': parts[2], 'intensity': parts[3], 'statement': parts[4], 'repetition': parts[5], 'actor': parts[6] } return None def extract_features(audio_path, sr=SAMPLE_RATE): """ Extract enhanced audio features for better emotion recognition """ try: # Load audio y, sr = librosa.load(audio_path, sr=sr, duration=3.0) # Limit to 3 seconds # 1. Mel-spectrogram (128 features) mel_spec = librosa.feature.melspectrogram( y=y, sr=sr, n_mels=N_MELS, n_fft=2048, hop_length=512 ) mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max) # 2. MFCCs (13 features) mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC) # 3. Delta MFCCs - temporal dynamics (13 features) mfcc_delta = librosa.feature.delta(mfccs) # 4. Delta-Delta MFCCs - acceleration (13 features) mfcc_delta2 = librosa.feature.delta(mfccs, order=2) # 5. Chromagram - pitch content (12 features) chroma = librosa.feature.chroma_stft(y=y, sr=sr, n_fft=2048, hop_length=512) # 6. Spectral Contrast - texture (7 features) spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, n_fft=2048, hop_length=512) # 7. Tonnetz - harmonic content (6 features) tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr) # 8. Zero Crossing Rate (1 feature) zcr = librosa.feature.zero_crossing_rate(y) # 9. Spectral Centroid (1 feature) spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr) # 10. Spectral Rolloff (1 feature) spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr) # 11. Spectral Bandwidth (1 feature) spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr) # Stack all features vertically # Total: 128 + 13 + 13 + 13 + 12 + 7 + 6 + 1 + 1 + 1 + 1 = 196 features features = np.vstack([ mel_spec_db, mfccs, mfcc_delta, mfcc_delta2, chroma, spectral_contrast, tonnetz, zcr, spectral_centroid, spectral_rolloff, spectral_bandwidth ]) # Pad or truncate to fixed length if features.shape[1] < MAX_LENGTH: # Pad with zeros pad_width = MAX_LENGTH - features.shape[1] features = np.pad(features, ((0, 0), (0, pad_width)), mode='constant') else: # Truncate features = features[:, :MAX_LENGTH] return features except Exception as e: print(f"Error processing {audio_path}: {e}") return None def prepare_dataset(data_dir, output_dir): """ Process all audio files and create dataset """ data_dir = Path(data_dir) output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True) # Find all audio files audio_files = list(data_dir.rglob("*.wav")) print(f"Found {len(audio_files)} audio files") # Process files features_list = [] labels_list = [] metadata_list = [] for audio_file in tqdm(audio_files, desc="Extracting features"): # Parse filename metadata = parse_filename(audio_file) if metadata is None or metadata['emotion'] == 'unknown': continue # Extract features features = extract_features(audio_file) if features is None: continue features_list.append(features) labels_list.append(EMOTION_TO_IDX[metadata['emotion']]) metadata_list.append(metadata) # Convert to arrays features_array = np.array(features_list, dtype=np.float32) labels_array = np.array(labels_list, dtype=np.int64) print(f"\nDataset shape: {features_array.shape}") print(f"Labels shape: {labels_array.shape}") # Normalize features (important for training stability!) print("\nNormalizing features...") print(f"Before normalization - Mean: {features_array.mean():.4f}, Std: {features_array.std():.4f}") # Standardize to zero mean and unit variance mean = features_array.mean() std = features_array.std() features_array = (features_array - mean) / (std + 1e-8) print(f"After normalization - Mean: {features_array.mean():.4f}, Std: {features_array.std():.4f}") # Save processed data np.save(output_dir / "features.npy", features_array) np.save(output_dir / "labels.npy", labels_array) # Save normalization parameters norm_params = {'mean': float(mean), 'std': float(std)} import json with open(output_dir / "norm_params.json", 'w') as f: json.dump(norm_params, f) # Save metadata metadata_df = pd.DataFrame(metadata_list) metadata_df.to_csv(output_dir / "metadata.csv", index=False) # Print class distribution print("\nClass distribution:") for emotion, idx in EMOTION_TO_IDX.items(): count = np.sum(labels_array == idx) print(f" {emotion}: {count} samples") print(f"\nāœ“ Dataset prepared successfully!") print(f"āœ“ Saved to: {output_dir.absolute()}") return features_array, labels_array, metadata_df if __name__ == "__main__": # Paths data_dir = Path(__file__).parent / "ravdess" output_dir = Path(__file__).parent / "processed" # Prepare dataset features, labels, metadata = prepare_dataset(data_dir, output_dir)