|
|
""" |
|
|
Data Preparation Module |
|
|
Extracts audio features from RAVDESS dataset |
|
|
""" |
|
|
|
|
|
import os |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import librosa |
|
|
from pathlib import Path |
|
|
from tqdm import tqdm |
|
|
import pickle |
|
|
|
|
|
|
|
|
EMOTION_MAP = { |
|
|
'01': 'neutral', |
|
|
'02': 'calm', |
|
|
'03': 'happy', |
|
|
'04': 'sad', |
|
|
'05': 'angry', |
|
|
'06': 'fearful', |
|
|
'07': 'disgust', |
|
|
'08': 'surprised' |
|
|
} |
|
|
|
|
|
EMOTION_TO_IDX = {emotion: idx for idx, emotion in enumerate(EMOTION_MAP.values())} |
|
|
|
|
|
|
|
|
SAMPLE_RATE = 16000 |
|
|
N_MELS = 128 |
|
|
N_MFCC = 13 |
|
|
MAX_LENGTH = 128 |
|
|
|
|
|
def parse_filename(filename): |
|
|
""" |
|
|
Parse RAVDESS filename to extract metadata |
|
|
Format: Modality-VocalChannel-Emotion-EmotionIntensity-Statement-Repetition-Actor.wav |
|
|
Example: 03-01-05-02-01-01-12.wav |
|
|
""" |
|
|
parts = filename.stem.split('-') |
|
|
if len(parts) == 7: |
|
|
return { |
|
|
'modality': parts[0], |
|
|
'vocal_channel': parts[1], |
|
|
'emotion': EMOTION_MAP.get(parts[2], 'unknown'), |
|
|
'emotion_code': parts[2], |
|
|
'intensity': parts[3], |
|
|
'statement': parts[4], |
|
|
'repetition': parts[5], |
|
|
'actor': parts[6] |
|
|
} |
|
|
return None |
|
|
|
|
|
def extract_features(audio_path, sr=SAMPLE_RATE): |
|
|
""" |
|
|
Extract enhanced audio features for better emotion recognition |
|
|
""" |
|
|
try: |
|
|
|
|
|
y, sr = librosa.load(audio_path, sr=sr, duration=3.0) |
|
|
|
|
|
|
|
|
mel_spec = librosa.feature.melspectrogram( |
|
|
y=y, |
|
|
sr=sr, |
|
|
n_mels=N_MELS, |
|
|
n_fft=2048, |
|
|
hop_length=512 |
|
|
) |
|
|
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max) |
|
|
|
|
|
|
|
|
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC) |
|
|
|
|
|
|
|
|
mfcc_delta = librosa.feature.delta(mfccs) |
|
|
|
|
|
|
|
|
mfcc_delta2 = librosa.feature.delta(mfccs, order=2) |
|
|
|
|
|
|
|
|
chroma = librosa.feature.chroma_stft(y=y, sr=sr, n_fft=2048, hop_length=512) |
|
|
|
|
|
|
|
|
spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, n_fft=2048, hop_length=512) |
|
|
|
|
|
|
|
|
tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr) |
|
|
|
|
|
|
|
|
zcr = librosa.feature.zero_crossing_rate(y) |
|
|
|
|
|
|
|
|
spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr) |
|
|
|
|
|
|
|
|
spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr) |
|
|
|
|
|
|
|
|
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr) |
|
|
|
|
|
|
|
|
|
|
|
features = np.vstack([ |
|
|
mel_spec_db, |
|
|
mfccs, |
|
|
mfcc_delta, |
|
|
mfcc_delta2, |
|
|
chroma, |
|
|
spectral_contrast, |
|
|
tonnetz, |
|
|
zcr, |
|
|
spectral_centroid, |
|
|
spectral_rolloff, |
|
|
spectral_bandwidth |
|
|
]) |
|
|
|
|
|
|
|
|
if features.shape[1] < MAX_LENGTH: |
|
|
|
|
|
pad_width = MAX_LENGTH - features.shape[1] |
|
|
features = np.pad(features, ((0, 0), (0, pad_width)), mode='constant') |
|
|
else: |
|
|
|
|
|
features = features[:, :MAX_LENGTH] |
|
|
|
|
|
return features |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error processing {audio_path}: {e}") |
|
|
return None |
|
|
|
|
|
def prepare_dataset(data_dir, output_dir): |
|
|
""" |
|
|
Process all audio files and create dataset |
|
|
""" |
|
|
data_dir = Path(data_dir) |
|
|
output_dir = Path(output_dir) |
|
|
output_dir.mkdir(exist_ok=True) |
|
|
|
|
|
|
|
|
audio_files = list(data_dir.rglob("*.wav")) |
|
|
print(f"Found {len(audio_files)} audio files") |
|
|
|
|
|
|
|
|
features_list = [] |
|
|
labels_list = [] |
|
|
metadata_list = [] |
|
|
|
|
|
for audio_file in tqdm(audio_files, desc="Extracting features"): |
|
|
|
|
|
metadata = parse_filename(audio_file) |
|
|
if metadata is None or metadata['emotion'] == 'unknown': |
|
|
continue |
|
|
|
|
|
|
|
|
features = extract_features(audio_file) |
|
|
if features is None: |
|
|
continue |
|
|
|
|
|
features_list.append(features) |
|
|
labels_list.append(EMOTION_TO_IDX[metadata['emotion']]) |
|
|
metadata_list.append(metadata) |
|
|
|
|
|
|
|
|
features_array = np.array(features_list, dtype=np.float32) |
|
|
labels_array = np.array(labels_list, dtype=np.int64) |
|
|
|
|
|
print(f"\nDataset shape: {features_array.shape}") |
|
|
print(f"Labels shape: {labels_array.shape}") |
|
|
|
|
|
|
|
|
print("\nNormalizing features...") |
|
|
print(f"Before normalization - Mean: {features_array.mean():.4f}, Std: {features_array.std():.4f}") |
|
|
|
|
|
|
|
|
mean = features_array.mean() |
|
|
std = features_array.std() |
|
|
features_array = (features_array - mean) / (std + 1e-8) |
|
|
|
|
|
print(f"After normalization - Mean: {features_array.mean():.4f}, Std: {features_array.std():.4f}") |
|
|
|
|
|
|
|
|
np.save(output_dir / "features.npy", features_array) |
|
|
np.save(output_dir / "labels.npy", labels_array) |
|
|
|
|
|
|
|
|
norm_params = {'mean': float(mean), 'std': float(std)} |
|
|
import json |
|
|
with open(output_dir / "norm_params.json", 'w') as f: |
|
|
json.dump(norm_params, f) |
|
|
|
|
|
|
|
|
metadata_df = pd.DataFrame(metadata_list) |
|
|
metadata_df.to_csv(output_dir / "metadata.csv", index=False) |
|
|
|
|
|
|
|
|
print("\nClass distribution:") |
|
|
for emotion, idx in EMOTION_TO_IDX.items(): |
|
|
count = np.sum(labels_array == idx) |
|
|
print(f" {emotion}: {count} samples") |
|
|
|
|
|
print(f"\n✓ Dataset prepared successfully!") |
|
|
print(f"✓ Saved to: {output_dir.absolute()}") |
|
|
|
|
|
return features_array, labels_array, metadata_df |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
data_dir = Path(__file__).parent / "ravdess" |
|
|
output_dir = Path(__file__).parent / "processed" |
|
|
|
|
|
|
|
|
features, labels, metadata = prepare_dataset(data_dir, output_dir) |
|
|
|