saadmannan commited on
Commit
c34e908
·
verified ·
1 Parent(s): 093d5c6

Upload prepare_data.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. prepare_data.py +214 -0
prepare_data.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Preparation Module
3
+ Extracts audio features from RAVDESS dataset
4
+ """
5
+
6
+ import os
7
+ import numpy as np
8
+ import pandas as pd
9
+ import librosa
10
+ from pathlib import Path
11
+ from tqdm import tqdm
12
+ import pickle
13
+
14
+ # Emotion mapping based on RAVDESS filename convention
15
+ EMOTION_MAP = {
16
+ '01': 'neutral',
17
+ '02': 'calm',
18
+ '03': 'happy',
19
+ '04': 'sad',
20
+ '05': 'angry',
21
+ '06': 'fearful',
22
+ '07': 'disgust',
23
+ '08': 'surprised'
24
+ }
25
+
26
+ EMOTION_TO_IDX = {emotion: idx for idx, emotion in enumerate(EMOTION_MAP.values())}
27
+
28
+ # Audio processing parameters
29
+ SAMPLE_RATE = 16000
30
+ N_MELS = 128
31
+ N_MFCC = 13
32
+ MAX_LENGTH = 128 # Fixed length for spectrograms (time steps)
33
+
34
+ def parse_filename(filename):
35
+ """
36
+ Parse RAVDESS filename to extract metadata
37
+ Format: Modality-VocalChannel-Emotion-EmotionIntensity-Statement-Repetition-Actor.wav
38
+ Example: 03-01-05-02-01-01-12.wav
39
+ """
40
+ parts = filename.stem.split('-')
41
+ if len(parts) == 7:
42
+ return {
43
+ 'modality': parts[0],
44
+ 'vocal_channel': parts[1],
45
+ 'emotion': EMOTION_MAP.get(parts[2], 'unknown'),
46
+ 'emotion_code': parts[2],
47
+ 'intensity': parts[3],
48
+ 'statement': parts[4],
49
+ 'repetition': parts[5],
50
+ 'actor': parts[6]
51
+ }
52
+ return None
53
+
54
+ def extract_features(audio_path, sr=SAMPLE_RATE):
55
+ """
56
+ Extract enhanced audio features for better emotion recognition
57
+ """
58
+ try:
59
+ # Load audio
60
+ y, sr = librosa.load(audio_path, sr=sr, duration=3.0) # Limit to 3 seconds
61
+
62
+ # 1. Mel-spectrogram (128 features)
63
+ mel_spec = librosa.feature.melspectrogram(
64
+ y=y,
65
+ sr=sr,
66
+ n_mels=N_MELS,
67
+ n_fft=2048,
68
+ hop_length=512
69
+ )
70
+ mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
71
+
72
+ # 2. MFCCs (13 features)
73
+ mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC)
74
+
75
+ # 3. Delta MFCCs - temporal dynamics (13 features)
76
+ mfcc_delta = librosa.feature.delta(mfccs)
77
+
78
+ # 4. Delta-Delta MFCCs - acceleration (13 features)
79
+ mfcc_delta2 = librosa.feature.delta(mfccs, order=2)
80
+
81
+ # 5. Chromagram - pitch content (12 features)
82
+ chroma = librosa.feature.chroma_stft(y=y, sr=sr, n_fft=2048, hop_length=512)
83
+
84
+ # 6. Spectral Contrast - texture (7 features)
85
+ spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, n_fft=2048, hop_length=512)
86
+
87
+ # 7. Tonnetz - harmonic content (6 features)
88
+ tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)
89
+
90
+ # 8. Zero Crossing Rate (1 feature)
91
+ zcr = librosa.feature.zero_crossing_rate(y)
92
+
93
+ # 9. Spectral Centroid (1 feature)
94
+ spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
95
+
96
+ # 10. Spectral Rolloff (1 feature)
97
+ spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
98
+
99
+ # 11. Spectral Bandwidth (1 feature)
100
+ spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
101
+
102
+ # Stack all features vertically
103
+ # Total: 128 + 13 + 13 + 13 + 12 + 7 + 6 + 1 + 1 + 1 + 1 = 196 features
104
+ features = np.vstack([
105
+ mel_spec_db,
106
+ mfccs,
107
+ mfcc_delta,
108
+ mfcc_delta2,
109
+ chroma,
110
+ spectral_contrast,
111
+ tonnetz,
112
+ zcr,
113
+ spectral_centroid,
114
+ spectral_rolloff,
115
+ spectral_bandwidth
116
+ ])
117
+
118
+ # Pad or truncate to fixed length
119
+ if features.shape[1] < MAX_LENGTH:
120
+ # Pad with zeros
121
+ pad_width = MAX_LENGTH - features.shape[1]
122
+ features = np.pad(features, ((0, 0), (0, pad_width)), mode='constant')
123
+ else:
124
+ # Truncate
125
+ features = features[:, :MAX_LENGTH]
126
+
127
+ return features
128
+
129
+ except Exception as e:
130
+ print(f"Error processing {audio_path}: {e}")
131
+ return None
132
+
133
+ def prepare_dataset(data_dir, output_dir):
134
+ """
135
+ Process all audio files and create dataset
136
+ """
137
+ data_dir = Path(data_dir)
138
+ output_dir = Path(output_dir)
139
+ output_dir.mkdir(exist_ok=True)
140
+
141
+ # Find all audio files
142
+ audio_files = list(data_dir.rglob("*.wav"))
143
+ print(f"Found {len(audio_files)} audio files")
144
+
145
+ # Process files
146
+ features_list = []
147
+ labels_list = []
148
+ metadata_list = []
149
+
150
+ for audio_file in tqdm(audio_files, desc="Extracting features"):
151
+ # Parse filename
152
+ metadata = parse_filename(audio_file)
153
+ if metadata is None or metadata['emotion'] == 'unknown':
154
+ continue
155
+
156
+ # Extract features
157
+ features = extract_features(audio_file)
158
+ if features is None:
159
+ continue
160
+
161
+ features_list.append(features)
162
+ labels_list.append(EMOTION_TO_IDX[metadata['emotion']])
163
+ metadata_list.append(metadata)
164
+
165
+ # Convert to arrays
166
+ features_array = np.array(features_list, dtype=np.float32)
167
+ labels_array = np.array(labels_list, dtype=np.int64)
168
+
169
+ print(f"\nDataset shape: {features_array.shape}")
170
+ print(f"Labels shape: {labels_array.shape}")
171
+
172
+ # Normalize features (important for training stability!)
173
+ print("\nNormalizing features...")
174
+ print(f"Before normalization - Mean: {features_array.mean():.4f}, Std: {features_array.std():.4f}")
175
+
176
+ # Standardize to zero mean and unit variance
177
+ mean = features_array.mean()
178
+ std = features_array.std()
179
+ features_array = (features_array - mean) / (std + 1e-8)
180
+
181
+ print(f"After normalization - Mean: {features_array.mean():.4f}, Std: {features_array.std():.4f}")
182
+
183
+ # Save processed data
184
+ np.save(output_dir / "features.npy", features_array)
185
+ np.save(output_dir / "labels.npy", labels_array)
186
+
187
+ # Save normalization parameters
188
+ norm_params = {'mean': float(mean), 'std': float(std)}
189
+ import json
190
+ with open(output_dir / "norm_params.json", 'w') as f:
191
+ json.dump(norm_params, f)
192
+
193
+ # Save metadata
194
+ metadata_df = pd.DataFrame(metadata_list)
195
+ metadata_df.to_csv(output_dir / "metadata.csv", index=False)
196
+
197
+ # Print class distribution
198
+ print("\nClass distribution:")
199
+ for emotion, idx in EMOTION_TO_IDX.items():
200
+ count = np.sum(labels_array == idx)
201
+ print(f" {emotion}: {count} samples")
202
+
203
+ print(f"\n✓ Dataset prepared successfully!")
204
+ print(f"✓ Saved to: {output_dir.absolute()}")
205
+
206
+ return features_array, labels_array, metadata_df
207
+
208
+ if __name__ == "__main__":
209
+ # Paths
210
+ data_dir = Path(__file__).parent / "ravdess"
211
+ output_dir = Path(__file__).parent / "processed"
212
+
213
+ # Prepare dataset
214
+ features, labels, metadata = prepare_dataset(data_dir, output_dir)