gulabjam commited on
Commit
9306638
·
1 Parent(s): b243717

Remove all files except .gitattributes, best_ast_model.pth, and AST_README.md

Browse files
AST_Pipeline.py DELETED
@@ -1,397 +0,0 @@
1
- import os
2
- import glob
3
- from networkx import display
4
- import numpy as np
5
- import pandas as pd
6
- from tqdm import tqdm
7
- import librosa
8
- import librosa.display
9
- import matplotlib.pyplot as plt
10
- import random
11
- import torch
12
- import wandb
13
- from torchsummary import summary
14
- from torch.utils.data.dataset import Dataset
15
- from torch.utils.data import DataLoader
16
- from torch.optim.lr_scheduler import ReduceLROnPlateau
17
- from transformers import ASTForAudioClassification, ASTConfig
18
- import torch.nn as nn
19
- from sklearn.model_selection import StratifiedShuffleSplit
20
- from collections import Counter
21
- import seaborn as sns
22
- from sklearn.tree import DecisionTreeClassifier
23
- from sklearn.metrics import f1_score
24
- import warnings
25
- warnings.filterwarnings("ignore")
26
-
27
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
28
- print(f"Is GPU available? {torch.cuda.is_available()}")
29
- print(f"Current device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
30
-
31
- DATA_SEED = 67
32
- TRAINING_SEED = 1234
33
- SR = 22050
34
- DURATION = 5.0
35
- N_FFT = 2048
36
- HOP_LENGTH = 512
37
- N_MELS = 128
38
- TOP_DB=20
39
- TARGET_SNR_DB = 10
40
-
41
- random.seed(DATA_SEED)
42
- np.random.seed(DATA_SEED)
43
- torch.manual_seed(DATA_SEED)
44
- torch.cuda.manual_seed(DATA_SEED)
45
-
46
- DATA_ROOT = '/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/genres_stems'
47
- GENRES = [] # Make the list of all genres available
48
- for genre in os.listdir(path=DATA_ROOT):
49
- GENRES.append(genre)
50
- display(GENRES)
51
- STEMS = {} # Write here stems file name
52
- STEM_KEYS = ['drums', 'vocals', 'bass', 'other']
53
- GENRE_TO_TEST = 'rock'
54
- SONG_INDEX = 0
55
- genre_to_id = {genre: i for i, genre in enumerate(GENRES)}
56
- id_to_genre = {i: genre for i, genre in enumerate(GENRES)}
57
-
58
- # Build the dataset in the format {genre: {stem_type: [file_paths]}}
59
- def build_dataset(root_dir, val_split=0.17, seed=42):
60
- train_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
61
- val_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
62
- rows = []
63
- for genre in GENRES:
64
- genre_path = os.path.join(root_dir, genre)
65
- for song in os.listdir(genre_path):
66
- rows.append({
67
- "label" : genre,
68
- "song" : song
69
- })
70
- df = pd.DataFrame(rows)
71
- X = df['song']
72
- y = df['label']
73
-
74
- sss = StratifiedShuffleSplit(n_splits=1, test_size=val_split, random_state=seed)
75
-
76
- for train_index, val_index in sss.split(X, y):
77
- train_df = df.iloc[train_index].reset_index(drop=True)
78
- val_df = df.iloc[val_index].reset_index(drop=True)
79
-
80
- for idx, row in train_df.iterrows():
81
- genre = row['label']
82
- song = row['song']
83
- song_path = os.path.join(root_dir, genre, song)
84
- for stem in os.listdir(song_path):
85
- train_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
86
-
87
- for idx, row in val_df.iterrows():
88
- genre = row['label']
89
- song = row['song']
90
- song_path = os.path.join(root_dir, genre, song)
91
- for stem in os.listdir(song_path):
92
- val_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
93
- return train_dataset, val_dataset
94
-
95
- tr, val = build_dataset(DATA_ROOT)
96
-
97
-
98
-
99
- AST_SR = 16000
100
- AST_TARGET_FRAMES = 1024
101
-
102
- # A custom Dataset class that will handle the loading of audio files, applying random augmentations, and generating the mel spectrograms on-the-fly during training and validation.
103
- class ASTAudioDataset(Dataset):
104
- def __init__(self, data_dict, length, genres, stem_types, noise_files, duration=10.0):
105
- self.data_dict = data_dict
106
- self.length = length
107
- self.genres = genres
108
- self.stem_types = stem_types
109
- self.noise_files = noise_files
110
- self.duration = duration
111
- self.sr = AST_SR
112
-
113
- def __len__(self):
114
- return self.length
115
-
116
- def __getitem__(self, idx, retries=0):
117
- if retries > 10:
118
- return self.__getitem__(random.randint(0, self.length - 1), 0)
119
-
120
- genre = random.choice(self.genres) # Choose a random genre for this sample
121
- max_start = max(0, 28 - self.duration) # Calculate the maximum start position
122
- base_start = random.uniform(0, max_start)
123
- target_len = int(self.sr * self.duration)
124
- stems_data = []
125
-
126
- for stem_type in self.stem_types:
127
- if random.random() < 0.15:
128
- continue
129
-
130
- try:
131
- path = random.choice(self.data_dict[genre][stem_type])
132
- y, _ = librosa.load(path, sr=self.sr, offset=base_start, duration=self.duration)
133
- y = librosa.util.fix_length(y, size=target_len)
134
-
135
- weight = random.uniform(0.4, 1.2)
136
- y = y * weight # Apply random gain to the stem
137
-
138
- if np.sqrt(np.mean(y**2)) > 0.001:
139
- stems_data.append(y)
140
- except:
141
- continue
142
-
143
- if not stems_data:
144
- return self.__getitem__(idx, retries + 1)
145
-
146
- mix = np.sum(stems_data, axis=0)
147
- if np.max(np.abs(mix)) > 0:
148
- mix = mix / np.max(np.abs(mix)) # Normalize the mixed audio to prevent clipping
149
-
150
- noise_path = random.choice(self.noise_files)
151
- noise_y, _ = librosa.load(noise_path, sr=self.sr, offset=base_start, duration=self.duration)
152
- noise_y = librosa.util.fix_length(noise_y, size=target_len)
153
-
154
- # Calculate the power of the signal and noise, then scale the noise to achieve the target SNR
155
- p_signal = np.mean(mix**2) + 1e-9
156
- p_noise = np.mean(noise_y**2) + 1e-9
157
-
158
- snr_divisor = random.uniform(2.0, 8.0)
159
- target_noise_power = p_signal / snr_divisor
160
- scaling_factor = np.sqrt(target_noise_power / p_noise)
161
- final_audio = mix + (scaling_factor * noise_y)
162
-
163
- mel = librosa.feature.melspectrogram(
164
- y=final_audio,
165
- sr=self.sr,
166
- n_mels=128,
167
- n_fft=400,
168
- hop_length=160
169
- )
170
-
171
- mel_db = librosa.power_to_db(mel, ref=np.max)
172
- mel_db = (mel_db + 4.26) / 4.56
173
- mel_tensor = torch.tensor(mel_db).float()
174
-
175
- mel_tensor = mel_tensor.T
176
-
177
- # Pad or truncate to ensure consistent shape (AST expects 1024 frames)
178
- if mel_tensor.shape[0] < AST_TARGET_FRAMES:
179
- padding_needed = AST_TARGET_FRAMES - mel_tensor.shape[0]
180
- mel_tensor = torch.nn.functional.pad(mel_tensor, (0, 0, 0, padding_needed))
181
- else:
182
- mel_tensor = mel_tensor[:AST_TARGET_FRAMES, :]
183
-
184
- genre_idx = genre_to_id[genre]
185
- return mel_tensor, genre_idx
186
- noise_files_list = [os.path.join('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/ESC-50-master/audio', f)
187
- for f in os.listdir('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/ESC-50-master/audio') if f.endswith('.wav')]
188
-
189
- train_ds = ASTAudioDataset(
190
- data_dict=tr,
191
- length=1000,
192
- genres=GENRES,
193
- stem_types=STEM_KEYS,
194
- noise_files=noise_files_list
195
- )
196
-
197
- val_ds = ASTAudioDataset(
198
- data_dict=val,
199
- length=500,
200
- genres=GENRES,
201
- stem_types=STEM_KEYS,
202
- noise_files=noise_files_list,
203
- )
204
-
205
- train_loader = DataLoader(
206
- train_ds,
207
- batch_size=4,
208
- shuffle=True,
209
- num_workers=2,
210
- pin_memory=True
211
- )
212
-
213
- val_loader = DataLoader(
214
- val_ds,
215
- batch_size=4,
216
- shuffle=False,
217
- num_workers=2,
218
- pin_memory=True
219
- )
220
-
221
- # Loading pre trained model
222
- class MusicGenreAST(nn.Module):
223
- def __init__(self, num_classes):
224
- super(MusicGenreAST, self).__init__()
225
- self.ast = ASTForAudioClassification.from_pretrained(
226
- "MIT/ast-finetuned-audioset-10-10-0.4593",
227
- num_labels=num_classes,
228
- ignore_mismatched_sizes=True
229
- )
230
-
231
- def forward(self, x):
232
- outputs = self.ast(x)
233
- return outputs
234
-
235
- #Train loop
236
- def train_ast(model_instance):
237
- num_epochs = 15
238
- patience_counter = 0
239
- patience_limit = 7
240
- best_score = float('-inf')
241
- accumulation_steps = 4
242
-
243
- model = model_instance.to(device)
244
- optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.01) #ADAMW optimizer with weight decay for better generalization
245
- scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='max', factor=0.5, patience=2, min_lr=1e-7) # Reduce learning rate when the F1 score plateaus, with a minimum learning rate threshold
246
- criterion = nn.CrossEntropyLoss()
247
-
248
- for epoch in range(num_epochs):
249
- # Training Phase
250
- model.train()
251
- tot_train_loss = 0
252
- optimizer.zero_grad()
253
-
254
- for i, (mel_spec, genre_id) in enumerate(train_loader):
255
- mel_spec, genre_id = mel_spec.to(device), genre_id.to(device)
256
-
257
- outputs = model(mel_spec).logits
258
- loss = criterion(outputs, genre_id)
259
- loss = loss / accumulation_steps
260
- loss.backward()
261
-
262
- if (i + 1) % accumulation_steps == 0:
263
- optimizer.step()
264
- optimizer.zero_grad()
265
-
266
- tot_train_loss += (loss.item() * accumulation_steps) * mel_spec.size(0)
267
-
268
- epoch_train_loss = tot_train_loss / len(train_loader.dataset)
269
- # Validation Phase
270
- model.eval()
271
- tot_val_loss = 0
272
- genre_pred_tot = []
273
- genre_true_tot = []
274
-
275
- with torch.no_grad():
276
- for mel_spec, genre_id in val_loader:
277
- mel_spec, genre_id = mel_spec.to(device), genre_id.to(device)
278
-
279
- outputs = model(mel_spec).logits
280
- val_loss = criterion(outputs, genre_id)
281
- preds = torch.argmax(outputs, dim=1)
282
-
283
- genre_true_tot.extend(genre_id.cpu().numpy())
284
- genre_pred_tot.extend(preds.cpu().numpy())
285
- tot_val_loss += val_loss.item() * mel_spec.size(0)
286
-
287
- epoch_validate_loss = tot_val_loss / len(val_loader.dataset)
288
- genre_score = f1_score(y_true=genre_true_tot, y_pred=genre_pred_tot, average='macro')
289
- current_lr = optimizer.param_groups[0]['lr']
290
-
291
- print(f'Epoch {epoch+1}/{num_epochs}')
292
- print(f'Train Loss: {epoch_train_loss:.4f} | Val Loss: {epoch_validate_loss:.4f}')
293
- print(f'F1 Score: {genre_score:.4f} | LR: {current_lr:.6f}\n')
294
-
295
- wandb.log({
296
- "epoch": epoch + 1,
297
- "train_loss": epoch_train_loss,
298
- "val_loss": epoch_validate_loss,
299
- "score": genre_score,
300
- "learning_rate": current_lr
301
- })
302
-
303
- scheduler.step(genre_score)
304
-
305
- if genre_score > best_score:
306
- best_score = genre_score
307
- patience_counter = 0
308
- torch.save(model.state_dict(), 'best_ast_model.pth')
309
- print(f"New Best Score! Model saved.")
310
- else:
311
- patience_counter += 1
312
- torch.cuda.empty_cache()
313
-
314
- if patience_counter >= patience_limit:
315
- print('----- Early Stopping Triggered -----')
316
- break
317
-
318
- wandb.finish()
319
-
320
- train_ast(MusicGenreAST(10))
321
-
322
- # Prediction dataset for test set
323
- class ASTTestDataset(Dataset):
324
- def __init__(self, test_df):
325
- self.data = test_df
326
- self.sr = 16000
327
- self.duration = 10.0
328
- self.target_frames = 1024
329
-
330
- def __len__(self):
331
- return len(self.data)
332
-
333
- def __getitem__(self, idx):
334
- filename = self.data.iloc[idx]['filename']
335
- path = os.path.join('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup', filename)
336
-
337
- y, _ = librosa.load(path, sr=self.sr, offset=0, duration=self.duration)
338
-
339
- target_len = int(self.sr * self.duration)
340
- y = librosa.util.fix_length(y, size=target_len)
341
-
342
- if np.max(np.abs(y)) > 0:
343
- y = y / np.max(np.abs(y))
344
-
345
- mel = librosa.feature.melspectrogram(
346
- y=y,
347
- sr=self.sr,
348
- n_mels=128,
349
- n_fft=400,
350
- hop_length=160
351
- )
352
-
353
- mel_db = librosa.power_to_db(mel, ref=np.max)
354
- mel_db = (mel_db + 4.26) / 4.56
355
-
356
- mel_tensor = torch.tensor(mel_db).float().T
357
-
358
- current_frames = mel_tensor.shape[0]
359
- if current_frames < self.target_frames:
360
- padding_needed = self.target_frames - current_frames
361
- mel_tensor = torch.nn.functional.pad(mel_tensor, (0, 0, 0, padding_needed))
362
- else:
363
- mel_tensor = mel_tensor[:self.target_frames, :]
364
-
365
- return mel_tensor, filename
366
- test_df = pd.read_csv('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/test.csv')
367
-
368
- test_data = ASTTestDataset(test_df)
369
-
370
- test_loader = DataLoader(
371
- dataset=test_data,
372
- batch_size=4,
373
- shuffle=False,
374
- num_workers=2,
375
- pin_memory=True if device == 'cuda' else False
376
- )
377
-
378
- # Prediction loop for test set
379
- def predict(model_instance, model_path):
380
- model = model_instance.to(device)
381
- model.load_state_dict(torch.load(model_path))
382
- model.eval()
383
-
384
- genre_pred = []
385
- for mel_specs,_ in test_loader:
386
- mel_specs = mel_specs.to(device)
387
- if mel_specs.dim() == 4:
388
- mel_specs = mel_specs.squeeze(1)
389
-
390
- with torch.no_grad():
391
- outputs = model(mel_specs).logits
392
- preds = torch.argmax(outputs, dim=1).cpu().numpy()
393
- genre_pred.extend(preds.tolist())
394
- final_result = [id_to_genre[g] for g in genre_pred]
395
- return final_result
396
-
397
- res = predict(MusicGenreAST(10), '/kaggle/input/models/bhavin273/ast-audio-model/pytorch/default/1/best_ast_model.pth')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CRNN_pipeline.py DELETED
@@ -1,366 +0,0 @@
1
- import os
2
- import glob
3
- import numpy as np
4
- import pandas as pd
5
- from tqdm import tqdm
6
- import librosa
7
- import librosa.display
8
- import matplotlib.pyplot as plt
9
- import random
10
- import torch
11
- import wandb
12
- from torchsummary import summary
13
- from torch.utils.data.dataset import Dataset
14
- from torch.utils.data import DataLoader
15
- from torch.optim.lr_scheduler import ReduceLROnPlateau
16
- from transformers import ASTForAudioClassification, ASTConfig
17
- import torch.nn as nn
18
- from sklearn.model_selection import StratifiedShuffleSplit
19
- from collections import Counter
20
- import seaborn as sns
21
- from sklearn.tree import DecisionTreeClassifier
22
- from sklearn.metrics import f1_score
23
- import warnings
24
- warnings.filterwarnings("ignore")
25
-
26
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
27
- print(f"Is GPU available? {torch.cuda.is_available()}")
28
- print(f"Current device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
29
-
30
- DATA_SEED = 67
31
- TRAINING_SEED = 1234
32
- SR = 22050
33
- DURATION = 5.0
34
- N_FFT = 2048
35
- HOP_LENGTH = 512
36
- N_MELS = 128
37
- TOP_DB=20
38
- TARGET_SNR_DB = 10
39
-
40
- random.seed(DATA_SEED)
41
- np.random.seed(DATA_SEED)
42
- torch.manual_seed(DATA_SEED)
43
- torch.cuda.manual_seed(DATA_SEED)
44
-
45
- DATA_ROOT = '/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/genres_stems'
46
- GENRES = [] # Make the list of all genres available
47
- for genre in os.listdir(path=DATA_ROOT):
48
- GENRES.append(genre)
49
- display(GENRES)
50
-
51
- STEMS = {} # Write here stems file name
52
- STEM_KEYS = ['drums', 'vocals', 'bass', 'other']
53
- GENRE_TO_TEST = 'rock'
54
- SONG_INDEX = 0
55
- genre_to_id = {genre: i for i, genre in enumerate(GENRES)}
56
- id_to_genre = {i: genre for i, genre in enumerate(GENRES)}
57
-
58
-
59
- # Function to build the dataset by splitting the songs into train and validation sets, and then creating a dictionary with genre as key and stem types as sub-keys containing the list of file paths for each stem type.
60
- def build_dataset(root_dir, val_split=0.17, seed=42):
61
- train_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
62
- val_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
63
- rows = []
64
- for genre in GENRES:
65
- genre_path = os.path.join(root_dir, genre)
66
- for song in os.listdir(genre_path):
67
- rows.append({
68
- "label" : genre,
69
- "song" : song
70
- })
71
- df = pd.DataFrame(rows)
72
- X = df['song']
73
- y = df['label']
74
-
75
- sss = StratifiedShuffleSplit(n_splits=1, test_size=val_split, random_state=seed)
76
-
77
- for train_index, val_index in sss.split(X, y):
78
- train_df = df.iloc[train_index].reset_index(drop=True)
79
- val_df = df.iloc[val_index].reset_index(drop=True)
80
-
81
- for idx, row in train_df.iterrows():
82
- genre = row['label']
83
- song = row['song']
84
- song_path = os.path.join(root_dir, genre, song)
85
- for stem in os.listdir(song_path):
86
- train_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
87
-
88
- for idx, row in val_df.iterrows():
89
- genre = row['label']
90
- song = row['song']
91
- song_path = os.path.join(root_dir, genre, song)
92
- for stem in os.listdir(song_path):
93
- val_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
94
- return train_dataset, val_dataset
95
-
96
- tr, val = build_dataset(DATA_ROOT)
97
-
98
- # Example of how the dataset is structured
99
- class audioDataset (Dataset):
100
- def __init__ (self, data_dict, length, genres, stem_types, noise_files, sr=SR, duration=DURATION, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS):
101
- self.data_dict = data_dict
102
- self.sr = sr
103
- self.genres = genres
104
- self.stem_types = stem_types
105
- self.duration = duration
106
- self.n_fft = n_fft
107
- self.hop_length = hop_length
108
- self.n_mels = n_mels
109
- self.noise_files = noise_files
110
- self.length = length
111
-
112
- def __len__(self):
113
- return self.length
114
-
115
- def __getitem__(self, idx, retries = 0):
116
- if retries > 10:
117
- idx = random.randint(0, self.length-1)
118
- return self.__getitem__(idx, 0)
119
- genre = random.choice(self.genres)
120
- start_time = random.uniform(0, 24)
121
- target_len = int(22050 * 5.0)
122
- stems_data = []
123
-
124
- for stem_type in self.stem_types:
125
- path = random.choice(self.data_dict[genre][stem_type])
126
- y, _ = librosa.load(path, sr=22050, offset=start_time, duration=self.duration)
127
- y = librosa.util.fix_length(y, size=target_len)
128
- rms = np.sqrt(np.mean(y**2))
129
- if rms < 0.001:
130
- return self.__getitem__(idx, retries + 1)
131
- stems_data.append(y)
132
- mix = np.sum(stems_data, axis=0) / 4.0
133
- if np.max(np.abs(mix)) > 0:
134
- mix = mix / np.max(np.abs(mix))
135
-
136
- noise_path = random.choice(self.noise_files)
137
- noise_y, _ = librosa.load(noise_path, sr=22050, offset=start_time, duration=self.duration)
138
- noise_y = librosa.util.fix_length(noise_y, size=target_len)
139
-
140
- p_signal = np.mean(mix**2) + 1e-9
141
- p_noise = np.mean(noise_y**2) + 1e-9
142
- target_noise_power = p_signal / 10.0
143
- scaling_factor = np.sqrt(target_noise_power / p_noise)
144
-
145
- final_audio = mix + (scaling_factor * noise_y)
146
-
147
- mfcc = librosa.feature.mfcc(
148
- y=final_audio,
149
- sr=self.sr,
150
- n_mfcc=40,
151
- n_fft=self.n_fft,
152
- hop_length=self.hop_length,
153
- n_mels=self.n_mels
154
- )
155
-
156
- mfcc_db = librosa.power_to_db(mfcc, ref=np.max)
157
- return torch.tensor(mfcc_db).float().unsqueeze(0), genre_to_id[genre] #shape(1, 40, 216)
158
-
159
- noise_files_list = [os.path.join('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/ESC-50-master/audio', f)
160
- for f in os.listdir('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/ESC-50-master/audio') if f.endswith('.wav')]
161
-
162
- train_ds = audioDataset(tr, 1000, GENRES, STEM_KEYS, noise_files_list)
163
- val_ds = audioDataset(val, 200, GENRES, STEM_KEYS, noise_files_list)
164
-
165
- train_loader = DataLoader(dataset=train_ds,
166
- batch_size=32,
167
- shuffle=True,
168
- num_workers=2,
169
- pin_memory=True if device == 'cuda' else False)
170
-
171
- val_loader = DataLoader(dataset=val_ds,
172
- batch_size=32,
173
- shuffle=False,
174
- num_workers=2,
175
- pin_memory=True if device == 'cuda' else False)
176
-
177
- #Model architecture with 3 convolutional layers followed by a GRU layer and a fully connected layer for genre classification. The convolutional layers extract features from the input MFCCs, while the GRU layer captures temporal dependencies in the features. The final fully connected layer outputs the genre predictions.
178
- class audioCRNN(nn.Module):
179
- def __init__(self):
180
- super(audioCRNN, self).__init__()
181
-
182
- self.conv1 = nn.Sequential(
183
- nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
184
- nn.BatchNorm2d(32),
185
- nn.ReLU(),
186
- nn.MaxPool2d(kernel_size=2)
187
- ) # Output Shape (32, 32, 20, 108)
188
-
189
- self.conv2 = nn.Sequential(
190
- nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
191
- nn.BatchNorm2d(64),
192
- nn.ReLU(),
193
- nn.MaxPool2d(kernel_size=(2,1))
194
- ) # Output Shape (32, 64, 10, 108)
195
-
196
- self.conv3 = nn.Sequential(
197
- nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
198
- nn.BatchNorm2d(128),
199
- nn.ReLU(),
200
- nn.MaxPool2d(kernel_size=(2,1))
201
- ) # Output Shape (32, 128, 5, 108)
202
-
203
- self.gru = nn.GRU(input_size=128 * 5, hidden_size=128,
204
- num_layers=2, batch_first=True,
205
- bidirectional=True, dropout=0.3) # Ouput size (32, 108, 256)
206
-
207
- self.genre_head = nn.Linear(256, 10)
208
-
209
-
210
- def forward (self, x):
211
- x = self.conv1(x)
212
- x = self.conv2(x)
213
- x = self.conv3(x)
214
- x = x.permute (0, 3, 1, 2)
215
- batch, time, channels, mels = x.size()
216
- x = x.contiguous().view(batch, time, channels*mels)
217
- x, _ = self.gru(x)
218
- x = x[:, -1, :]
219
-
220
- return self.genre_head(x)
221
-
222
-
223
- model = audioCRNN().to(device) # Printing Model Summary
224
- summary(model, (1, 40, 216))
225
-
226
- #The training fn
227
- def train_epoch (train_batch, audioCRNN, optimizer, criterion):
228
- audioCRNN.train() #Sets model in train Mode
229
- mel_freq, genre_id = train_batch
230
- mel_freq, genre_id = mel_freq.to(device), genre_id.to(device)
231
- optimizer.zero_grad() #Initializes the gradients to zero
232
- genre_output = audioCRNN(mel_freq)
233
- genre_loss = criterion(genre_output, genre_id)
234
- genre_loss.backward() #Back propagation
235
- optimizer.step() #Weights are adjusted
236
-
237
- return genre_loss.item()
238
-
239
- #The validation fn
240
- def validate_epoch (validation_batch, audioCRNN, criterion):
241
- audioCRNN.eval() # Runs Model in evaluation mode
242
- mel_freq, genre_id = validation_batch
243
- mel_freq, genre_id = mel_freq.to(device), genre_id.to(device)
244
- with torch.no_grad(): #Gradients disabled
245
- genre_output = audioCRNN(mel_freq)
246
- genre_loss = criterion(genre_output, genre_id)
247
- genre_pred = torch.softmax(genre_output, dim=1)
248
- genre_pred = torch.argmax(genre_pred, dim=1).cpu().numpy() # Gender prediction with softmax followed by argmax
249
- return genre_loss.item(), genre_pred
250
-
251
- #The training loop
252
- def train (audioCRNN):
253
- num_epochs = 17
254
- patience_counter = 0
255
- patience_limit = 5
256
- best_score = float('-inf')
257
- model = audioCRNN.to(device)
258
- optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
259
- # scheduler to adjust the optimizer in case the curve flattens
260
- scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='max', factor=0.5, patience=2, min_lr=1e-6)
261
- criterion = nn.CrossEntropyLoss()
262
-
263
- for epoch in range (num_epochs):
264
- tot_train_loss = 0
265
- # Training loop
266
- for mel_freq, genre_id in train_loader:
267
- train_loss = train_epoch((mel_freq, genre_id), model, optimizer, criterion)
268
- tot_train_loss += train_loss * mel_freq.size(0) # Batch mean loss multiplied by batch size for total batch loss
269
- epoch_train_loss = tot_train_loss/len(train_loader.dataset) # Total Train loss per epoch
270
- tot_val_loss = 0
271
- genre_pred_tot = []
272
- genre_true_tot = []
273
- # Validation loop
274
- for mel_freq, genre_id in val_loader:
275
- genre_true_tot.extend(genre_id.cpu().numpy())
276
- val_loss, genre_pred = validate_epoch((mel_freq, genre_id), model, criterion)
277
- genre_pred_tot.extend(genre_pred)
278
- tot_val_loss += val_loss * mel_freq.size(0)
279
- epoch_validate_loss = tot_val_loss/len(val_loader.dataset)
280
- genre_score = f1_score(y_true=genre_true_tot, y_pred=genre_pred_tot, average='macro') # f1 macro for evaluating the genre score
281
- current_lr = optimizer.param_groups[0]['lr']
282
-
283
-
284
- print(f'Epoch {epoch+1}/{num_epochs}')
285
- print(f'Train Loss : {epoch_train_loss:.4f} | Validation Loss : {epoch_validate_loss:.4f}')
286
- print(f'Score : {genre_score:.4f}')
287
- print(f'Current learning Rate is : {current_lr:.6f}\n')
288
-
289
- wandb.log({
290
- "epoch": epoch + 1,
291
- "train_loss": epoch_train_loss,
292
- "val_loss": epoch_validate_loss,
293
- "score": genre_score,
294
- "learning_rate": current_lr
295
- })
296
-
297
-
298
- scheduler.step(genre_score)
299
- if genre_score > best_score:
300
- patience_counter = 0
301
- best_score = genre_score
302
- torch.save(model.state_dict(), 'best_finetuned_model.pth') # save model
303
- else:
304
- patience_counter += 1
305
-
306
- if patience_counter >= patience_limit:
307
- print('-----Early Stopping------')
308
- break
309
- wandb.finish()
310
-
311
- train(audioCRNN())
312
-
313
- test_df = pd.read_csv('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/test.csv')
314
-
315
- class TestDataSet (Dataset):
316
- def __init__ (self, test_df):
317
- self.data = test_df
318
-
319
- def __len__(self):
320
- return len (self.data)
321
-
322
- def __getitem__(self, idx):
323
- target_len = int(22050 * 5.0)
324
- path = os.path.join('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup', self.data.iloc[idx]['filename'])
325
- y, _ = librosa.load(path, sr=22050, offset=0, duration=5.0)
326
- y = librosa.util.fix_length(y, size=target_len)
327
- if np.max(np.abs(y)) > 0:
328
- y = y / np.max(np.abs(y))
329
- mfcc = librosa.feature.mfcc(
330
- y=y,
331
- sr=22050,
332
- n_mfcc=40,
333
- n_fft=2048,
334
- hop_length=512,
335
- n_mels=128
336
- )
337
-
338
- mfcc_db = librosa.power_to_db(mfcc, ref=np.max)
339
- return torch.tensor(mfcc_db).float().unsqueeze(0)
340
-
341
-
342
- test_data = TestDataSet(test_df)
343
-
344
- test_loader = DataLoader(dataset=test_data,
345
- batch_size=32,
346
- num_workers=2,
347
- pin_memory=True if device == 'cuda' else False)
348
-
349
- genre_pred = []
350
- def predict (model_name, model_path):
351
- model = model_name.to(device)
352
- model.load_state_dict(torch.load(model_path))
353
-
354
- model.eval()
355
-
356
- for mfccs in test_loader:
357
- mfccs = mfccs.to(device)
358
- with torch.no_grad():
359
- genre_id = model (mfccs).logits
360
- genre_out = torch.softmax(genre_id, dim=1)
361
- genre_out = torch.argmax(genre_out, dim=1).cpu().numpy()
362
- genre_pred.extend(genre_out.tolist())
363
- final_result = [id_to_genre[g] for g in genre_pred]
364
- return final_result
365
-
366
- res = predict (audioCRNN(), '/kaggle/input/models/bhavin273/crnn-model/pytorch/default/1/best_crnnModel.pth')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Efficient_Net_Pipeline.py DELETED
@@ -1,326 +0,0 @@
1
- import os
2
- import glob
3
- from networkx import display
4
- import numpy as np
5
- import pandas as pd
6
- from tqdm import tqdm
7
- import librosa
8
- import librosa.display
9
- import matplotlib.pyplot as plt
10
- import random
11
- import torch
12
- import timm
13
- import wandb
14
- from torchsummary import summary
15
- from torch.utils.data.dataset import Dataset
16
- from torch.utils.data import DataLoader
17
- from torch.optim.lr_scheduler import ReduceLROnPlateau
18
- from transformers import ASTForAudioClassification, ASTConfig
19
- import torch.nn as nn
20
- from sklearn.model_selection import StratifiedShuffleSplit
21
- from collections import Counter
22
- import seaborn as sns
23
- from sklearn.tree import DecisionTreeClassifier
24
- from sklearn.metrics import f1_score
25
- import warnings
26
- warnings.filterwarnings("ignore")
27
-
28
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
29
- print(f"Is GPU available? {torch.cuda.is_available()}")
30
- print(f"Current device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
31
-
32
- DATA_SEED = 67
33
- TRAINING_SEED = 1234
34
- SR = 22050
35
- DURATION = 5.0
36
- N_FFT = 2048
37
- HOP_LENGTH = 512
38
- N_MELS = 128
39
- TOP_DB=20
40
- TARGET_SNR_DB = 10
41
-
42
- random.seed(DATA_SEED)
43
- np.random.seed(DATA_SEED)
44
- torch.manual_seed(DATA_SEED)
45
- torch.cuda.manual_seed(DATA_SEED)
46
-
47
- DATA_ROOT = '/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/genres_stems'
48
- GENRES = [] # Make the list of all genres available
49
- for genre in os.listdir(path=DATA_ROOT):
50
- GENRES.append(genre)
51
- display(GENRES)
52
- STEMS = {} # Write here stems file name
53
- STEM_KEYS = ['drums', 'vocals', 'bass', 'other']
54
- GENRE_TO_TEST = 'rock'
55
- SONG_INDEX = 0
56
- genre_to_id = {genre: i for i, genre in enumerate(GENRES)}
57
- id_to_genre = {i: genre for i, genre in enumerate(GENRES)}
58
-
59
- # Build the dataset in the format {genre: {stem_type: [file_paths]}}
60
- def build_dataset(root_dir, val_split=0.17, seed=42):
61
- train_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
62
- val_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
63
- rows = []
64
- for genre in GENRES:
65
- genre_path = os.path.join(root_dir, genre)
66
- for song in os.listdir(genre_path):
67
- rows.append({
68
- "label" : genre,
69
- "song" : song
70
- })
71
- df = pd.DataFrame(rows)
72
- X = df['song']
73
- y = df['label']
74
-
75
- sss = StratifiedShuffleSplit(n_splits=1, test_size=val_split, random_state=seed)
76
-
77
- for train_index, val_index in sss.split(X, y):
78
- train_df = df.iloc[train_index].reset_index(drop=True)
79
- val_df = df.iloc[val_index].reset_index(drop=True)
80
-
81
- for idx, row in train_df.iterrows():
82
- genre = row['label']
83
- song = row['song']
84
- song_path = os.path.join(root_dir, genre, song)
85
- for stem in os.listdir(song_path):
86
- train_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
87
-
88
- for idx, row in val_df.iterrows():
89
- genre = row['label']
90
- song = row['song']
91
- song_path = os.path.join(root_dir, genre, song)
92
- for stem in os.listdir(song_path):
93
- val_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
94
- return train_dataset, val_dataset
95
-
96
- tr, val = build_dataset(DATA_ROOT)
97
-
98
-
99
-
100
- AST_SR = 16000
101
- AST_TARGET_FRAMES = 1024
102
-
103
- # A custom Dataset class that will handle the loading of audio files, applying random augmentations, and generating the mel spectrograms on-the-fly during training and validation.
104
- class audioDataset (Dataset):
105
- def __init__ (self, data_dict, length, genres, stem_types, noise_files, sr=SR, duration=DURATION, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS):
106
- self.data_dict = data_dict
107
- self.sr = sr
108
- self.genres = genres
109
- self.stem_types = stem_types
110
- self.duration = duration
111
- self.n_fft = n_fft
112
- self.hop_length = hop_length
113
- self.n_mels = n_mels
114
- self.noise_files = noise_files
115
- self.length = length
116
- def __len__(self):
117
- return self.length
118
- def __getitem__(self, idx, retries = 0):
119
- if retries > 10:
120
- idx = random.randint(0, self.length-1)
121
- return self.__getitem__(idx, 0)
122
- genre = random.choice(self.genres)
123
- start_time = random.uniform(0, 24)
124
- target_len = int(22050 * 5.0)
125
- stems_data = []
126
- for stem_type in self.stem_types:
127
- path = random.choice(self.data_dict[genre][stem_type])
128
- y, _ = librosa.load(path, sr=22050, offset=start_time, duration=self.duration)
129
- y = librosa.util.fix_length(y, size=target_len)
130
- rms = np.sqrt(np.mean(y**2))
131
- if rms < 0.001:
132
- return self.__getitem__(idx, retries + 1)
133
- stems_data.append(y)
134
- mix = np.sum(stems_data, axis=0) / 4.0
135
- if np.max(np.abs(mix)) > 0:
136
- mix = mix / np.max(np.abs(mix))
137
- noise_path = random.choice(self.noise_files)
138
- noise_y, _ = librosa.load(noise_path, sr=22050, offset=start_time, duration=self.duration)
139
- noise_y = librosa.util.fix_length(noise_y, size=target_len)
140
- p_signal = np.mean(mix**2) + 1e-9
141
- p_noise = np.mean(noise_y**2) + 1e-9
142
- target_noise_power = p_signal / 10.0
143
- scaling_factor = np.sqrt(target_noise_power / p_noise)
144
- final_audio = mix + (scaling_factor * noise_y)
145
- mfcc = librosa.feature.mfcc(
146
- y=final_audio,
147
- sr=self.sr,
148
- n_fft=self.n_fft,
149
- hop_length=self.hop_length,
150
- n_mels=self.n_mels
151
- )
152
- mfcc_db = librosa.power_to_db(mfcc, ref=np.max)
153
- return torch.tensor(mfcc_db).float().unsqueeze(0), genre_to_id[genre] #shape(1, 128, 216)
154
-
155
- noise_files_list = [os.path.join('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/ESC-50-master/audio', f)
156
- for f in os.listdir('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/ESC-50-master/audio') if f.endswith('.wav')]
157
-
158
- train_ds = audioDataset(tr, 1000, GENRES, STEM_KEYS, noise_files_list)
159
- val_ds = audioDataset(val, 200, GENRES, STEM_KEYS, noise_files_list)
160
-
161
- train_loader = DataLoader(dataset=train_ds,
162
- batch_size=32,
163
- shuffle=True,
164
- num_workers=2,
165
- pin_memory=True if device == 'cuda' else False)
166
-
167
- val_loader = DataLoader(dataset=val_ds,
168
- batch_size=32,
169
- shuffle=False,
170
- num_workers=2,
171
- pin_memory=True if device == 'cuda' else False)
172
-
173
- # Loading pre trained model
174
- class EfficientNetAudio(nn.Module):
175
- def __init__(self, num_classes=10, model_name='efficientnet_b0'):
176
- super(EfficientNetAudio, self).__init__()
177
- self.model = timm.create_model(
178
- model_name,
179
- pretrained=True,
180
- in_chans=1,
181
- num_classes=num_classes
182
- )
183
-
184
- def forward(self, x):
185
- return self.model(x)
186
-
187
- def train_epoch (train_batch, EffNet, optimizer, criterion):
188
- EffNet.train() #Sets model in train Mode
189
- mel_freq, genre_id = train_batch
190
- mel_freq, genre_id = mel_freq.to(device), genre_id.to(device)
191
- optimizer.zero_grad() #Initializes the gradients to zero
192
- genre_output = EffNet(mel_freq)
193
- genre_loss = criterion(genre_output, genre_id)
194
- genre_loss.backward() #Back propagation
195
- optimizer.step() #Weights are adjusted
196
-
197
- return genre_loss.item()
198
-
199
- def validate_epoch (validation_batch, EffNet, criterion):
200
- EffNet.eval() # Runs Model in evaluation mode
201
- mel_freq, genre_id = validation_batch
202
- mel_freq, genre_id = mel_freq.to(device), genre_id.to(device)
203
- with torch.no_grad(): #Gradients disabled
204
- genre_output = EffNet(mel_freq)
205
- genre_loss = criterion(genre_output, genre_id)
206
- genre_pred = torch.softmax(genre_output, dim=1)
207
- genre_pred = torch.argmax(genre_pred, dim=1).cpu().numpy() # Gender prediction with softmax followed by argmax
208
- return genre_loss.item(), genre_pred
209
-
210
-
211
- #Train loop
212
- def train (audioCRNN):
213
- num_epochs = 20
214
- patience_counter = 0
215
- patience_limit = 5
216
- best_score = float('-inf')
217
- model = audioCRNN.to(device)
218
- optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
219
- # scheduler to adjust the optimizer in case the curve flattens
220
- scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='max', factor=0.5, patience=2, min_lr=1e-6)
221
- criterion = nn.CrossEntropyLoss()
222
-
223
- for epoch in range (num_epochs):
224
- tot_train_loss = 0
225
- # Training loop
226
- for mel_freq, genre_id in train_loader:
227
- train_loss = train_epoch((mel_freq, genre_id), model, optimizer, criterion)
228
- tot_train_loss += train_loss * mel_freq.size(0) # Batch mean loss multiplied by batch size for total batch loss
229
- epoch_train_loss = tot_train_loss/len(train_loader.dataset) # Total Train loss per epoch
230
- tot_val_loss = 0
231
- genre_pred_tot = []
232
- genre_true_tot = []
233
- # Validation loop
234
- for mel_freq, genre_id in val_loader:
235
- genre_true_tot.extend(genre_id.cpu().numpy())
236
- val_loss, genre_pred = validate_epoch((mel_freq, genre_id), model, criterion)
237
- genre_pred_tot.extend(genre_pred)
238
- tot_val_loss += val_loss * mel_freq.size(0)
239
- epoch_validate_loss = tot_val_loss/len(val_loader.dataset)
240
- genre_score = f1_score(y_true=genre_true_tot, y_pred=genre_pred_tot, average='macro') # f1 macro for evaluating the genre score
241
- current_lr = optimizer.param_groups[0]['lr']
242
-
243
-
244
- print(f'Epoch {epoch+1}/{num_epochs}')
245
- print(f'Train Loss : {epoch_train_loss:.4f} | Validation Loss : {epoch_validate_loss:.4f}')
246
- print(f'Score : {genre_score:.4f}')
247
- print(f'Current learning Rate is : {current_lr:.6f}\n')
248
-
249
- wandb.log({
250
- "epoch": epoch + 1,
251
- "train_loss": epoch_train_loss,
252
- "val_loss": epoch_validate_loss,
253
- "score": genre_score,
254
- "learning_rate": current_lr
255
- })
256
-
257
-
258
- scheduler.step(genre_score)
259
- if genre_score > best_score:
260
- patience_counter = 0
261
- best_score = genre_score
262
- torch.save(model.state_dict(), 'best_efficientnet_model.pth') # save model
263
- else:
264
- patience_counter += 1
265
-
266
- if patience_counter >= patience_limit:
267
- print('-----Early Stopping------')
268
- break
269
-
270
- train(EfficientNetAudio())
271
-
272
- # Prediction dataset for test set
273
- test_df = pd.read_csv('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/test.csv')
274
-
275
- class TestDataSet (Dataset):
276
- def __init__ (self, test_df):
277
- self.data = test_df
278
-
279
- def __len__(self):
280
- return len (self.data)
281
-
282
- def __getitem__(self, idx):
283
- target_len = int(22050 * 5.0)
284
- path = os.path.join('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup', self.data.iloc[idx]['filename'])
285
- y, _ = librosa.load(path, sr=22050, offset=0, duration=5.0)
286
- y = librosa.util.fix_length(y, size=target_len)
287
- if np.max(np.abs(y)) > 0:
288
- y = y / np.max(np.abs(y))
289
- mfcc = librosa.feature.mfcc(
290
- y=y,
291
- sr=22050,
292
- n_fft=2048,
293
- hop_length=512,
294
- n_mels=128
295
- )
296
-
297
- mfcc_db = librosa.power_to_db(mfcc, ref=np.max)
298
- return torch.tensor(mfcc_db).float().unsqueeze(0)
299
-
300
-
301
- test_data = TestDataSet(test_df)
302
-
303
- test_loader = DataLoader(dataset=test_data,
304
- batch_size=32,
305
- num_workers=2,
306
- pin_memory=True if device == 'cuda' else False)
307
-
308
- # Prediction loop for test set
309
- genre_pred = []
310
- def predict (model_name, model_path):
311
- model = model_name.to(device)
312
- model.load_state_dict(torch.load(model_path))
313
-
314
- model.eval()
315
-
316
- for mfccs in test_loader:
317
- mfccs = mfccs.to(device)
318
- with torch.no_grad():
319
- genre_id = model (mfccs)
320
- genre_out = torch.softmax(genre_id, dim=1)
321
- genre_out = torch.argmax(genre_out, dim=1).cpu().numpy()
322
- genre_pred.extend(genre_out.tolist())
323
- final_result = [id_to_genre[g] for g in genre_pred]
324
- return final_result
325
-
326
- res = predict (EfficientNetAudio(), '/kaggle/input/models/bhavin273/efficientnet-model/pytorch/default/1/best_efficientnet_model.pth')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md DELETED
@@ -1,247 +0,0 @@
1
- # Music Genre Classification from Audio
2
-
3
- A deep learning project for classifying audio tracks into **10 music genres** using the **messy_mashup** dataset from the Kaggle competition (`jan-2026-dl-gen-ai-project`). Three model architectures are explored: a scratch-built CRNN, a pretrained EfficientNet-B0, and a fine-tuned Audio Spectrogram Transformer (AST) — with AST achieving the best leaderboard score of **0.857**.
4
-
5
- ---
6
-
7
- ## Table of Contents
8
-
9
- - [Project Overview](#project-overview)
10
- - [Dataset](#dataset)
11
- - [Project Structure](#project-structure)
12
- - [Installation](#installation)
13
- - [Preprocessing Pipeline](#preprocessing-pipeline)
14
- - [Models](#models)
15
- - [Model 1: CRNN (CNN + Bidirectional GRU)](#model-1-crnn-cnn--bidirectional-gru)
16
- - [Model 2: Audio Spectrogram Transformer (AST)](#model-2-audio-spectrogram-transformer-ast)
17
- - [Model 3: EfficientNet-B0](#model-3-efficientnet-b0)
18
- - [Training](#training)
19
- - [Results](#results)
20
- - [Usage](#usage)
21
- - [Acknowledgements](#acknowledgements)
22
-
23
- ---
24
-
25
- ## Project Overview
26
-
27
- The goal is to predict the genre of a music track from its raw audio waveform. The audio data consists of separated stems (drums, vocals, bass, other) per song, which are mixed together with environmental noise augmentation to simulate real-world conditions. Three progressively more powerful model architectures are trained and compared.
28
-
29
- ---
30
-
31
- ## Dataset
32
-
33
- | Property | Details |
34
- |----------|---------|
35
- | **Name** | messy_mashup |
36
- | **Genres (10)** | blues, classical, country, disco, hiphop, jazz, metal, pop, reggae, rock |
37
- | **Stems per song** | `drums`, `vocals`, `bass`, `other` |
38
- | **Noise source** | ESC-50 environmental sound dataset |
39
- | **Train/Val split** | 83% / 17% (Stratified Shuffle Split, seed=42) |
40
-
41
- ### Data Organization
42
-
43
- ```
44
- genres_stems/
45
- ├── blues/
46
- │ ├── song_001/
47
- │ │ ├── drums.wav
48
- │ │ ├── vocals.wav
49
- │ │ ├── bass.wav
50
- │ │ └── other.wav
51
- │ └── ...
52
- ├── classical/
53
- │ └── ...
54
- └── ...
55
- ```
56
-
57
- ---
58
-
59
- ## Project Structure
60
-
61
- ```
62
- ├── dl-23f3002677-notebook-t12026.ipynb # Main Kaggle notebook (all models + submission)
63
- ├── CRNN_pipeline.py # Standalone CRNN model pipeline
64
- ├── AST_Pipeline.py # Standalone AST model pipeline
65
- ├── Efficient_Net_Pipeline.py # Standalone EfficientNet model pipeline
66
- ├── milestone1.py # Milestone 1 — EDA & audio preprocessing
67
- ├── milestone_2.py # Milestone 2 — Feature extraction & baseline
68
- ├── best_ast_model.pth # Saved AST model weights
69
- ├── requirements.txt # Python dependencies
70
- └── README.md
71
- ```
72
-
73
- ### File Descriptions
74
-
75
- | File | Purpose |
76
- |------|---------|
77
- | **dl-23f3002677-notebook-t12026.ipynb** | End-to-end notebook: data visualization, dataset building, all 3 model architectures (train + predict), performance comparison, and final submission generation |
78
- | **milestone1.py** | Data exploration — builds train/val split, detects silence segments in stems |
79
- | **milestone_2.py** | Feature engineering (tempo, spectral centroid, ZCR, rolloff) + Decision Tree baseline classifier |
80
- | **CRNN_pipeline.py** | CRNN model definition, dataset, training loop, and prediction |
81
- | **AST_Pipeline.py** | Audio Spectrogram Transformer pipeline with HuggingFace pretrained model |
82
- | **Efficient_Net_Pipeline.py** | EfficientNet-B0 pipeline using `timm` pretrained model |
83
-
84
- ---
85
-
86
- ## Installation
87
-
88
- ```bash
89
- pip install -r requirements.txt
90
- ```
91
-
92
- ### Dependencies
93
-
94
- - **numpy** — numerical computation
95
- - **pandas** — data manipulation
96
- - **librosa** — audio loading & feature extraction
97
- - **torch** / **torchaudio** — deep learning framework
98
- - **torchsummary** — model architecture summaries
99
- - **transformers** — HuggingFace pretrained AST model
100
- - **timm** — pretrained EfficientNet-B0
101
- - **scikit-learn** — stratified splitting, F1 scoring, Decision Tree baseline
102
- - **matplotlib** / **seaborn** — visualization
103
- - **wandb** — experiment tracking
104
- - **soundfile** — audio I/O
105
- - **tqdm** — progress bars
106
-
107
- ---
108
-
109
- ## Preprocessing Pipeline
110
-
111
- ### Audio Mixing
112
-
113
- 1. **Stem selection**: For each sample, randomly select a genre and load stems (drums, vocals, bass, other) from a random song at a random offset
114
- 2. **Mixing**: Sum all stem waveforms together and peak-normalize the mix
115
- 3. **Noise injection**: Add a random ESC-50 environmental sound clip, scaled to a target SNR
116
-
117
- ### Feature Extraction
118
-
119
- | Model | Feature | Sample Rate | Duration | Output Shape |
120
- |-------|---------|-------------|----------|-------------|
121
- | CRNN | 40 MFCCs → power_to_dB | 22,050 Hz | 5s | `(1, 40, 216)` |
122
- | EfficientNet | 128 MFCCs → power_to_dB | 22,050 Hz | 5s | `(1, 128, 216)` |
123
- | AST | 128-mel spectrogram → power_to_dB → normalized | 16,000 Hz | 10s | `(1024, 128)` |
124
-
125
- ### AST-Specific Augmentations
126
-
127
- - **Stem dropout**: 15% chance of skipping each stem type
128
- - **Random gain**: Each stem scaled by a random factor in [0.4, 1.2]
129
- - **Variable SNR**: Noise scaling divisor randomized between 2.0 and 8.0
130
- - **Normalization**: `(mel_dB + 4.26) / 4.56`
131
-
132
- ---
133
-
134
- ## Models
135
-
136
- ### Model 1: CRNN (CNN + Bidirectional GRU)
137
-
138
- A scratch-built architecture combining convolutional feature extraction with recurrent temporal modeling.
139
-
140
- ```
141
- Input (1, 40, 216)
142
- → Conv2d(1→32) + BatchNorm + ReLU + MaxPool(2×2) → (32, 20, 108)
143
- → Conv2d(32→64) + BatchNorm + ReLU + MaxPool(2×1) → (64, 10, 108)
144
- → Conv2d(64→128) + BatchNorm + ReLU + MaxPool(2×1) → (128, 5, 108)
145
- → Reshape to (108, 640)
146
- → Bidirectional GRU (hidden=128, 2 layers, dropout=0.3) → (108, 256)
147
- → Last timestep → Linear(256→10)
148
- ```
149
-
150
- ### Model 2: Audio Spectrogram Transformer (AST)
151
-
152
- Fine-tuned from the pretrained [`MIT/ast-finetuned-audioset-10-10-0.4593`](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593) checkpoint via HuggingFace Transformers. The classification head is replaced with a 10-class output layer.
153
-
154
- ### Model 3: EfficientNet-B0
155
-
156
- Pretrained EfficientNet-B0 loaded via `timm` with `in_chans=1` (single-channel MFCC input) and `num_classes=10`.
157
-
158
- ---
159
-
160
- ## Training
161
-
162
- ### Hyperparameters
163
-
164
- | Parameter | CRNN | AST | EfficientNet |
165
- |-----------|------|-----|-------------|
166
- | **Optimizer** | Adam | AdamW | Adam |
167
- | **Learning rate** | 1e-3 | 1e-5 | 1e-3 |
168
- | **Weight decay** | 1e-5 | 0.01 | 1e-5 |
169
- | **Batch size** | 32 | 4 | 32 |
170
- | **Max epochs** | 17 | 15 | 20 |
171
- | **Early stopping patience** | 5 | 7 | 5 |
172
- | **Gradient accumulation** | — | 4 steps | — |
173
- | **Scheduler** | ReduceLROnPlateau | ReduceLROnPlateau | ReduceLROnPlateau |
174
- | **Loss function** | CrossEntropyLoss | CrossEntropyLoss | CrossEntropyLoss |
175
- | **Train samples** | 1,000 | 1,000 | 1,000 |
176
- | **Val samples** | 200 | 500 | 200 |
177
-
178
- ### Experiment Tracking
179
-
180
- All training runs are logged to [Weights & Biases (WandB)](https://wandb.ai/) with metrics for train loss, validation loss, F1 score, and learning rate per epoch.
181
-
182
- ---
183
-
184
- ## Results
185
-
186
- | Model | Max F1 (Validation) | Kaggle Leaderboard Score |
187
- |-------|:-------------------:|:------------------------:|
188
- | CRNN (scratch) | 0.5800 | 0.33103 |
189
- | EfficientNet-B0 | 0.5258 | 0.31641 |
190
- | **Audio Spectrogram Transformer** | **0.8861** | **0.85708** |
191
-
192
- The **AST model** significantly outperforms the other two, achieving nearly **0.86 on the leaderboard** — a 2.5× improvement over the CNN-based models. Key factors behind its success:
193
-
194
- - **Pretrained on AudioSet**: Large-scale audio pretraining provides strong feature representations
195
- - **Longer input duration**: 10s vs 5s captures more musical context
196
- - **Mel spectrogram input**: Richer frequency representation compared to MFCCs
197
- - **Stronger augmentation**: Stem dropout, variable gain, and variable SNR improve robustness
198
-
199
- ---
200
-
201
- ## Usage
202
-
203
- ### Training (in Kaggle notebook)
204
-
205
- 1. Set up the dataset paths in the notebook
206
- 2. Uncomment the training cell for the desired model:
207
- ```python
208
- # CRNN
209
- train(audioCRNN())
210
-
211
- # AST
212
- train_ast(MusicGenreAST(10))
213
-
214
- # EfficientNet
215
- train(EfficientNetAudio())
216
- ```
217
-
218
- ### Prediction
219
-
220
- ```python
221
- # AST (best model)
222
- res = predict(MusicGenreAST(10), 'best_ast_model.pth')
223
-
224
- # CRNN
225
- res = predict(audioCRNN(), 'best_crnnModel.pth')
226
-
227
- # EfficientNet
228
- res = predict(EfficientNetAudio(), 'best_efficientnet_model.pth')
229
- ```
230
-
231
- ### Generating Submission
232
-
233
- ```python
234
- submission_df = pd.read_csv('sample_submission.csv')
235
- submission = pd.DataFrame({"id": submission_df['id'], "genre": res})
236
- submission.to_csv("submission.csv", index=False)
237
- ```
238
-
239
- ---
240
-
241
- ## Acknowledgements
242
-
243
- - [MIT AST](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593) — pretrained Audio Spectrogram Transformer
244
- - [ESC-50](https://github.com/karolpiczak/ESC-50) — environmental sound dataset used for noise augmentation
245
- - [timm](https://github.com/huggingface/pytorch-image-models) — PyTorch Image Models library
246
- - [Weights & Biases](https://wandb.ai/) — experiment tracking
247
- - [librosa](https://librosa.org/) — audio analysis library
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dl-23f3002677-notebook-t12026.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
milestone1.py DELETED
@@ -1,232 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """MileStone1.ipynb
3
-
4
- Automatically generated by Colab.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1X-yYuntHy5LGU1wh8QNz1rTO1geAS-qh
8
- """
9
-
10
- import os
11
- import glob
12
- import numpy as np
13
- import pandas as pd
14
- from tqdm import tqdm
15
- import librosa
16
- import librosa.display
17
- import matplotlib.pyplot as plt
18
- import random
19
- import torch
20
- from sklearn.model_selection import StratifiedShuffleSplit
21
- from google.colab import drive
22
- from collections import Counter
23
-
24
- import warnings
25
- warnings.filterwarnings("ignore")
26
-
27
- drive.mount('/content/drive')
28
-
29
-
30
- drive_zip = '/content/drive/MyDrive/jan-2026-dl-gen-ai-project.zip'
31
- local_zip = '/content/data.zip'
32
- extract_to = '/content/dataset'
33
-
34
- if not os.path.exists(extract_to):
35
- print("Moving 16 GB from Drive to local SSD...")
36
- !rsync -ah --progress "{drive_zip}" "{local_zip}"
37
-
38
- print("Unzipping 25GB... (This will take a few minutes)")
39
- !unzip -q "{local_zip}" -d "{extract_to}"
40
-
41
- print("Deleting zip to save space...")
42
- os.remove(local_zip)
43
- print("Ready! Data is in /content/dataset")
44
- else:
45
- print("Data already exists.")
46
-
47
- DATA_SEED = 67
48
- TRAINING_SEED = 1234
49
- SR = 22050
50
- DURATION = 5.0
51
- N_FFT = 2048
52
- HOP_LENGTH = 512
53
- N_MELS = 128
54
- TOP_DB=20
55
- TARGET_SNR_DB = 10
56
-
57
- random.seed(DATA_SEED)
58
- np.random.seed(DATA_SEED)
59
- torch.manual_seed(DATA_SEED)
60
- torch.cuda.manual_seed(DATA_SEED)
61
-
62
- # CONFIGURATION
63
- DATA_ROOT = '/content/dataset/messy_mashup/genres_stems'
64
- GENRES = [] # Make the list of all genres available
65
- for genre in os.listdir(path=DATA_ROOT):
66
- GENRES.append(genre)
67
- display(GENRES)
68
- STEMS = {} # Write here stems file name
69
- STEM_KEYS = ['drums', 'vocals', 'bass', 'other']
70
- GENRE_TO_TEST = 'rock'
71
- SONG_INDEX = 0
72
-
73
- def build_dataset(root_dir, val_split=0.17, seed=42):
74
- train_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
75
- val_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
76
- rows = []
77
- for genre in GENRES:
78
- genre_path = os.path.join(root_dir, genre)
79
- for song in os.listdir(genre_path):
80
- rows.append({
81
- "label" : genre,
82
- "song" : song
83
- })
84
- df = pd.DataFrame(rows)
85
- display(song_df.head())
86
- X = df['song']
87
- y = df['label']
88
-
89
- sss = StratifiedShuffleSplit(n_splits=1, test_size=val_split, random_state=seed)
90
-
91
- for train_index, val_index in sss.split(X, y):
92
- train_df = df.iloc[train_index].reset_index(drop=True)
93
- val_df = df.iloc[val_index].reset_index(drop=True)
94
-
95
- for idx, row in train_df.iterrows():
96
- genre = row['label']
97
- song = row['song']
98
- song_path = os.path.join(root_dir, genre, song)
99
- for stem in os.listdir(song_path):
100
- train_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
101
-
102
- for idx, row in val_df.iterrows():
103
- genre = row['label']
104
- song = row['song']
105
- song_path = os.path.join(root_dir, genre, song)
106
- for stem in os.listdir(song_path):
107
- val_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
108
-
109
- print(len(val['jazz']['drums']))
110
- return train_dataset, val_dataset
111
-
112
- tr, val = build_dataset(DATA_ROOT)
113
-
114
- def find_long_silences(dataset_dict, sr=SR, threshold_sec=DURATION, top_db=TOP_DB):
115
- """
116
- Input:
117
- dataset_dict: The dictionary structure {genre: {stem: [paths...]}}
118
- Output:
119
- df: Pandas DataFrame containing details of all files with silence >= 5s
120
- """
121
- records = []
122
- for genre in GENRES:
123
- for key in STEM_KEYS:
124
- for file in dataset_dict[genre][key]:
125
- silence_type = []
126
- total_duration = 0
127
- max_silence = 0
128
- y, sr = librosa.load(file, sr=sr)
129
- intervals = librosa.effects.split(y, top_db=top_db)
130
- file_length = len(y) / sr
131
-
132
- if intervals.size == 0:
133
- silence_type.append("FULL")
134
- max_silence = file_length
135
- total_silence_duration = file_length
136
- else:
137
- # Start
138
- start_gap = intervals[0][0] / sr
139
- if start_gap > 0:
140
- silence_type.append("START")
141
- max_silence = max(max_silence, start_gap)
142
-
143
- # Middle
144
- mid_silence_total = 0
145
- for i in range(1, len(intervals)):
146
- gap = (intervals[i][0] - intervals[i-1][1]) / sr
147
- if gap > 0:
148
- if "MIDDLE" not in silence_type: silence_type.append("MIDDLE")
149
- max_silence = max(max_silence, gap)
150
- mid_silence_total += gap
151
-
152
- # End
153
- end_gap = (len(y) - intervals[-1][1]) / sr
154
- if end_gap > 0:
155
- silence_type.append("END")
156
- max_silence = max(max_silence, end_gap)
157
- total_silence_duration = start_gap + mid_silence_total + end_gap
158
-
159
- if max_silence >= threshold_sec:
160
- records.append({
161
- "Genre": genre,
162
- "Stem": key,
163
- "Duration": round(total_silence_duration, 2),
164
- "Max_Silence_Sec": round(max_silence, 2),
165
- "Silence_Location": ", ".join(silence_type),
166
- "File_Path": file
167
- })
168
- #-------------------------------------------------------------------------
169
- df = pd.DataFrame(records)
170
- return df
171
-
172
-
173
- # --- EXECUTION ---
174
- # Pass your 'tr' (training) dictionary here.
175
- # Ensure 'tr' is defined from your previous build_dataset code.
176
- df_silence = find_long_silences(tr, threshold_sec=DURATION, top_db=TOP_DB)
177
-
178
- # --- RESULTS ANALYSIS ---
179
-
180
- # ------------------- write your code here -------------------------------
181
- #-------------------------------------------------------------------------
182
- # Hint: Create a pivot Table: Count by Genre vs Stem
183
-
184
- df_silence[df_silence["Silence_Location"] == 'START'].head()
185
-
186
- df_silence[
187
- (df_silence['Stem'] == 'drums') &
188
- (df_silence['Genre'] == 'jazz') &
189
- (df_silence['Max_Silence_Sec'] > 10)
190
- ].shape
191
-
192
- stems_audio = []
193
- path = '/content/dataset/messy_mashup/genres_stems/rock/rock.00000'
194
- try:
195
- for key in STEM_KEYS:
196
- y, sr = librosa.load(os.path.join(path, f'{key}.wav'), sr=SR, duration=5.0)
197
- stems_audio.append(y)
198
- print("Audio loaded successfully.")
199
-
200
- except IndexError:
201
- print(f"ERROR: Song index {SONG_INDEX} out of range for genre {GENRE_TO_TEST}.")
202
- except Exception as e:
203
- print(f"ERROR: {e}")
204
-
205
- # ------------------- write your code here -------------------------------
206
-
207
- # Stack them into a numpy array (Shape: 4 x Samples)
208
- stems_stack = np.asarray(stems_audio)
209
- print(stems_stack.shape)
210
-
211
- # Mix the stems by summing them element-wise
212
- mix_raw = np.sum(stems_stack, axis=0)
213
- print(mix_raw.shape)
214
-
215
- # Calculate RMS Amplitude MANUALLY
216
- rms_val = np.sqrt(np.sum(mix_raw**2) / mix_raw.shape[0])
217
- print(rms_val)
218
-
219
- # Peak Normalization (FIXED)
220
- max_val = np.max(np.abs(mix_raw)) # <-- IMPORTANT CHANGE
221
- print(max_val)
222
-
223
- if max_val > 0:
224
- mix_norm = mix_raw / max_val
225
- else:
226
- mix_norm = mix_raw
227
-
228
- print(np.max(np.abs(mix_norm)))
229
-
230
- # VALIDATION
231
- assert np.isclose(np.max(np.abs(mix_norm)), 1.0), "Normalization failed."
232
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
milestone_2.py DELETED
@@ -1,258 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """Milestone_2.ipynb
3
-
4
- Automatically generated by Colab.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1-r7NZ_PYS1JKGoinWyPplhGkitD7lHuV
8
- """
9
-
10
- import os
11
- import glob
12
- import numpy as np
13
- import pandas as pd
14
- from tqdm import tqdm
15
- import librosa
16
- import librosa.display
17
- import matplotlib.pyplot as plt
18
- import random
19
- import torch
20
- import soundfile as sf
21
- from sklearn.model_selection import StratifiedShuffleSplit
22
- from google.colab import drive
23
- from collections import Counter
24
- import matplotlib.pyplot as plt
25
- import seaborn as sns
26
- from sklearn.model_selection import train_test_split
27
- from sklearn.tree import DecisionTreeClassifier
28
- from sklearn.metrics import f1_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
29
- import warnings
30
- warnings.filterwarnings("ignore")
31
-
32
- drive.mount('/content/drive')
33
-
34
-
35
- drive_zip = '/content/drive/MyDrive/jan-2026-dl-gen-ai-project.zip'
36
- local_zip = '/content/data.zip'
37
- extract_to = '/content/dataset'
38
-
39
- if not os.path.exists(extract_to):
40
- print("Moving 16 GB from Drive to local SSD...")
41
- !rsync -ah --progress "{drive_zip}" "{local_zip}"
42
-
43
- print("Unzipping 25GB... (This will take a few minutes)")
44
- !unzip -q "{local_zip}" -d "{extract_to}"
45
-
46
- print("Deleting zip to save space...")
47
- os.remove(local_zip)
48
- print("Ready! Data is in /content/dataset")
49
- else:
50
- print("Data already exists.")
51
-
52
- DATA_SEED = 67
53
- TRAINING_SEED = 1234
54
- SR = 22050
55
- DURATION = 5.0
56
- N_FFT = 2048
57
- HOP_LENGTH = 512
58
- N_MELS = 128
59
- TOP_DB=20
60
- TARGET_SNR_DB = 10
61
-
62
- random.seed(DATA_SEED)
63
- np.random.seed(DATA_SEED)
64
- torch.manual_seed(DATA_SEED)
65
- torch.cuda.manual_seed(DATA_SEED)
66
-
67
- # CONFIGURATION
68
- DATA_ROOT = '/content/dataset/messy_mashup/genres_stems'
69
- GENRES = [] # Make the list of all genres available
70
- for genre in os.listdir(path=DATA_ROOT):
71
- GENRES.append(genre)
72
- display(GENRES)
73
- STEMS = {} # Write here stems file name
74
- STEM_KEYS = ['drums', 'vocals', 'bass', 'other']
75
- GENRE_TO_TEST = 'rock'
76
- SONG_INDEX = 0
77
-
78
- def build_dataset(root_dir, val_split=0.17, seed=42):
79
- train_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
80
- val_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
81
- rows = []
82
- for genre in GENRES:
83
- genre_path = os.path.join(root_dir, genre)
84
- for song in os.listdir(genre_path):
85
- rows.append({
86
- "label" : genre,
87
- "song" : song
88
- })
89
- df = pd.DataFrame(rows)
90
- X = df['song']
91
- y = df['label']
92
-
93
- sss = StratifiedShuffleSplit(n_splits=1, test_size=val_split, random_state=seed)
94
-
95
- for train_index, val_index in sss.split(X, y):
96
- train_df = df.iloc[train_index].reset_index(drop=True)
97
- val_df = df.iloc[val_index].reset_index(drop=True)
98
-
99
- for idx, row in train_df.iterrows():
100
- genre = row['label']
101
- song = row['song']
102
- song_path = os.path.join(root_dir, genre, song)
103
- for stem in os.listdir(song_path):
104
- train_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
105
-
106
- for idx, row in val_df.iterrows():
107
- genre = row['label']
108
- song = row['song']
109
- song_path = os.path.join(root_dir, genre, song)
110
- for stem in os.listdir(song_path):
111
- val_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
112
- return train_dataset, val_dataset
113
-
114
- tr, val = build_dataset(DATA_ROOT)
115
-
116
- # Question_1 and Question_2
117
- duration_arr = []
118
- for stem in STEM_KEYS:
119
- for song in tr['jazz'][stem]:
120
- y, sr = librosa.load(path=song)
121
- duration = librosa.get_duration(y=y, sr=sr)
122
- duration_arr.append(duration)
123
- display(np.mean(np.array(duration_arr)))
124
-
125
- # Question_2
126
- sr_set=set()
127
- for genre in GENRES:
128
- for stem in STEM_KEYS:
129
- for song in tr[genre][stem]:
130
- y, sr = librosa.load(path=song)
131
- sr_set.add(sr)
132
- display(list(sr_set))
133
-
134
- #Question_3
135
- counter = 0
136
- for genre in GENRES:
137
- for stem in STEM_KEYS:
138
- for song in tr[genre][stem]:
139
- if os.path.getsize(song) == 0:
140
- counter += 1
141
- display(counter)
142
-
143
- #Question_4
144
- amplitude_arr = []
145
-
146
- for genre in GENRES:
147
- for song in tr[genre]['vocals']:
148
- y, sr = sf.read(song)
149
-
150
- peak_linear = np.max(np.abs(y))
151
-
152
- if peak_linear > 0:
153
- peak_db = librosa.amplitude_to_db(np.array([peak_linear]), ref=1.0)[0]
154
- amplitude_arr.append(peak_db)
155
-
156
- average_peak_db = np.mean(amplitude_arr)
157
- print(f"Average Peak Level: {average_peak_db:.2f} dBFS")
158
-
159
- #Question_5
160
- centroids = []
161
- for stem in STEM_KEYS:
162
- for song in tr['blues'][stem]:
163
- y, sr = librosa.load(song)
164
- sc = librosa.feature.spectral_centroid(y=y, sr=sr)
165
- centroids.append(np.mean(sc))
166
- mean_blues_centroid = np.mean(centroids)
167
- print(f"Mean Spectral Centroid for Blues: {mean_blues_centroid:.2f} Hz")
168
-
169
- #Question_6
170
- genre_means = {}
171
- for genre in GENRES:
172
- genre_centroids = []
173
- for stem in STEM_KEYS:
174
- for song_path in tr[genre][stem]:
175
- y, sr = librosa.load(song_path)
176
- sc = librosa.feature.spectral_centroid(y=y, sr=sr)
177
- genre_centroids.append(np.mean(sc))
178
-
179
- genre_means[genre] = np.mean(genre_centroids)
180
-
181
- highest_genre = max(genre_means, key=genre_means.get)
182
- print(f"Genre with highest mean spectral centroid: {highest_genre} ({genre_means[highest_genre]:.2f} Hz)")
183
-
184
- #Question_7
185
- silence_count = 0
186
- threshold = librosa.db_to_amplitude(-20)
187
-
188
- for genre in GENRES:
189
- for stem_type in STEM_KEYS:
190
- for song_path in tr[genre][stem_type]:
191
- y, sr = librosa.load(song_path, duration=0.5)
192
- if np.max(np.abs(y)) < threshold:
193
- silence_count += 1
194
-
195
- print(f"Stems quieter than -20dB in the first 0.5s: {silence_count}")
196
-
197
- # --- 1. Setup and Preprocessing ---
198
- ROOT = '/content/dataset/messy_mashup'
199
- STEMS_PATH = os.path.join(ROOT, 'genres_stems')
200
- GENRES = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]
201
-
202
- def extract_features(song_path):
203
- # Load 10s at 22050Hz
204
- y, sr = librosa.load(os.path.join(song_path, 'other.wav'), sr=22050, duration=10)
205
- tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
206
- spec_cent = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
207
- zcr = np.mean(librosa.feature.zero_crossing_rate(y))
208
- rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
209
- return [float(tempo), spec_cent, zcr, rolloff]
210
-
211
- # --- 2. Data Preparation & Stratified Split ---
212
- data = []
213
- for g in GENRES:
214
- gp = os.path.join(STEMS_PATH, g)
215
- songs = [s for s in os.listdir(gp) if os.path.isdir(os.path.join(gp, s))]
216
- for s in songs[:50]: # Sampling 50 for speed; use all for final
217
- data.append({'path': os.path.join(gp, s), 'genre': g})
218
-
219
- df = pd.DataFrame(data)
220
- train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['genre'], random_state=42)
221
-
222
- # --- 3. Model Training (Decision Tree) ---
223
- X_train = np.array([extract_features(p) for p in train_df['path']])
224
- y_train = train_df['genre']
225
- X_val = np.array([extract_features(p) for p in val_df['path']])
226
- y_val = val_df['genre']
227
-
228
- clf = DecisionTreeClassifier(max_depth=5, random_state=42)
229
- clf.fit(X_train, y_train)
230
-
231
- y_pred = clf.predict(X_val)
232
- display(f'f1 Score is {f1_score(y_pred=y_pred, y_true=y_val, average='macro'):.2f}')
233
-
234
- report_dict = classification_report(y_val, y_pred, output_dict=True)
235
- df_report = pd.DataFrame(report_dict).transpose()
236
- plt.figure(figsize=(10, 6))
237
- sns.heatmap(df_report, annot=True, cmap="YlGnBu", fmt=".2f", linewidths=.5)
238
- plt.title('Genre Classification Performance')
239
- plt.show()
240
-
241
- display(np.mean(np.array(y_pred==y_val)))
242
-
243
- cm = confusion_matrix(y_val, y_pred)
244
- plt.figure(figsize=(10, 8))
245
- disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
246
- disp.plot(cmap='Blues', xticks_rotation='vertical', ax=plt.gca())
247
- plt.title('Confusion Matrix: Predicted vs True Genres')
248
- plt.show()
249
-
250
- true_positives = np.diag(cm)
251
- tp_per_genre = dict(zip(clf.classes_, true_positives))
252
- highest_tp_genre = max(tp_per_genre, key=tp_per_genre.get)
253
- print(f"Genre with the highest True Positives: {highest_tp_genre}")
254
-
255
- false_negatives = np.sum(cm, axis=1) - np.diag(cm)
256
- fn_per_genre = dict(zip(clf.classes_, false_negatives))
257
- best_genre = min(fn_per_genre, key=fn_per_genre.get)
258
- print(f"Genre with the lowest False Negatives: {best_genre}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt DELETED
@@ -1,14 +0,0 @@
1
- numpy
2
- pandas
3
- tqdm
4
- librosa
5
- matplotlib
6
- seaborn
7
- torch
8
- torchaudio
9
- torchsummary
10
- transformers
11
- timm
12
- scikit-learn
13
- soundfile
14
- wandb