gulabjam commited on
Commit ·
9306638
1
Parent(s): b243717
Remove all files except .gitattributes, best_ast_model.pth, and AST_README.md
Browse files- AST_Pipeline.py +0 -397
- CRNN_pipeline.py +0 -366
- Efficient_Net_Pipeline.py +0 -326
- README.md +0 -247
- dl-23f3002677-notebook-t12026.ipynb +0 -0
- milestone1.py +0 -232
- milestone_2.py +0 -258
- requirements.txt +0 -14
AST_Pipeline.py
DELETED
|
@@ -1,397 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import glob
|
| 3 |
-
from networkx import display
|
| 4 |
-
import numpy as np
|
| 5 |
-
import pandas as pd
|
| 6 |
-
from tqdm import tqdm
|
| 7 |
-
import librosa
|
| 8 |
-
import librosa.display
|
| 9 |
-
import matplotlib.pyplot as plt
|
| 10 |
-
import random
|
| 11 |
-
import torch
|
| 12 |
-
import wandb
|
| 13 |
-
from torchsummary import summary
|
| 14 |
-
from torch.utils.data.dataset import Dataset
|
| 15 |
-
from torch.utils.data import DataLoader
|
| 16 |
-
from torch.optim.lr_scheduler import ReduceLROnPlateau
|
| 17 |
-
from transformers import ASTForAudioClassification, ASTConfig
|
| 18 |
-
import torch.nn as nn
|
| 19 |
-
from sklearn.model_selection import StratifiedShuffleSplit
|
| 20 |
-
from collections import Counter
|
| 21 |
-
import seaborn as sns
|
| 22 |
-
from sklearn.tree import DecisionTreeClassifier
|
| 23 |
-
from sklearn.metrics import f1_score
|
| 24 |
-
import warnings
|
| 25 |
-
warnings.filterwarnings("ignore")
|
| 26 |
-
|
| 27 |
-
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 28 |
-
print(f"Is GPU available? {torch.cuda.is_available()}")
|
| 29 |
-
print(f"Current device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
|
| 30 |
-
|
| 31 |
-
DATA_SEED = 67
|
| 32 |
-
TRAINING_SEED = 1234
|
| 33 |
-
SR = 22050
|
| 34 |
-
DURATION = 5.0
|
| 35 |
-
N_FFT = 2048
|
| 36 |
-
HOP_LENGTH = 512
|
| 37 |
-
N_MELS = 128
|
| 38 |
-
TOP_DB=20
|
| 39 |
-
TARGET_SNR_DB = 10
|
| 40 |
-
|
| 41 |
-
random.seed(DATA_SEED)
|
| 42 |
-
np.random.seed(DATA_SEED)
|
| 43 |
-
torch.manual_seed(DATA_SEED)
|
| 44 |
-
torch.cuda.manual_seed(DATA_SEED)
|
| 45 |
-
|
| 46 |
-
DATA_ROOT = '/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/genres_stems'
|
| 47 |
-
GENRES = [] # Make the list of all genres available
|
| 48 |
-
for genre in os.listdir(path=DATA_ROOT):
|
| 49 |
-
GENRES.append(genre)
|
| 50 |
-
display(GENRES)
|
| 51 |
-
STEMS = {} # Write here stems file name
|
| 52 |
-
STEM_KEYS = ['drums', 'vocals', 'bass', 'other']
|
| 53 |
-
GENRE_TO_TEST = 'rock'
|
| 54 |
-
SONG_INDEX = 0
|
| 55 |
-
genre_to_id = {genre: i for i, genre in enumerate(GENRES)}
|
| 56 |
-
id_to_genre = {i: genre for i, genre in enumerate(GENRES)}
|
| 57 |
-
|
| 58 |
-
# Build the dataset in the format {genre: {stem_type: [file_paths]}}
|
| 59 |
-
def build_dataset(root_dir, val_split=0.17, seed=42):
|
| 60 |
-
train_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
|
| 61 |
-
val_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
|
| 62 |
-
rows = []
|
| 63 |
-
for genre in GENRES:
|
| 64 |
-
genre_path = os.path.join(root_dir, genre)
|
| 65 |
-
for song in os.listdir(genre_path):
|
| 66 |
-
rows.append({
|
| 67 |
-
"label" : genre,
|
| 68 |
-
"song" : song
|
| 69 |
-
})
|
| 70 |
-
df = pd.DataFrame(rows)
|
| 71 |
-
X = df['song']
|
| 72 |
-
y = df['label']
|
| 73 |
-
|
| 74 |
-
sss = StratifiedShuffleSplit(n_splits=1, test_size=val_split, random_state=seed)
|
| 75 |
-
|
| 76 |
-
for train_index, val_index in sss.split(X, y):
|
| 77 |
-
train_df = df.iloc[train_index].reset_index(drop=True)
|
| 78 |
-
val_df = df.iloc[val_index].reset_index(drop=True)
|
| 79 |
-
|
| 80 |
-
for idx, row in train_df.iterrows():
|
| 81 |
-
genre = row['label']
|
| 82 |
-
song = row['song']
|
| 83 |
-
song_path = os.path.join(root_dir, genre, song)
|
| 84 |
-
for stem in os.listdir(song_path):
|
| 85 |
-
train_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
|
| 86 |
-
|
| 87 |
-
for idx, row in val_df.iterrows():
|
| 88 |
-
genre = row['label']
|
| 89 |
-
song = row['song']
|
| 90 |
-
song_path = os.path.join(root_dir, genre, song)
|
| 91 |
-
for stem in os.listdir(song_path):
|
| 92 |
-
val_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
|
| 93 |
-
return train_dataset, val_dataset
|
| 94 |
-
|
| 95 |
-
tr, val = build_dataset(DATA_ROOT)
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
AST_SR = 16000
|
| 100 |
-
AST_TARGET_FRAMES = 1024
|
| 101 |
-
|
| 102 |
-
# A custom Dataset class that will handle the loading of audio files, applying random augmentations, and generating the mel spectrograms on-the-fly during training and validation.
|
| 103 |
-
class ASTAudioDataset(Dataset):
|
| 104 |
-
def __init__(self, data_dict, length, genres, stem_types, noise_files, duration=10.0):
|
| 105 |
-
self.data_dict = data_dict
|
| 106 |
-
self.length = length
|
| 107 |
-
self.genres = genres
|
| 108 |
-
self.stem_types = stem_types
|
| 109 |
-
self.noise_files = noise_files
|
| 110 |
-
self.duration = duration
|
| 111 |
-
self.sr = AST_SR
|
| 112 |
-
|
| 113 |
-
def __len__(self):
|
| 114 |
-
return self.length
|
| 115 |
-
|
| 116 |
-
def __getitem__(self, idx, retries=0):
|
| 117 |
-
if retries > 10:
|
| 118 |
-
return self.__getitem__(random.randint(0, self.length - 1), 0)
|
| 119 |
-
|
| 120 |
-
genre = random.choice(self.genres) # Choose a random genre for this sample
|
| 121 |
-
max_start = max(0, 28 - self.duration) # Calculate the maximum start position
|
| 122 |
-
base_start = random.uniform(0, max_start)
|
| 123 |
-
target_len = int(self.sr * self.duration)
|
| 124 |
-
stems_data = []
|
| 125 |
-
|
| 126 |
-
for stem_type in self.stem_types:
|
| 127 |
-
if random.random() < 0.15:
|
| 128 |
-
continue
|
| 129 |
-
|
| 130 |
-
try:
|
| 131 |
-
path = random.choice(self.data_dict[genre][stem_type])
|
| 132 |
-
y, _ = librosa.load(path, sr=self.sr, offset=base_start, duration=self.duration)
|
| 133 |
-
y = librosa.util.fix_length(y, size=target_len)
|
| 134 |
-
|
| 135 |
-
weight = random.uniform(0.4, 1.2)
|
| 136 |
-
y = y * weight # Apply random gain to the stem
|
| 137 |
-
|
| 138 |
-
if np.sqrt(np.mean(y**2)) > 0.001:
|
| 139 |
-
stems_data.append(y)
|
| 140 |
-
except:
|
| 141 |
-
continue
|
| 142 |
-
|
| 143 |
-
if not stems_data:
|
| 144 |
-
return self.__getitem__(idx, retries + 1)
|
| 145 |
-
|
| 146 |
-
mix = np.sum(stems_data, axis=0)
|
| 147 |
-
if np.max(np.abs(mix)) > 0:
|
| 148 |
-
mix = mix / np.max(np.abs(mix)) # Normalize the mixed audio to prevent clipping
|
| 149 |
-
|
| 150 |
-
noise_path = random.choice(self.noise_files)
|
| 151 |
-
noise_y, _ = librosa.load(noise_path, sr=self.sr, offset=base_start, duration=self.duration)
|
| 152 |
-
noise_y = librosa.util.fix_length(noise_y, size=target_len)
|
| 153 |
-
|
| 154 |
-
# Calculate the power of the signal and noise, then scale the noise to achieve the target SNR
|
| 155 |
-
p_signal = np.mean(mix**2) + 1e-9
|
| 156 |
-
p_noise = np.mean(noise_y**2) + 1e-9
|
| 157 |
-
|
| 158 |
-
snr_divisor = random.uniform(2.0, 8.0)
|
| 159 |
-
target_noise_power = p_signal / snr_divisor
|
| 160 |
-
scaling_factor = np.sqrt(target_noise_power / p_noise)
|
| 161 |
-
final_audio = mix + (scaling_factor * noise_y)
|
| 162 |
-
|
| 163 |
-
mel = librosa.feature.melspectrogram(
|
| 164 |
-
y=final_audio,
|
| 165 |
-
sr=self.sr,
|
| 166 |
-
n_mels=128,
|
| 167 |
-
n_fft=400,
|
| 168 |
-
hop_length=160
|
| 169 |
-
)
|
| 170 |
-
|
| 171 |
-
mel_db = librosa.power_to_db(mel, ref=np.max)
|
| 172 |
-
mel_db = (mel_db + 4.26) / 4.56
|
| 173 |
-
mel_tensor = torch.tensor(mel_db).float()
|
| 174 |
-
|
| 175 |
-
mel_tensor = mel_tensor.T
|
| 176 |
-
|
| 177 |
-
# Pad or truncate to ensure consistent shape (AST expects 1024 frames)
|
| 178 |
-
if mel_tensor.shape[0] < AST_TARGET_FRAMES:
|
| 179 |
-
padding_needed = AST_TARGET_FRAMES - mel_tensor.shape[0]
|
| 180 |
-
mel_tensor = torch.nn.functional.pad(mel_tensor, (0, 0, 0, padding_needed))
|
| 181 |
-
else:
|
| 182 |
-
mel_tensor = mel_tensor[:AST_TARGET_FRAMES, :]
|
| 183 |
-
|
| 184 |
-
genre_idx = genre_to_id[genre]
|
| 185 |
-
return mel_tensor, genre_idx
|
| 186 |
-
noise_files_list = [os.path.join('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/ESC-50-master/audio', f)
|
| 187 |
-
for f in os.listdir('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/ESC-50-master/audio') if f.endswith('.wav')]
|
| 188 |
-
|
| 189 |
-
train_ds = ASTAudioDataset(
|
| 190 |
-
data_dict=tr,
|
| 191 |
-
length=1000,
|
| 192 |
-
genres=GENRES,
|
| 193 |
-
stem_types=STEM_KEYS,
|
| 194 |
-
noise_files=noise_files_list
|
| 195 |
-
)
|
| 196 |
-
|
| 197 |
-
val_ds = ASTAudioDataset(
|
| 198 |
-
data_dict=val,
|
| 199 |
-
length=500,
|
| 200 |
-
genres=GENRES,
|
| 201 |
-
stem_types=STEM_KEYS,
|
| 202 |
-
noise_files=noise_files_list,
|
| 203 |
-
)
|
| 204 |
-
|
| 205 |
-
train_loader = DataLoader(
|
| 206 |
-
train_ds,
|
| 207 |
-
batch_size=4,
|
| 208 |
-
shuffle=True,
|
| 209 |
-
num_workers=2,
|
| 210 |
-
pin_memory=True
|
| 211 |
-
)
|
| 212 |
-
|
| 213 |
-
val_loader = DataLoader(
|
| 214 |
-
val_ds,
|
| 215 |
-
batch_size=4,
|
| 216 |
-
shuffle=False,
|
| 217 |
-
num_workers=2,
|
| 218 |
-
pin_memory=True
|
| 219 |
-
)
|
| 220 |
-
|
| 221 |
-
# Loading pre trained model
|
| 222 |
-
class MusicGenreAST(nn.Module):
|
| 223 |
-
def __init__(self, num_classes):
|
| 224 |
-
super(MusicGenreAST, self).__init__()
|
| 225 |
-
self.ast = ASTForAudioClassification.from_pretrained(
|
| 226 |
-
"MIT/ast-finetuned-audioset-10-10-0.4593",
|
| 227 |
-
num_labels=num_classes,
|
| 228 |
-
ignore_mismatched_sizes=True
|
| 229 |
-
)
|
| 230 |
-
|
| 231 |
-
def forward(self, x):
|
| 232 |
-
outputs = self.ast(x)
|
| 233 |
-
return outputs
|
| 234 |
-
|
| 235 |
-
#Train loop
|
| 236 |
-
def train_ast(model_instance):
|
| 237 |
-
num_epochs = 15
|
| 238 |
-
patience_counter = 0
|
| 239 |
-
patience_limit = 7
|
| 240 |
-
best_score = float('-inf')
|
| 241 |
-
accumulation_steps = 4
|
| 242 |
-
|
| 243 |
-
model = model_instance.to(device)
|
| 244 |
-
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.01) #ADAMW optimizer with weight decay for better generalization
|
| 245 |
-
scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='max', factor=0.5, patience=2, min_lr=1e-7) # Reduce learning rate when the F1 score plateaus, with a minimum learning rate threshold
|
| 246 |
-
criterion = nn.CrossEntropyLoss()
|
| 247 |
-
|
| 248 |
-
for epoch in range(num_epochs):
|
| 249 |
-
# Training Phase
|
| 250 |
-
model.train()
|
| 251 |
-
tot_train_loss = 0
|
| 252 |
-
optimizer.zero_grad()
|
| 253 |
-
|
| 254 |
-
for i, (mel_spec, genre_id) in enumerate(train_loader):
|
| 255 |
-
mel_spec, genre_id = mel_spec.to(device), genre_id.to(device)
|
| 256 |
-
|
| 257 |
-
outputs = model(mel_spec).logits
|
| 258 |
-
loss = criterion(outputs, genre_id)
|
| 259 |
-
loss = loss / accumulation_steps
|
| 260 |
-
loss.backward()
|
| 261 |
-
|
| 262 |
-
if (i + 1) % accumulation_steps == 0:
|
| 263 |
-
optimizer.step()
|
| 264 |
-
optimizer.zero_grad()
|
| 265 |
-
|
| 266 |
-
tot_train_loss += (loss.item() * accumulation_steps) * mel_spec.size(0)
|
| 267 |
-
|
| 268 |
-
epoch_train_loss = tot_train_loss / len(train_loader.dataset)
|
| 269 |
-
# Validation Phase
|
| 270 |
-
model.eval()
|
| 271 |
-
tot_val_loss = 0
|
| 272 |
-
genre_pred_tot = []
|
| 273 |
-
genre_true_tot = []
|
| 274 |
-
|
| 275 |
-
with torch.no_grad():
|
| 276 |
-
for mel_spec, genre_id in val_loader:
|
| 277 |
-
mel_spec, genre_id = mel_spec.to(device), genre_id.to(device)
|
| 278 |
-
|
| 279 |
-
outputs = model(mel_spec).logits
|
| 280 |
-
val_loss = criterion(outputs, genre_id)
|
| 281 |
-
preds = torch.argmax(outputs, dim=1)
|
| 282 |
-
|
| 283 |
-
genre_true_tot.extend(genre_id.cpu().numpy())
|
| 284 |
-
genre_pred_tot.extend(preds.cpu().numpy())
|
| 285 |
-
tot_val_loss += val_loss.item() * mel_spec.size(0)
|
| 286 |
-
|
| 287 |
-
epoch_validate_loss = tot_val_loss / len(val_loader.dataset)
|
| 288 |
-
genre_score = f1_score(y_true=genre_true_tot, y_pred=genre_pred_tot, average='macro')
|
| 289 |
-
current_lr = optimizer.param_groups[0]['lr']
|
| 290 |
-
|
| 291 |
-
print(f'Epoch {epoch+1}/{num_epochs}')
|
| 292 |
-
print(f'Train Loss: {epoch_train_loss:.4f} | Val Loss: {epoch_validate_loss:.4f}')
|
| 293 |
-
print(f'F1 Score: {genre_score:.4f} | LR: {current_lr:.6f}\n')
|
| 294 |
-
|
| 295 |
-
wandb.log({
|
| 296 |
-
"epoch": epoch + 1,
|
| 297 |
-
"train_loss": epoch_train_loss,
|
| 298 |
-
"val_loss": epoch_validate_loss,
|
| 299 |
-
"score": genre_score,
|
| 300 |
-
"learning_rate": current_lr
|
| 301 |
-
})
|
| 302 |
-
|
| 303 |
-
scheduler.step(genre_score)
|
| 304 |
-
|
| 305 |
-
if genre_score > best_score:
|
| 306 |
-
best_score = genre_score
|
| 307 |
-
patience_counter = 0
|
| 308 |
-
torch.save(model.state_dict(), 'best_ast_model.pth')
|
| 309 |
-
print(f"New Best Score! Model saved.")
|
| 310 |
-
else:
|
| 311 |
-
patience_counter += 1
|
| 312 |
-
torch.cuda.empty_cache()
|
| 313 |
-
|
| 314 |
-
if patience_counter >= patience_limit:
|
| 315 |
-
print('----- Early Stopping Triggered -----')
|
| 316 |
-
break
|
| 317 |
-
|
| 318 |
-
wandb.finish()
|
| 319 |
-
|
| 320 |
-
train_ast(MusicGenreAST(10))
|
| 321 |
-
|
| 322 |
-
# Prediction dataset for test set
|
| 323 |
-
class ASTTestDataset(Dataset):
|
| 324 |
-
def __init__(self, test_df):
|
| 325 |
-
self.data = test_df
|
| 326 |
-
self.sr = 16000
|
| 327 |
-
self.duration = 10.0
|
| 328 |
-
self.target_frames = 1024
|
| 329 |
-
|
| 330 |
-
def __len__(self):
|
| 331 |
-
return len(self.data)
|
| 332 |
-
|
| 333 |
-
def __getitem__(self, idx):
|
| 334 |
-
filename = self.data.iloc[idx]['filename']
|
| 335 |
-
path = os.path.join('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup', filename)
|
| 336 |
-
|
| 337 |
-
y, _ = librosa.load(path, sr=self.sr, offset=0, duration=self.duration)
|
| 338 |
-
|
| 339 |
-
target_len = int(self.sr * self.duration)
|
| 340 |
-
y = librosa.util.fix_length(y, size=target_len)
|
| 341 |
-
|
| 342 |
-
if np.max(np.abs(y)) > 0:
|
| 343 |
-
y = y / np.max(np.abs(y))
|
| 344 |
-
|
| 345 |
-
mel = librosa.feature.melspectrogram(
|
| 346 |
-
y=y,
|
| 347 |
-
sr=self.sr,
|
| 348 |
-
n_mels=128,
|
| 349 |
-
n_fft=400,
|
| 350 |
-
hop_length=160
|
| 351 |
-
)
|
| 352 |
-
|
| 353 |
-
mel_db = librosa.power_to_db(mel, ref=np.max)
|
| 354 |
-
mel_db = (mel_db + 4.26) / 4.56
|
| 355 |
-
|
| 356 |
-
mel_tensor = torch.tensor(mel_db).float().T
|
| 357 |
-
|
| 358 |
-
current_frames = mel_tensor.shape[0]
|
| 359 |
-
if current_frames < self.target_frames:
|
| 360 |
-
padding_needed = self.target_frames - current_frames
|
| 361 |
-
mel_tensor = torch.nn.functional.pad(mel_tensor, (0, 0, 0, padding_needed))
|
| 362 |
-
else:
|
| 363 |
-
mel_tensor = mel_tensor[:self.target_frames, :]
|
| 364 |
-
|
| 365 |
-
return mel_tensor, filename
|
| 366 |
-
test_df = pd.read_csv('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/test.csv')
|
| 367 |
-
|
| 368 |
-
test_data = ASTTestDataset(test_df)
|
| 369 |
-
|
| 370 |
-
test_loader = DataLoader(
|
| 371 |
-
dataset=test_data,
|
| 372 |
-
batch_size=4,
|
| 373 |
-
shuffle=False,
|
| 374 |
-
num_workers=2,
|
| 375 |
-
pin_memory=True if device == 'cuda' else False
|
| 376 |
-
)
|
| 377 |
-
|
| 378 |
-
# Prediction loop for test set
|
| 379 |
-
def predict(model_instance, model_path):
|
| 380 |
-
model = model_instance.to(device)
|
| 381 |
-
model.load_state_dict(torch.load(model_path))
|
| 382 |
-
model.eval()
|
| 383 |
-
|
| 384 |
-
genre_pred = []
|
| 385 |
-
for mel_specs,_ in test_loader:
|
| 386 |
-
mel_specs = mel_specs.to(device)
|
| 387 |
-
if mel_specs.dim() == 4:
|
| 388 |
-
mel_specs = mel_specs.squeeze(1)
|
| 389 |
-
|
| 390 |
-
with torch.no_grad():
|
| 391 |
-
outputs = model(mel_specs).logits
|
| 392 |
-
preds = torch.argmax(outputs, dim=1).cpu().numpy()
|
| 393 |
-
genre_pred.extend(preds.tolist())
|
| 394 |
-
final_result = [id_to_genre[g] for g in genre_pred]
|
| 395 |
-
return final_result
|
| 396 |
-
|
| 397 |
-
res = predict(MusicGenreAST(10), '/kaggle/input/models/bhavin273/ast-audio-model/pytorch/default/1/best_ast_model.pth')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CRNN_pipeline.py
DELETED
|
@@ -1,366 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import glob
|
| 3 |
-
import numpy as np
|
| 4 |
-
import pandas as pd
|
| 5 |
-
from tqdm import tqdm
|
| 6 |
-
import librosa
|
| 7 |
-
import librosa.display
|
| 8 |
-
import matplotlib.pyplot as plt
|
| 9 |
-
import random
|
| 10 |
-
import torch
|
| 11 |
-
import wandb
|
| 12 |
-
from torchsummary import summary
|
| 13 |
-
from torch.utils.data.dataset import Dataset
|
| 14 |
-
from torch.utils.data import DataLoader
|
| 15 |
-
from torch.optim.lr_scheduler import ReduceLROnPlateau
|
| 16 |
-
from transformers import ASTForAudioClassification, ASTConfig
|
| 17 |
-
import torch.nn as nn
|
| 18 |
-
from sklearn.model_selection import StratifiedShuffleSplit
|
| 19 |
-
from collections import Counter
|
| 20 |
-
import seaborn as sns
|
| 21 |
-
from sklearn.tree import DecisionTreeClassifier
|
| 22 |
-
from sklearn.metrics import f1_score
|
| 23 |
-
import warnings
|
| 24 |
-
warnings.filterwarnings("ignore")
|
| 25 |
-
|
| 26 |
-
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 27 |
-
print(f"Is GPU available? {torch.cuda.is_available()}")
|
| 28 |
-
print(f"Current device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
|
| 29 |
-
|
| 30 |
-
DATA_SEED = 67
|
| 31 |
-
TRAINING_SEED = 1234
|
| 32 |
-
SR = 22050
|
| 33 |
-
DURATION = 5.0
|
| 34 |
-
N_FFT = 2048
|
| 35 |
-
HOP_LENGTH = 512
|
| 36 |
-
N_MELS = 128
|
| 37 |
-
TOP_DB=20
|
| 38 |
-
TARGET_SNR_DB = 10
|
| 39 |
-
|
| 40 |
-
random.seed(DATA_SEED)
|
| 41 |
-
np.random.seed(DATA_SEED)
|
| 42 |
-
torch.manual_seed(DATA_SEED)
|
| 43 |
-
torch.cuda.manual_seed(DATA_SEED)
|
| 44 |
-
|
| 45 |
-
DATA_ROOT = '/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/genres_stems'
|
| 46 |
-
GENRES = [] # Make the list of all genres available
|
| 47 |
-
for genre in os.listdir(path=DATA_ROOT):
|
| 48 |
-
GENRES.append(genre)
|
| 49 |
-
display(GENRES)
|
| 50 |
-
|
| 51 |
-
STEMS = {} # Write here stems file name
|
| 52 |
-
STEM_KEYS = ['drums', 'vocals', 'bass', 'other']
|
| 53 |
-
GENRE_TO_TEST = 'rock'
|
| 54 |
-
SONG_INDEX = 0
|
| 55 |
-
genre_to_id = {genre: i for i, genre in enumerate(GENRES)}
|
| 56 |
-
id_to_genre = {i: genre for i, genre in enumerate(GENRES)}
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
# Function to build the dataset by splitting the songs into train and validation sets, and then creating a dictionary with genre as key and stem types as sub-keys containing the list of file paths for each stem type.
|
| 60 |
-
def build_dataset(root_dir, val_split=0.17, seed=42):
|
| 61 |
-
train_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
|
| 62 |
-
val_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
|
| 63 |
-
rows = []
|
| 64 |
-
for genre in GENRES:
|
| 65 |
-
genre_path = os.path.join(root_dir, genre)
|
| 66 |
-
for song in os.listdir(genre_path):
|
| 67 |
-
rows.append({
|
| 68 |
-
"label" : genre,
|
| 69 |
-
"song" : song
|
| 70 |
-
})
|
| 71 |
-
df = pd.DataFrame(rows)
|
| 72 |
-
X = df['song']
|
| 73 |
-
y = df['label']
|
| 74 |
-
|
| 75 |
-
sss = StratifiedShuffleSplit(n_splits=1, test_size=val_split, random_state=seed)
|
| 76 |
-
|
| 77 |
-
for train_index, val_index in sss.split(X, y):
|
| 78 |
-
train_df = df.iloc[train_index].reset_index(drop=True)
|
| 79 |
-
val_df = df.iloc[val_index].reset_index(drop=True)
|
| 80 |
-
|
| 81 |
-
for idx, row in train_df.iterrows():
|
| 82 |
-
genre = row['label']
|
| 83 |
-
song = row['song']
|
| 84 |
-
song_path = os.path.join(root_dir, genre, song)
|
| 85 |
-
for stem in os.listdir(song_path):
|
| 86 |
-
train_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
|
| 87 |
-
|
| 88 |
-
for idx, row in val_df.iterrows():
|
| 89 |
-
genre = row['label']
|
| 90 |
-
song = row['song']
|
| 91 |
-
song_path = os.path.join(root_dir, genre, song)
|
| 92 |
-
for stem in os.listdir(song_path):
|
| 93 |
-
val_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
|
| 94 |
-
return train_dataset, val_dataset
|
| 95 |
-
|
| 96 |
-
tr, val = build_dataset(DATA_ROOT)
|
| 97 |
-
|
| 98 |
-
# Example of how the dataset is structured
|
| 99 |
-
class audioDataset (Dataset):
|
| 100 |
-
def __init__ (self, data_dict, length, genres, stem_types, noise_files, sr=SR, duration=DURATION, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS):
|
| 101 |
-
self.data_dict = data_dict
|
| 102 |
-
self.sr = sr
|
| 103 |
-
self.genres = genres
|
| 104 |
-
self.stem_types = stem_types
|
| 105 |
-
self.duration = duration
|
| 106 |
-
self.n_fft = n_fft
|
| 107 |
-
self.hop_length = hop_length
|
| 108 |
-
self.n_mels = n_mels
|
| 109 |
-
self.noise_files = noise_files
|
| 110 |
-
self.length = length
|
| 111 |
-
|
| 112 |
-
def __len__(self):
|
| 113 |
-
return self.length
|
| 114 |
-
|
| 115 |
-
def __getitem__(self, idx, retries = 0):
|
| 116 |
-
if retries > 10:
|
| 117 |
-
idx = random.randint(0, self.length-1)
|
| 118 |
-
return self.__getitem__(idx, 0)
|
| 119 |
-
genre = random.choice(self.genres)
|
| 120 |
-
start_time = random.uniform(0, 24)
|
| 121 |
-
target_len = int(22050 * 5.0)
|
| 122 |
-
stems_data = []
|
| 123 |
-
|
| 124 |
-
for stem_type in self.stem_types:
|
| 125 |
-
path = random.choice(self.data_dict[genre][stem_type])
|
| 126 |
-
y, _ = librosa.load(path, sr=22050, offset=start_time, duration=self.duration)
|
| 127 |
-
y = librosa.util.fix_length(y, size=target_len)
|
| 128 |
-
rms = np.sqrt(np.mean(y**2))
|
| 129 |
-
if rms < 0.001:
|
| 130 |
-
return self.__getitem__(idx, retries + 1)
|
| 131 |
-
stems_data.append(y)
|
| 132 |
-
mix = np.sum(stems_data, axis=0) / 4.0
|
| 133 |
-
if np.max(np.abs(mix)) > 0:
|
| 134 |
-
mix = mix / np.max(np.abs(mix))
|
| 135 |
-
|
| 136 |
-
noise_path = random.choice(self.noise_files)
|
| 137 |
-
noise_y, _ = librosa.load(noise_path, sr=22050, offset=start_time, duration=self.duration)
|
| 138 |
-
noise_y = librosa.util.fix_length(noise_y, size=target_len)
|
| 139 |
-
|
| 140 |
-
p_signal = np.mean(mix**2) + 1e-9
|
| 141 |
-
p_noise = np.mean(noise_y**2) + 1e-9
|
| 142 |
-
target_noise_power = p_signal / 10.0
|
| 143 |
-
scaling_factor = np.sqrt(target_noise_power / p_noise)
|
| 144 |
-
|
| 145 |
-
final_audio = mix + (scaling_factor * noise_y)
|
| 146 |
-
|
| 147 |
-
mfcc = librosa.feature.mfcc(
|
| 148 |
-
y=final_audio,
|
| 149 |
-
sr=self.sr,
|
| 150 |
-
n_mfcc=40,
|
| 151 |
-
n_fft=self.n_fft,
|
| 152 |
-
hop_length=self.hop_length,
|
| 153 |
-
n_mels=self.n_mels
|
| 154 |
-
)
|
| 155 |
-
|
| 156 |
-
mfcc_db = librosa.power_to_db(mfcc, ref=np.max)
|
| 157 |
-
return torch.tensor(mfcc_db).float().unsqueeze(0), genre_to_id[genre] #shape(1, 40, 216)
|
| 158 |
-
|
| 159 |
-
noise_files_list = [os.path.join('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/ESC-50-master/audio', f)
|
| 160 |
-
for f in os.listdir('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/ESC-50-master/audio') if f.endswith('.wav')]
|
| 161 |
-
|
| 162 |
-
train_ds = audioDataset(tr, 1000, GENRES, STEM_KEYS, noise_files_list)
|
| 163 |
-
val_ds = audioDataset(val, 200, GENRES, STEM_KEYS, noise_files_list)
|
| 164 |
-
|
| 165 |
-
train_loader = DataLoader(dataset=train_ds,
|
| 166 |
-
batch_size=32,
|
| 167 |
-
shuffle=True,
|
| 168 |
-
num_workers=2,
|
| 169 |
-
pin_memory=True if device == 'cuda' else False)
|
| 170 |
-
|
| 171 |
-
val_loader = DataLoader(dataset=val_ds,
|
| 172 |
-
batch_size=32,
|
| 173 |
-
shuffle=False,
|
| 174 |
-
num_workers=2,
|
| 175 |
-
pin_memory=True if device == 'cuda' else False)
|
| 176 |
-
|
| 177 |
-
#Model architecture with 3 convolutional layers followed by a GRU layer and a fully connected layer for genre classification. The convolutional layers extract features from the input MFCCs, while the GRU layer captures temporal dependencies in the features. The final fully connected layer outputs the genre predictions.
|
| 178 |
-
class audioCRNN(nn.Module):
|
| 179 |
-
def __init__(self):
|
| 180 |
-
super(audioCRNN, self).__init__()
|
| 181 |
-
|
| 182 |
-
self.conv1 = nn.Sequential(
|
| 183 |
-
nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
|
| 184 |
-
nn.BatchNorm2d(32),
|
| 185 |
-
nn.ReLU(),
|
| 186 |
-
nn.MaxPool2d(kernel_size=2)
|
| 187 |
-
) # Output Shape (32, 32, 20, 108)
|
| 188 |
-
|
| 189 |
-
self.conv2 = nn.Sequential(
|
| 190 |
-
nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
|
| 191 |
-
nn.BatchNorm2d(64),
|
| 192 |
-
nn.ReLU(),
|
| 193 |
-
nn.MaxPool2d(kernel_size=(2,1))
|
| 194 |
-
) # Output Shape (32, 64, 10, 108)
|
| 195 |
-
|
| 196 |
-
self.conv3 = nn.Sequential(
|
| 197 |
-
nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
|
| 198 |
-
nn.BatchNorm2d(128),
|
| 199 |
-
nn.ReLU(),
|
| 200 |
-
nn.MaxPool2d(kernel_size=(2,1))
|
| 201 |
-
) # Output Shape (32, 128, 5, 108)
|
| 202 |
-
|
| 203 |
-
self.gru = nn.GRU(input_size=128 * 5, hidden_size=128,
|
| 204 |
-
num_layers=2, batch_first=True,
|
| 205 |
-
bidirectional=True, dropout=0.3) # Ouput size (32, 108, 256)
|
| 206 |
-
|
| 207 |
-
self.genre_head = nn.Linear(256, 10)
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
def forward (self, x):
|
| 211 |
-
x = self.conv1(x)
|
| 212 |
-
x = self.conv2(x)
|
| 213 |
-
x = self.conv3(x)
|
| 214 |
-
x = x.permute (0, 3, 1, 2)
|
| 215 |
-
batch, time, channels, mels = x.size()
|
| 216 |
-
x = x.contiguous().view(batch, time, channels*mels)
|
| 217 |
-
x, _ = self.gru(x)
|
| 218 |
-
x = x[:, -1, :]
|
| 219 |
-
|
| 220 |
-
return self.genre_head(x)
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
model = audioCRNN().to(device) # Printing Model Summary
|
| 224 |
-
summary(model, (1, 40, 216))
|
| 225 |
-
|
| 226 |
-
#The training fn
|
| 227 |
-
def train_epoch (train_batch, audioCRNN, optimizer, criterion):
|
| 228 |
-
audioCRNN.train() #Sets model in train Mode
|
| 229 |
-
mel_freq, genre_id = train_batch
|
| 230 |
-
mel_freq, genre_id = mel_freq.to(device), genre_id.to(device)
|
| 231 |
-
optimizer.zero_grad() #Initializes the gradients to zero
|
| 232 |
-
genre_output = audioCRNN(mel_freq)
|
| 233 |
-
genre_loss = criterion(genre_output, genre_id)
|
| 234 |
-
genre_loss.backward() #Back propagation
|
| 235 |
-
optimizer.step() #Weights are adjusted
|
| 236 |
-
|
| 237 |
-
return genre_loss.item()
|
| 238 |
-
|
| 239 |
-
#The validation fn
|
| 240 |
-
def validate_epoch (validation_batch, audioCRNN, criterion):
|
| 241 |
-
audioCRNN.eval() # Runs Model in evaluation mode
|
| 242 |
-
mel_freq, genre_id = validation_batch
|
| 243 |
-
mel_freq, genre_id = mel_freq.to(device), genre_id.to(device)
|
| 244 |
-
with torch.no_grad(): #Gradients disabled
|
| 245 |
-
genre_output = audioCRNN(mel_freq)
|
| 246 |
-
genre_loss = criterion(genre_output, genre_id)
|
| 247 |
-
genre_pred = torch.softmax(genre_output, dim=1)
|
| 248 |
-
genre_pred = torch.argmax(genre_pred, dim=1).cpu().numpy() # Gender prediction with softmax followed by argmax
|
| 249 |
-
return genre_loss.item(), genre_pred
|
| 250 |
-
|
| 251 |
-
#The training loop
|
| 252 |
-
def train (audioCRNN):
|
| 253 |
-
num_epochs = 17
|
| 254 |
-
patience_counter = 0
|
| 255 |
-
patience_limit = 5
|
| 256 |
-
best_score = float('-inf')
|
| 257 |
-
model = audioCRNN.to(device)
|
| 258 |
-
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
|
| 259 |
-
# scheduler to adjust the optimizer in case the curve flattens
|
| 260 |
-
scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='max', factor=0.5, patience=2, min_lr=1e-6)
|
| 261 |
-
criterion = nn.CrossEntropyLoss()
|
| 262 |
-
|
| 263 |
-
for epoch in range (num_epochs):
|
| 264 |
-
tot_train_loss = 0
|
| 265 |
-
# Training loop
|
| 266 |
-
for mel_freq, genre_id in train_loader:
|
| 267 |
-
train_loss = train_epoch((mel_freq, genre_id), model, optimizer, criterion)
|
| 268 |
-
tot_train_loss += train_loss * mel_freq.size(0) # Batch mean loss multiplied by batch size for total batch loss
|
| 269 |
-
epoch_train_loss = tot_train_loss/len(train_loader.dataset) # Total Train loss per epoch
|
| 270 |
-
tot_val_loss = 0
|
| 271 |
-
genre_pred_tot = []
|
| 272 |
-
genre_true_tot = []
|
| 273 |
-
# Validation loop
|
| 274 |
-
for mel_freq, genre_id in val_loader:
|
| 275 |
-
genre_true_tot.extend(genre_id.cpu().numpy())
|
| 276 |
-
val_loss, genre_pred = validate_epoch((mel_freq, genre_id), model, criterion)
|
| 277 |
-
genre_pred_tot.extend(genre_pred)
|
| 278 |
-
tot_val_loss += val_loss * mel_freq.size(0)
|
| 279 |
-
epoch_validate_loss = tot_val_loss/len(val_loader.dataset)
|
| 280 |
-
genre_score = f1_score(y_true=genre_true_tot, y_pred=genre_pred_tot, average='macro') # f1 macro for evaluating the genre score
|
| 281 |
-
current_lr = optimizer.param_groups[0]['lr']
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
print(f'Epoch {epoch+1}/{num_epochs}')
|
| 285 |
-
print(f'Train Loss : {epoch_train_loss:.4f} | Validation Loss : {epoch_validate_loss:.4f}')
|
| 286 |
-
print(f'Score : {genre_score:.4f}')
|
| 287 |
-
print(f'Current learning Rate is : {current_lr:.6f}\n')
|
| 288 |
-
|
| 289 |
-
wandb.log({
|
| 290 |
-
"epoch": epoch + 1,
|
| 291 |
-
"train_loss": epoch_train_loss,
|
| 292 |
-
"val_loss": epoch_validate_loss,
|
| 293 |
-
"score": genre_score,
|
| 294 |
-
"learning_rate": current_lr
|
| 295 |
-
})
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
scheduler.step(genre_score)
|
| 299 |
-
if genre_score > best_score:
|
| 300 |
-
patience_counter = 0
|
| 301 |
-
best_score = genre_score
|
| 302 |
-
torch.save(model.state_dict(), 'best_finetuned_model.pth') # save model
|
| 303 |
-
else:
|
| 304 |
-
patience_counter += 1
|
| 305 |
-
|
| 306 |
-
if patience_counter >= patience_limit:
|
| 307 |
-
print('-----Early Stopping------')
|
| 308 |
-
break
|
| 309 |
-
wandb.finish()
|
| 310 |
-
|
| 311 |
-
train(audioCRNN())
|
| 312 |
-
|
| 313 |
-
test_df = pd.read_csv('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/test.csv')
|
| 314 |
-
|
| 315 |
-
class TestDataSet (Dataset):
|
| 316 |
-
def __init__ (self, test_df):
|
| 317 |
-
self.data = test_df
|
| 318 |
-
|
| 319 |
-
def __len__(self):
|
| 320 |
-
return len (self.data)
|
| 321 |
-
|
| 322 |
-
def __getitem__(self, idx):
|
| 323 |
-
target_len = int(22050 * 5.0)
|
| 324 |
-
path = os.path.join('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup', self.data.iloc[idx]['filename'])
|
| 325 |
-
y, _ = librosa.load(path, sr=22050, offset=0, duration=5.0)
|
| 326 |
-
y = librosa.util.fix_length(y, size=target_len)
|
| 327 |
-
if np.max(np.abs(y)) > 0:
|
| 328 |
-
y = y / np.max(np.abs(y))
|
| 329 |
-
mfcc = librosa.feature.mfcc(
|
| 330 |
-
y=y,
|
| 331 |
-
sr=22050,
|
| 332 |
-
n_mfcc=40,
|
| 333 |
-
n_fft=2048,
|
| 334 |
-
hop_length=512,
|
| 335 |
-
n_mels=128
|
| 336 |
-
)
|
| 337 |
-
|
| 338 |
-
mfcc_db = librosa.power_to_db(mfcc, ref=np.max)
|
| 339 |
-
return torch.tensor(mfcc_db).float().unsqueeze(0)
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
test_data = TestDataSet(test_df)
|
| 343 |
-
|
| 344 |
-
test_loader = DataLoader(dataset=test_data,
|
| 345 |
-
batch_size=32,
|
| 346 |
-
num_workers=2,
|
| 347 |
-
pin_memory=True if device == 'cuda' else False)
|
| 348 |
-
|
| 349 |
-
genre_pred = []
|
| 350 |
-
def predict (model_name, model_path):
|
| 351 |
-
model = model_name.to(device)
|
| 352 |
-
model.load_state_dict(torch.load(model_path))
|
| 353 |
-
|
| 354 |
-
model.eval()
|
| 355 |
-
|
| 356 |
-
for mfccs in test_loader:
|
| 357 |
-
mfccs = mfccs.to(device)
|
| 358 |
-
with torch.no_grad():
|
| 359 |
-
genre_id = model (mfccs).logits
|
| 360 |
-
genre_out = torch.softmax(genre_id, dim=1)
|
| 361 |
-
genre_out = torch.argmax(genre_out, dim=1).cpu().numpy()
|
| 362 |
-
genre_pred.extend(genre_out.tolist())
|
| 363 |
-
final_result = [id_to_genre[g] for g in genre_pred]
|
| 364 |
-
return final_result
|
| 365 |
-
|
| 366 |
-
res = predict (audioCRNN(), '/kaggle/input/models/bhavin273/crnn-model/pytorch/default/1/best_crnnModel.pth')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Efficient_Net_Pipeline.py
DELETED
|
@@ -1,326 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import glob
|
| 3 |
-
from networkx import display
|
| 4 |
-
import numpy as np
|
| 5 |
-
import pandas as pd
|
| 6 |
-
from tqdm import tqdm
|
| 7 |
-
import librosa
|
| 8 |
-
import librosa.display
|
| 9 |
-
import matplotlib.pyplot as plt
|
| 10 |
-
import random
|
| 11 |
-
import torch
|
| 12 |
-
import timm
|
| 13 |
-
import wandb
|
| 14 |
-
from torchsummary import summary
|
| 15 |
-
from torch.utils.data.dataset import Dataset
|
| 16 |
-
from torch.utils.data import DataLoader
|
| 17 |
-
from torch.optim.lr_scheduler import ReduceLROnPlateau
|
| 18 |
-
from transformers import ASTForAudioClassification, ASTConfig
|
| 19 |
-
import torch.nn as nn
|
| 20 |
-
from sklearn.model_selection import StratifiedShuffleSplit
|
| 21 |
-
from collections import Counter
|
| 22 |
-
import seaborn as sns
|
| 23 |
-
from sklearn.tree import DecisionTreeClassifier
|
| 24 |
-
from sklearn.metrics import f1_score
|
| 25 |
-
import warnings
|
| 26 |
-
warnings.filterwarnings("ignore")
|
| 27 |
-
|
| 28 |
-
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 29 |
-
print(f"Is GPU available? {torch.cuda.is_available()}")
|
| 30 |
-
print(f"Current device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
|
| 31 |
-
|
| 32 |
-
DATA_SEED = 67
|
| 33 |
-
TRAINING_SEED = 1234
|
| 34 |
-
SR = 22050
|
| 35 |
-
DURATION = 5.0
|
| 36 |
-
N_FFT = 2048
|
| 37 |
-
HOP_LENGTH = 512
|
| 38 |
-
N_MELS = 128
|
| 39 |
-
TOP_DB=20
|
| 40 |
-
TARGET_SNR_DB = 10
|
| 41 |
-
|
| 42 |
-
random.seed(DATA_SEED)
|
| 43 |
-
np.random.seed(DATA_SEED)
|
| 44 |
-
torch.manual_seed(DATA_SEED)
|
| 45 |
-
torch.cuda.manual_seed(DATA_SEED)
|
| 46 |
-
|
| 47 |
-
DATA_ROOT = '/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/genres_stems'
|
| 48 |
-
GENRES = [] # Make the list of all genres available
|
| 49 |
-
for genre in os.listdir(path=DATA_ROOT):
|
| 50 |
-
GENRES.append(genre)
|
| 51 |
-
display(GENRES)
|
| 52 |
-
STEMS = {} # Write here stems file name
|
| 53 |
-
STEM_KEYS = ['drums', 'vocals', 'bass', 'other']
|
| 54 |
-
GENRE_TO_TEST = 'rock'
|
| 55 |
-
SONG_INDEX = 0
|
| 56 |
-
genre_to_id = {genre: i for i, genre in enumerate(GENRES)}
|
| 57 |
-
id_to_genre = {i: genre for i, genre in enumerate(GENRES)}
|
| 58 |
-
|
| 59 |
-
# Build the dataset in the format {genre: {stem_type: [file_paths]}}
|
| 60 |
-
def build_dataset(root_dir, val_split=0.17, seed=42):
|
| 61 |
-
train_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
|
| 62 |
-
val_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
|
| 63 |
-
rows = []
|
| 64 |
-
for genre in GENRES:
|
| 65 |
-
genre_path = os.path.join(root_dir, genre)
|
| 66 |
-
for song in os.listdir(genre_path):
|
| 67 |
-
rows.append({
|
| 68 |
-
"label" : genre,
|
| 69 |
-
"song" : song
|
| 70 |
-
})
|
| 71 |
-
df = pd.DataFrame(rows)
|
| 72 |
-
X = df['song']
|
| 73 |
-
y = df['label']
|
| 74 |
-
|
| 75 |
-
sss = StratifiedShuffleSplit(n_splits=1, test_size=val_split, random_state=seed)
|
| 76 |
-
|
| 77 |
-
for train_index, val_index in sss.split(X, y):
|
| 78 |
-
train_df = df.iloc[train_index].reset_index(drop=True)
|
| 79 |
-
val_df = df.iloc[val_index].reset_index(drop=True)
|
| 80 |
-
|
| 81 |
-
for idx, row in train_df.iterrows():
|
| 82 |
-
genre = row['label']
|
| 83 |
-
song = row['song']
|
| 84 |
-
song_path = os.path.join(root_dir, genre, song)
|
| 85 |
-
for stem in os.listdir(song_path):
|
| 86 |
-
train_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
|
| 87 |
-
|
| 88 |
-
for idx, row in val_df.iterrows():
|
| 89 |
-
genre = row['label']
|
| 90 |
-
song = row['song']
|
| 91 |
-
song_path = os.path.join(root_dir, genre, song)
|
| 92 |
-
for stem in os.listdir(song_path):
|
| 93 |
-
val_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
|
| 94 |
-
return train_dataset, val_dataset
|
| 95 |
-
|
| 96 |
-
tr, val = build_dataset(DATA_ROOT)
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
AST_SR = 16000
|
| 101 |
-
AST_TARGET_FRAMES = 1024
|
| 102 |
-
|
| 103 |
-
# A custom Dataset class that will handle the loading of audio files, applying random augmentations, and generating the mel spectrograms on-the-fly during training and validation.
|
| 104 |
-
class audioDataset (Dataset):
|
| 105 |
-
def __init__ (self, data_dict, length, genres, stem_types, noise_files, sr=SR, duration=DURATION, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS):
|
| 106 |
-
self.data_dict = data_dict
|
| 107 |
-
self.sr = sr
|
| 108 |
-
self.genres = genres
|
| 109 |
-
self.stem_types = stem_types
|
| 110 |
-
self.duration = duration
|
| 111 |
-
self.n_fft = n_fft
|
| 112 |
-
self.hop_length = hop_length
|
| 113 |
-
self.n_mels = n_mels
|
| 114 |
-
self.noise_files = noise_files
|
| 115 |
-
self.length = length
|
| 116 |
-
def __len__(self):
|
| 117 |
-
return self.length
|
| 118 |
-
def __getitem__(self, idx, retries = 0):
|
| 119 |
-
if retries > 10:
|
| 120 |
-
idx = random.randint(0, self.length-1)
|
| 121 |
-
return self.__getitem__(idx, 0)
|
| 122 |
-
genre = random.choice(self.genres)
|
| 123 |
-
start_time = random.uniform(0, 24)
|
| 124 |
-
target_len = int(22050 * 5.0)
|
| 125 |
-
stems_data = []
|
| 126 |
-
for stem_type in self.stem_types:
|
| 127 |
-
path = random.choice(self.data_dict[genre][stem_type])
|
| 128 |
-
y, _ = librosa.load(path, sr=22050, offset=start_time, duration=self.duration)
|
| 129 |
-
y = librosa.util.fix_length(y, size=target_len)
|
| 130 |
-
rms = np.sqrt(np.mean(y**2))
|
| 131 |
-
if rms < 0.001:
|
| 132 |
-
return self.__getitem__(idx, retries + 1)
|
| 133 |
-
stems_data.append(y)
|
| 134 |
-
mix = np.sum(stems_data, axis=0) / 4.0
|
| 135 |
-
if np.max(np.abs(mix)) > 0:
|
| 136 |
-
mix = mix / np.max(np.abs(mix))
|
| 137 |
-
noise_path = random.choice(self.noise_files)
|
| 138 |
-
noise_y, _ = librosa.load(noise_path, sr=22050, offset=start_time, duration=self.duration)
|
| 139 |
-
noise_y = librosa.util.fix_length(noise_y, size=target_len)
|
| 140 |
-
p_signal = np.mean(mix**2) + 1e-9
|
| 141 |
-
p_noise = np.mean(noise_y**2) + 1e-9
|
| 142 |
-
target_noise_power = p_signal / 10.0
|
| 143 |
-
scaling_factor = np.sqrt(target_noise_power / p_noise)
|
| 144 |
-
final_audio = mix + (scaling_factor * noise_y)
|
| 145 |
-
mfcc = librosa.feature.mfcc(
|
| 146 |
-
y=final_audio,
|
| 147 |
-
sr=self.sr,
|
| 148 |
-
n_fft=self.n_fft,
|
| 149 |
-
hop_length=self.hop_length,
|
| 150 |
-
n_mels=self.n_mels
|
| 151 |
-
)
|
| 152 |
-
mfcc_db = librosa.power_to_db(mfcc, ref=np.max)
|
| 153 |
-
return torch.tensor(mfcc_db).float().unsqueeze(0), genre_to_id[genre] #shape(1, 128, 216)
|
| 154 |
-
|
| 155 |
-
noise_files_list = [os.path.join('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/ESC-50-master/audio', f)
|
| 156 |
-
for f in os.listdir('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/ESC-50-master/audio') if f.endswith('.wav')]
|
| 157 |
-
|
| 158 |
-
train_ds = audioDataset(tr, 1000, GENRES, STEM_KEYS, noise_files_list)
|
| 159 |
-
val_ds = audioDataset(val, 200, GENRES, STEM_KEYS, noise_files_list)
|
| 160 |
-
|
| 161 |
-
train_loader = DataLoader(dataset=train_ds,
|
| 162 |
-
batch_size=32,
|
| 163 |
-
shuffle=True,
|
| 164 |
-
num_workers=2,
|
| 165 |
-
pin_memory=True if device == 'cuda' else False)
|
| 166 |
-
|
| 167 |
-
val_loader = DataLoader(dataset=val_ds,
|
| 168 |
-
batch_size=32,
|
| 169 |
-
shuffle=False,
|
| 170 |
-
num_workers=2,
|
| 171 |
-
pin_memory=True if device == 'cuda' else False)
|
| 172 |
-
|
| 173 |
-
# Loading pre trained model
|
| 174 |
-
class EfficientNetAudio(nn.Module):
|
| 175 |
-
def __init__(self, num_classes=10, model_name='efficientnet_b0'):
|
| 176 |
-
super(EfficientNetAudio, self).__init__()
|
| 177 |
-
self.model = timm.create_model(
|
| 178 |
-
model_name,
|
| 179 |
-
pretrained=True,
|
| 180 |
-
in_chans=1,
|
| 181 |
-
num_classes=num_classes
|
| 182 |
-
)
|
| 183 |
-
|
| 184 |
-
def forward(self, x):
|
| 185 |
-
return self.model(x)
|
| 186 |
-
|
| 187 |
-
def train_epoch (train_batch, EffNet, optimizer, criterion):
|
| 188 |
-
EffNet.train() #Sets model in train Mode
|
| 189 |
-
mel_freq, genre_id = train_batch
|
| 190 |
-
mel_freq, genre_id = mel_freq.to(device), genre_id.to(device)
|
| 191 |
-
optimizer.zero_grad() #Initializes the gradients to zero
|
| 192 |
-
genre_output = EffNet(mel_freq)
|
| 193 |
-
genre_loss = criterion(genre_output, genre_id)
|
| 194 |
-
genre_loss.backward() #Back propagation
|
| 195 |
-
optimizer.step() #Weights are adjusted
|
| 196 |
-
|
| 197 |
-
return genre_loss.item()
|
| 198 |
-
|
| 199 |
-
def validate_epoch (validation_batch, EffNet, criterion):
|
| 200 |
-
EffNet.eval() # Runs Model in evaluation mode
|
| 201 |
-
mel_freq, genre_id = validation_batch
|
| 202 |
-
mel_freq, genre_id = mel_freq.to(device), genre_id.to(device)
|
| 203 |
-
with torch.no_grad(): #Gradients disabled
|
| 204 |
-
genre_output = EffNet(mel_freq)
|
| 205 |
-
genre_loss = criterion(genre_output, genre_id)
|
| 206 |
-
genre_pred = torch.softmax(genre_output, dim=1)
|
| 207 |
-
genre_pred = torch.argmax(genre_pred, dim=1).cpu().numpy() # Gender prediction with softmax followed by argmax
|
| 208 |
-
return genre_loss.item(), genre_pred
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
#Train loop
|
| 212 |
-
def train (audioCRNN):
|
| 213 |
-
num_epochs = 20
|
| 214 |
-
patience_counter = 0
|
| 215 |
-
patience_limit = 5
|
| 216 |
-
best_score = float('-inf')
|
| 217 |
-
model = audioCRNN.to(device)
|
| 218 |
-
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
|
| 219 |
-
# scheduler to adjust the optimizer in case the curve flattens
|
| 220 |
-
scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='max', factor=0.5, patience=2, min_lr=1e-6)
|
| 221 |
-
criterion = nn.CrossEntropyLoss()
|
| 222 |
-
|
| 223 |
-
for epoch in range (num_epochs):
|
| 224 |
-
tot_train_loss = 0
|
| 225 |
-
# Training loop
|
| 226 |
-
for mel_freq, genre_id in train_loader:
|
| 227 |
-
train_loss = train_epoch((mel_freq, genre_id), model, optimizer, criterion)
|
| 228 |
-
tot_train_loss += train_loss * mel_freq.size(0) # Batch mean loss multiplied by batch size for total batch loss
|
| 229 |
-
epoch_train_loss = tot_train_loss/len(train_loader.dataset) # Total Train loss per epoch
|
| 230 |
-
tot_val_loss = 0
|
| 231 |
-
genre_pred_tot = []
|
| 232 |
-
genre_true_tot = []
|
| 233 |
-
# Validation loop
|
| 234 |
-
for mel_freq, genre_id in val_loader:
|
| 235 |
-
genre_true_tot.extend(genre_id.cpu().numpy())
|
| 236 |
-
val_loss, genre_pred = validate_epoch((mel_freq, genre_id), model, criterion)
|
| 237 |
-
genre_pred_tot.extend(genre_pred)
|
| 238 |
-
tot_val_loss += val_loss * mel_freq.size(0)
|
| 239 |
-
epoch_validate_loss = tot_val_loss/len(val_loader.dataset)
|
| 240 |
-
genre_score = f1_score(y_true=genre_true_tot, y_pred=genre_pred_tot, average='macro') # f1 macro for evaluating the genre score
|
| 241 |
-
current_lr = optimizer.param_groups[0]['lr']
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
print(f'Epoch {epoch+1}/{num_epochs}')
|
| 245 |
-
print(f'Train Loss : {epoch_train_loss:.4f} | Validation Loss : {epoch_validate_loss:.4f}')
|
| 246 |
-
print(f'Score : {genre_score:.4f}')
|
| 247 |
-
print(f'Current learning Rate is : {current_lr:.6f}\n')
|
| 248 |
-
|
| 249 |
-
wandb.log({
|
| 250 |
-
"epoch": epoch + 1,
|
| 251 |
-
"train_loss": epoch_train_loss,
|
| 252 |
-
"val_loss": epoch_validate_loss,
|
| 253 |
-
"score": genre_score,
|
| 254 |
-
"learning_rate": current_lr
|
| 255 |
-
})
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
scheduler.step(genre_score)
|
| 259 |
-
if genre_score > best_score:
|
| 260 |
-
patience_counter = 0
|
| 261 |
-
best_score = genre_score
|
| 262 |
-
torch.save(model.state_dict(), 'best_efficientnet_model.pth') # save model
|
| 263 |
-
else:
|
| 264 |
-
patience_counter += 1
|
| 265 |
-
|
| 266 |
-
if patience_counter >= patience_limit:
|
| 267 |
-
print('-----Early Stopping------')
|
| 268 |
-
break
|
| 269 |
-
|
| 270 |
-
train(EfficientNetAudio())
|
| 271 |
-
|
| 272 |
-
# Prediction dataset for test set
|
| 273 |
-
test_df = pd.read_csv('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/test.csv')
|
| 274 |
-
|
| 275 |
-
class TestDataSet (Dataset):
|
| 276 |
-
def __init__ (self, test_df):
|
| 277 |
-
self.data = test_df
|
| 278 |
-
|
| 279 |
-
def __len__(self):
|
| 280 |
-
return len (self.data)
|
| 281 |
-
|
| 282 |
-
def __getitem__(self, idx):
|
| 283 |
-
target_len = int(22050 * 5.0)
|
| 284 |
-
path = os.path.join('/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup', self.data.iloc[idx]['filename'])
|
| 285 |
-
y, _ = librosa.load(path, sr=22050, offset=0, duration=5.0)
|
| 286 |
-
y = librosa.util.fix_length(y, size=target_len)
|
| 287 |
-
if np.max(np.abs(y)) > 0:
|
| 288 |
-
y = y / np.max(np.abs(y))
|
| 289 |
-
mfcc = librosa.feature.mfcc(
|
| 290 |
-
y=y,
|
| 291 |
-
sr=22050,
|
| 292 |
-
n_fft=2048,
|
| 293 |
-
hop_length=512,
|
| 294 |
-
n_mels=128
|
| 295 |
-
)
|
| 296 |
-
|
| 297 |
-
mfcc_db = librosa.power_to_db(mfcc, ref=np.max)
|
| 298 |
-
return torch.tensor(mfcc_db).float().unsqueeze(0)
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
test_data = TestDataSet(test_df)
|
| 302 |
-
|
| 303 |
-
test_loader = DataLoader(dataset=test_data,
|
| 304 |
-
batch_size=32,
|
| 305 |
-
num_workers=2,
|
| 306 |
-
pin_memory=True if device == 'cuda' else False)
|
| 307 |
-
|
| 308 |
-
# Prediction loop for test set
|
| 309 |
-
genre_pred = []
|
| 310 |
-
def predict (model_name, model_path):
|
| 311 |
-
model = model_name.to(device)
|
| 312 |
-
model.load_state_dict(torch.load(model_path))
|
| 313 |
-
|
| 314 |
-
model.eval()
|
| 315 |
-
|
| 316 |
-
for mfccs in test_loader:
|
| 317 |
-
mfccs = mfccs.to(device)
|
| 318 |
-
with torch.no_grad():
|
| 319 |
-
genre_id = model (mfccs)
|
| 320 |
-
genre_out = torch.softmax(genre_id, dim=1)
|
| 321 |
-
genre_out = torch.argmax(genre_out, dim=1).cpu().numpy()
|
| 322 |
-
genre_pred.extend(genre_out.tolist())
|
| 323 |
-
final_result = [id_to_genre[g] for g in genre_pred]
|
| 324 |
-
return final_result
|
| 325 |
-
|
| 326 |
-
res = predict (EfficientNetAudio(), '/kaggle/input/models/bhavin273/efficientnet-model/pytorch/default/1/best_efficientnet_model.pth')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
DELETED
|
@@ -1,247 +0,0 @@
|
|
| 1 |
-
# Music Genre Classification from Audio
|
| 2 |
-
|
| 3 |
-
A deep learning project for classifying audio tracks into **10 music genres** using the **messy_mashup** dataset from the Kaggle competition (`jan-2026-dl-gen-ai-project`). Three model architectures are explored: a scratch-built CRNN, a pretrained EfficientNet-B0, and a fine-tuned Audio Spectrogram Transformer (AST) — with AST achieving the best leaderboard score of **0.857**.
|
| 4 |
-
|
| 5 |
-
---
|
| 6 |
-
|
| 7 |
-
## Table of Contents
|
| 8 |
-
|
| 9 |
-
- [Project Overview](#project-overview)
|
| 10 |
-
- [Dataset](#dataset)
|
| 11 |
-
- [Project Structure](#project-structure)
|
| 12 |
-
- [Installation](#installation)
|
| 13 |
-
- [Preprocessing Pipeline](#preprocessing-pipeline)
|
| 14 |
-
- [Models](#models)
|
| 15 |
-
- [Model 1: CRNN (CNN + Bidirectional GRU)](#model-1-crnn-cnn--bidirectional-gru)
|
| 16 |
-
- [Model 2: Audio Spectrogram Transformer (AST)](#model-2-audio-spectrogram-transformer-ast)
|
| 17 |
-
- [Model 3: EfficientNet-B0](#model-3-efficientnet-b0)
|
| 18 |
-
- [Training](#training)
|
| 19 |
-
- [Results](#results)
|
| 20 |
-
- [Usage](#usage)
|
| 21 |
-
- [Acknowledgements](#acknowledgements)
|
| 22 |
-
|
| 23 |
-
---
|
| 24 |
-
|
| 25 |
-
## Project Overview
|
| 26 |
-
|
| 27 |
-
The goal is to predict the genre of a music track from its raw audio waveform. The audio data consists of separated stems (drums, vocals, bass, other) per song, which are mixed together with environmental noise augmentation to simulate real-world conditions. Three progressively more powerful model architectures are trained and compared.
|
| 28 |
-
|
| 29 |
-
---
|
| 30 |
-
|
| 31 |
-
## Dataset
|
| 32 |
-
|
| 33 |
-
| Property | Details |
|
| 34 |
-
|----------|---------|
|
| 35 |
-
| **Name** | messy_mashup |
|
| 36 |
-
| **Genres (10)** | blues, classical, country, disco, hiphop, jazz, metal, pop, reggae, rock |
|
| 37 |
-
| **Stems per song** | `drums`, `vocals`, `bass`, `other` |
|
| 38 |
-
| **Noise source** | ESC-50 environmental sound dataset |
|
| 39 |
-
| **Train/Val split** | 83% / 17% (Stratified Shuffle Split, seed=42) |
|
| 40 |
-
|
| 41 |
-
### Data Organization
|
| 42 |
-
|
| 43 |
-
```
|
| 44 |
-
genres_stems/
|
| 45 |
-
├── blues/
|
| 46 |
-
│ ├── song_001/
|
| 47 |
-
│ │ ├── drums.wav
|
| 48 |
-
│ │ ├── vocals.wav
|
| 49 |
-
│ │ ├── bass.wav
|
| 50 |
-
│ │ └── other.wav
|
| 51 |
-
│ └── ...
|
| 52 |
-
├── classical/
|
| 53 |
-
│ └── ...
|
| 54 |
-
└── ...
|
| 55 |
-
```
|
| 56 |
-
|
| 57 |
-
---
|
| 58 |
-
|
| 59 |
-
## Project Structure
|
| 60 |
-
|
| 61 |
-
```
|
| 62 |
-
├── dl-23f3002677-notebook-t12026.ipynb # Main Kaggle notebook (all models + submission)
|
| 63 |
-
├── CRNN_pipeline.py # Standalone CRNN model pipeline
|
| 64 |
-
├── AST_Pipeline.py # Standalone AST model pipeline
|
| 65 |
-
├── Efficient_Net_Pipeline.py # Standalone EfficientNet model pipeline
|
| 66 |
-
├── milestone1.py # Milestone 1 — EDA & audio preprocessing
|
| 67 |
-
├── milestone_2.py # Milestone 2 — Feature extraction & baseline
|
| 68 |
-
├── best_ast_model.pth # Saved AST model weights
|
| 69 |
-
├── requirements.txt # Python dependencies
|
| 70 |
-
└── README.md
|
| 71 |
-
```
|
| 72 |
-
|
| 73 |
-
### File Descriptions
|
| 74 |
-
|
| 75 |
-
| File | Purpose |
|
| 76 |
-
|------|---------|
|
| 77 |
-
| **dl-23f3002677-notebook-t12026.ipynb** | End-to-end notebook: data visualization, dataset building, all 3 model architectures (train + predict), performance comparison, and final submission generation |
|
| 78 |
-
| **milestone1.py** | Data exploration — builds train/val split, detects silence segments in stems |
|
| 79 |
-
| **milestone_2.py** | Feature engineering (tempo, spectral centroid, ZCR, rolloff) + Decision Tree baseline classifier |
|
| 80 |
-
| **CRNN_pipeline.py** | CRNN model definition, dataset, training loop, and prediction |
|
| 81 |
-
| **AST_Pipeline.py** | Audio Spectrogram Transformer pipeline with HuggingFace pretrained model |
|
| 82 |
-
| **Efficient_Net_Pipeline.py** | EfficientNet-B0 pipeline using `timm` pretrained model |
|
| 83 |
-
|
| 84 |
-
---
|
| 85 |
-
|
| 86 |
-
## Installation
|
| 87 |
-
|
| 88 |
-
```bash
|
| 89 |
-
pip install -r requirements.txt
|
| 90 |
-
```
|
| 91 |
-
|
| 92 |
-
### Dependencies
|
| 93 |
-
|
| 94 |
-
- **numpy** — numerical computation
|
| 95 |
-
- **pandas** — data manipulation
|
| 96 |
-
- **librosa** — audio loading & feature extraction
|
| 97 |
-
- **torch** / **torchaudio** — deep learning framework
|
| 98 |
-
- **torchsummary** — model architecture summaries
|
| 99 |
-
- **transformers** — HuggingFace pretrained AST model
|
| 100 |
-
- **timm** — pretrained EfficientNet-B0
|
| 101 |
-
- **scikit-learn** — stratified splitting, F1 scoring, Decision Tree baseline
|
| 102 |
-
- **matplotlib** / **seaborn** — visualization
|
| 103 |
-
- **wandb** — experiment tracking
|
| 104 |
-
- **soundfile** — audio I/O
|
| 105 |
-
- **tqdm** — progress bars
|
| 106 |
-
|
| 107 |
-
---
|
| 108 |
-
|
| 109 |
-
## Preprocessing Pipeline
|
| 110 |
-
|
| 111 |
-
### Audio Mixing
|
| 112 |
-
|
| 113 |
-
1. **Stem selection**: For each sample, randomly select a genre and load stems (drums, vocals, bass, other) from a random song at a random offset
|
| 114 |
-
2. **Mixing**: Sum all stem waveforms together and peak-normalize the mix
|
| 115 |
-
3. **Noise injection**: Add a random ESC-50 environmental sound clip, scaled to a target SNR
|
| 116 |
-
|
| 117 |
-
### Feature Extraction
|
| 118 |
-
|
| 119 |
-
| Model | Feature | Sample Rate | Duration | Output Shape |
|
| 120 |
-
|-------|---------|-------------|----------|-------------|
|
| 121 |
-
| CRNN | 40 MFCCs → power_to_dB | 22,050 Hz | 5s | `(1, 40, 216)` |
|
| 122 |
-
| EfficientNet | 128 MFCCs → power_to_dB | 22,050 Hz | 5s | `(1, 128, 216)` |
|
| 123 |
-
| AST | 128-mel spectrogram → power_to_dB → normalized | 16,000 Hz | 10s | `(1024, 128)` |
|
| 124 |
-
|
| 125 |
-
### AST-Specific Augmentations
|
| 126 |
-
|
| 127 |
-
- **Stem dropout**: 15% chance of skipping each stem type
|
| 128 |
-
- **Random gain**: Each stem scaled by a random factor in [0.4, 1.2]
|
| 129 |
-
- **Variable SNR**: Noise scaling divisor randomized between 2.0 and 8.0
|
| 130 |
-
- **Normalization**: `(mel_dB + 4.26) / 4.56`
|
| 131 |
-
|
| 132 |
-
---
|
| 133 |
-
|
| 134 |
-
## Models
|
| 135 |
-
|
| 136 |
-
### Model 1: CRNN (CNN + Bidirectional GRU)
|
| 137 |
-
|
| 138 |
-
A scratch-built architecture combining convolutional feature extraction with recurrent temporal modeling.
|
| 139 |
-
|
| 140 |
-
```
|
| 141 |
-
Input (1, 40, 216)
|
| 142 |
-
→ Conv2d(1→32) + BatchNorm + ReLU + MaxPool(2×2) → (32, 20, 108)
|
| 143 |
-
→ Conv2d(32→64) + BatchNorm + ReLU + MaxPool(2×1) → (64, 10, 108)
|
| 144 |
-
→ Conv2d(64→128) + BatchNorm + ReLU + MaxPool(2×1) → (128, 5, 108)
|
| 145 |
-
→ Reshape to (108, 640)
|
| 146 |
-
→ Bidirectional GRU (hidden=128, 2 layers, dropout=0.3) → (108, 256)
|
| 147 |
-
→ Last timestep → Linear(256→10)
|
| 148 |
-
```
|
| 149 |
-
|
| 150 |
-
### Model 2: Audio Spectrogram Transformer (AST)
|
| 151 |
-
|
| 152 |
-
Fine-tuned from the pretrained [`MIT/ast-finetuned-audioset-10-10-0.4593`](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593) checkpoint via HuggingFace Transformers. The classification head is replaced with a 10-class output layer.
|
| 153 |
-
|
| 154 |
-
### Model 3: EfficientNet-B0
|
| 155 |
-
|
| 156 |
-
Pretrained EfficientNet-B0 loaded via `timm` with `in_chans=1` (single-channel MFCC input) and `num_classes=10`.
|
| 157 |
-
|
| 158 |
-
---
|
| 159 |
-
|
| 160 |
-
## Training
|
| 161 |
-
|
| 162 |
-
### Hyperparameters
|
| 163 |
-
|
| 164 |
-
| Parameter | CRNN | AST | EfficientNet |
|
| 165 |
-
|-----------|------|-----|-------------|
|
| 166 |
-
| **Optimizer** | Adam | AdamW | Adam |
|
| 167 |
-
| **Learning rate** | 1e-3 | 1e-5 | 1e-3 |
|
| 168 |
-
| **Weight decay** | 1e-5 | 0.01 | 1e-5 |
|
| 169 |
-
| **Batch size** | 32 | 4 | 32 |
|
| 170 |
-
| **Max epochs** | 17 | 15 | 20 |
|
| 171 |
-
| **Early stopping patience** | 5 | 7 | 5 |
|
| 172 |
-
| **Gradient accumulation** | — | 4 steps | — |
|
| 173 |
-
| **Scheduler** | ReduceLROnPlateau | ReduceLROnPlateau | ReduceLROnPlateau |
|
| 174 |
-
| **Loss function** | CrossEntropyLoss | CrossEntropyLoss | CrossEntropyLoss |
|
| 175 |
-
| **Train samples** | 1,000 | 1,000 | 1,000 |
|
| 176 |
-
| **Val samples** | 200 | 500 | 200 |
|
| 177 |
-
|
| 178 |
-
### Experiment Tracking
|
| 179 |
-
|
| 180 |
-
All training runs are logged to [Weights & Biases (WandB)](https://wandb.ai/) with metrics for train loss, validation loss, F1 score, and learning rate per epoch.
|
| 181 |
-
|
| 182 |
-
---
|
| 183 |
-
|
| 184 |
-
## Results
|
| 185 |
-
|
| 186 |
-
| Model | Max F1 (Validation) | Kaggle Leaderboard Score |
|
| 187 |
-
|-------|:-------------------:|:------------------------:|
|
| 188 |
-
| CRNN (scratch) | 0.5800 | 0.33103 |
|
| 189 |
-
| EfficientNet-B0 | 0.5258 | 0.31641 |
|
| 190 |
-
| **Audio Spectrogram Transformer** | **0.8861** | **0.85708** |
|
| 191 |
-
|
| 192 |
-
The **AST model** significantly outperforms the other two, achieving nearly **0.86 on the leaderboard** — a 2.5× improvement over the CNN-based models. Key factors behind its success:
|
| 193 |
-
|
| 194 |
-
- **Pretrained on AudioSet**: Large-scale audio pretraining provides strong feature representations
|
| 195 |
-
- **Longer input duration**: 10s vs 5s captures more musical context
|
| 196 |
-
- **Mel spectrogram input**: Richer frequency representation compared to MFCCs
|
| 197 |
-
- **Stronger augmentation**: Stem dropout, variable gain, and variable SNR improve robustness
|
| 198 |
-
|
| 199 |
-
---
|
| 200 |
-
|
| 201 |
-
## Usage
|
| 202 |
-
|
| 203 |
-
### Training (in Kaggle notebook)
|
| 204 |
-
|
| 205 |
-
1. Set up the dataset paths in the notebook
|
| 206 |
-
2. Uncomment the training cell for the desired model:
|
| 207 |
-
```python
|
| 208 |
-
# CRNN
|
| 209 |
-
train(audioCRNN())
|
| 210 |
-
|
| 211 |
-
# AST
|
| 212 |
-
train_ast(MusicGenreAST(10))
|
| 213 |
-
|
| 214 |
-
# EfficientNet
|
| 215 |
-
train(EfficientNetAudio())
|
| 216 |
-
```
|
| 217 |
-
|
| 218 |
-
### Prediction
|
| 219 |
-
|
| 220 |
-
```python
|
| 221 |
-
# AST (best model)
|
| 222 |
-
res = predict(MusicGenreAST(10), 'best_ast_model.pth')
|
| 223 |
-
|
| 224 |
-
# CRNN
|
| 225 |
-
res = predict(audioCRNN(), 'best_crnnModel.pth')
|
| 226 |
-
|
| 227 |
-
# EfficientNet
|
| 228 |
-
res = predict(EfficientNetAudio(), 'best_efficientnet_model.pth')
|
| 229 |
-
```
|
| 230 |
-
|
| 231 |
-
### Generating Submission
|
| 232 |
-
|
| 233 |
-
```python
|
| 234 |
-
submission_df = pd.read_csv('sample_submission.csv')
|
| 235 |
-
submission = pd.DataFrame({"id": submission_df['id'], "genre": res})
|
| 236 |
-
submission.to_csv("submission.csv", index=False)
|
| 237 |
-
```
|
| 238 |
-
|
| 239 |
-
---
|
| 240 |
-
|
| 241 |
-
## Acknowledgements
|
| 242 |
-
|
| 243 |
-
- [MIT AST](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593) — pretrained Audio Spectrogram Transformer
|
| 244 |
-
- [ESC-50](https://github.com/karolpiczak/ESC-50) — environmental sound dataset used for noise augmentation
|
| 245 |
-
- [timm](https://github.com/huggingface/pytorch-image-models) — PyTorch Image Models library
|
| 246 |
-
- [Weights & Biases](https://wandb.ai/) — experiment tracking
|
| 247 |
-
- [librosa](https://librosa.org/) — audio analysis library
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dl-23f3002677-notebook-t12026.ipynb
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
milestone1.py
DELETED
|
@@ -1,232 +0,0 @@
|
|
| 1 |
-
# -*- coding: utf-8 -*-
|
| 2 |
-
"""MileStone1.ipynb
|
| 3 |
-
|
| 4 |
-
Automatically generated by Colab.
|
| 5 |
-
|
| 6 |
-
Original file is located at
|
| 7 |
-
https://colab.research.google.com/drive/1X-yYuntHy5LGU1wh8QNz1rTO1geAS-qh
|
| 8 |
-
"""
|
| 9 |
-
|
| 10 |
-
import os
|
| 11 |
-
import glob
|
| 12 |
-
import numpy as np
|
| 13 |
-
import pandas as pd
|
| 14 |
-
from tqdm import tqdm
|
| 15 |
-
import librosa
|
| 16 |
-
import librosa.display
|
| 17 |
-
import matplotlib.pyplot as plt
|
| 18 |
-
import random
|
| 19 |
-
import torch
|
| 20 |
-
from sklearn.model_selection import StratifiedShuffleSplit
|
| 21 |
-
from google.colab import drive
|
| 22 |
-
from collections import Counter
|
| 23 |
-
|
| 24 |
-
import warnings
|
| 25 |
-
warnings.filterwarnings("ignore")
|
| 26 |
-
|
| 27 |
-
drive.mount('/content/drive')
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
drive_zip = '/content/drive/MyDrive/jan-2026-dl-gen-ai-project.zip'
|
| 31 |
-
local_zip = '/content/data.zip'
|
| 32 |
-
extract_to = '/content/dataset'
|
| 33 |
-
|
| 34 |
-
if not os.path.exists(extract_to):
|
| 35 |
-
print("Moving 16 GB from Drive to local SSD...")
|
| 36 |
-
!rsync -ah --progress "{drive_zip}" "{local_zip}"
|
| 37 |
-
|
| 38 |
-
print("Unzipping 25GB... (This will take a few minutes)")
|
| 39 |
-
!unzip -q "{local_zip}" -d "{extract_to}"
|
| 40 |
-
|
| 41 |
-
print("Deleting zip to save space...")
|
| 42 |
-
os.remove(local_zip)
|
| 43 |
-
print("Ready! Data is in /content/dataset")
|
| 44 |
-
else:
|
| 45 |
-
print("Data already exists.")
|
| 46 |
-
|
| 47 |
-
DATA_SEED = 67
|
| 48 |
-
TRAINING_SEED = 1234
|
| 49 |
-
SR = 22050
|
| 50 |
-
DURATION = 5.0
|
| 51 |
-
N_FFT = 2048
|
| 52 |
-
HOP_LENGTH = 512
|
| 53 |
-
N_MELS = 128
|
| 54 |
-
TOP_DB=20
|
| 55 |
-
TARGET_SNR_DB = 10
|
| 56 |
-
|
| 57 |
-
random.seed(DATA_SEED)
|
| 58 |
-
np.random.seed(DATA_SEED)
|
| 59 |
-
torch.manual_seed(DATA_SEED)
|
| 60 |
-
torch.cuda.manual_seed(DATA_SEED)
|
| 61 |
-
|
| 62 |
-
# CONFIGURATION
|
| 63 |
-
DATA_ROOT = '/content/dataset/messy_mashup/genres_stems'
|
| 64 |
-
GENRES = [] # Make the list of all genres available
|
| 65 |
-
for genre in os.listdir(path=DATA_ROOT):
|
| 66 |
-
GENRES.append(genre)
|
| 67 |
-
display(GENRES)
|
| 68 |
-
STEMS = {} # Write here stems file name
|
| 69 |
-
STEM_KEYS = ['drums', 'vocals', 'bass', 'other']
|
| 70 |
-
GENRE_TO_TEST = 'rock'
|
| 71 |
-
SONG_INDEX = 0
|
| 72 |
-
|
| 73 |
-
def build_dataset(root_dir, val_split=0.17, seed=42):
|
| 74 |
-
train_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
|
| 75 |
-
val_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
|
| 76 |
-
rows = []
|
| 77 |
-
for genre in GENRES:
|
| 78 |
-
genre_path = os.path.join(root_dir, genre)
|
| 79 |
-
for song in os.listdir(genre_path):
|
| 80 |
-
rows.append({
|
| 81 |
-
"label" : genre,
|
| 82 |
-
"song" : song
|
| 83 |
-
})
|
| 84 |
-
df = pd.DataFrame(rows)
|
| 85 |
-
display(song_df.head())
|
| 86 |
-
X = df['song']
|
| 87 |
-
y = df['label']
|
| 88 |
-
|
| 89 |
-
sss = StratifiedShuffleSplit(n_splits=1, test_size=val_split, random_state=seed)
|
| 90 |
-
|
| 91 |
-
for train_index, val_index in sss.split(X, y):
|
| 92 |
-
train_df = df.iloc[train_index].reset_index(drop=True)
|
| 93 |
-
val_df = df.iloc[val_index].reset_index(drop=True)
|
| 94 |
-
|
| 95 |
-
for idx, row in train_df.iterrows():
|
| 96 |
-
genre = row['label']
|
| 97 |
-
song = row['song']
|
| 98 |
-
song_path = os.path.join(root_dir, genre, song)
|
| 99 |
-
for stem in os.listdir(song_path):
|
| 100 |
-
train_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
|
| 101 |
-
|
| 102 |
-
for idx, row in val_df.iterrows():
|
| 103 |
-
genre = row['label']
|
| 104 |
-
song = row['song']
|
| 105 |
-
song_path = os.path.join(root_dir, genre, song)
|
| 106 |
-
for stem in os.listdir(song_path):
|
| 107 |
-
val_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
|
| 108 |
-
|
| 109 |
-
print(len(val['jazz']['drums']))
|
| 110 |
-
return train_dataset, val_dataset
|
| 111 |
-
|
| 112 |
-
tr, val = build_dataset(DATA_ROOT)
|
| 113 |
-
|
| 114 |
-
def find_long_silences(dataset_dict, sr=SR, threshold_sec=DURATION, top_db=TOP_DB):
|
| 115 |
-
"""
|
| 116 |
-
Input:
|
| 117 |
-
dataset_dict: The dictionary structure {genre: {stem: [paths...]}}
|
| 118 |
-
Output:
|
| 119 |
-
df: Pandas DataFrame containing details of all files with silence >= 5s
|
| 120 |
-
"""
|
| 121 |
-
records = []
|
| 122 |
-
for genre in GENRES:
|
| 123 |
-
for key in STEM_KEYS:
|
| 124 |
-
for file in dataset_dict[genre][key]:
|
| 125 |
-
silence_type = []
|
| 126 |
-
total_duration = 0
|
| 127 |
-
max_silence = 0
|
| 128 |
-
y, sr = librosa.load(file, sr=sr)
|
| 129 |
-
intervals = librosa.effects.split(y, top_db=top_db)
|
| 130 |
-
file_length = len(y) / sr
|
| 131 |
-
|
| 132 |
-
if intervals.size == 0:
|
| 133 |
-
silence_type.append("FULL")
|
| 134 |
-
max_silence = file_length
|
| 135 |
-
total_silence_duration = file_length
|
| 136 |
-
else:
|
| 137 |
-
# Start
|
| 138 |
-
start_gap = intervals[0][0] / sr
|
| 139 |
-
if start_gap > 0:
|
| 140 |
-
silence_type.append("START")
|
| 141 |
-
max_silence = max(max_silence, start_gap)
|
| 142 |
-
|
| 143 |
-
# Middle
|
| 144 |
-
mid_silence_total = 0
|
| 145 |
-
for i in range(1, len(intervals)):
|
| 146 |
-
gap = (intervals[i][0] - intervals[i-1][1]) / sr
|
| 147 |
-
if gap > 0:
|
| 148 |
-
if "MIDDLE" not in silence_type: silence_type.append("MIDDLE")
|
| 149 |
-
max_silence = max(max_silence, gap)
|
| 150 |
-
mid_silence_total += gap
|
| 151 |
-
|
| 152 |
-
# End
|
| 153 |
-
end_gap = (len(y) - intervals[-1][1]) / sr
|
| 154 |
-
if end_gap > 0:
|
| 155 |
-
silence_type.append("END")
|
| 156 |
-
max_silence = max(max_silence, end_gap)
|
| 157 |
-
total_silence_duration = start_gap + mid_silence_total + end_gap
|
| 158 |
-
|
| 159 |
-
if max_silence >= threshold_sec:
|
| 160 |
-
records.append({
|
| 161 |
-
"Genre": genre,
|
| 162 |
-
"Stem": key,
|
| 163 |
-
"Duration": round(total_silence_duration, 2),
|
| 164 |
-
"Max_Silence_Sec": round(max_silence, 2),
|
| 165 |
-
"Silence_Location": ", ".join(silence_type),
|
| 166 |
-
"File_Path": file
|
| 167 |
-
})
|
| 168 |
-
#-------------------------------------------------------------------------
|
| 169 |
-
df = pd.DataFrame(records)
|
| 170 |
-
return df
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
# --- EXECUTION ---
|
| 174 |
-
# Pass your 'tr' (training) dictionary here.
|
| 175 |
-
# Ensure 'tr' is defined from your previous build_dataset code.
|
| 176 |
-
df_silence = find_long_silences(tr, threshold_sec=DURATION, top_db=TOP_DB)
|
| 177 |
-
|
| 178 |
-
# --- RESULTS ANALYSIS ---
|
| 179 |
-
|
| 180 |
-
# ------------------- write your code here -------------------------------
|
| 181 |
-
#-------------------------------------------------------------------------
|
| 182 |
-
# Hint: Create a pivot Table: Count by Genre vs Stem
|
| 183 |
-
|
| 184 |
-
df_silence[df_silence["Silence_Location"] == 'START'].head()
|
| 185 |
-
|
| 186 |
-
df_silence[
|
| 187 |
-
(df_silence['Stem'] == 'drums') &
|
| 188 |
-
(df_silence['Genre'] == 'jazz') &
|
| 189 |
-
(df_silence['Max_Silence_Sec'] > 10)
|
| 190 |
-
].shape
|
| 191 |
-
|
| 192 |
-
stems_audio = []
|
| 193 |
-
path = '/content/dataset/messy_mashup/genres_stems/rock/rock.00000'
|
| 194 |
-
try:
|
| 195 |
-
for key in STEM_KEYS:
|
| 196 |
-
y, sr = librosa.load(os.path.join(path, f'{key}.wav'), sr=SR, duration=5.0)
|
| 197 |
-
stems_audio.append(y)
|
| 198 |
-
print("Audio loaded successfully.")
|
| 199 |
-
|
| 200 |
-
except IndexError:
|
| 201 |
-
print(f"ERROR: Song index {SONG_INDEX} out of range for genre {GENRE_TO_TEST}.")
|
| 202 |
-
except Exception as e:
|
| 203 |
-
print(f"ERROR: {e}")
|
| 204 |
-
|
| 205 |
-
# ------------------- write your code here -------------------------------
|
| 206 |
-
|
| 207 |
-
# Stack them into a numpy array (Shape: 4 x Samples)
|
| 208 |
-
stems_stack = np.asarray(stems_audio)
|
| 209 |
-
print(stems_stack.shape)
|
| 210 |
-
|
| 211 |
-
# Mix the stems by summing them element-wise
|
| 212 |
-
mix_raw = np.sum(stems_stack, axis=0)
|
| 213 |
-
print(mix_raw.shape)
|
| 214 |
-
|
| 215 |
-
# Calculate RMS Amplitude MANUALLY
|
| 216 |
-
rms_val = np.sqrt(np.sum(mix_raw**2) / mix_raw.shape[0])
|
| 217 |
-
print(rms_val)
|
| 218 |
-
|
| 219 |
-
# Peak Normalization (FIXED)
|
| 220 |
-
max_val = np.max(np.abs(mix_raw)) # <-- IMPORTANT CHANGE
|
| 221 |
-
print(max_val)
|
| 222 |
-
|
| 223 |
-
if max_val > 0:
|
| 224 |
-
mix_norm = mix_raw / max_val
|
| 225 |
-
else:
|
| 226 |
-
mix_norm = mix_raw
|
| 227 |
-
|
| 228 |
-
print(np.max(np.abs(mix_norm)))
|
| 229 |
-
|
| 230 |
-
# VALIDATION
|
| 231 |
-
assert np.isclose(np.max(np.abs(mix_norm)), 1.0), "Normalization failed."
|
| 232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
milestone_2.py
DELETED
|
@@ -1,258 +0,0 @@
|
|
| 1 |
-
# -*- coding: utf-8 -*-
|
| 2 |
-
"""Milestone_2.ipynb
|
| 3 |
-
|
| 4 |
-
Automatically generated by Colab.
|
| 5 |
-
|
| 6 |
-
Original file is located at
|
| 7 |
-
https://colab.research.google.com/drive/1-r7NZ_PYS1JKGoinWyPplhGkitD7lHuV
|
| 8 |
-
"""
|
| 9 |
-
|
| 10 |
-
import os
|
| 11 |
-
import glob
|
| 12 |
-
import numpy as np
|
| 13 |
-
import pandas as pd
|
| 14 |
-
from tqdm import tqdm
|
| 15 |
-
import librosa
|
| 16 |
-
import librosa.display
|
| 17 |
-
import matplotlib.pyplot as plt
|
| 18 |
-
import random
|
| 19 |
-
import torch
|
| 20 |
-
import soundfile as sf
|
| 21 |
-
from sklearn.model_selection import StratifiedShuffleSplit
|
| 22 |
-
from google.colab import drive
|
| 23 |
-
from collections import Counter
|
| 24 |
-
import matplotlib.pyplot as plt
|
| 25 |
-
import seaborn as sns
|
| 26 |
-
from sklearn.model_selection import train_test_split
|
| 27 |
-
from sklearn.tree import DecisionTreeClassifier
|
| 28 |
-
from sklearn.metrics import f1_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
|
| 29 |
-
import warnings
|
| 30 |
-
warnings.filterwarnings("ignore")
|
| 31 |
-
|
| 32 |
-
drive.mount('/content/drive')
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
drive_zip = '/content/drive/MyDrive/jan-2026-dl-gen-ai-project.zip'
|
| 36 |
-
local_zip = '/content/data.zip'
|
| 37 |
-
extract_to = '/content/dataset'
|
| 38 |
-
|
| 39 |
-
if not os.path.exists(extract_to):
|
| 40 |
-
print("Moving 16 GB from Drive to local SSD...")
|
| 41 |
-
!rsync -ah --progress "{drive_zip}" "{local_zip}"
|
| 42 |
-
|
| 43 |
-
print("Unzipping 25GB... (This will take a few minutes)")
|
| 44 |
-
!unzip -q "{local_zip}" -d "{extract_to}"
|
| 45 |
-
|
| 46 |
-
print("Deleting zip to save space...")
|
| 47 |
-
os.remove(local_zip)
|
| 48 |
-
print("Ready! Data is in /content/dataset")
|
| 49 |
-
else:
|
| 50 |
-
print("Data already exists.")
|
| 51 |
-
|
| 52 |
-
DATA_SEED = 67
|
| 53 |
-
TRAINING_SEED = 1234
|
| 54 |
-
SR = 22050
|
| 55 |
-
DURATION = 5.0
|
| 56 |
-
N_FFT = 2048
|
| 57 |
-
HOP_LENGTH = 512
|
| 58 |
-
N_MELS = 128
|
| 59 |
-
TOP_DB=20
|
| 60 |
-
TARGET_SNR_DB = 10
|
| 61 |
-
|
| 62 |
-
random.seed(DATA_SEED)
|
| 63 |
-
np.random.seed(DATA_SEED)
|
| 64 |
-
torch.manual_seed(DATA_SEED)
|
| 65 |
-
torch.cuda.manual_seed(DATA_SEED)
|
| 66 |
-
|
| 67 |
-
# CONFIGURATION
|
| 68 |
-
DATA_ROOT = '/content/dataset/messy_mashup/genres_stems'
|
| 69 |
-
GENRES = [] # Make the list of all genres available
|
| 70 |
-
for genre in os.listdir(path=DATA_ROOT):
|
| 71 |
-
GENRES.append(genre)
|
| 72 |
-
display(GENRES)
|
| 73 |
-
STEMS = {} # Write here stems file name
|
| 74 |
-
STEM_KEYS = ['drums', 'vocals', 'bass', 'other']
|
| 75 |
-
GENRE_TO_TEST = 'rock'
|
| 76 |
-
SONG_INDEX = 0
|
| 77 |
-
|
| 78 |
-
def build_dataset(root_dir, val_split=0.17, seed=42):
|
| 79 |
-
train_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
|
| 80 |
-
val_dataset = {g: {s: [] for s in STEM_KEYS} for g in GENRES}
|
| 81 |
-
rows = []
|
| 82 |
-
for genre in GENRES:
|
| 83 |
-
genre_path = os.path.join(root_dir, genre)
|
| 84 |
-
for song in os.listdir(genre_path):
|
| 85 |
-
rows.append({
|
| 86 |
-
"label" : genre,
|
| 87 |
-
"song" : song
|
| 88 |
-
})
|
| 89 |
-
df = pd.DataFrame(rows)
|
| 90 |
-
X = df['song']
|
| 91 |
-
y = df['label']
|
| 92 |
-
|
| 93 |
-
sss = StratifiedShuffleSplit(n_splits=1, test_size=val_split, random_state=seed)
|
| 94 |
-
|
| 95 |
-
for train_index, val_index in sss.split(X, y):
|
| 96 |
-
train_df = df.iloc[train_index].reset_index(drop=True)
|
| 97 |
-
val_df = df.iloc[val_index].reset_index(drop=True)
|
| 98 |
-
|
| 99 |
-
for idx, row in train_df.iterrows():
|
| 100 |
-
genre = row['label']
|
| 101 |
-
song = row['song']
|
| 102 |
-
song_path = os.path.join(root_dir, genre, song)
|
| 103 |
-
for stem in os.listdir(song_path):
|
| 104 |
-
train_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
|
| 105 |
-
|
| 106 |
-
for idx, row in val_df.iterrows():
|
| 107 |
-
genre = row['label']
|
| 108 |
-
song = row['song']
|
| 109 |
-
song_path = os.path.join(root_dir, genre, song)
|
| 110 |
-
for stem in os.listdir(song_path):
|
| 111 |
-
val_dataset[genre][stem.replace('.wav', '')].append(os.path.join(song_path, stem))
|
| 112 |
-
return train_dataset, val_dataset
|
| 113 |
-
|
| 114 |
-
tr, val = build_dataset(DATA_ROOT)
|
| 115 |
-
|
| 116 |
-
# Question_1 and Question_2
|
| 117 |
-
duration_arr = []
|
| 118 |
-
for stem in STEM_KEYS:
|
| 119 |
-
for song in tr['jazz'][stem]:
|
| 120 |
-
y, sr = librosa.load(path=song)
|
| 121 |
-
duration = librosa.get_duration(y=y, sr=sr)
|
| 122 |
-
duration_arr.append(duration)
|
| 123 |
-
display(np.mean(np.array(duration_arr)))
|
| 124 |
-
|
| 125 |
-
# Question_2
|
| 126 |
-
sr_set=set()
|
| 127 |
-
for genre in GENRES:
|
| 128 |
-
for stem in STEM_KEYS:
|
| 129 |
-
for song in tr[genre][stem]:
|
| 130 |
-
y, sr = librosa.load(path=song)
|
| 131 |
-
sr_set.add(sr)
|
| 132 |
-
display(list(sr_set))
|
| 133 |
-
|
| 134 |
-
#Question_3
|
| 135 |
-
counter = 0
|
| 136 |
-
for genre in GENRES:
|
| 137 |
-
for stem in STEM_KEYS:
|
| 138 |
-
for song in tr[genre][stem]:
|
| 139 |
-
if os.path.getsize(song) == 0:
|
| 140 |
-
counter += 1
|
| 141 |
-
display(counter)
|
| 142 |
-
|
| 143 |
-
#Question_4
|
| 144 |
-
amplitude_arr = []
|
| 145 |
-
|
| 146 |
-
for genre in GENRES:
|
| 147 |
-
for song in tr[genre]['vocals']:
|
| 148 |
-
y, sr = sf.read(song)
|
| 149 |
-
|
| 150 |
-
peak_linear = np.max(np.abs(y))
|
| 151 |
-
|
| 152 |
-
if peak_linear > 0:
|
| 153 |
-
peak_db = librosa.amplitude_to_db(np.array([peak_linear]), ref=1.0)[0]
|
| 154 |
-
amplitude_arr.append(peak_db)
|
| 155 |
-
|
| 156 |
-
average_peak_db = np.mean(amplitude_arr)
|
| 157 |
-
print(f"Average Peak Level: {average_peak_db:.2f} dBFS")
|
| 158 |
-
|
| 159 |
-
#Question_5
|
| 160 |
-
centroids = []
|
| 161 |
-
for stem in STEM_KEYS:
|
| 162 |
-
for song in tr['blues'][stem]:
|
| 163 |
-
y, sr = librosa.load(song)
|
| 164 |
-
sc = librosa.feature.spectral_centroid(y=y, sr=sr)
|
| 165 |
-
centroids.append(np.mean(sc))
|
| 166 |
-
mean_blues_centroid = np.mean(centroids)
|
| 167 |
-
print(f"Mean Spectral Centroid for Blues: {mean_blues_centroid:.2f} Hz")
|
| 168 |
-
|
| 169 |
-
#Question_6
|
| 170 |
-
genre_means = {}
|
| 171 |
-
for genre in GENRES:
|
| 172 |
-
genre_centroids = []
|
| 173 |
-
for stem in STEM_KEYS:
|
| 174 |
-
for song_path in tr[genre][stem]:
|
| 175 |
-
y, sr = librosa.load(song_path)
|
| 176 |
-
sc = librosa.feature.spectral_centroid(y=y, sr=sr)
|
| 177 |
-
genre_centroids.append(np.mean(sc))
|
| 178 |
-
|
| 179 |
-
genre_means[genre] = np.mean(genre_centroids)
|
| 180 |
-
|
| 181 |
-
highest_genre = max(genre_means, key=genre_means.get)
|
| 182 |
-
print(f"Genre with highest mean spectral centroid: {highest_genre} ({genre_means[highest_genre]:.2f} Hz)")
|
| 183 |
-
|
| 184 |
-
#Question_7
|
| 185 |
-
silence_count = 0
|
| 186 |
-
threshold = librosa.db_to_amplitude(-20)
|
| 187 |
-
|
| 188 |
-
for genre in GENRES:
|
| 189 |
-
for stem_type in STEM_KEYS:
|
| 190 |
-
for song_path in tr[genre][stem_type]:
|
| 191 |
-
y, sr = librosa.load(song_path, duration=0.5)
|
| 192 |
-
if np.max(np.abs(y)) < threshold:
|
| 193 |
-
silence_count += 1
|
| 194 |
-
|
| 195 |
-
print(f"Stems quieter than -20dB in the first 0.5s: {silence_count}")
|
| 196 |
-
|
| 197 |
-
# --- 1. Setup and Preprocessing ---
|
| 198 |
-
ROOT = '/content/dataset/messy_mashup'
|
| 199 |
-
STEMS_PATH = os.path.join(ROOT, 'genres_stems')
|
| 200 |
-
GENRES = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]
|
| 201 |
-
|
| 202 |
-
def extract_features(song_path):
|
| 203 |
-
# Load 10s at 22050Hz
|
| 204 |
-
y, sr = librosa.load(os.path.join(song_path, 'other.wav'), sr=22050, duration=10)
|
| 205 |
-
tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
|
| 206 |
-
spec_cent = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
|
| 207 |
-
zcr = np.mean(librosa.feature.zero_crossing_rate(y))
|
| 208 |
-
rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
|
| 209 |
-
return [float(tempo), spec_cent, zcr, rolloff]
|
| 210 |
-
|
| 211 |
-
# --- 2. Data Preparation & Stratified Split ---
|
| 212 |
-
data = []
|
| 213 |
-
for g in GENRES:
|
| 214 |
-
gp = os.path.join(STEMS_PATH, g)
|
| 215 |
-
songs = [s for s in os.listdir(gp) if os.path.isdir(os.path.join(gp, s))]
|
| 216 |
-
for s in songs[:50]: # Sampling 50 for speed; use all for final
|
| 217 |
-
data.append({'path': os.path.join(gp, s), 'genre': g})
|
| 218 |
-
|
| 219 |
-
df = pd.DataFrame(data)
|
| 220 |
-
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['genre'], random_state=42)
|
| 221 |
-
|
| 222 |
-
# --- 3. Model Training (Decision Tree) ---
|
| 223 |
-
X_train = np.array([extract_features(p) for p in train_df['path']])
|
| 224 |
-
y_train = train_df['genre']
|
| 225 |
-
X_val = np.array([extract_features(p) for p in val_df['path']])
|
| 226 |
-
y_val = val_df['genre']
|
| 227 |
-
|
| 228 |
-
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
|
| 229 |
-
clf.fit(X_train, y_train)
|
| 230 |
-
|
| 231 |
-
y_pred = clf.predict(X_val)
|
| 232 |
-
display(f'f1 Score is {f1_score(y_pred=y_pred, y_true=y_val, average='macro'):.2f}')
|
| 233 |
-
|
| 234 |
-
report_dict = classification_report(y_val, y_pred, output_dict=True)
|
| 235 |
-
df_report = pd.DataFrame(report_dict).transpose()
|
| 236 |
-
plt.figure(figsize=(10, 6))
|
| 237 |
-
sns.heatmap(df_report, annot=True, cmap="YlGnBu", fmt=".2f", linewidths=.5)
|
| 238 |
-
plt.title('Genre Classification Performance')
|
| 239 |
-
plt.show()
|
| 240 |
-
|
| 241 |
-
display(np.mean(np.array(y_pred==y_val)))
|
| 242 |
-
|
| 243 |
-
cm = confusion_matrix(y_val, y_pred)
|
| 244 |
-
plt.figure(figsize=(10, 8))
|
| 245 |
-
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
|
| 246 |
-
disp.plot(cmap='Blues', xticks_rotation='vertical', ax=plt.gca())
|
| 247 |
-
plt.title('Confusion Matrix: Predicted vs True Genres')
|
| 248 |
-
plt.show()
|
| 249 |
-
|
| 250 |
-
true_positives = np.diag(cm)
|
| 251 |
-
tp_per_genre = dict(zip(clf.classes_, true_positives))
|
| 252 |
-
highest_tp_genre = max(tp_per_genre, key=tp_per_genre.get)
|
| 253 |
-
print(f"Genre with the highest True Positives: {highest_tp_genre}")
|
| 254 |
-
|
| 255 |
-
false_negatives = np.sum(cm, axis=1) - np.diag(cm)
|
| 256 |
-
fn_per_genre = dict(zip(clf.classes_, false_negatives))
|
| 257 |
-
best_genre = min(fn_per_genre, key=fn_per_genre.get)
|
| 258 |
-
print(f"Genre with the lowest False Negatives: {best_genre}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
numpy
|
| 2 |
-
pandas
|
| 3 |
-
tqdm
|
| 4 |
-
librosa
|
| 5 |
-
matplotlib
|
| 6 |
-
seaborn
|
| 7 |
-
torch
|
| 8 |
-
torchaudio
|
| 9 |
-
torchsummary
|
| 10 |
-
transformers
|
| 11 |
-
timm
|
| 12 |
-
scikit-learn
|
| 13 |
-
soundfile
|
| 14 |
-
wandb
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|