Heinrich Dinkel commited on
Commit ·
9dd5fff
1
Parent(s): 5340ead
updated
Browse files- notebook.ipynb +61 -91
notebook.ipynb
CHANGED
|
@@ -17,7 +17,6 @@
|
|
| 17 |
"source": [
|
| 18 |
"import torch\n",
|
| 19 |
"import torch.nn as nn\n",
|
| 20 |
-
"from torch.utils.data import Dataset, DataLoader\n",
|
| 21 |
"from transformers import AutoModel\n",
|
| 22 |
"import librosa\n",
|
| 23 |
"import os\n",
|
|
@@ -25,8 +24,7 @@
|
|
| 25 |
"from sklearn.metrics import accuracy_score\n",
|
| 26 |
"import numpy as np\n",
|
| 27 |
"from tqdm import tqdm\n",
|
| 28 |
-
"import pickle
|
| 29 |
-
"from concurrent.futures import ThreadPoolExecutor"
|
| 30 |
]
|
| 31 |
},
|
| 32 |
{
|
|
@@ -35,31 +33,6 @@
|
|
| 35 |
"metadata": {},
|
| 36 |
"outputs": [],
|
| 37 |
"source": [
|
| 38 |
-
"class ESC50Dataset(Dataset):\n",
|
| 39 |
-
" def __init__(self, metadata_path, audio_dir, sr=16000):\n",
|
| 40 |
-
" self.metadata = pd.read_csv(metadata_path)\n",
|
| 41 |
-
" self.audio_dir = audio_dir\n",
|
| 42 |
-
" self.sr = sr\n",
|
| 43 |
-
" \n",
|
| 44 |
-
" def __len__(self):\n",
|
| 45 |
-
" return len(self.metadata)\n",
|
| 46 |
-
" \n",
|
| 47 |
-
" def __getitem__(self, idx):\n",
|
| 48 |
-
" row = self.metadata.iloc[idx]\n",
|
| 49 |
-
" filename = row['filename']\n",
|
| 50 |
-
" label = row['target']\n",
|
| 51 |
-
" fold = row['fold']\n",
|
| 52 |
-
" \n",
|
| 53 |
-
" audio_path = os.path.join(self.audio_dir, filename)\n",
|
| 54 |
-
" \n",
|
| 55 |
-
" try:\n",
|
| 56 |
-
" audio, sr = librosa.load(audio_path, sr=self.sr)\n",
|
| 57 |
-
" return audio, label, fold\n",
|
| 58 |
-
" except Exception as e:\n",
|
| 59 |
-
" print(f\"Error loading {audio_path}: {e}\")\n",
|
| 60 |
-
" # Return zeros if file can't be loaded\n",
|
| 61 |
-
" return np.zeros(16000), label, fold\n",
|
| 62 |
-
"\n",
|
| 63 |
"def download_esc50():\n",
|
| 64 |
" import urllib.request\n",
|
| 65 |
" import zipfile\n",
|
|
@@ -73,46 +46,17 @@
|
|
| 73 |
" zip_ref.extractall('.')\n",
|
| 74 |
" os.rename('ESC-50-master', 'ESC-50')\n",
|
| 75 |
" os.remove('esc50.zip')\n",
|
| 76 |
-
" print(\"ESC-50 dataset downloaded and extracted\")
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
" audio = np.pad(audio, (0, 16000 - len(audio)))\n",
|
| 86 |
-
" elif len(audio) > 16000:\n",
|
| 87 |
-
" audio = audio[:16000]\n",
|
| 88 |
-
" \n",
|
| 89 |
-
" audio_tensor = torch.tensor(audio).float().unsqueeze(0).to(device)\n",
|
| 90 |
-
" audio_tensors.append(audio_tensor)\n",
|
| 91 |
-
" \n",
|
| 92 |
-
" # Batch process\n",
|
| 93 |
-
" if audio_tensors:\n",
|
| 94 |
-
" batch_audio = torch.cat(audio_tensors, dim=0)\n",
|
| 95 |
-
" \n",
|
| 96 |
-
" with torch.no_grad(), torch.autocast(device_type='cuda' if device.type == 'cuda' else 'cpu'):\n",
|
| 97 |
-
" features = model.encode(batch_audio)\n",
|
| 98 |
-
" if isinstance(features, dict):\n",
|
| 99 |
-
" for key in ['last_hidden_state', 'embeddings', 'audio']:\n",
|
| 100 |
-
" if key in features:\n",
|
| 101 |
-
" features = features[key]\n",
|
| 102 |
-
" break\n",
|
| 103 |
-
" else:\n",
|
| 104 |
-
" features = list(features.values())[0]\n",
|
| 105 |
-
" \n",
|
| 106 |
-
" # Global average pooling\n",
|
| 107 |
-
" if features.dim() > 2:\n",
|
| 108 |
-
" features = features.mean(dim=1)\n",
|
| 109 |
-
" \n",
|
| 110 |
-
" return features.cpu().numpy()\n",
|
| 111 |
-
" \n",
|
| 112 |
-
" return np.array([])\n",
|
| 113 |
-
"\n",
|
| 114 |
"def extract_features():\n",
|
| 115 |
-
" \"\"\"Extract and save features for all ESC-50 audio files
|
| 116 |
" \n",
|
| 117 |
" if os.path.exists('esc50_features.pkl'):\n",
|
| 118 |
" print(\"Features already extracted, loading from file...\")\n",
|
|
@@ -125,30 +69,50 @@
|
|
| 125 |
" device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
| 126 |
" model.to(device)\n",
|
| 127 |
" \n",
|
| 128 |
-
" #
|
| 129 |
" metadata_path = 'ESC-50/meta/esc50.csv'\n",
|
| 130 |
-
"
|
| 131 |
-
" dataset = ESC50Dataset(metadata_path, audio_dir)\n",
|
| 132 |
-
" \n",
|
| 133 |
-
" # Batch processing\n",
|
| 134 |
-
" batch_size = 16\n",
|
| 135 |
-
" dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4)\n",
|
| 136 |
" \n",
|
| 137 |
" features_list = []\n",
|
| 138 |
" labels_list = []\n",
|
| 139 |
" folds_list = []\n",
|
| 140 |
" \n",
|
| 141 |
-
" print(\"Extracting features
|
| 142 |
-
" for
|
| 143 |
-
"
|
|
|
|
|
|
|
| 144 |
" \n",
|
| 145 |
-
"
|
| 146 |
-
" batch_features = extract_features_batch(audio_batch, model, device)\n",
|
| 147 |
" \n",
|
| 148 |
-
"
|
| 149 |
-
"
|
| 150 |
-
"
|
| 151 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
" \n",
|
| 153 |
" # Save features\n",
|
| 154 |
" features_data = {\n",
|
|
@@ -162,8 +126,15 @@
|
|
| 162 |
" pickle.dump(features_data, f)\n",
|
| 163 |
" \n",
|
| 164 |
" print(f\"Features extracted: {len(features_list)} samples, embedding dim: {features_data['embedding_dim']}\")\n",
|
| 165 |
-
" return features_data
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
"# Download dataset and extract features\n",
|
| 168 |
"download_esc50()\n",
|
| 169 |
"features_data = extract_features()\n",
|
|
@@ -214,7 +185,7 @@
|
|
| 214 |
" classifier.to(device)\n",
|
| 215 |
" \n",
|
| 216 |
" # Training setup\n",
|
| 217 |
-
" optimizer = torch.optim.Adam(classifier.parameters(), lr=
|
| 218 |
" criterion = nn.CrossEntropyLoss()\n",
|
| 219 |
" \n",
|
| 220 |
" # Training loop\n",
|
|
@@ -233,10 +204,9 @@
|
|
| 233 |
" batch_features = X_train_tensor[i:i+batch_size].to(device)\n",
|
| 234 |
" batch_labels = y_train_tensor[i:i+batch_size].to(device)\n",
|
| 235 |
" \n",
|
| 236 |
-
" # Forward pass
|
| 237 |
-
"
|
| 238 |
-
"
|
| 239 |
-
" loss = criterion(logits, batch_labels)\n",
|
| 240 |
" \n",
|
| 241 |
" # Backward pass\n",
|
| 242 |
" optimizer.zero_grad()\n",
|
|
@@ -252,7 +222,7 @@
|
|
| 252 |
" \n",
|
| 253 |
" # Validation\n",
|
| 254 |
" classifier.eval()\n",
|
| 255 |
-
" with torch.no_grad()
|
| 256 |
" val_features = X_val_tensor.to(device)\n",
|
| 257 |
" val_labels = y_val_tensor.cpu().numpy()\n",
|
| 258 |
" \n",
|
|
|
|
| 17 |
"source": [
|
| 18 |
"import torch\n",
|
| 19 |
"import torch.nn as nn\n",
|
|
|
|
| 20 |
"from transformers import AutoModel\n",
|
| 21 |
"import librosa\n",
|
| 22 |
"import os\n",
|
|
|
|
| 24 |
"from sklearn.metrics import accuracy_score\n",
|
| 25 |
"import numpy as np\n",
|
| 26 |
"from tqdm import tqdm\n",
|
| 27 |
+
"import pickle"
|
|
|
|
| 28 |
]
|
| 29 |
},
|
| 30 |
{
|
|
|
|
| 33 |
"metadata": {},
|
| 34 |
"outputs": [],
|
| 35 |
"source": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
"def download_esc50():\n",
|
| 37 |
" import urllib.request\n",
|
| 38 |
" import zipfile\n",
|
|
|
|
| 46 |
" zip_ref.extractall('.')\n",
|
| 47 |
" os.rename('ESC-50-master', 'ESC-50')\n",
|
| 48 |
" os.remove('esc50.zip')\n",
|
| 49 |
+
" print(\"ESC-50 dataset downloaded and extracted\")"
|
| 50 |
+
]
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"cell_type": "code",
|
| 54 |
+
"execution_count": null,
|
| 55 |
+
"metadata": {},
|
| 56 |
+
"outputs": [],
|
| 57 |
+
"source": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
"def extract_features():\n",
|
| 59 |
+
" \"\"\"Extract and save features for all ESC-50 audio files\"\"\"\n",
|
| 60 |
" \n",
|
| 61 |
" if os.path.exists('esc50_features.pkl'):\n",
|
| 62 |
" print(\"Features already extracted, loading from file...\")\n",
|
|
|
|
| 69 |
" device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
| 70 |
" model.to(device)\n",
|
| 71 |
" \n",
|
| 72 |
+
" # Load metadata\n",
|
| 73 |
" metadata_path = 'ESC-50/meta/esc50.csv'\n",
|
| 74 |
+
" df = pd.read_csv(metadata_path)\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
" \n",
|
| 76 |
" features_list = []\n",
|
| 77 |
" labels_list = []\n",
|
| 78 |
" folds_list = []\n",
|
| 79 |
" \n",
|
| 80 |
+
" print(\"Extracting features...\")\n",
|
| 81 |
+
" for idx, row in tqdm(df.iterrows(), total=len(df)):\n",
|
| 82 |
+
" filename = row['filename']\n",
|
| 83 |
+
" label = row['target']\n",
|
| 84 |
+
" fold = row['fold']\n",
|
| 85 |
" \n",
|
| 86 |
+
" audio_path = os.path.join('ESC-50/audio', filename)\n",
|
|
|
|
| 87 |
" \n",
|
| 88 |
+
" try:\n",
|
| 89 |
+
" # Load and preprocess audio\n",
|
| 90 |
+
" audio, sr = librosa.load(audio_path, sr=16000)\n",
|
| 91 |
+
" audio_tensor = torch.tensor(audio).float().unsqueeze(0).to(device)\n",
|
| 92 |
+
" \n",
|
| 93 |
+
" # Extract features\n",
|
| 94 |
+
" with torch.no_grad(),torch.autocast(device_type='cuda'):\n",
|
| 95 |
+
" features = model.encode(audio_tensor)\n",
|
| 96 |
+
" if isinstance(features, dict):\n",
|
| 97 |
+
" for key in ['last_hidden_state', 'embeddings', 'audio']:\n",
|
| 98 |
+
" if key in features:\n",
|
| 99 |
+
" features = features[key]\n",
|
| 100 |
+
" break\n",
|
| 101 |
+
" else:\n",
|
| 102 |
+
" features = list(features.values())[0]\n",
|
| 103 |
+
" \n",
|
| 104 |
+
" # Global average pooling\n",
|
| 105 |
+
" if features.dim() > 2:\n",
|
| 106 |
+
" features = features.mean(dim=1)\n",
|
| 107 |
+
" \n",
|
| 108 |
+
" features = features.squeeze().cpu().numpy()\n",
|
| 109 |
+
" \n",
|
| 110 |
+
" features_list.append(features)\n",
|
| 111 |
+
" labels_list.append(label)\n",
|
| 112 |
+
" folds_list.append(fold)\n",
|
| 113 |
+
" \n",
|
| 114 |
+
" except Exception as e:\n",
|
| 115 |
+
" print(f\"Error processing {filename}: {e}\")\n",
|
| 116 |
" \n",
|
| 117 |
" # Save features\n",
|
| 118 |
" features_data = {\n",
|
|
|
|
| 126 |
" pickle.dump(features_data, f)\n",
|
| 127 |
" \n",
|
| 128 |
" print(f\"Features extracted: {len(features_list)} samples, embedding dim: {features_data['embedding_dim']}\")\n",
|
| 129 |
+
" return features_data"
|
| 130 |
+
]
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"cell_type": "code",
|
| 134 |
+
"execution_count": null,
|
| 135 |
+
"metadata": {},
|
| 136 |
+
"outputs": [],
|
| 137 |
+
"source": [
|
| 138 |
"# Download dataset and extract features\n",
|
| 139 |
"download_esc50()\n",
|
| 140 |
"features_data = extract_features()\n",
|
|
|
|
| 185 |
" classifier.to(device)\n",
|
| 186 |
" \n",
|
| 187 |
" # Training setup\n",
|
| 188 |
+
" optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3)\n",
|
| 189 |
" criterion = nn.CrossEntropyLoss()\n",
|
| 190 |
" \n",
|
| 191 |
" # Training loop\n",
|
|
|
|
| 204 |
" batch_features = X_train_tensor[i:i+batch_size].to(device)\n",
|
| 205 |
" batch_labels = y_train_tensor[i:i+batch_size].to(device)\n",
|
| 206 |
" \n",
|
| 207 |
+
" # Forward pass\n",
|
| 208 |
+
" logits = classifier(batch_features)\n",
|
| 209 |
+
" loss = criterion(logits, batch_labels)\n",
|
|
|
|
| 210 |
" \n",
|
| 211 |
" # Backward pass\n",
|
| 212 |
" optimizer.zero_grad()\n",
|
|
|
|
| 222 |
" \n",
|
| 223 |
" # Validation\n",
|
| 224 |
" classifier.eval()\n",
|
| 225 |
+
" with torch.no_grad():\n",
|
| 226 |
" val_features = X_val_tensor.to(device)\n",
|
| 227 |
" val_labels = y_val_tensor.cpu().numpy()\n",
|
| 228 |
" \n",
|