Heinrich Dinkel commited on
Commit
9dd5fff
·
1 Parent(s): 5340ead
Files changed (1) hide show
  1. notebook.ipynb +61 -91
notebook.ipynb CHANGED
@@ -17,7 +17,6 @@
17
  "source": [
18
  "import torch\n",
19
  "import torch.nn as nn\n",
20
- "from torch.utils.data import Dataset, DataLoader\n",
21
  "from transformers import AutoModel\n",
22
  "import librosa\n",
23
  "import os\n",
@@ -25,8 +24,7 @@
25
  "from sklearn.metrics import accuracy_score\n",
26
  "import numpy as np\n",
27
  "from tqdm import tqdm\n",
28
- "import pickle\n",
29
- "from concurrent.futures import ThreadPoolExecutor"
30
  ]
31
  },
32
  {
@@ -35,31 +33,6 @@
35
  "metadata": {},
36
  "outputs": [],
37
  "source": [
38
- "class ESC50Dataset(Dataset):\n",
39
- " def __init__(self, metadata_path, audio_dir, sr=16000):\n",
40
- " self.metadata = pd.read_csv(metadata_path)\n",
41
- " self.audio_dir = audio_dir\n",
42
- " self.sr = sr\n",
43
- " \n",
44
- " def __len__(self):\n",
45
- " return len(self.metadata)\n",
46
- " \n",
47
- " def __getitem__(self, idx):\n",
48
- " row = self.metadata.iloc[idx]\n",
49
- " filename = row['filename']\n",
50
- " label = row['target']\n",
51
- " fold = row['fold']\n",
52
- " \n",
53
- " audio_path = os.path.join(self.audio_dir, filename)\n",
54
- " \n",
55
- " try:\n",
56
- " audio, sr = librosa.load(audio_path, sr=self.sr)\n",
57
- " return audio, label, fold\n",
58
- " except Exception as e:\n",
59
- " print(f\"Error loading {audio_path}: {e}\")\n",
60
- " # Return zeros if file can't be loaded\n",
61
- " return np.zeros(16000), label, fold\n",
62
- "\n",
63
  "def download_esc50():\n",
64
  " import urllib.request\n",
65
  " import zipfile\n",
@@ -73,46 +46,17 @@
73
  " zip_ref.extractall('.')\n",
74
  " os.rename('ESC-50-master', 'ESC-50')\n",
75
  " os.remove('esc50.zip')\n",
76
- " print(\"ESC-50 dataset downloaded and extracted\")\n",
77
- "\n",
78
- "def extract_features_batch(audio_batch, model, device):\n",
79
- " \"\"\"Extract features for a batch of audio\"\"\"\n",
80
- " audio_tensors = []\n",
81
- " \n",
82
- " for audio in audio_batch:\n",
83
- " # Ensure audio is the right length\n",
84
- " if len(audio) < 16000:\n",
85
- " audio = np.pad(audio, (0, 16000 - len(audio)))\n",
86
- " elif len(audio) > 16000:\n",
87
- " audio = audio[:16000]\n",
88
- " \n",
89
- " audio_tensor = torch.tensor(audio).float().unsqueeze(0).to(device)\n",
90
- " audio_tensors.append(audio_tensor)\n",
91
- " \n",
92
- " # Batch process\n",
93
- " if audio_tensors:\n",
94
- " batch_audio = torch.cat(audio_tensors, dim=0)\n",
95
- " \n",
96
- " with torch.no_grad(), torch.autocast(device_type='cuda' if device.type == 'cuda' else 'cpu'):\n",
97
- " features = model.encode(batch_audio)\n",
98
- " if isinstance(features, dict):\n",
99
- " for key in ['last_hidden_state', 'embeddings', 'audio']:\n",
100
- " if key in features:\n",
101
- " features = features[key]\n",
102
- " break\n",
103
- " else:\n",
104
- " features = list(features.values())[0]\n",
105
- " \n",
106
- " # Global average pooling\n",
107
- " if features.dim() > 2:\n",
108
- " features = features.mean(dim=1)\n",
109
- " \n",
110
- " return features.cpu().numpy()\n",
111
- " \n",
112
- " return np.array([])\n",
113
- "\n",
114
  "def extract_features():\n",
115
- " \"\"\"Extract and save features for all ESC-50 audio files using batch processing\"\"\"\n",
116
  " \n",
117
  " if os.path.exists('esc50_features.pkl'):\n",
118
  " print(\"Features already extracted, loading from file...\")\n",
@@ -125,30 +69,50 @@
125
  " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
126
  " model.to(device)\n",
127
  " \n",
128
- " # Create dataset\n",
129
  " metadata_path = 'ESC-50/meta/esc50.csv'\n",
130
- " audio_dir = 'ESC-50/audio'\n",
131
- " dataset = ESC50Dataset(metadata_path, audio_dir)\n",
132
- " \n",
133
- " # Batch processing\n",
134
- " batch_size = 16\n",
135
- " dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4)\n",
136
  " \n",
137
  " features_list = []\n",
138
  " labels_list = []\n",
139
  " folds_list = []\n",
140
  " \n",
141
- " print(\"Extracting features with batch processing...\")\n",
142
- " for batch in tqdm(dataloader):\n",
143
- " audio_batch, label_batch, fold_batch = batch\n",
 
 
144
  " \n",
145
- " # Extract features for this batch\n",
146
- " batch_features = extract_features_batch(audio_batch, model, device)\n",
147
  " \n",
148
- " if len(batch_features) > 0:\n",
149
- " features_list.extend(batch_features)\n",
150
- " labels_list.extend(label_batch.numpy())\n",
151
- " folds_list.extend(fold_batch.numpy())\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  " \n",
153
  " # Save features\n",
154
  " features_data = {\n",
@@ -162,8 +126,15 @@
162
  " pickle.dump(features_data, f)\n",
163
  " \n",
164
  " print(f\"Features extracted: {len(features_list)} samples, embedding dim: {features_data['embedding_dim']}\")\n",
165
- " return features_data\n",
166
- "\n",
 
 
 
 
 
 
 
167
  "# Download dataset and extract features\n",
168
  "download_esc50()\n",
169
  "features_data = extract_features()\n",
@@ -214,7 +185,7 @@
214
  " classifier.to(device)\n",
215
  " \n",
216
  " # Training setup\n",
217
- " optimizer = torch.optim.Adam(classifier.parameters(), lr=8e-3)\n",
218
  " criterion = nn.CrossEntropyLoss()\n",
219
  " \n",
220
  " # Training loop\n",
@@ -233,10 +204,9 @@
233
  " batch_features = X_train_tensor[i:i+batch_size].to(device)\n",
234
  " batch_labels = y_train_tensor[i:i+batch_size].to(device)\n",
235
  " \n",
236
- " # Forward pass with autocast\n",
237
- " with torch.autocast(device_type='cuda' if device.type == 'cuda' else 'cpu'):\n",
238
- " logits = classifier(batch_features)\n",
239
- " loss = criterion(logits, batch_labels)\n",
240
  " \n",
241
  " # Backward pass\n",
242
  " optimizer.zero_grad()\n",
@@ -252,7 +222,7 @@
252
  " \n",
253
  " # Validation\n",
254
  " classifier.eval()\n",
255
- " with torch.no_grad(), torch.autocast(device_type='cuda' if device.type == 'cuda' else 'cpu'):\n",
256
  " val_features = X_val_tensor.to(device)\n",
257
  " val_labels = y_val_tensor.cpu().numpy()\n",
258
  " \n",
 
17
  "source": [
18
  "import torch\n",
19
  "import torch.nn as nn\n",
 
20
  "from transformers import AutoModel\n",
21
  "import librosa\n",
22
  "import os\n",
 
24
  "from sklearn.metrics import accuracy_score\n",
25
  "import numpy as np\n",
26
  "from tqdm import tqdm\n",
27
+ "import pickle"
 
28
  ]
29
  },
30
  {
 
33
  "metadata": {},
34
  "outputs": [],
35
  "source": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  "def download_esc50():\n",
37
  " import urllib.request\n",
38
  " import zipfile\n",
 
46
  " zip_ref.extractall('.')\n",
47
  " os.rename('ESC-50-master', 'ESC-50')\n",
48
  " os.remove('esc50.zip')\n",
49
+ " print(\"ESC-50 dataset downloaded and extracted\")"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": null,
55
+ "metadata": {},
56
+ "outputs": [],
57
+ "source": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  "def extract_features():\n",
59
+ " \"\"\"Extract and save features for all ESC-50 audio files\"\"\"\n",
60
  " \n",
61
  " if os.path.exists('esc50_features.pkl'):\n",
62
  " print(\"Features already extracted, loading from file...\")\n",
 
69
  " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
70
  " model.to(device)\n",
71
  " \n",
72
+ " # Load metadata\n",
73
  " metadata_path = 'ESC-50/meta/esc50.csv'\n",
74
+ " df = pd.read_csv(metadata_path)\n",
 
 
 
 
 
75
  " \n",
76
  " features_list = []\n",
77
  " labels_list = []\n",
78
  " folds_list = []\n",
79
  " \n",
80
+ " print(\"Extracting features...\")\n",
81
+ " for idx, row in tqdm(df.iterrows(), total=len(df)):\n",
82
+ " filename = row['filename']\n",
83
+ " label = row['target']\n",
84
+ " fold = row['fold']\n",
85
  " \n",
86
+ " audio_path = os.path.join('ESC-50/audio', filename)\n",
 
87
  " \n",
88
+ " try:\n",
89
+ " # Load and preprocess audio\n",
90
+ " audio, sr = librosa.load(audio_path, sr=16000)\n",
91
+ " audio_tensor = torch.tensor(audio).float().unsqueeze(0).to(device)\n",
92
+ " \n",
93
+ " # Extract features\n",
94
+ " with torch.no_grad(),torch.autocast(device_type='cuda'):\n",
95
+ " features = model.encode(audio_tensor)\n",
96
+ " if isinstance(features, dict):\n",
97
+ " for key in ['last_hidden_state', 'embeddings', 'audio']:\n",
98
+ " if key in features:\n",
99
+ " features = features[key]\n",
100
+ " break\n",
101
+ " else:\n",
102
+ " features = list(features.values())[0]\n",
103
+ " \n",
104
+ " # Global average pooling\n",
105
+ " if features.dim() > 2:\n",
106
+ " features = features.mean(dim=1)\n",
107
+ " \n",
108
+ " features = features.squeeze().cpu().numpy()\n",
109
+ " \n",
110
+ " features_list.append(features)\n",
111
+ " labels_list.append(label)\n",
112
+ " folds_list.append(fold)\n",
113
+ " \n",
114
+ " except Exception as e:\n",
115
+ " print(f\"Error processing {filename}: {e}\")\n",
116
  " \n",
117
  " # Save features\n",
118
  " features_data = {\n",
 
126
  " pickle.dump(features_data, f)\n",
127
  " \n",
128
  " print(f\"Features extracted: {len(features_list)} samples, embedding dim: {features_data['embedding_dim']}\")\n",
129
+ " return features_data"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": null,
135
+ "metadata": {},
136
+ "outputs": [],
137
+ "source": [
138
  "# Download dataset and extract features\n",
139
  "download_esc50()\n",
140
  "features_data = extract_features()\n",
 
185
  " classifier.to(device)\n",
186
  " \n",
187
  " # Training setup\n",
188
+ " optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3)\n",
189
  " criterion = nn.CrossEntropyLoss()\n",
190
  " \n",
191
  " # Training loop\n",
 
204
  " batch_features = X_train_tensor[i:i+batch_size].to(device)\n",
205
  " batch_labels = y_train_tensor[i:i+batch_size].to(device)\n",
206
  " \n",
207
+ " # Forward pass\n",
208
+ " logits = classifier(batch_features)\n",
209
+ " loss = criterion(logits, batch_labels)\n",
 
210
  " \n",
211
  " # Backward pass\n",
212
  " optimizer.zero_grad()\n",
 
222
  " \n",
223
  " # Validation\n",
224
  " classifier.eval()\n",
225
+ " with torch.no_grad():\n",
226
  " val_features = X_val_tensor.to(device)\n",
227
  " val_labels = y_val_tensor.cpu().numpy()\n",
228
  " \n",