Heinrich Dinkel commited on
Commit
0f35cd3
·
1 Parent(s): c123c63
Files changed (1) hide show
  1. notebook.ipynb +90 -60
notebook.ipynb CHANGED
@@ -17,6 +17,7 @@
17
  "source": [
18
  "import torch\n",
19
  "import torch.nn as nn\n",
 
20
  "from transformers import AutoModel\n",
21
  "import librosa\n",
22
  "import os\n",
@@ -24,7 +25,8 @@
24
  "from sklearn.metrics import accuracy_score\n",
25
  "import numpy as np\n",
26
  "from tqdm import tqdm\n",
27
- "import pickle"
 
28
  ]
29
  },
30
  {
@@ -33,6 +35,31 @@
33
  "metadata": {},
34
  "outputs": [],
35
  "source": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  "def download_esc50():\n",
37
  " import urllib.request\n",
38
  " import zipfile\n",
@@ -46,17 +73,46 @@
46
  " zip_ref.extractall('.')\n",
47
  " os.rename('ESC-50-master', 'ESC-50')\n",
48
  " os.remove('esc50.zip')\n",
49
- " print(\"ESC-50 dataset downloaded and extracted\")"
50
- ]
51
- },
52
- {
53
- "cell_type": "code",
54
- "execution_count": null,
55
- "metadata": {},
56
- "outputs": [],
57
- "source": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  "def extract_features():\n",
59
- " \"\"\"Extract and save features for all ESC-50 audio files\"\"\"\n",
60
  " \n",
61
  " if os.path.exists('esc50_features.pkl'):\n",
62
  " print(\"Features already extracted, loading from file...\")\n",
@@ -69,50 +125,30 @@
69
  " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
70
  " model.to(device)\n",
71
  " \n",
72
- " # Load metadata\n",
73
  " metadata_path = 'ESC-50/meta/esc50.csv'\n",
74
- " df = pd.read_csv(metadata_path)\n",
 
 
 
 
 
75
  " \n",
76
  " features_list = []\n",
77
  " labels_list = []\n",
78
  " folds_list = []\n",
79
  " \n",
80
- " print(\"Extracting features...\")\n",
81
- " for idx, row in tqdm(df.iterrows(), total=len(df)):\n",
82
- " filename = row['filename']\n",
83
- " label = row['target']\n",
84
- " fold = row['fold']\n",
85
  " \n",
86
- " audio_path = os.path.join('ESC-50/audio', filename)\n",
 
87
  " \n",
88
- " try:\n",
89
- " # Load and preprocess audio\n",
90
- " audio, sr = librosa.load(audio_path, sr=16000)\n",
91
- " audio_tensor = torch.tensor(audio).float().unsqueeze(0).to(device)\n",
92
- " \n",
93
- " # Extract features\n",
94
- " with torch.no_grad(),torch.autocast(device_type='cuda'):\n",
95
- " features = model.encode(audio_tensor)\n",
96
- " if isinstance(features, dict):\n",
97
- " for key in ['last_hidden_state', 'embeddings', 'audio']:\n",
98
- " if key in features:\n",
99
- " features = features[key]\n",
100
- " break\n",
101
- " else:\n",
102
- " features = list(features.values())[0]\n",
103
- " \n",
104
- " # Global average pooling\n",
105
- " if features.dim() > 2:\n",
106
- " features = features.mean(dim=1)\n",
107
- " \n",
108
- " features = features.squeeze().cpu().numpy()\n",
109
- " \n",
110
- " features_list.append(features)\n",
111
- " labels_list.append(label)\n",
112
- " folds_list.append(fold)\n",
113
- " \n",
114
- " except Exception as e:\n",
115
- " print(f\"Error processing {filename}: {e}\")\n",
116
  " \n",
117
  " # Save features\n",
118
  " features_data = {\n",
@@ -126,15 +162,8 @@
126
  " pickle.dump(features_data, f)\n",
127
  " \n",
128
  " print(f\"Features extracted: {len(features_list)} samples, embedding dim: {features_data['embedding_dim']}\")\n",
129
- " return features_data"
130
- ]
131
- },
132
- {
133
- "cell_type": "code",
134
- "execution_count": null,
135
- "metadata": {},
136
- "outputs": [],
137
- "source": [
138
  "# Download dataset and extract features\n",
139
  "download_esc50()\n",
140
  "features_data = extract_features()\n",
@@ -204,9 +233,10 @@
204
  " batch_features = X_train_tensor[i:i+batch_size].to(device)\n",
205
  " batch_labels = y_train_tensor[i:i+batch_size].to(device)\n",
206
  " \n",
207
- " # Forward pass\n",
208
- " logits = classifier(batch_features)\n",
209
- " loss = criterion(logits, batch_labels)\n",
 
210
  " \n",
211
  " # Backward pass\n",
212
  " optimizer.zero_grad()\n",
@@ -222,7 +252,7 @@
222
  " \n",
223
  " # Validation\n",
224
  " classifier.eval()\n",
225
- " with torch.no_grad():\n",
226
  " val_features = X_val_tensor.to(device)\n",
227
  " val_labels = y_val_tensor.cpu().numpy()\n",
228
  " \n",
 
17
  "source": [
18
  "import torch\n",
19
  "import torch.nn as nn\n",
20
+ "from torch.utils.data import Dataset, DataLoader\n",
21
  "from transformers import AutoModel\n",
22
  "import librosa\n",
23
  "import os\n",
 
25
  "from sklearn.metrics import accuracy_score\n",
26
  "import numpy as np\n",
27
  "from tqdm import tqdm\n",
28
+ "import pickle\n",
29
+ "from concurrent.futures import ThreadPoolExecutor"
30
  ]
31
  },
32
  {
 
35
  "metadata": {},
36
  "outputs": [],
37
  "source": [
38
+ "class ESC50Dataset(Dataset):\n",
39
+ " def __init__(self, metadata_path, audio_dir, sr=16000):\n",
40
+ " self.metadata = pd.read_csv(metadata_path)\n",
41
+ " self.audio_dir = audio_dir\n",
42
+ " self.sr = sr\n",
43
+ " \n",
44
+ " def __len__(self):\n",
45
+ " return len(self.metadata)\n",
46
+ " \n",
47
+ " def __getitem__(self, idx):\n",
48
+ " row = self.metadata.iloc[idx]\n",
49
+ " filename = row['filename']\n",
50
+ " label = row['target']\n",
51
+ " fold = row['fold']\n",
52
+ " \n",
53
+ " audio_path = os.path.join(self.audio_dir, filename)\n",
54
+ " \n",
55
+ " try:\n",
56
+ " audio, sr = librosa.load(audio_path, sr=self.sr)\n",
57
+ " return audio, label, fold\n",
58
+ " except Exception as e:\n",
59
+ " print(f\"Error loading {audio_path}: {e}\")\n",
60
+ " # Return zeros if file can't be loaded\n",
61
+ " return np.zeros(16000), label, fold\n",
62
+ "\n",
63
  "def download_esc50():\n",
64
  " import urllib.request\n",
65
  " import zipfile\n",
 
73
  " zip_ref.extractall('.')\n",
74
  " os.rename('ESC-50-master', 'ESC-50')\n",
75
  " os.remove('esc50.zip')\n",
76
+ " print(\"ESC-50 dataset downloaded and extracted\")\n",
77
+ "\n",
78
+ "def extract_features_batch(audio_batch, model, device):\n",
79
+ " \"\"\"Extract features for a batch of audio\"\"\"\n",
80
+ " audio_tensors = []\n",
81
+ " \n",
82
+ " for audio in audio_batch:\n",
83
+ " # Ensure audio is the right length\n",
84
+ " if len(audio) < 16000:\n",
85
+ " audio = np.pad(audio, (0, 16000 - len(audio)))\n",
86
+ " elif len(audio) > 16000:\n",
87
+ " audio = audio[:16000]\n",
88
+ " \n",
89
+ " audio_tensor = torch.tensor(audio).float().unsqueeze(0).to(device)\n",
90
+ " audio_tensors.append(audio_tensor)\n",
91
+ " \n",
92
+ " # Batch process\n",
93
+ " if audio_tensors:\n",
94
+ " batch_audio = torch.cat(audio_tensors, dim=0)\n",
95
+ " \n",
96
+ " with torch.no_grad(), torch.autocast(device_type='cuda' if device.type == 'cuda' else 'cpu'):\n",
97
+ " features = model.encode(batch_audio)\n",
98
+ " if isinstance(features, dict):\n",
99
+ " for key in ['last_hidden_state', 'embeddings', 'audio']:\n",
100
+ " if key in features:\n",
101
+ " features = features[key]\n",
102
+ " break\n",
103
+ " else:\n",
104
+ " features = list(features.values())[0]\n",
105
+ " \n",
106
+ " # Global average pooling\n",
107
+ " if features.dim() > 2:\n",
108
+ " features = features.mean(dim=1)\n",
109
+ " \n",
110
+ " return features.cpu().numpy()\n",
111
+ " \n",
112
+ " return np.array([])\n",
113
+ "\n",
114
  "def extract_features():\n",
115
+ " \"\"\"Extract and save features for all ESC-50 audio files using batch processing\"\"\"\n",
116
  " \n",
117
  " if os.path.exists('esc50_features.pkl'):\n",
118
  " print(\"Features already extracted, loading from file...\")\n",
 
125
  " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
126
  " model.to(device)\n",
127
  " \n",
128
+ " # Create dataset\n",
129
  " metadata_path = 'ESC-50/meta/esc50.csv'\n",
130
+ " audio_dir = 'ESC-50/audio'\n",
131
+ " dataset = ESC50Dataset(metadata_path, audio_dir)\n",
132
+ " \n",
133
+ " # Batch processing\n",
134
+ " batch_size = 16\n",
135
+ " dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4)\n",
136
  " \n",
137
  " features_list = []\n",
138
  " labels_list = []\n",
139
  " folds_list = []\n",
140
  " \n",
141
+ " print(\"Extracting features with batch processing...\")\n",
142
+ " for batch in tqdm(dataloader):\n",
143
+ " audio_batch, label_batch, fold_batch = batch\n",
 
 
144
  " \n",
145
+ " # Extract features for this batch\n",
146
+ " batch_features = extract_features_batch(audio_batch, model, device)\n",
147
  " \n",
148
+ " if len(batch_features) > 0:\n",
149
+ " features_list.extend(batch_features)\n",
150
+ " labels_list.extend(label_batch.numpy())\n",
151
+ " folds_list.extend(fold_batch.numpy())\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  " \n",
153
  " # Save features\n",
154
  " features_data = {\n",
 
162
  " pickle.dump(features_data, f)\n",
163
  " \n",
164
  " print(f\"Features extracted: {len(features_list)} samples, embedding dim: {features_data['embedding_dim']}\")\n",
165
+ " return features_data\n",
166
+ "\n",
 
 
 
 
 
 
 
167
  "# Download dataset and extract features\n",
168
  "download_esc50()\n",
169
  "features_data = extract_features()\n",
 
233
  " batch_features = X_train_tensor[i:i+batch_size].to(device)\n",
234
  " batch_labels = y_train_tensor[i:i+batch_size].to(device)\n",
235
  " \n",
236
+ " # Forward pass with autocast\n",
237
+ " with torch.autocast(device_type='cuda' if device.type == 'cuda' else 'cpu'):\n",
238
+ " logits = classifier(batch_features)\n",
239
+ " loss = criterion(logits, batch_labels)\n",
240
  " \n",
241
  " # Backward pass\n",
242
  " optimizer.zero_grad()\n",
 
252
  " \n",
253
  " # Validation\n",
254
  " classifier.eval()\n",
255
+ " with torch.no_grad(), torch.autocast(device_type='cuda' if device.type == 'cuda' else 'cpu'):\n",
256
  " val_features = X_val_tensor.to(device)\n",
257
  " val_labels = y_val_tensor.cpu().numpy()\n",
258
  " \n",