Heinrich Dinkel commited on
Commit ·
e393f34
1
Parent(s): 63ada3a
Updated notebook
Browse files- notebook.ipynb +7 -50
notebook.ipynb
CHANGED
|
@@ -55,15 +55,8 @@
|
|
| 55 |
" audio_tensor = torch.tensor(audio).float()\n",
|
| 56 |
" label_tensor = torch.tensor(label).long()\n",
|
| 57 |
" \n",
|
| 58 |
-
" return audio_tensor, label_tensor"
|
| 59 |
-
|
| 60 |
-
},
|
| 61 |
-
{
|
| 62 |
-
"cell_type": "code",
|
| 63 |
-
"execution_count": null,
|
| 64 |
-
"metadata": {},
|
| 65 |
-
"outputs": [],
|
| 66 |
-
"source": [
|
| 67 |
"def download_esc50():\n",
|
| 68 |
" import urllib.request\n",
|
| 69 |
" import zipfile\n",
|
|
@@ -80,34 +73,6 @@
|
|
| 80 |
" print(\"ESC-50 dataset downloaded and extracted\")"
|
| 81 |
]
|
| 82 |
},
|
| 83 |
-
{
|
| 84 |
-
"cell_type": "code",
|
| 85 |
-
"execution_count": null,
|
| 86 |
-
"metadata": {},
|
| 87 |
-
"outputs": [],
|
| 88 |
-
"source": [
|
| 89 |
-
"def get_embedding_dim(model):\n",
|
| 90 |
-
" dummy_input = torch.randn(1, 160000)\n",
|
| 91 |
-
" with torch.no_grad():\n",
|
| 92 |
-
" output = model(dummy_input)\n",
|
| 93 |
-
" if isinstance(output, dict):\n",
|
| 94 |
-
" for key in ['last_hidden_state', 'embeddings', 'audio']:\n",
|
| 95 |
-
" if key in output:\n",
|
| 96 |
-
" features = output[key]\n",
|
| 97 |
-
" break\n",
|
| 98 |
-
" else:\n",
|
| 99 |
-
" features = list(output.values())[0]\n",
|
| 100 |
-
" else:\n",
|
| 101 |
-
" features = output\n",
|
| 102 |
-
" \n",
|
| 103 |
-
" if features.dim() > 2:\n",
|
| 104 |
-
" embedding_dim = features.shape[-1]\n",
|
| 105 |
-
" else:\n",
|
| 106 |
-
" embedding_dim = features.shape[-1]\n",
|
| 107 |
-
" \n",
|
| 108 |
-
" return embedding_dim"
|
| 109 |
-
]
|
| 110 |
-
},
|
| 111 |
{
|
| 112 |
"cell_type": "code",
|
| 113 |
"execution_count": null,
|
|
@@ -121,7 +86,7 @@
|
|
| 121 |
"model = AutoModel.from_pretrained(\"mispeech/dashengtokenizer\", trust_remote_code=True)\n",
|
| 122 |
"\n",
|
| 123 |
"# Get embedding dimension\n",
|
| 124 |
-
"embedding_dim =
|
| 125 |
"print(f\"Model embedding dimension: {embedding_dim}\")\n",
|
| 126 |
"\n",
|
| 127 |
"# Freeze model\n",
|
|
@@ -135,15 +100,7 @@
|
|
| 135 |
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
| 136 |
"model.to(device)\n",
|
| 137 |
"classifier.to(device)\n",
|
| 138 |
-
"print(f\"Using device: {device}\")"
|
| 139 |
-
]
|
| 140 |
-
},
|
| 141 |
-
{
|
| 142 |
-
"cell_type": "code",
|
| 143 |
-
"execution_count": null,
|
| 144 |
-
"metadata": {},
|
| 145 |
-
"outputs": [],
|
| 146 |
-
"source": [
|
| 147 |
"# Create datasets\n",
|
| 148 |
"audio_dir = 'ESC-50/audio'\n",
|
| 149 |
"metadata_path = 'ESC-50/meta/esc50.csv'\n",
|
|
@@ -259,10 +216,10 @@
|
|
| 259 |
}
|
| 260 |
],
|
| 261 |
"metadata": {
|
| 262 |
-
|
| 263 |
"colab": {
|
| 264 |
-
|
| 265 |
-
|
| 266 |
},
|
| 267 |
"kernelspec": {
|
| 268 |
"display_name": "Python 3 (ipykernel)",
|
|
|
|
| 55 |
" audio_tensor = torch.tensor(audio).float()\n",
|
| 56 |
" label_tensor = torch.tensor(label).long()\n",
|
| 57 |
" \n",
|
| 58 |
+
" return audio_tensor, label_tensor\n",
|
| 59 |
+
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
"def download_esc50():\n",
|
| 61 |
" import urllib.request\n",
|
| 62 |
" import zipfile\n",
|
|
|
|
| 73 |
" print(\"ESC-50 dataset downloaded and extracted\")"
|
| 74 |
]
|
| 75 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
{
|
| 77 |
"cell_type": "code",
|
| 78 |
"execution_count": null,
|
|
|
|
| 86 |
"model = AutoModel.from_pretrained(\"mispeech/dashengtokenizer\", trust_remote_code=True)\n",
|
| 87 |
"\n",
|
| 88 |
"# Get embedding dimension\n",
|
| 89 |
+
"embedding_dim = 1280\n",
|
| 90 |
"print(f\"Model embedding dimension: {embedding_dim}\")\n",
|
| 91 |
"\n",
|
| 92 |
"# Freeze model\n",
|
|
|
|
| 100 |
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
| 101 |
"model.to(device)\n",
|
| 102 |
"classifier.to(device)\n",
|
| 103 |
+
"print(f\"Using device: {device}\")\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
"# Create datasets\n",
|
| 105 |
"audio_dir = 'ESC-50/audio'\n",
|
| 106 |
"metadata_path = 'ESC-50/meta/esc50.csv'\n",
|
|
|
|
| 216 |
}
|
| 217 |
],
|
| 218 |
"metadata": {
|
| 219 |
+
"accelerator": "GPU",
|
| 220 |
"colab": {
|
| 221 |
+
"gpuType": "T4",
|
| 222 |
+
"provenance": []
|
| 223 |
},
|
| 224 |
"kernelspec": {
|
| 225 |
"display_name": "Python 3 (ipykernel)",
|