File size: 18,373 Bytes
d036110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "text": [
      "A    audioset/README.md\r\nA    audioset/mel_features.py\r\nA    audioset/vggish_inference_demo.py\r\nA    audioset/vggish_input.py\r\nA    audioset/vggish_params.py\r\nA    audioset/vggish_postprocess.py\r\nA    audioset/vggish_slim.py\r\nA    audioset/vggish_smoke_test.py\r\n",
      "A    audioset/vggish_train_demo.py\r\n",
      "Checked out revision 9495.\r\n",
      "Requirement already satisfied: numpy in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (1.16.3)\r\nRequirement already satisfied: scipy in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (1.2.1)\r\n",
      "Requirement already satisfied: resampy in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (0.2.1)\r\nRequirement already satisfied: tensorflow in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (1.13.1)\r\nRequirement already satisfied: six in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (1.12.0)\r\nRequirement already satisfied: soundfile in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (0.10.2)\r\nRequirement already satisfied: numpy>=1.10 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from resampy) (1.16.3)\r\nRequirement already satisfied: scipy>=0.13 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from resampy) (1.2.1)\r\n",
      "Requirement already satisfied: numba>=0.32 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from resampy) (0.43.1)\r\nRequirement already satisfied: gast>=0.2.0 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (0.2.2)\r\nRequirement already satisfied: termcolor>=1.1.0 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (1.1.0)\r\nRequirement already satisfied: tensorflow-estimator<1.14.0rc0,>=1.13.0 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (1.13.0)\r\nRequirement already satisfied: wheel>=0.26 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (0.33.1)\r\nRequirement already satisfied: grpcio>=1.8.6 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (1.20.1)\r\nRequirement already satisfied: astor>=0.6.0 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (0.7.1)\r\nRequirement already satisfied: absl-py>=0.1.6 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (0.7.1)\r\nRequirement already satisfied: protobuf>=3.6.1 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (3.7.1)\r\nRequirement already satisfied: keras-applications>=1.0.6 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (1.0.7)\r\nRequirement already satisfied: keras-preprocessing>=1.0.5 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (1.0.9)\r\nRequirement already satisfied: tensorboard<1.14.0,>=1.13.0 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (1.13.1)\r\n",
      "Requirement already satisfied: cffi>=1.0 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from soundfile) (1.12.3)\r\nRequirement already satisfied: llvmlite>=0.28.0dev0 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from numba>=0.32->resampy) (0.28.0)\r\nRequirement already satisfied: mock>=2.0.0 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow-estimator<1.14.0rc0,>=1.13.0->tensorflow) (3.0.5)\r\nRequirement already satisfied: setuptools in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from protobuf>=3.6.1->tensorflow) (41.0.1)\r\nRequirement already satisfied: h5py in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from keras-applications>=1.0.6->tensorflow) (2.9.0)\r\nRequirement already satisfied: werkzeug>=0.11.15 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorboard<1.14.0,>=1.13.0->tensorflow) (0.15.2)\r\nRequirement already satisfied: markdown>=2.6.8 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorboard<1.14.0,>=1.13.0->tensorflow) (3.1)\r\nRequirement already satisfied: pycparser in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from cffi>=1.0->soundfile) (2.19)\r\n",
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\r\n                                 Dload  Upload   Total   Spent    Left  Speed\r\n\r  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0",
      "\r  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0",
      "\r  0  277M    0  3870    0     0   2314      0 34:56:43  0:00:01 34:56:42  2313",
      "\r  1  277M    1 4096k    0     0  1413k      0  0:03:21  0:00:02  0:03:19 1413k",
      "\r  4  277M    4 13.1M    0     0  4110k      0  0:01:09  0:00:03  0:01:06 4110k",
      "\r 13  277M   13 37.2M    0     0  8940k      0  0:00:31  0:00:04  0:00:27 8939k",
      "\r 25  277M   25 71.5M    0     0  13.5M      0  0:00:20  0:00:05  0:00:15 17.7M",
      "\r 34  277M   34 96.8M    0     0  15.4M      0  0:00:17  0:00:06  0:00:11 21.0M",
      "\r 43  277M   43  120M    0     0  15.8M      0  0:00:17  0:00:07  0:00:10 24.6M",
      "\r 48  277M   48  136M    0     0  16.4M      0  0:00:16  0:00:08  0:00:08 24.4M",
      "\r 60  277M   60  166M    0     0  17.9M      0  0:00:15  0:00:09  0:00:06 25.8M",
      "\r 71  277M   71  197M    0     0  19.2M      0  0:00:14  0:00:10  0:00:04 25.1M",
      "\r 76  277M   76  212M    0     0  18.8M      0  0:00:14  0:00:11  0:00:03 23.0M",
      "\r 83  277M   83  232M    0     0  17.8M      0  0:00:15  0:00:12  0:00:03 20.8M",
      "\r 86  277M   86 ",
      " 240M    0     0  18.0M      0  0:00:15  0:00:13  0:00:02 20.8M",
      "\r 95  277M   95  264M    0     0  18.2M      0  0:00:15  0:00:14  0:00:01 18.6M",
      "\r100  277M  100  277M    0     0  18.6M      0  0:00:14  0:00:14 --:--:-- 17.3M\r\n",
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\r\n                                 Dload  Upload   Total   Spent    Left  Speed\r\n\r  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0",
      "\r  1 73020    1  1284    0     0   3139      0  0:00:23 --:--:--  0:00:23  3139\r100 73020  100 73020    0     0   163k      0 --:--:-- --:--:-- --:--:--  163k\r\n"
     ],
     "output_type": "stream"
    }
   ],
   "source": [
    "\"\"\"\n",
    "This notebook demonstrates how to replicate converting tensorflow\n",
    "weights from tensorflow's vggish to torchvggish\n",
    "\"\"\" \n",
    "\n",
    "# Download the audioset directory using subversion\n",
    "# !apt-get -qq install subversion   # uncomment if on linux\n",
    "!svn checkout https://github.com/tensorflow/models/trunk/research/audioset\n",
    "\n",
    "# Download audioset requirements\n",
    "!pip install numpy scipy\n",
    "!pip install resampy tensorflow six soundfile\n",
    "\n",
    "# grab the VGGish model checkpoints & PCA params\n",
    "!curl -O https://storage.googleapis.com/audioset/vggish_model.ckpt\n",
    "!curl -O https://storage.googleapis.com/audioset/vggish_pca_params.npz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "text": [
      "\nWARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.\nFor more information, please see:\n  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n  * https://github.com/tensorflow/addons\nIf you depend on functionality not listed there, please file an issue.\n\n\nTesting your install of VGGish\n\n",
      "Log Mel Spectrogram example:  [[-4.47297436 -4.29457354 -4.14940631 ... -3.9747003  -3.94774997\n  -3.78687669]\n [-4.48589533 -4.28825497 -4.139964   ... -3.98368686 -3.94976505\n  -3.7951698 ]\n [-4.46158065 -4.29329706 -4.14905953 ... -3.96442484 -3.94895483\n  -3.78619839]\n ...\n [-4.46152626 -4.29365061 -4.14848608 ... -3.96638113 -3.95057575\n  -3.78538167]\n [-4.46152595 -4.2936572  -4.14848104 ... -3.96640507 -3.95059567\n  -3.78537143]\n [-4.46152565 -4.29366386 -4.14847603 ... -3.96642906 -3.95061564\n  -3.78536116]]\nWARNING:tensorflow:From /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\nInstructions for updating:\nColocations handled automatically by placer.\n",
      "WARNING:tensorflow:From /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages/tensorflow/contrib/layers/python/layers/layers.py:1624: flatten (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\nInstructions for updating:\nUse keras.layers.flatten instead.\n",
      "WARNING:tensorflow:From /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.\nInstructions for updating:\nUse standard file APIs to check for files with this prefix.\n",
      "INFO:tensorflow:Restoring parameters from vggish_model.ckpt\n",
      "VGGish embedding:  [0.         0.         0.         0.         0.         0.\n 0.         0.16137293 0.         0.         0.         0.\n 0.         0.         0.         0.         0.         0.80695796\n 0.         0.         0.         0.         0.         0.\n 0.         0.36792755 0.03582409 0.         0.         0.\n 0.         0.38027024 0.1375593  0.9174708  0.8065634  0.\n 0.         0.         0.         0.04036281 0.7076243  0.\n 0.497839   0.24081808 0.21565434 0.88492286 1.19568    0.6706197\n 0.20779458 0.01639861 0.17471863 0.         0.         0.25100806\n 0.         0.         0.14607918 0.         0.39887053 0.30542105\n 0.12896761 0.         0.         0.         0.         0.\n 0.5385133  0.         0.         0.04941072 0.42527416 0.18537284\n 0.         0.         0.14753515 0.         0.         0.69933873\n 0.45541188 0.05174822 0.         0.01992539 0.         0.\n 0.5181578  0.565576   0.6587975  0.         0.         0.41056332\n 0.         0.         0.         0.25765193 0.23232114 0.24026448\n 0.         0.         0.         0.         0.         0.26523757\n 0.         0.48460823 0.         0.         0.19325787 0.\n 0.20123348 0.         0.03368621 0.         0.         0.\n 0.         0.17836356 0.024749   0.06889972 0.         0.\n 0.         0.08246281 0.         0.         0.         0.\n 0.         0.        ]\nPostprocessed VGGish embedding:  [169  10 154 127 191  66 124  69 157 232 142  21 128 131  43   3  33 111\n 198 153  76 255 194  60  71 179 146 131 167  60  79  76 192  84 102 160\n  23  91 173  13 149 186 115 202 252 163  84 145 107 255   5 198  81   0\n 203 110  35 104 101 131 255   0   0 158 136  74 115 152  77 154  54 151\n  82 243  57 116 165 153  85 181 152   0 255 122  29 255  46 105 110  43\n   0  90  58  13 255 108  96 255  84 121 255  75 176 111 176  64  83 231\n 255  82 255  94  81 144  99 173 255   0   0 158  31 230 112 255   0 255\n  20 255]\n\nLooks Good To Me!\n\n"
     ],
     "output_type": "stream"
    }
   ],
   "source": [
    "# Test install\n",
    "!mv audioset/* .\n",
    "from vggish_smoke_test import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "text": [
      "INFO:tensorflow:Restoring parameters from vggish_model.ckpt\n",
      "vggish/conv1/weights:0\n\t(3, 3, 1, 64)\nvggish/conv1/biases:0\n\t(64,)\nvggish/conv2/weights:0\n\t(3, 3, 64, 128)\nvggish/conv2/biases:0\n\t(128,)\nvggish/conv3/conv3_1/weights:0\n\t(3, 3, 128, 256)\nvggish/conv3/conv3_1/biases:0\n\t(256,)\nvggish/conv3/conv3_2/weights:0\n\t(3, 3, 256, 256)\nvggish/conv3/conv3_2/biases:0\n\t(256,)\nvggish/conv4/conv4_1/weights:0\n\t(3, 3, 256, 512)\nvggish/conv4/conv4_1/biases:0\n\t(512,)\nvggish/conv4/conv4_2/weights:0\n\t(3, 3, 512, 512)\nvggish/conv4/conv4_2/biases:0\n\t(512,)\nvggish/fc1/fc1_1/weights:0\n\t(12288, 4096)\nvggish/fc1/fc1_1/biases:0\n\t(4096,)\nvggish/fc1/fc1_2/weights:0\n\t(4096, 4096)\nvggish/fc1/fc1_2/biases:0\n\t(4096,)\nvggish/fc2/weights:0\n\t(4096, 128)\nvggish/fc2/biases:0\n\t(128,)\nvalues written to vggish_dict\n"
     ],
     "output_type": "stream"
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "import vggish_slim\n",
    "\n",
    "vggish_dict = {}\n",
    "# load the model and get info \n",
    "with tf.Graph().as_default(), tf.Session() as sess:\n",
    "    vggish_slim.define_vggish_slim(training=True)\n",
    "    vggish_slim.load_vggish_slim_checkpoint(sess,\"vggish_model.ckpt\")\n",
    "    \n",
    "    tvars = tf.trainable_variables()\n",
    "    tvars_vals = sess.run(tvars)\n",
    "\n",
    "    for var, val in zip(tvars, tvars_vals):\n",
    "        print(\"%s\" % (var.name))\n",
    "        print(\"\\t\" + str(var.shape))\n",
    "        vggish_dict[var.name] = val\n",
    "    print(\"values written to vggish_dict\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Define torch model for vggish\n",
    "\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import numpy as np\n",
    "\n",
    "# From vggish_slim:\n",
    "# The VGG stack of alternating convolutions and max-pools.\n",
    "#     net = slim.conv2d(net, 64, scope='conv1')\n",
    "#     net = slim.max_pool2d(net, scope='pool1')\n",
    "#     net = slim.conv2d(net, 128, scope='conv2')\n",
    "#     net = slim.max_pool2d(net, scope='pool2')\n",
    "#     net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')\n",
    "#     net = slim.max_pool2d(net, scope='pool3')\n",
    "#     net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')\n",
    "#     net = slim.max_pool2d(net, scope='pool4')\n",
    "#     # Flatten before entering fully-connected layers\n",
    "#     net = slim.flatten(net)\n",
    "#     net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')\n",
    "#     # The embedding layer.\n",
    "#     net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')\n",
    "\n",
    "vggish_list = list(vggish_dict.values())\n",
    "def param_generator():\n",
    "    param = vggish_list.pop(0)\n",
    "    transposed = np.transpose(param)\n",
    "    to_torch = torch.from_numpy(transposed)\n",
    "    result = torch.nn.Parameter(to_torch)\n",
    "    yield result\n",
    "\n",
    "class VGGish(nn.Module):\n",
    "    def __init__(self):\n",
    "        super(VGGish, self).__init__()\n",
    "        self.features = nn.Sequential(\n",
    "            nn.Conv2d(1, 64, 3, 1, 1),\n",
    "            nn.ReLU(inplace=True),\n",
    "            nn.MaxPool2d(2, 2),\n",
    "            nn.Conv2d(64, 128, 3, 1, 1),\n",
    "            nn.ReLU(inplace=True),\n",
    "            nn.MaxPool2d(2, 2),\n",
    "            nn.Conv2d(128, 256, 3, 1, 1),\n",
    "            nn.ReLU(inplace=True),\n",
    "            nn.Conv2d(256, 256, 3, 1, 1),\n",
    "            nn.ReLU(inplace=True),\n",
    "            nn.MaxPool2d(2, 2),\n",
    "            nn.Conv2d(256, 512, 3, 1, 1),\n",
    "            nn.ReLU(inplace=True),\n",
    "            nn.Conv2d(512, 512, 3, 1, 1),\n",
    "            nn.ReLU(inplace=True),\n",
    "            nn.MaxPool2d(2, 2))\n",
    "        self.embeddings = nn.Sequential(\n",
    "            nn.Linear(512*24, 4096),\n",
    "            nn.ReLU(inplace=True),\n",
    "            nn.Linear(4096, 4096),\n",
    "            nn.ReLU(inplace=True),\n",
    "            nn.Linear(4096, 128),\n",
    "            nn.ReLU(inplace=True))\n",
    "        \n",
    "        # extract weights from `vggish_list`\n",
    "        for seq in (self.features, self.embeddings):\n",
    "            for layer in seq:\n",
    "                if type(layer).__name__ != \"MaxPool2d\" and type(layer).__name__ != \"ReLU\":\n",
    "                    layer.weight = next(param_generator())\n",
    "                    layer.bias = next(param_generator())\n",
    "            \n",
    "    def forward(self, x):\n",
    "        x = self.features(x)\n",
    "        x = x.view(x.size(0),-1)\n",
    "        x = self.embeddings(x)\n",
    "        return x\n",
    "\n",
    "net = VGGish()\n",
    "net.eval()\n",
    "\n",
    "# Save weights to disk\n",
    "torch.save(net.state_dict(), \"./vggish.pth\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "source": [],
    "metadata": {
     "collapsed": false
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}