{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "name": "stdout", "text": [ "A audioset/README.md\r\nA audioset/mel_features.py\r\nA audioset/vggish_inference_demo.py\r\nA audioset/vggish_input.py\r\nA audioset/vggish_params.py\r\nA audioset/vggish_postprocess.py\r\nA audioset/vggish_slim.py\r\nA audioset/vggish_smoke_test.py\r\n", "A audioset/vggish_train_demo.py\r\n", "Checked out revision 9495.\r\n", "Requirement already satisfied: numpy in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (1.16.3)\r\nRequirement already satisfied: scipy in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (1.2.1)\r\n", "Requirement already satisfied: resampy in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (0.2.1)\r\nRequirement already satisfied: tensorflow in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (1.13.1)\r\nRequirement already satisfied: six in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (1.12.0)\r\nRequirement already satisfied: soundfile in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (0.10.2)\r\nRequirement already satisfied: numpy>=1.10 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from resampy) (1.16.3)\r\nRequirement already satisfied: scipy>=0.13 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from resampy) (1.2.1)\r\n", "Requirement already satisfied: numba>=0.32 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from resampy) (0.43.1)\r\nRequirement already satisfied: gast>=0.2.0 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (0.2.2)\r\nRequirement already satisfied: termcolor>=1.1.0 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (1.1.0)\r\nRequirement already satisfied: tensorflow-estimator<1.14.0rc0,>=1.13.0 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (1.13.0)\r\nRequirement already satisfied: wheel>=0.26 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (0.33.1)\r\nRequirement already satisfied: grpcio>=1.8.6 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (1.20.1)\r\nRequirement already satisfied: astor>=0.6.0 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (0.7.1)\r\nRequirement already satisfied: absl-py>=0.1.6 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (0.7.1)\r\nRequirement already satisfied: protobuf>=3.6.1 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (3.7.1)\r\nRequirement already satisfied: keras-applications>=1.0.6 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (1.0.7)\r\nRequirement already satisfied: keras-preprocessing>=1.0.5 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (1.0.9)\r\nRequirement already satisfied: tensorboard<1.14.0,>=1.13.0 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow) (1.13.1)\r\n", "Requirement already satisfied: cffi>=1.0 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from soundfile) (1.12.3)\r\nRequirement already satisfied: llvmlite>=0.28.0dev0 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from numba>=0.32->resampy) (0.28.0)\r\nRequirement already satisfied: mock>=2.0.0 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorflow-estimator<1.14.0rc0,>=1.13.0->tensorflow) (3.0.5)\r\nRequirement already satisfied: setuptools in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from protobuf>=3.6.1->tensorflow) (41.0.1)\r\nRequirement already satisfied: h5py in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from keras-applications>=1.0.6->tensorflow) (2.9.0)\r\nRequirement already satisfied: werkzeug>=0.11.15 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorboard<1.14.0,>=1.13.0->tensorflow) (0.15.2)\r\nRequirement already satisfied: markdown>=2.6.8 in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from tensorboard<1.14.0,>=1.13.0->tensorflow) (3.1)\r\nRequirement already satisfied: pycparser in /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages (from cffi>=1.0->soundfile) (2.19)\r\n", " % Total % Received % Xferd Average Speed Time Time Time Current\r\n Dload Upload Total Spent Left Speed\r\n\r 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0", "\r 0 0 0 0 0 0 0 0 --:--:-- 0:00:01 --:--:-- 0", "\r 0 277M 0 3870 0 0 2314 0 34:56:43 0:00:01 34:56:42 2313", "\r 1 277M 1 4096k 0 0 1413k 0 0:03:21 0:00:02 0:03:19 1413k", "\r 4 277M 4 13.1M 0 0 4110k 0 0:01:09 0:00:03 0:01:06 4110k", "\r 13 277M 13 37.2M 0 0 8940k 0 0:00:31 0:00:04 0:00:27 8939k", "\r 25 277M 25 71.5M 0 0 13.5M 0 0:00:20 0:00:05 0:00:15 17.7M", "\r 34 277M 34 96.8M 0 0 15.4M 0 0:00:17 0:00:06 0:00:11 21.0M", "\r 43 277M 43 120M 0 0 15.8M 0 0:00:17 0:00:07 0:00:10 24.6M", "\r 48 277M 48 136M 0 0 16.4M 0 0:00:16 0:00:08 0:00:08 24.4M", "\r 60 277M 60 166M 0 0 17.9M 0 0:00:15 0:00:09 0:00:06 25.8M", "\r 71 277M 71 197M 0 0 19.2M 0 0:00:14 0:00:10 0:00:04 25.1M", "\r 76 277M 76 212M 0 0 18.8M 0 0:00:14 0:00:11 0:00:03 23.0M", "\r 83 277M 83 232M 0 0 17.8M 0 0:00:15 0:00:12 0:00:03 20.8M", "\r 86 277M 86 ", " 240M 0 0 18.0M 0 0:00:15 0:00:13 0:00:02 20.8M", "\r 95 277M 95 264M 0 0 18.2M 0 0:00:15 0:00:14 0:00:01 18.6M", "\r100 277M 100 277M 0 0 18.6M 0 0:00:14 0:00:14 --:--:-- 17.3M\r\n", " % Total % Received % Xferd Average Speed Time Time Time Current\r\n Dload Upload Total Spent Left Speed\r\n\r 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0", "\r 1 73020 1 1284 0 0 3139 0 0:00:23 --:--:-- 0:00:23 3139\r100 73020 100 73020 0 0 163k 0 --:--:-- --:--:-- --:--:-- 163k\r\n" ], "output_type": "stream" } ], "source": [ "\"\"\"\n", "This notebook demonstrates how to replicate converting tensorflow\n", "weights from tensorflow's vggish to torchvggish\n", "\"\"\" \n", "\n", "# Download the audioset directory using subversion\n", "# !apt-get -qq install subversion # uncomment if on linux\n", "!svn checkout https://github.com/tensorflow/models/trunk/research/audioset\n", "\n", "# Download audioset requirements\n", "!pip install numpy scipy\n", "!pip install resampy tensorflow six soundfile\n", "\n", "# grab the VGGish model checkpoints & PCA params\n", "!curl -O https://storage.googleapis.com/audioset/vggish_model.ckpt\n", "!curl -O https://storage.googleapis.com/audioset/vggish_pca_params.npz" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "name": "stdout", "text": [ "\nWARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.\nFor more information, please see:\n * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n * https://github.com/tensorflow/addons\nIf you depend on functionality not listed there, please file an issue.\n\n\nTesting your install of VGGish\n\n", "Log Mel Spectrogram example: [[-4.47297436 -4.29457354 -4.14940631 ... -3.9747003 -3.94774997\n -3.78687669]\n [-4.48589533 -4.28825497 -4.139964 ... -3.98368686 -3.94976505\n -3.7951698 ]\n [-4.46158065 -4.29329706 -4.14905953 ... -3.96442484 -3.94895483\n -3.78619839]\n ...\n [-4.46152626 -4.29365061 -4.14848608 ... -3.96638113 -3.95057575\n -3.78538167]\n [-4.46152595 -4.2936572 -4.14848104 ... -3.96640507 -3.95059567\n -3.78537143]\n [-4.46152565 -4.29366386 -4.14847603 ... -3.96642906 -3.95061564\n -3.78536116]]\nWARNING:tensorflow:From /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\nInstructions for updating:\nColocations handled automatically by placer.\n", "WARNING:tensorflow:From /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages/tensorflow/contrib/layers/python/layers/layers.py:1624: flatten (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\nInstructions for updating:\nUse keras.layers.flatten instead.\n", "WARNING:tensorflow:From /Users/harrisontaylor/.conda/envs/audioset-experiments/lib/python3.7/site-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.\nInstructions for updating:\nUse standard file APIs to check for files with this prefix.\n", "INFO:tensorflow:Restoring parameters from vggish_model.ckpt\n", "VGGish embedding: [0. 0. 0. 0. 0. 0.\n 0. 0.16137293 0. 0. 0. 0.\n 0. 0. 0. 0. 0. 0.80695796\n 0. 0. 0. 0. 0. 0.\n 0. 0.36792755 0.03582409 0. 0. 0.\n 0. 0.38027024 0.1375593 0.9174708 0.8065634 0.\n 0. 0. 0. 0.04036281 0.7076243 0.\n 0.497839 0.24081808 0.21565434 0.88492286 1.19568 0.6706197\n 0.20779458 0.01639861 0.17471863 0. 0. 0.25100806\n 0. 0. 0.14607918 0. 0.39887053 0.30542105\n 0.12896761 0. 0. 0. 0. 0.\n 0.5385133 0. 0. 0.04941072 0.42527416 0.18537284\n 0. 0. 0.14753515 0. 0. 0.69933873\n 0.45541188 0.05174822 0. 0.01992539 0. 0.\n 0.5181578 0.565576 0.6587975 0. 0. 0.41056332\n 0. 0. 0. 0.25765193 0.23232114 0.24026448\n 0. 0. 0. 0. 0. 0.26523757\n 0. 0.48460823 0. 0. 0.19325787 0.\n 0.20123348 0. 0.03368621 0. 0. 0.\n 0. 0.17836356 0.024749 0.06889972 0. 0.\n 0. 0.08246281 0. 0. 0. 0.\n 0. 0. ]\nPostprocessed VGGish embedding: [169 10 154 127 191 66 124 69 157 232 142 21 128 131 43 3 33 111\n 198 153 76 255 194 60 71 179 146 131 167 60 79 76 192 84 102 160\n 23 91 173 13 149 186 115 202 252 163 84 145 107 255 5 198 81 0\n 203 110 35 104 101 131 255 0 0 158 136 74 115 152 77 154 54 151\n 82 243 57 116 165 153 85 181 152 0 255 122 29 255 46 105 110 43\n 0 90 58 13 255 108 96 255 84 121 255 75 176 111 176 64 83 231\n 255 82 255 94 81 144 99 173 255 0 0 158 31 230 112 255 0 255\n 20 255]\n\nLooks Good To Me!\n\n" ], "output_type": "stream" } ], "source": [ "# Test install\n", "!mv audioset/* .\n", "from vggish_smoke_test import *" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "name": "stdout", "text": [ "INFO:tensorflow:Restoring parameters from vggish_model.ckpt\n", "vggish/conv1/weights:0\n\t(3, 3, 1, 64)\nvggish/conv1/biases:0\n\t(64,)\nvggish/conv2/weights:0\n\t(3, 3, 64, 128)\nvggish/conv2/biases:0\n\t(128,)\nvggish/conv3/conv3_1/weights:0\n\t(3, 3, 128, 256)\nvggish/conv3/conv3_1/biases:0\n\t(256,)\nvggish/conv3/conv3_2/weights:0\n\t(3, 3, 256, 256)\nvggish/conv3/conv3_2/biases:0\n\t(256,)\nvggish/conv4/conv4_1/weights:0\n\t(3, 3, 256, 512)\nvggish/conv4/conv4_1/biases:0\n\t(512,)\nvggish/conv4/conv4_2/weights:0\n\t(3, 3, 512, 512)\nvggish/conv4/conv4_2/biases:0\n\t(512,)\nvggish/fc1/fc1_1/weights:0\n\t(12288, 4096)\nvggish/fc1/fc1_1/biases:0\n\t(4096,)\nvggish/fc1/fc1_2/weights:0\n\t(4096, 4096)\nvggish/fc1/fc1_2/biases:0\n\t(4096,)\nvggish/fc2/weights:0\n\t(4096, 128)\nvggish/fc2/biases:0\n\t(128,)\nvalues written to vggish_dict\n" ], "output_type": "stream" } ], "source": [ "import tensorflow as tf\n", "import vggish_slim\n", "\n", "vggish_dict = {}\n", "# load the model and get info \n", "with tf.Graph().as_default(), tf.Session() as sess:\n", " vggish_slim.define_vggish_slim(training=True)\n", " vggish_slim.load_vggish_slim_checkpoint(sess,\"vggish_model.ckpt\")\n", " \n", " tvars = tf.trainable_variables()\n", " tvars_vals = sess.run(tvars)\n", "\n", " for var, val in zip(tvars, tvars_vals):\n", " print(\"%s\" % (var.name))\n", " print(\"\\t\" + str(var.shape))\n", " vggish_dict[var.name] = val\n", " print(\"values written to vggish_dict\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "# Define torch model for vggish\n", "\n", "import torch\n", "import torch.nn as nn\n", "import numpy as np\n", "\n", "# From vggish_slim:\n", "# The VGG stack of alternating convolutions and max-pools.\n", "# net = slim.conv2d(net, 64, scope='conv1')\n", "# net = slim.max_pool2d(net, scope='pool1')\n", "# net = slim.conv2d(net, 128, scope='conv2')\n", "# net = slim.max_pool2d(net, scope='pool2')\n", "# net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')\n", "# net = slim.max_pool2d(net, scope='pool3')\n", "# net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')\n", "# net = slim.max_pool2d(net, scope='pool4')\n", "# # Flatten before entering fully-connected layers\n", "# net = slim.flatten(net)\n", "# net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')\n", "# # The embedding layer.\n", "# net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')\n", "\n", "vggish_list = list(vggish_dict.values())\n", "def param_generator():\n", " param = vggish_list.pop(0)\n", " transposed = np.transpose(param)\n", " to_torch = torch.from_numpy(transposed)\n", " result = torch.nn.Parameter(to_torch)\n", " yield result\n", "\n", "class VGGish(nn.Module):\n", " def __init__(self):\n", " super(VGGish, self).__init__()\n", " self.features = nn.Sequential(\n", " nn.Conv2d(1, 64, 3, 1, 1),\n", " nn.ReLU(inplace=True),\n", " nn.MaxPool2d(2, 2),\n", " nn.Conv2d(64, 128, 3, 1, 1),\n", " nn.ReLU(inplace=True),\n", " nn.MaxPool2d(2, 2),\n", " nn.Conv2d(128, 256, 3, 1, 1),\n", " nn.ReLU(inplace=True),\n", " nn.Conv2d(256, 256, 3, 1, 1),\n", " nn.ReLU(inplace=True),\n", " nn.MaxPool2d(2, 2),\n", " nn.Conv2d(256, 512, 3, 1, 1),\n", " nn.ReLU(inplace=True),\n", " nn.Conv2d(512, 512, 3, 1, 1),\n", " nn.ReLU(inplace=True),\n", " nn.MaxPool2d(2, 2))\n", " self.embeddings = nn.Sequential(\n", " nn.Linear(512*24, 4096),\n", " nn.ReLU(inplace=True),\n", " nn.Linear(4096, 4096),\n", " nn.ReLU(inplace=True),\n", " nn.Linear(4096, 128),\n", " nn.ReLU(inplace=True))\n", " \n", " # extract weights from `vggish_list`\n", " for seq in (self.features, self.embeddings):\n", " for layer in seq:\n", " if type(layer).__name__ != \"MaxPool2d\" and type(layer).__name__ != \"ReLU\":\n", " layer.weight = next(param_generator())\n", " layer.bias = next(param_generator())\n", " \n", " def forward(self, x):\n", " x = self.features(x)\n", " x = x.view(x.size(0),-1)\n", " x = self.embeddings(x)\n", " return x\n", "\n", "net = VGGish()\n", "net.eval()\n", "\n", "# Save weights to disk\n", "torch.save(net.state_dict(), \"./vggish.pth\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" }, "pycharm": { "stem_cell": { "cell_type": "raw", "source": [], "metadata": { "collapsed": false } } } }, "nbformat": 4, "nbformat_minor": 1 }