Spaces:

onlycaps
/

audio_palette

Sleeping

App Files Files Community

manasch commited on Nov 28, 2023

Commit

17d2808

verified ·

1 Parent(s): c37043e

add pace model training notebook

Browse files

Files changed (3) hide show

.gitignore +2 -0
app.py +48 -10
notebooks/PaceModel.ipynb +584 -0

.gitignore CHANGED Viewed

@@ -3,3 +3,5 @@ __pycache__
 *.jpg
 *.png

 *.jpg
 *.png
+*.log

app.py CHANGED Viewed

@@ -18,9 +18,17 @@ class AudioPalette:
         self.image_captioning = ImageCaptioning()
     def generate(self, input_image: PIL.Image.Image):
-        generated_text = self.image_captioning.query(input_image)[0].get("generated_text")
         pace = self.pace_model.predict(input_image)
-        return pace + (" - " + generated_text if generated_text is not None else "")
 def main():
     model = AudioPalette()
@@ -33,14 +41,44 @@ def main():
             show_label=True,
             container=True
         ),
-        outputs=gr.Textbox(
-            lines=1,
-            placeholder="Pace of the image and the caption",
-            label="Caption and Pace",
-            show_label=True,
-            container=True,
-            type="text"
-        ),
         cache_examples=False,
         live=False,
         title="Audio Palette",

         self.image_captioning = ImageCaptioning()
     def generate(self, input_image: PIL.Image.Image):
         pace = self.pace_model.predict(input_image)
+        print("Pace Prediction Done")
+        generated_text = self.image_captioning.query(input_image)[0].get("generated_text")
+        print("Captioning Done")
+        generated_text = generated_text if generated_text is not None else ""
+        temp = pace + " - " + generated_text
+        outputs = [temp, pace, generated_text]
+        return outputs
 def main():
     model = AudioPalette()
             show_label=True,
             container=True
         ),
+        outputs=[
+            gr.Textbox(
+                lines=1,
+                placeholder="Pace of the image and the caption",
+                label="Caption and Pace",
+                show_label=True,
+                container=True,
+                type="text",
+                visible=True
+            ),
+            gr.Textbox(
+                lines=1,
+                placeholder="Pace of the image",
+                label="Pace",
+                show_label=True,
+                container=True,
+                type="text",
+                visible=False
+            ),
+            gr.Textbox(
+                lines=1,
+                placeholder="Caption for the image",
+                label="Caption",
+                show_label=True,
+                container=True,
+                type="text",
+                visible=False
+            ),
+            # gr.Audio(
+            #     label="Generated Audio",
+            #     show_label=True,
+            #     container=True,
+            #     visible=False,
+            #     format="wav",
+            #     autoplay=False,
+            #     show_download_button=True,
+            # )
+        ],
         cache_examples=False,
         live=False,
         title="Audio Palette",

notebooks/PaceModel.ipynb ADDED Viewed

	@@ -0,0 +1,584 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "import json\n",
+        "import shutil\n",
+        "from pathlib import Path\n",
+        "from keras.applications.resnet50 import ResNet50\n",
+        "\n",
+        "import cv2\n",
+        "import matplotlib.pyplot as plt\n",
+        "import pandas as pd\n",
+        "\n",
+        "from google.colab import files\n",
+        "from google.colab.patches import cv2_imshow"
+      ],
+      "metadata": {
+        "id": "wzZknIqDEwBg"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "kaggle_token = Path(\"/root/.kaggle/kaggle.json\")\n",
+        "if not kaggle_token.parent.exists():\n",
+        "    kaggle_token.parent.mkdir()\n",
+        "if not kaggle_token.exists():\n",
+        "    print(\"Upload token:\")\n",
+        "    files.upload()\n",
+        "    shutil.move((Path.cwd() / \"kaggle.json\").as_posix(), kaggle_token.resolve().as_posix())"
+      ],
+      "metadata": {
+        "id": "c0SGUkuUEy6n"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!chmod 600 /root/.kaggle/kaggle.json"
+      ],
+      "metadata": {
+        "id": "yufGnL24E0gf"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!kaggle d download srbhshinde/flickr8k-sau"
+      ],
+      "metadata": {
+        "id": "84kESXJrE7Zn"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!7z x flickr8k-sau.zip"
+      ],
+      "metadata": {
+        "id": "NOyPR6EnE80f"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "output = Path.cwd() / 'images'\n",
+        "if not output.exists():\n",
+        "    output.mkdir()\n",
+        "\n",
+        "fast = output / 'fast'\n",
+        "med = output / 'medium'\n",
+        "slow = output / 'slow'\n",
+        "\n",
+        "if not fast.exists():\n",
+        "    fast.mkdir()\n",
+        "\n",
+        "if not med.exists():\n",
+        "    med.mkdir()\n",
+        "\n",
+        "if not slow.exists():\n",
+        "    slow.mkdir()\n",
+        "\n",
+        "counter = 0\n",
+        "\n",
+        "with open(\"finalDataset.csv\") as f:\n",
+        "    f.readline()\n",
+        "    image_path = Path.cwd() / 'Flickr_Data' / 'Images'\n",
+        "    for line in f:\n",
+        "        idx, image_name, pace = line.strip().split(',')\n",
+        "        if pace == 'slow':\n",
+        "            shutil.copy2(image_path / image_name, slow)\n",
+        "        elif pace == 'fast':\n",
+        "            shutil.copy2(image_path / image_name, fast)\n",
+        "        else:\n",
+        "            shutil.copy2(image_path / image_name, med)\n",
+        "        counter += 1\n",
+        "\n",
+        "print(counter)"
+      ],
+      "metadata": {
+        "id": "dY034gfPE9wW"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import matplotlib.pyplot as plt\n",
+        "import numpy as np\n",
+        "import os\n",
+        "import PIL\n",
+        "import tensorflow as tf\n",
+        "from tensorflow import keras\n",
+        "from tensorflow.keras import layers\n",
+        "from tensorflow.python.keras.layers import Dense, Flatten\n",
+        "from tensorflow.keras.models import Sequential\n",
+        "from tensorflow.keras.optimizers import Adam"
+      ],
+      "metadata": {
+        "id": "G4uVyuXJGKDj"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "a2Zst5ytENY3"
+      },
+      "outputs": [],
+      "source": [
+        "import pathlib\n",
+        "data_dir = 'images/'\n",
+        "data_dir = pathlib.Path(data_dir)\n",
+        "bg = list(data_dir.glob('medium/*'))\n",
+        "print(bg[0])\n",
+        "PIL.Image.open(str(bg[0]))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "data_dir = 'images/'\n",
+        "img_height, img_width = 224,224\n",
+        "batch_size = 32\n",
+        "train_ds = tf.keras.preprocessing.image_dataset_from_directory(\n",
+        "    data_dir,\n",
+        "    validation_split = 0.2,\n",
+        "    subset = \"training\",\n",
+        "    seed = 345,\n",
+        "    label_mode = 'categorical',\n",
+        "    image_size = (img_height, img_width),\n",
+        "    batch_size = batch_size\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "xlGw3fN5Eb24"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "val_ds = tf.keras.preprocessing.image_dataset_from_directory(\n",
+        "    data_dir,\n",
+        "    validation_split = 0.2,\n",
+        "    subset = \"validation\",\n",
+        "    seed = 345,\n",
+        "    label_mode = 'categorical',\n",
+        "    image_size = (img_height, img_width),\n",
+        "    batch_size = batch_size\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "IN1O5n9WEd2n"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "class_names = train_ds.class_names\n",
+        "print(class_names)"
+      ],
+      "metadata": {
+        "id": "zVXI77J4Ehh3"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "resnet_model = Sequential()\n",
+        "\n",
+        "pretrained_model= ResNet50(\n",
+        "                   include_top=False,\n",
+        "                   input_shape=(224,224,3),\n",
+        "                   pooling='avg',classes=211,\n",
+        "                   weights='imagenet')\n",
+        "\n",
+        "for layer in pretrained_model.layers:\n",
+        "        layer.trainable=False\n",
+        "\n",
+        "resnet_model.add(pretrained_model)\n",
+        "resnet_model.add(Flatten())\n",
+        "resnet_model.add(Dense(1024, activation = 'relu'))\n",
+        "resnet_model.add(Dense(256, activation = 'relu'))\n",
+        "resnet_model.add(Dense(3, activation = 'softmax'))"
+      ],
+      "metadata": {
+        "id": "ZpR1kjgWEi3_"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "resnet_model.summary()"
+      ],
+      "metadata": {
+        "id": "2WGlO4VLEpSf"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "resnet_model.compile(optimizer=Adam(learning_rate=0.001),loss='categorical_crossentropy',metrics=['accuracy'])"
+      ],
+      "metadata": {
+        "id": "TPNjjBLqEqwu"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "epochs = 15\n",
+        "checkpoint_filepath = '/tmp/checkpoint'\n",
+        "model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(\n",
+        "    filepath=checkpoint_filepath,\n",
+        "    save_weights_only=True,\n",
+        "    monitor='val_accuracy',\n",
+        "    mode='max',\n",
+        "    save_best_only=True)\n",
+        "history = resnet_model.fit(\n",
+        "    train_ds,\n",
+        "    validation_data = val_ds,\n",
+        "    epochs = epochs,\n",
+        "    callbacks=[model_checkpoint_callback]\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "rc8VaaypEsJX"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "resnet_model.load_weights(checkpoint_filepath)"
+      ],
+      "metadata": {
+        "id": "tUCSqCs3JLBu"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "resnet_model.save('pace_model')"
+      ],
+      "metadata": {
+        "id": "HIsB7PHxfwM-"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "resnet_model.save_weights('pace_model_weights.h5')"
+      ],
+      "metadata": {
+        "id": "yhiPNdnkgU7C"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "resnet_model.load_weights('pace_model_weights.h5')"
+      ],
+      "metadata": {
+        "id": "IuXgP1oJhZz1"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import cv2\n",
+        "image=cv2.imread('danny.png')\n",
+        "image_resized= cv2.resize(image, (img_height,img_width))\n",
+        "image=np.expand_dims(image_resized,axis=0)\n",
+        "print(image.shape)"
+      ],
+      "metadata": {
+        "id": "YXGqeYevKlpR"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "PIL.Image.open('danny.png')"
+      ],
+      "metadata": {
+        "id": "ZP6ATt0CYe1c"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "pred=resnet_model.predict(image)\n",
+        "print(pred)"
+      ],
+      "metadata": {
+        "id": "i1qeeWVcK3av"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "output_class=class_names[np.argmax(pred)]\n",
+        "print(\"The predicted class is\", output_class)"
+      ],
+      "metadata": {
+        "id": "-FgwW4zDK47O"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import matplotlib.image as img\n",
+        "import matplotlib.pyplot as plt\n",
+        "from scipy.cluster.vq import whiten\n",
+        "from scipy.cluster.vq import kmeans\n",
+        "import pandas as pd\n",
+        "\n",
+        "batman_image = img.imread('danny.png')\n",
+        "\n",
+        "r = []\n",
+        "g = []\n",
+        "b = []\n",
+        "for row in batman_image:\n",
+        "    for temp_r, temp_g, temp_b, temp in row:\n",
+        "        r.append(temp_r)\n",
+        "        g.append(temp_g)\n",
+        "        b.append(temp_b)\n",
+        "\n",
+        "batman_df = pd.DataFrame({'red': r,\n",
+        "                          'green': g,\n",
+        "                          'blue': b})\n",
+        "\n",
+        "batman_df['scaled_color_red'] = whiten(batman_df['red'])\n",
+        "batman_df['scaled_color_blue'] = whiten(batman_df['blue'])\n",
+        "batman_df['scaled_color_green'] = whiten(batman_df['green'])\n",
+        "\n",
+        "cluster_centers, _ = kmeans(batman_df[['scaled_color_red',\n",
+        "                                       'scaled_color_blue',\n",
+        "                                       'scaled_color_green']], 3)\n",
+        "\n",
+        "dominant_colors = []\n",
+        "\n",
+        "red_std, green_std, blue_std = batman_df[['red',\n",
+        "                                          'green',\n",
+        "                                          'blue']].std()\n",
+        "\n",
+        "for cluster_center in cluster_centers:\n",
+        "    red_scaled, green_scaled, blue_scaled = cluster_center\n",
+        "    dominant_colors.append((\n",
+        "        red_scaled * red_std / 255,\n",
+        "        green_scaled * green_std / 255,\n",
+        "        blue_scaled * blue_std / 255\n",
+        "    ))\n",
+        "\n",
+        "plt.imshow([dominant_colors])\n",
+        "plt.show()"
+      ],
+      "metadata": {
+        "id": "pcUf1oNWpNWt"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import matplotlib.image as img\n",
+        "\n",
+        "# Read batman image and print dimensions\n",
+        "batman_image = img.imread('for.jpg')\n",
+        "print(batman_image.shape)"
+      ],
+      "metadata": {
+        "id": "r4eAlhkupdlS"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "from scipy.cluster.vq import whiten\n",
+        "\n",
+        "# Store RGB values of all pixels in lists r, g and b\n",
+        "r = []\n",
+        "g = []\n",
+        "b = []\n",
+        "for row in batman_image:\n",
+        "    for temp_r, temp_g, temp_b in row:\n",
+        "        r.append(temp_r)\n",
+        "        g.append(temp_g)\n",
+        "        b.append(temp_b)\n",
+        "\n",
+        "# only printing the size of these lists\n",
+        "# as the content is too big\n",
+        "print(len(r))\n",
+        "print(len(g))\n",
+        "print(len(b))\n",
+        "\n",
+        "# Saving as DataFrame\n",
+        "batman_df = pd.DataFrame({'red': r,\n",
+        "                          'green': g,\n",
+        "                          'blue': b})\n",
+        "\n",
+        "# Scaling the values\n",
+        "batman_df['scaled_color_red'] = whiten(batman_df['red'])\n",
+        "batman_df['scaled_color_blue'] = whiten(batman_df['blue'])\n",
+        "batman_df['scaled_color_green'] = whiten(batman_df['green'])"
+      ],
+      "metadata": {
+        "id": "unc3TcVop2qn"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import seaborn as sns\n",
+        "distortions = []\n",
+        "num_clusters = range(1, 7)  # range of cluster sizes\n",
+        "\n",
+        "# Create a list of distortions from the kmeans function\n",
+        "for i in num_clusters:\n",
+        "    cluster_centers, distortion = kmeans(batman_df[['scaled_color_red',\n",
+        "                                                    'scaled_color_blue',\n",
+        "                                                    'scaled_color_green']], i)\n",
+        "    distortions.append(distortion)\n",
+        "\n",
+        "# Create a data frame with two lists, num_clusters and distortions\n",
+        "elbow_plot = pd.DataFrame({'num_clusters': num_clusters,\n",
+        "                           'distortions': distortions})\n",
+        "\n",
+        "# Create a line plot of num_clusters and distortions\n",
+        "sns.lineplot(x='num_clusters', y='distortions', data=elbow_plot)\n",
+        "plt.xticks(num_clusters)\n",
+        "plt.show()"
+      ],
+      "metadata": {
+        "id": "NE7I1771qAPK"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "cluster_centers, _ = kmeans(batman_df[['scaled_color_red',\n",
+        "                                       'scaled_color_blue',\n",
+        "                                       'scaled_color_green']], 3)\n",
+        "\n",
+        "dominant_colors = []\n",
+        "\n",
+        "# Get standard deviations of each color\n",
+        "red_std, green_std, blue_std = batman_df[['red',\n",
+        "                                          'green',\n",
+        "                                          'blue']].std()\n",
+        "\n",
+        "for cluster_center in cluster_centers:\n",
+        "    red_scaled, green_scaled, blue_scaled = cluster_center\n",
+        "\n",
+        "    # Convert each standardized value to scaled value\n",
+        "    dominant_colors.append((\n",
+        "        red_scaled * red_std / 255,\n",
+        "        green_scaled * green_std / 255,\n",
+        "        blue_scaled * blue_std / 255\n",
+        "    ))\n",
+        "\n",
+        "# Display colors of cluster centers\n",
+        "plt.imshow([dominant_colors])\n",
+        "plt.show()"
+      ],
+      "metadata": {
+        "id": "sfE-qoULqHQM"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from webcolors import rgb_to_name\n",
+        "\n",
+        "for i in dominant_colors:\n",
+        "  named_color = rgb_to_name(i, spec='css3')\n",
+        "  print(named_color)"
+      ],
+      "metadata": {
+        "id": "1U86FPKkqPPV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "xolPh4bIqsU_"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}