codeShare
/

JupyterNotebooks

Model card Files Files and versions

xet

Community

codeShare commited on Sep 12, 2024

Commit

345c051

verified ·

1 Parent(s): b293fe3

Upload sd_token_similarity_calculator.ipynb

Browse files

Files changed (1) hide show

sd_token_similarity_calculator.ipynb +203 -152

sd_token_similarity_calculator.ipynb CHANGED Viewed

@@ -155,118 +155,29 @@
       ],
       "metadata": {
         "id": "Ch9puvwKH1s3",
-        "collapsed": true
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# @title 📝 Prompt similarity:  Order pre-made text_encodings\n",
-        "prompt = \" a fast car on the road \" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
-        "from transformers import AutoTokenizer\n",
-        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
-        "from transformers import  CLIPProcessor, CLIPModel\n",
-        "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
-        "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
-        "\n",
-        "# Get text features for user input\n",
-        "inputs = tokenizer(text = prompt, padding=True, return_tensors=\"pt\")\n",
-        "text_features_A = model.get_text_features(**inputs)\n",
-        "text_features_A = text_features_A/text_features_A.norm(p=2, dim=-1, keepdim=True)\n",
-        "name_A = prompt\n",
-        "#------#\n",
-        "\n",
-        "# Load the .db file for prefix encodings\n",
-        "import shelve\n",
-        "_iters = -1\n",
-        "RANGE = NUM_PREFIX\n",
-        "NUM_PREFIX_LISTS = 1\n",
-        "dots = results_sim = torch.zeros(RANGE*NUM_PREFIX_LISTS)\n",
-        "for _PREFIX_ENC_VOCAB in PREFIX_ENC_VOCAB:\n",
-        "  _iters = _iters + 1\n",
-        "  d = shelve.open(_PREFIX_ENC_VOCAB)\n",
-        "  for _index in range(RANGE):\n",
-        "    index = _iters*RANGE + _index\n",
-        "    text_features = d[f'{_index}']\n",
-        "    text_features = text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
-        "    sim = torch.nn.functional.cosine_similarity(text_features, text_features_A)\n",
-        "    dots[index] = sim\n",
-        "  #----#\n",
-        "  d.close() #close the file\n",
-        "#------#\n",
-        "prefix_sorted, prefix_indices = torch.sort(dots,dim=0 , descending=True)\n",
-        "#------#\n",
-        "\n",
-        "# Load the .db file for prefix encodings\n",
-        "import shelve\n",
-        "_iters = -1\n",
-        "RANGE = NUM_SUFFIX\n",
-        "dots = results_sim = torch.zeros(RANGE*NUM_SUFFIX_LISTS)\n",
-        "for _SUFFIX_ENC_VOCAB in SUFFIX_ENC_VOCAB:\n",
-        "  _iters = _iters + 1\n",
-        "  d = shelve.open(_SUFFIX_ENC_VOCAB)\n",
-        "  for _index in range(RANGE):\n",
-        "    index = _iters*RANGE + _index\n",
-        "    text_features = d[f'{_index}']\n",
-        "    text_features = text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
-        "    sim = torch.nn.functional.cosine_similarity(text_features, text_features_A)\n",
-        "    dots[index] = sim\n",
-        "  #----#\n",
-        "  d.close() #close the file\n",
-        "#------#\n",
-        "suffix_sorted, suffix_indices = torch.sort(dots,dim=0 , descending=True)\n",
-        "#------#\n",
-        "\n",
-        "#Print the results\n",
-        "# title Show the 100 most similiar suffix and prefix text-encodings to the text encoding\n",
-        "RANGE = 30\n",
-        "_suffixes = '{'\n",
-        "_sims =  '{'\n",
-        "for index in range(RANGE):\n",
-        "  id = int(suffix_indices[index])\n",
-        "  ahead = \"from \"\n",
-        "  behind = \"\"\n",
-        "  if(id>NUM_SUFFIX*1):\n",
-        "    ahead = \"a \"\n",
-        "  if(id>NUM_SUFFIX*2):\n",
-        "    ahead = \"by \"\n",
-        "  if(id>NUM_SUFFIX*3):\n",
-        "    ahead = \"\"\n",
-        "    behind = \"like\"\n",
-        "  id = _modulus(id,NUM_SUFFIX)\n",
-        "  #------#\n",
-        "  sim = suffix_sorted[index].item()\n",
-        "  name = ahead + get_suffix(id) + behind\n",
-        "  if(get_suffix(id) == ' '): name = ahead + f'{id}' + behind\n",
-        "  _suffixes = _suffixes + name + '|'\n",
-        "  _sims = _sims + f'{round(sim*100,2)} %' + '|'\n",
-        "#------#\n",
-        "_suffixes = (_suffixes + '}').replace('|}', '}')\n",
-        "_sims = (_sims + '}').replace('|}', '}')\n",
-        "\n",
-        "print('most similiar suffix items to prompt : ' + _suffixes)\n",
-        "print('similarity % for suffix items : ' + _sims)\n",
-        "print('')\n",
-        "\n",
-        "#-------#\n",
-        "\n",
-        "_prefixes = '{'\n",
-        "for index in range(RANGE):\n",
-        "  id = f'{prefix_indices[index]}'\n",
-        "  #sim = prefix_sorted[index]\n",
-        "  name = get_prefix(id)\n",
-        "  _prefixes = _prefixes + name + '|'\n",
-        "#------#\n",
-        "_prefixes = (_prefixes + '}').replace('|}', '}')\n",
-        "print('most similiar prefix suffix to image : ' + _prefixes)\n"
-      ],
-      "metadata": {
-        "id": "xc-PbIYF428y"
       },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
@@ -421,6 +332,146 @@
       "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "markdown",
       "source": [
@@ -474,13 +525,13 @@
       ],
       "metadata": {
         "id": "ke6mZ1RZDOeB",
-        "outputId": "f98f9ea5-32d1-4cf7-b523-1c6b6e6792a2",
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 1000
         }
       },
-      "execution_count": 2,
       "outputs": [
         {
           "output_type": "display_data",
@@ -497,6 +548,15 @@
     {
       "cell_type": "code",
       "source": [
         "\n",
         "from transformers import AutoTokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
@@ -554,12 +614,14 @@
         "suffix_sorted, suffix_indices = torch.sort(dots,dim=0 , descending=True)\n",
         "#------#\n",
         "\n",
         "#Print the results\n",
         "# title Show the 100 most similiar suffix and prefix text-encodings to the text encoding\n",
-        "RANGE = 30\n",
         "_suffixes = '{'\n",
         "_sims =  '{'\n",
-        "for index in range(RANGE):\n",
         "  id = int(suffix_indices[index])\n",
         "  ahead = \"from \"\n",
         "  behind = \"\"\n",
@@ -576,62 +638,51 @@
         "  name = ahead + get_suffix(id) + behind\n",
         "  if(get_suffix(id) == ' '): name = ahead + f'{id}' + behind\n",
         "  _suffixes = _suffixes + name + '|'\n",
-        "  _sims = _sims + f'{round(sim*100,2)} %' + '|'\n",
         "#------#\n",
         "_suffixes = (_suffixes + '}').replace('|}', '}')\n",
         "_sims = (_sims + '}').replace('|}', '}')\n",
         "\n",
-        "print('most similiar suffix items to prompt : ' + _suffixes)\n",
-        "print('similarity % for suffix items : ' + _sims)\n",
-        "print('')\n",
         "\n",
         "#-------#\n",
         "\n",
         "_prefixes = '{'\n",
-        "for index in range(RANGE):\n",
         "  id = f'{prefix_indices[index]}'\n",
         "  #sim = prefix_sorted[index]\n",
         "  name = get_prefix(id)\n",
         "  _prefixes = _prefixes + name + '|'\n",
         "#------#\n",
         "_prefixes = (_prefixes + '}').replace('|}', '}')\n",
-        "print('most similiar prefix suffix to image : ' + _prefixes)\n"
-      ],
-      "metadata": {
-        "id": "rebogpoyOG8k"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# @title 🖼️ Show the 10 most similiar suffix and prefix text-encodings to the image encoding\n",
         "\n",
-        "_suffixes = '{'\n",
-        "for index in range(20):\n",
-        "  id = f'{suffix_indices[index]}'\n",
-        "  sim = suffix_sorted[index]\n",
-        "  name = get_suffix(id)\n",
-        "  _suffixes = _suffixes + name + '|'\n",
-        "#------#\n",
-        "_suffixes = (_suffixes + '}').replace('|}', '}')\n",
-        "print('most similiar suffix tokens to image : ' + _suffixes)\n",
         "\n",
-        "#-------#\n",
         "\n",
-        "_prefixes = '{'\n",
-        "for index in range(20):\n",
-        "  id = f'{prefix_indices[index]}'\n",
-        "  sim = prefix_sorted[index]\n",
-        "  name = get_prefix(id)\n",
-        "  _prefixes = _prefixes + name + '|'\n",
-        "#------#\n",
-        "_prefixes = (_prefixes + '}').replace('|}', '}')\n",
-        "print('most similiar prefix tokens to image : ' + _prefixes)\n"
       ],
       "metadata": {
-        "id": "eZqMUhP0qYaK"
       },
       "execution_count": null,
       "outputs": []

       ],
       "metadata": {
         "id": "Ch9puvwKH1s3",
+        "collapsed": true,
+        "outputId": "129b355e-9a4f-49d1-b641-3b675558f9b2",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
       },
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Cloning into 'sd_tokens'...\n",
+            "remote: Enumerating objects: 99, done.\u001b[K\n",
+            "remote: Counting objects: 100% (96/96), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (96/96), done.\u001b[K\n",
+            "remote: Total 99 (delta 34), reused 0 (delta 0), pack-reused 3 (from 1)\u001b[K\n",
+            "Unpacking objects: 100% (99/99), 1.35 MiB | 3.12 MiB/s, done.\n",
+            "Filtering content: 100% (22/22), 2.47 GiB | 39.37 MiB/s, done.\n",
+            "/content/sd_tokens\n"
+          ]
+        }
+      ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "outputs": []
     },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title 📝 Get Prompt text_encoding similarity to the pre-calc. text_encodings\n",
+        "prompt = \" a fast car on the road \" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
+        "list_size = 100 # @param {type:'number'}\n",
+        "start_at_index = 0 # @param {type:'number'}\n",
+        "print_Similarity = True # @param {type:\"boolean\"}\n",
+        "print_Suffix = True # @param {type:\"boolean\"}\n",
+        "print_Prefix = True # @param {type:\"boolean\"}\n",
+        "print_Descriptions = True # @param {type:\"boolean\"}\n",
+        "compact_Output = False # @param {type:\"boolean\"}\n",
+        "\n",
+        "from transformers import AutoTokenizer\n",
+        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
+        "from transformers import  CLIPProcessor, CLIPModel\n",
+        "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
+        "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
+        "\n",
+        "# Get text features for user input\n",
+        "inputs = tokenizer(text = prompt, padding=True, return_tensors=\"pt\")\n",
+        "text_features_A = model.get_text_features(**inputs)\n",
+        "text_features_A = text_features_A/text_features_A.norm(p=2, dim=-1, keepdim=True)\n",
+        "name_A = prompt\n",
+        "#------#\n",
+        "\n",
+        "# Load the .db file for prefix encodings\n",
+        "import shelve\n",
+        "_iters = -1\n",
+        "RANGE = NUM_PREFIX\n",
+        "NUM_PREFIX_LISTS = 1\n",
+        "dots = results_sim = torch.zeros(RANGE*NUM_PREFIX_LISTS)\n",
+        "for _PREFIX_ENC_VOCAB in PREFIX_ENC_VOCAB:\n",
+        "  _iters = _iters + 1\n",
+        "  d = shelve.open(_PREFIX_ENC_VOCAB)\n",
+        "  for _index in range(RANGE):\n",
+        "    index = _iters*RANGE + _index\n",
+        "    text_features = d[f'{_index}']\n",
+        "    text_features = text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
+        "    sim = torch.nn.functional.cosine_similarity(text_features, text_features_A)\n",
+        "    dots[index] = sim\n",
+        "  #----#\n",
+        "  d.close() #close the file\n",
+        "#------#\n",
+        "prefix_sorted, prefix_indices = torch.sort(dots,dim=0 , descending=True)\n",
+        "#------#\n",
+        "\n",
+        "# Load the .db file for prefix encodings\n",
+        "import shelve\n",
+        "_iters = -1\n",
+        "RANGE = NUM_SUFFIX\n",
+        "dots = results_sim = torch.zeros(RANGE*NUM_SUFFIX_LISTS)\n",
+        "for _SUFFIX_ENC_VOCAB in SUFFIX_ENC_VOCAB:\n",
+        "  _iters = _iters + 1\n",
+        "  d = shelve.open(_SUFFIX_ENC_VOCAB)\n",
+        "  for _index in range(RANGE):\n",
+        "    index = _iters*RANGE + _index\n",
+        "    text_features = d[f'{_index}']\n",
+        "    text_features = text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
+        "    sim = torch.nn.functional.cosine_similarity(text_features, text_features_A)\n",
+        "    dots[index] = sim\n",
+        "  #----#\n",
+        "  d.close() #close the file\n",
+        "#------#\n",
+        "suffix_sorted, suffix_indices = torch.sort(dots,dim=0 , descending=True)\n",
+        "#------#\n",
+        "\n",
+        "#Print the results\n",
+        "# title Show the 100 most similiar suffix and prefix text-encodings to the text encoding\n",
+        "RANGE = list_size\n",
+        "_suffixes = '{'\n",
+        "_sims =  '{'\n",
+        "for index in range(start_at_index + RANGE):\n",
+        "  if index < start_at_index : continue\n",
+        "  id = int(suffix_indices[index])\n",
+        "  ahead = \"from \"\n",
+        "  behind = \"\"\n",
+        "  if(id>NUM_SUFFIX*1):\n",
+        "    ahead = \"a \"\n",
+        "  if(id>NUM_SUFFIX*2):\n",
+        "    ahead = \"by \"\n",
+        "  if(id>NUM_SUFFIX*3):\n",
+        "    ahead = \"\"\n",
+        "    behind = \"like\"\n",
+        "  id = _modulus(id,NUM_SUFFIX)\n",
+        "  #------#\n",
+        "  sim = suffix_sorted[index].item()\n",
+        "  name = ahead + get_suffix(id) + behind\n",
+        "  if(get_suffix(id) == ' '): name = ahead + f'{id}' + behind\n",
+        "  _suffixes = _suffixes + name + '|'\n",
+        "  _sims = _sims + f'{round(sim,2)} %' + '|'\n",
+        "#------#\n",
+        "_suffixes = (_suffixes + '}').replace('|}', '}')\n",
+        "_sims = (_sims + '}').replace('|}', '}')\n",
+        "#------#\n",
+        "\n",
+        "\n",
+        "suffixes = _suffixes\n",
+        "sims = _sims\n",
+        "if(not print_Suffix): suffixes = ''\n",
+        "if(not print_Similarity): sims = ''\n",
+        "\n",
+        "if(not compact_Output):\n",
+        "  if(print_Descriptions):\n",
+        "    print(f'The {start_at_index}-{start_at_index + RANGE} most similiar suffix items to prompt : ' + suffixes)\n",
+        "    print(f'The {start_at_index}-{start_at_index + RANGE} similarity % for suffix items : ' + sims)\n",
+        "    print('')\n",
+        "  else:\n",
+        "    print(suffixes)\n",
+        "#-------#\n",
+        "\n",
+        "_prefixes = '{'\n",
+        "for index in range(start_at_index + RANGE):\n",
+        "  if index < start_at_index : continue\n",
+        "  id = f'{prefix_indices[index]}'\n",
+        "  #sim = prefix_sorted[index]\n",
+        "  name = get_prefix(id)\n",
+        "  _prefixes = _prefixes + name + '|'\n",
+        "#------#\n",
+        "_prefixes = (_prefixes + '}').replace('|}', '}')\n",
+        "\n",
+        "\n",
+        "prefixes = _prefixes\n",
+        "if(not print_Prefix): prefixes = ''\n",
+        "\n",
+        "if(print_Descriptions):\n",
+        "  print(f'The {start_at_index}-{start_at_index + RANGE} most similiar prefixes to prompt : ' + prefixes)\n",
+        "else:\n",
+        "  if(compact_Output):\n",
+        "    print((prefixes + _suffixes).replace('}{', '|'))\n",
+        "  else:\n",
+        "    print(prefixes)\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "xc-PbIYF428y"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
     {
       "cell_type": "markdown",
       "source": [
       ],
       "metadata": {
         "id": "ke6mZ1RZDOeB",
+        "outputId": "9f9b5556-6fa7-4aed-e1bc-1704ab0af381",
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 1000
         }
       },
+      "execution_count": 4,
       "outputs": [
         {
           "output_type": "display_data",
     {
       "cell_type": "code",
       "source": [
+        "# @title 🖼️ Get image_encoding similarity to the pre-calc. text_encodings\n",
+        "\n",
+        "list_size = 100 # @param {type:'number'}\n",
+        "start_at_index = 0 # @param {type:'number'}\n",
+        "print_Similarity = True # @param {type:\"boolean\"}\n",
+        "print_Suffix = True # @param {type:\"boolean\"}\n",
+        "print_Prefix = True # @param {type:\"boolean\"}\n",
+        "print_Descriptions = True # @param {type:\"boolean\"}\n",
+        "compact_Output = False # @param {type:\"boolean\"}\n",
         "\n",
         "from transformers import AutoTokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
         "suffix_sorted, suffix_indices = torch.sort(dots,dim=0 , descending=True)\n",
         "#------#\n",
         "\n",
+        "\n",
         "#Print the results\n",
         "# title Show the 100 most similiar suffix and prefix text-encodings to the text encoding\n",
+        "RANGE = list_size\n",
         "_suffixes = '{'\n",
         "_sims =  '{'\n",
+        "for index in range(start_at_index + RANGE):\n",
+        "  if index < start_at_index : continue\n",
         "  id = int(suffix_indices[index])\n",
         "  ahead = \"from \"\n",
         "  behind = \"\"\n",
         "  name = ahead + get_suffix(id) + behind\n",
         "  if(get_suffix(id) == ' '): name = ahead + f'{id}' + behind\n",
         "  _suffixes = _suffixes + name + '|'\n",
+        "  _sims = _sims + f'{round(sim,2)} %' + '|'\n",
         "#------#\n",
         "_suffixes = (_suffixes + '}').replace('|}', '}')\n",
         "_sims = (_sims + '}').replace('|}', '}')\n",
+        "#------#\n",
         "\n",
+        "suffixes = _suffixes\n",
+        "sims = _sims\n",
+        "\n",
+        "if(not print_Suffix): suffixes = ''\n",
+        "if(not print_Similarity): sims = ''\n",
         "\n",
+        "if(not compact_Output):\n",
+        "  if(print_Descriptions):\n",
+        "    print(f'The {start_at_index}-{start_at_index + RANGE} most similiar suffix items to prompt : ' + suffixes)\n",
+        "    print(f'The {start_at_index}-{start_at_index + RANGE} similarity % for suffix items : ' + sims)\n",
+        "    print('')\n",
+        "  else:\n",
+        "    print(suffixes)\n",
         "#-------#\n",
         "\n",
         "_prefixes = '{'\n",
+        "for index in range(start_at_index + RANGE):\n",
+        "  if index < start_at_index : continue\n",
         "  id = f'{prefix_indices[index]}'\n",
         "  #sim = prefix_sorted[index]\n",
         "  name = get_prefix(id)\n",
         "  _prefixes = _prefixes + name + '|'\n",
         "#------#\n",
         "_prefixes = (_prefixes + '}').replace('|}', '}')\n",
         "\n",
         "\n",
+        "prefixes = _prefixes\n",
+        "if(not print_Prefix): prefixes = ''\n",
         "\n",
+        "if(print_Descriptions):\n",
+        "  print(f'The {start_at_index}-{start_at_index + RANGE} most similiar prefixes to prompt : ' + prefixes)\n",
+        "else:\n",
+        "  if(compact_Output):\n",
+        "    print((prefixes + _suffixes).replace('}{', '|'))\n",
+        "  else:\n",
+        "    print(prefixes)\n"
       ],
       "metadata": {
+        "id": "rebogpoyOG8k"
       },
       "execution_count": null,
       "outputs": []