Upload sd_token_similarity_calculator.ipynb
Browse files
sd_token_similarity_calculator.ipynb
CHANGED
|
@@ -46,7 +46,8 @@
|
|
| 46 |
"NUM_PREFIX = 13662\n",
|
| 47 |
"NUM_SUFFIX = 32901\n",
|
| 48 |
"\n",
|
| 49 |
-
"
|
|
|
|
| 50 |
"\n",
|
| 51 |
"#Import the vocab.json\n",
|
| 52 |
"import json\n",
|
|
@@ -117,6 +118,22 @@
|
|
| 117 |
" return ' ' #<---- return whitespace if other id like emojis etc.\n",
|
| 118 |
"#--------#\n",
|
| 119 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
"#print(get_token(35894))\n"
|
| 121 |
],
|
| 122 |
"metadata": {
|
|
@@ -135,7 +152,7 @@
|
|
| 135 |
"tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
|
| 136 |
"\n",
|
| 137 |
"# @markdown Write name of token to match against\n",
|
| 138 |
-
"token_name = \"
|
| 139 |
"\n",
|
| 140 |
"prompt = token_name\n",
|
| 141 |
"# @markdown (optional) Mix the token with something else\n",
|
|
@@ -308,7 +325,7 @@
|
|
| 308 |
"#Get image\n",
|
| 309 |
"# You can use \"http://images.cocodataset.org/val2017/000000039769.jpg\" for testing\n",
|
| 310 |
"image_url = \"\" # @param {\"type\":\"string\",\"placeholder\":\"leave empty for local upload (scroll down to see it)\"}\n",
|
| 311 |
-
"colab_image_path = \"\" # @param {\"type\":\"string\",\"placeholder\": \"eval. as '/content/sd_tokens/' + **your input**\"}\n",
|
| 312 |
"# @markdown --------------------------\n",
|
| 313 |
"\n",
|
| 314 |
"image_path = \"\"\n",
|
|
@@ -332,6 +349,8 @@
|
|
| 332 |
"else:\n",
|
| 333 |
" image_A = Image.open(requests.get(image_url, stream=True).raw)\n",
|
| 334 |
"#------#\n",
|
|
|
|
|
|
|
| 335 |
"\n"
|
| 336 |
],
|
| 337 |
"metadata": {
|
|
@@ -340,6 +359,89 @@
|
|
| 340 |
"execution_count": null,
|
| 341 |
"outputs": []
|
| 342 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
{
|
| 344 |
"cell_type": "code",
|
| 345 |
"source": [
|
|
@@ -718,6 +820,35 @@
|
|
| 718 |
"id": "hyK423TQCRup"
|
| 719 |
}
|
| 720 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 721 |
{
|
| 722 |
"cell_type": "markdown",
|
| 723 |
"source": [
|
|
@@ -929,7 +1060,9 @@
|
|
| 929 |
"\n",
|
| 930 |
"//---//\n",
|
| 931 |
"\n",
|
| 932 |
-
"https://codeandlife.com/2023/01/26/mastering-the-huggingface-clip-model-how-to-extract-embeddings-and-calculate-similarity-for-text-and-images
|
|
|
|
|
|
|
| 933 |
],
|
| 934 |
"metadata": {
|
| 935 |
"id": "njeJx_nSSA8H"
|
|
|
|
| 46 |
"NUM_PREFIX = 13662\n",
|
| 47 |
"NUM_SUFFIX = 32901\n",
|
| 48 |
"\n",
|
| 49 |
+
"PREFIX_ENC_VOCAB = 'encoded_prefix_to_girl'\n",
|
| 50 |
+
"SUFFIX_ENC_VOCAB = 'encoded_suffix'\n",
|
| 51 |
"\n",
|
| 52 |
"#Import the vocab.json\n",
|
| 53 |
"import json\n",
|
|
|
|
| 118 |
" return ' ' #<---- return whitespace if other id like emojis etc.\n",
|
| 119 |
"#--------#\n",
|
| 120 |
"\n",
|
| 121 |
+
"#get token from id (excluding tokens with special symbols)\n",
|
| 122 |
+
"def get_suffix(id):\n",
|
| 123 |
+
" _id = f'{id}'\n",
|
| 124 |
+
" if int(id) <= NUM_SUFFIX:\n",
|
| 125 |
+
" return suffix[_id]\n",
|
| 126 |
+
" return ' ' #<---- return whitespace if out of bounds\n",
|
| 127 |
+
"#--------#\n",
|
| 128 |
+
"\n",
|
| 129 |
+
"#get token from id (excluding tokens with special symbols)\n",
|
| 130 |
+
"def get_prefix(id):\n",
|
| 131 |
+
" _id = f'{id}'\n",
|
| 132 |
+
" if int(id) <= NUM_PREFIX:\n",
|
| 133 |
+
" return prefix[_id]\n",
|
| 134 |
+
" return ' ' #<---- return whitespace if out of bounds\n",
|
| 135 |
+
"#--------#\n",
|
| 136 |
+
"\n",
|
| 137 |
"#print(get_token(35894))\n"
|
| 138 |
],
|
| 139 |
"metadata": {
|
|
|
|
| 152 |
"tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
|
| 153 |
"\n",
|
| 154 |
"# @markdown Write name of token to match against\n",
|
| 155 |
+
"token_name = \"prs \" # @param {type:'string',\"placeholder\":\"leave empty for random value token\"}\n",
|
| 156 |
"\n",
|
| 157 |
"prompt = token_name\n",
|
| 158 |
"# @markdown (optional) Mix the token with something else\n",
|
|
|
|
| 325 |
"#Get image\n",
|
| 326 |
"# You can use \"http://images.cocodataset.org/val2017/000000039769.jpg\" for testing\n",
|
| 327 |
"image_url = \"\" # @param {\"type\":\"string\",\"placeholder\":\"leave empty for local upload (scroll down to see it)\"}\n",
|
| 328 |
+
"colab_image_path = \"imperial.png\" # @param {\"type\":\"string\",\"placeholder\": \"eval. as '/content/sd_tokens/' + **your input**\"}\n",
|
| 329 |
"# @markdown --------------------------\n",
|
| 330 |
"\n",
|
| 331 |
"image_path = \"\"\n",
|
|
|
|
| 349 |
"else:\n",
|
| 350 |
" image_A = Image.open(requests.get(image_url, stream=True).raw)\n",
|
| 351 |
"#------#\n",
|
| 352 |
+
"from google.colab.patches import cv2_imshow\n",
|
| 353 |
+
"cv2_imshow(image_A)\n",
|
| 354 |
"\n"
|
| 355 |
],
|
| 356 |
"metadata": {
|
|
|
|
| 359 |
"execution_count": null,
|
| 360 |
"outputs": []
|
| 361 |
},
|
| 362 |
+
{
|
| 363 |
+
"cell_type": "code",
|
| 364 |
+
"source": [
|
| 365 |
+
"# @title Order pre-made text_encodings to image similarity\n",
|
| 366 |
+
"from transformers import AutoTokenizer\n",
|
| 367 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
|
| 368 |
+
"from transformers import CLIPProcessor, CLIPModel\n",
|
| 369 |
+
"processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
|
| 370 |
+
"model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
|
| 371 |
+
"\n",
|
| 372 |
+
"# Get image features\n",
|
| 373 |
+
"inputs = processor(images=image_A, return_tensors=\"pt\")\n",
|
| 374 |
+
"image_features = model.get_image_features(**inputs)\n",
|
| 375 |
+
"image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)\n",
|
| 376 |
+
"name_A = \"the image\"\n",
|
| 377 |
+
"\n",
|
| 378 |
+
"# Load the .db file for prefix encodings\n",
|
| 379 |
+
"import shelve\n",
|
| 380 |
+
"d = shelve.open(PREFIX_ENC_VOCAB)\n",
|
| 381 |
+
"dots = results_sim = torch.zeros(NUM_PREFIX)\n",
|
| 382 |
+
"for index in range(NUM_PREFIX):\n",
|
| 383 |
+
" text_features = d[f'{index}']\n",
|
| 384 |
+
" logit_scale = model.logit_scale.exp()\n",
|
| 385 |
+
" torch.matmul(text_features, image_features.t()) * logit_scale\n",
|
| 386 |
+
" sim = torch.nn.functional.cosine_similarity(text_features, image_features) * logit_scale\n",
|
| 387 |
+
" dots[index] = sim\n",
|
| 388 |
+
"#----#\n",
|
| 389 |
+
"prefix_sorted, prefix_indices = torch.sort(dots,dim=0 , descending=True)\n",
|
| 390 |
+
"d.close() #close the file\n",
|
| 391 |
+
"\n",
|
| 392 |
+
"# Load the .db file for suffix encodings\n",
|
| 393 |
+
"import shelve\n",
|
| 394 |
+
"d = shelve.open(SUFFIX_ENC_VOCAB)\n",
|
| 395 |
+
"dots = results_sim = torch.zeros(NUM_SUFFIX)\n",
|
| 396 |
+
"for index in range(NUM_SUFFIX):\n",
|
| 397 |
+
" text_features = d[f'{index}']\n",
|
| 398 |
+
" logit_scale = model.logit_scale.exp()\n",
|
| 399 |
+
" torch.matmul(text_features, image_features.t()) * logit_scale\n",
|
| 400 |
+
" sim = torch.nn.functional.cosine_similarity(text_features, image_features) * logit_scale\n",
|
| 401 |
+
" dots[index] = sim\n",
|
| 402 |
+
"#----#\n",
|
| 403 |
+
"suffix_sorted, suffix_indices = torch.sort(dots,dim=0 , descending=True)\n",
|
| 404 |
+
"d.close() #close the file"
|
| 405 |
+
],
|
| 406 |
+
"metadata": {
|
| 407 |
+
"id": "gaOB8rsOneIa"
|
| 408 |
+
},
|
| 409 |
+
"execution_count": null,
|
| 410 |
+
"outputs": []
|
| 411 |
+
},
|
| 412 |
+
{
|
| 413 |
+
"cell_type": "code",
|
| 414 |
+
"source": [
|
| 415 |
+
"# @title Show the 10 most similiar suffix and prefix text-encodings to the image encoding\n",
|
| 416 |
+
"\n",
|
| 417 |
+
"_suffixes = '{'\n",
|
| 418 |
+
"for index in range(20):\n",
|
| 419 |
+
" id = f'{suffix_indices[index]}'\n",
|
| 420 |
+
" sim = suffix_sorted[index]\n",
|
| 421 |
+
" name = get_suffix(id)\n",
|
| 422 |
+
" _suffixes = _suffixes + name + '|'\n",
|
| 423 |
+
"#------#\n",
|
| 424 |
+
"_suffixes = (_suffixes + '}').replace('|}', '}')\n",
|
| 425 |
+
"print('most similiar suffix tokens to image : ' + _suffixes)\n",
|
| 426 |
+
"\n",
|
| 427 |
+
"#-------#\n",
|
| 428 |
+
"\n",
|
| 429 |
+
"_prefixes = '{'\n",
|
| 430 |
+
"for index in range(20):\n",
|
| 431 |
+
" id = f'{prefix_indices[index]}'\n",
|
| 432 |
+
" sim = prefix_sorted[index]\n",
|
| 433 |
+
" name = get_prefix(id)\n",
|
| 434 |
+
" _prefixes = _prefixes + name + '|'\n",
|
| 435 |
+
"#------#\n",
|
| 436 |
+
"_prefixes = (_prefixes + '}').replace('|}', '}')\n",
|
| 437 |
+
"print('most similiar prefix tokens to image : ' + _prefixes)\n"
|
| 438 |
+
],
|
| 439 |
+
"metadata": {
|
| 440 |
+
"id": "eZqMUhP0qYaK"
|
| 441 |
+
},
|
| 442 |
+
"execution_count": null,
|
| 443 |
+
"outputs": []
|
| 444 |
+
},
|
| 445 |
{
|
| 446 |
"cell_type": "code",
|
| 447 |
"source": [
|
|
|
|
| 820 |
"id": "hyK423TQCRup"
|
| 821 |
}
|
| 822 |
},
|
| 823 |
+
{
|
| 824 |
+
"cell_type": "code",
|
| 825 |
+
"source": [
|
| 826 |
+
"# @title Make your own text_encodings .db file for later use (rate is roughly 1K encodings per minute, so plan accordingly)\n",
|
| 827 |
+
"from transformers import AutoTokenizer\n",
|
| 828 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
|
| 829 |
+
"from transformers import CLIPProcessor, CLIPModel\n",
|
| 830 |
+
"processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
|
| 831 |
+
"model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
|
| 832 |
+
"\n",
|
| 833 |
+
"# Save results as .db file\n",
|
| 834 |
+
"import shelve\n",
|
| 835 |
+
"d = shelve.open('my_text_encodings')\n",
|
| 836 |
+
"for index in range(NUM_PREFIX):\n",
|
| 837 |
+
" inputs = tokenizer(text = get_prefix(index)+'girl ', padding=True, return_tensors=\"pt\")\n",
|
| 838 |
+
" text_features = model.get_text_features(**inputs)\n",
|
| 839 |
+
" d[f'{index}'] = text_features\n",
|
| 840 |
+
"#----#\n",
|
| 841 |
+
"\n",
|
| 842 |
+
"d.close() #close the file\n",
|
| 843 |
+
"\n",
|
| 844 |
+
""
|
| 845 |
+
],
|
| 846 |
+
"metadata": {
|
| 847 |
+
"id": "9ZiTsF9jV0TV"
|
| 848 |
+
},
|
| 849 |
+
"execution_count": 10,
|
| 850 |
+
"outputs": []
|
| 851 |
+
},
|
| 852 |
{
|
| 853 |
"cell_type": "markdown",
|
| 854 |
"source": [
|
|
|
|
| 1060 |
"\n",
|
| 1061 |
"//---//\n",
|
| 1062 |
"\n",
|
| 1063 |
+
"https://codeandlife.com/2023/01/26/mastering-the-huggingface-clip-model-how-to-extract-embeddings-and-calculate-similarity-for-text-and-images/\n",
|
| 1064 |
+
"\n",
|
| 1065 |
+
"https://arxiv.org/pdf/2303.03032"
|
| 1066 |
],
|
| 1067 |
"metadata": {
|
| 1068 |
"id": "njeJx_nSSA8H"
|