CoreML MobileClip S0

Browse files

Files changed (8) hide show

ImageEncoder_mobileclip_s0.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
ImageEncoder_mobileclip_s0.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
ImageEncoder_mobileclip_s0.mlpackage/Manifest.json +18 -0
LICENSE +46 -0
PyTorch2CoreML-mobileclip.ipynb +620 -0
TextEncoder_mobileclip_s0.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
TextEncoder_mobileclip_s0.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
TextEncoder_mobileclip_s0.mlpackage/Manifest.json +18 -0

ImageEncoder_mobileclip_s0.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ec56c63c97cc32d8d2884fd8a9c61175f5797997462096513e6cf5dc60af626
+size 150531

ImageEncoder_mobileclip_s0.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a484a869abb2fc6e1ac37975c7801e5524c44bd71936fe2da799e9dd6accd4a
+size 22717696

ImageEncoder_mobileclip_s0.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "CED15CCF-4EDF-46F6-B043-0B8D502F3F13": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        },
+        "F5132FC6-F83D-47D8-AAF2-1056EF407E07": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        }
+    },
+    "rootModelIdentifier": "F5132FC6-F83D-47D8-AAF2-1056EF407E07"
+}

LICENSE ADDED Viewed

	@@ -0,0 +1,46 @@

+Copyright (C) 2024 Apple Inc. All Rights Reserved.
+IMPORTANT:  This Apple software is supplied to you by Apple
+Inc. ("Apple") in consideration of your agreement to the following
+terms, and your use, installation, modification or redistribution of
+this Apple software constitutes acceptance of these terms.  If you do
+not agree with these terms, please do not use, install, modify or
+redistribute this Apple software.
+In consideration of your agreement to abide by the following terms, and
+subject to these terms, Apple grants you a personal, non-exclusive
+license, under Apple's copyrights in this original Apple software (the
+"Apple Software"), to use, reproduce, modify and redistribute the Apple
+Software, with or without modifications, in source and/or binary forms;
+provided that if you redistribute the Apple Software in its entirety and
+without modifications, you must retain this notice and the following
+text and disclaimers in all such redistributions of the Apple Software.
+Neither the name, trademarks, service marks or logos of Apple Inc. may
+be used to endorse or promote products derived from the Apple Software
+without specific prior written permission from Apple.  Except as
+expressly stated in this notice, no other rights or licenses, express or
+implied, are granted by Apple herein, including but not limited to any
+patent rights that may be infringed by your derivative works or by other
+works in which the Apple Software may be incorporated.
+The Apple Software is provided by Apple on an "AS IS" basis.  APPLE
+MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
+THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
+OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
+OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
+MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
+AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
+STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-------------------------------------------------------------------------------
+SOFTWARE DISTRIBUTED WITH ML-MobileCLIP:
+The ML-MobileCLIP software includes a number of subcomponents with separate
+copyright notices and license terms - please see the file ACKNOWLEDGEMENTS.
+-------------------------------------------------------------------------------

PyTorch2CoreML-mobileclip.ipynb ADDED Viewed

	@@ -0,0 +1,620 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "1e99de7a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2024-06-20 13:18:56--  https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_s0.pt\n",
+      "Resolving docs-assets.developer.apple.com (docs-assets.developer.apple.com)... 17.253.73.203, 17.253.73.201\n",
+      "Connecting to docs-assets.developer.apple.com (docs-assets.developer.apple.com)|17.253.73.203|:443... connected.\n",
+      "HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n",
+      "\n",
+      "    The file is already fully retrieved; nothing to do.\n",
+      "\n",
+      "--2024-06-20 13:18:58--  https://raw.githubusercontent.com/apple/ml-mobileclip/main/mobileclip/configs/mobileclip_s0.json\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+      "HTTP request sent, awaiting response... 416 Range Not Satisfiable\n",
+      "\n",
+      "    The file is already fully retrieved; nothing to do.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "!pip install -q git+https://github.com/apple/ml-mobileclip\n",
+    "!mkdir -p checkpoints\n",
+    "!wget --continue https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_s0.pt -P checkpoints\n",
+    "!wget --continue https://raw.githubusercontent.com/apple/ml-mobileclip/main/mobileclip/configs/mobileclip_s0.json  -P checkpoints\n",
+    "!pip install -q --upgrade coremltools"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "801db364",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "scikit-learn version 1.2.2 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import coremltools as ct\n",
+    "import mobileclip\n",
+    "import numpy as np\n",
+    "from PIL import Image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "26f7dcff",
+   "metadata": {},
+   "source": [
+    "# 1. Export TextEncoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8f89976b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/anaconda3/envs/py30/lib/python3.10/site-packages/mobileclip/modules/common/transformer.py:125: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  if seq_len != self.num_embeddings:\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "\n",
+    "#device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "device = \"cpu\"\n",
+    "model, _, preprocess = mobileclip.create_model_and_transforms('mobileclip_s0', pretrained='./checkpoints/mobileclip_s0.pt')\n",
+    "tokenizer = mobileclip.get_tokenizer('mobileclip_s0')\n",
+    "\n",
+    "model=model.to(device)\n",
+    "model = model.eval()\n",
+    "\n",
+    "text_encoder = model.text_encoder\n",
+    "example_input = tokenizer(\"a photo of a cat\", return_tensors=\"pt\")\n",
+    "traced_model = torch.jit.trace(text_encoder, example_input)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "a727c3d1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 77])"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "example_input.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "a38a3ca0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# https://github.com/apple/ml-mobileclip/blob/main/mobileclip/configs/mobileclip_s0.json\n",
+    "max_seq_length = 77"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "c87abd71",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Converting PyTorch Frontend ==> MIL Ops:  27%|██▋       | 110/402 [00:00<00:00, 687.59 ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!\n",
+      "Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 401/402 [00:00<00:00, 1694.77 ops/s]\n",
+      "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 172.42 passes/s]\n",
+      "Running MIL default pipeline: 100%|██████████| 78/78 [00:02<00:00, 31.32 passes/s] \n",
+      "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 219.77 passes/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "text_encoder_model = ct.convert(\n",
+    "            traced_model,\n",
+    "            convert_to=\"mlprogram\",\n",
+    "            minimum_deployment_target=ct.target.iOS16,\n",
+    "            inputs=[ct.TensorType(name=\"prompt\",\n",
+    "                                 shape=[1,max_seq_length],\n",
+    "                                 dtype=np.int32)],\n",
+    "            outputs=[ct.TensorType(name=\"embOutput\", dtype=np.float32)],\n",
+    "        )\n",
+    "text_encoder_model.save(\"TextEncoder_mobileclip_s0.mlpackage\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "617e4e6b",
+   "metadata": {},
+   "source": [
+    "## Validate export  precision"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "fd6af02a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tokenized text:  tensor([49406,   320,  1125,   539,   320,  2368, 49407,     0,     0,     0],\n",
+      "       dtype=torch.int32)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load the model\n",
+    "te_ml_model = ct.models.MLModel('TextEncoder_mobileclip_s0.mlpackage')\n",
+    "\n",
+    "# Choose a tokenizer, here we use the clip tokenizer\n",
+    "text = tokenizer(\"a photo of a cat\").to(torch.int32)\n",
+    "text = text[:,:max_seq_length]\n",
+    "print(\"Tokenized text: \", text[0, :10])\n",
+    "\n",
+    "# # Or use CLIPTokenizerFast\n",
+    "# text = tokenizer(\"a photo of a cat\", return_tensors=\"pt\", padding=\"max_length\", max_length=max_seq_length)\n",
+    "# text = text.data['input_ids'].to(torch.int32)\n",
+    "\n",
+    "orig_features = text_encoder(text)\n",
+    "predictions = te_ml_model.predict({'prompt': text})\n",
+    "out = traced_model(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c29d0a98",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Original PyTorch TextEncoder ckpt out for \"a photo of a cat\":\n",
+      ">>> tensor([ 0.1062,  0.3889,  0.2455,  0.2906,  0.3474, -0.0871,  0.0244, -0.1012,\n",
+      "         0.4056, -0.0591], grad_fn=<SliceBackward0>)\n",
+      "Traced PyTorch TextEncoder ckpt out for \"a photo of a cat\":\n",
+      ">>> tensor([ 0.1062,  0.3889,  0.2455,  0.2906,  0.3474, -0.0871,  0.0244, -0.1012,\n",
+      "         0.4056, -0.0591], grad_fn=<SliceBackward0>)\n",
+      "\n",
+      "CoreML TextEncoder ckpt out for \"a photo of a cat\":\n",
+      ">>> [ 0.10631     0.388583    0.24500522  0.29059237  0.3471204  -0.0872687\n",
+      "  0.024912   -0.10095407  0.4052309  -0.05918849]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Original PyTorch TextEncoder ckpt out for \\\"a photo of a cat\\\":\\n>>>\", orig_features[0, :10])\n",
+    "print(\"Traced PyTorch TextEncoder ckpt out for \\\"a photo of a cat\\\":\\n>>>\", out[0, :10])\n",
+    "print(\"\\nCoreML TextEncoder ckpt out for \\\"a photo of a cat\\\":\\n>>>\", predictions['embOutput'][0, :10])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3c0d9c70",
+   "metadata": {},
+   "source": [
+    "You can see that there is some loss in precision, but it is still acceptable."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ca182b4a",
+   "metadata": {},
+   "source": [
+    "# 2. Export ImageEncoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "68521589",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([1, 3, 256, 256])\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/tm/mkjhhwzd5hb8y3tkrr72_zcw0000gq/T/ipykernel_43113/694208471.py:4: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
+      "  example_input = torch.tensor(preprocess(img))\n"
+     ]
+    }
+   ],
+   "source": [
+    "image_encoder = model.image_encoder\n",
+    "\n",
+    "img = Image.open(\"./sample_images/IMG_4085.jpeg\")\n",
+    "example_input = torch.tensor(preprocess(img))\n",
+    "#reshape to 1,3,256,256\n",
+    "example_input = example_input.unsqueeze(0)\n",
+    "print(example_input.shape)\n",
+    "traced_model = torch.jit.trace(image_encoder, example_input)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "6817c413",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Original PyTorch ImageEncoder ckpt out for jpg:\n",
+      ">>> tensor([ 0.0180,  0.0550,  0.0086,  0.0529,  0.0514,  0.0155, -0.0660,  0.1181,\n",
+      "         0.0274, -0.0218], grad_fn=<SliceBackward0>)\n"
+     ]
+    }
+   ],
+   "source": [
+    "example_output = image_encoder(example_input)\n",
+    "print(\"Original PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", example_output[0, :10])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "123c9b1c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD\n",
+    "image_mean = IMAGENET_DEFAULT_MEAN\n",
+    "image_std = IMAGENET_DEFAULT_STD"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "8f66a99c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torchvision.transforms as transforms\n",
+    "\n",
+    "class Wrapper(torch.nn.Module):\n",
+    "    def __init__(self, model):\n",
+    "        super().__init__()\n",
+    "        self.model = model\n",
+    "        _means = IMAGENET_DEFAULT_MEAN\n",
+    "        _stds = IMAGENET_DEFAULT_STD\n",
+    "        self.stds = torch.tensor(_stds).half()[:,None,None]\n",
+    "        self.means = torch.tensor(_means).half()[:,None,None]\n",
+    "\n",
+    "    transform_model = torch.nn.Sequential(\n",
+    "        transforms.Normalize(mean=image_mean,\n",
+    "                             std=image_std)\n",
+    "                             )\n",
+    "\n",
+    "    def forward(self, input):        \n",
+    "        input = input/255.0\n",
+    "        intput = self.transform_model(input)\n",
+    "        output = self.model(input)        \n",
+    "        return output\n",
+    "\n",
+    "# Instantiate the Wrapper model passing the original PyTorch FCN model\n",
+    "wrapped_model = Wrapper(traced_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "b3da3350",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrapped PyTorch ImageEncoder ckpt out for jpg:\n",
+      ">>> tensor([ 0.0180,  0.0501,  0.0073,  0.0510,  0.0515,  0.0164, -0.0680,  0.1125,\n",
+      "         0.0306, -0.0220])\n",
+      "Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\n",
+      ">>> tensor([ 0.0180,  0.0501,  0.0073,  0.0510,  0.0515,  0.0164, -0.0680,  0.1125,\n",
+      "         0.0306, -0.0220])\n"
+     ]
+    }
+   ],
+   "source": [
+    "i = np.asarray(img.resize((256, 256)))\n",
+    "i = i.astype(\"float32\")\n",
+    "i = np.transpose(i, (2, 0, 1))\n",
+    "i = np.expand_dims(i, 0)\n",
+    "i = torch.from_numpy(i)\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    out = wrapped_model(i)\n",
+    "\n",
+    "print(\"wrapped PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", out[0, :10])\n",
+    "\n",
+    "traced_model = torch.jit.trace(wrapped_model, i)\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    out = traced_model(i)\n",
+    "\n",
+    "print(\"Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", out[0, :10])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "304ae7b0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Model is not in eval mode. Consider calling '.eval()' on your model prior to conversion\n",
+      "Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 723/724 [00:00<00:00, 3783.41 ops/s]\n",
+      "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 69.84 passes/s]\n",
+      "Running MIL default pipeline: 100%|██████████| 78/78 [00:02<00:00, 30.22 passes/s]\n",
+      "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 71.49 passes/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "image_input = ct.ImageType(name=\"colorImage\", shape=i.shape)\n",
+    "image_encoder_model = ct.converters.convert(\n",
+    "    traced_model,\n",
+    "    convert_to=\"mlprogram\",\n",
+    "    inputs=[image_input],\n",
+    "    outputs=[ct.TensorType(name=\"embOutput\", dtype=np.float32)],\n",
+    "    minimum_deployment_target=ct.target.iOS16,\n",
+    ")\n",
+    "image_encoder_model.save(\"ImageEncoder_mobileclip_s0.mlpackage\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f3c5008e",
+   "metadata": {},
+   "source": [
+    "## Validate export"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "759bb57d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/tm/mkjhhwzd5hb8y3tkrr72_zcw0000gq/T/ipykernel_43113/3839791618.py:5: DeprecationWarning: BICUBIC is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BICUBIC instead.\n",
+      "  imgPIL = imgPIL.resize((256, 256), Image.BICUBIC)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\n",
+      ">>> tensor([ 0.0180,  0.0501,  0.0073,  0.0510,  0.0515,  0.0164, -0.0680,  0.1125,\n",
+      "         0.0306, -0.0220], grad_fn=<SliceBackward0>)\n",
+      "\n",
+      "CoreML ImageEncoder ckpt out for jpg:\n",
+      ">>> [ 0.01794434  0.04956055  0.0073967   0.05114746  0.05157471  0.01622009\n",
+      " -0.0680542   0.11236572  0.03044128 -0.02180481]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torchvision.transforms as transforms\n",
+    "\n",
+    "ie_ml_model = ct.models.MLModel('ImageEncoder_mobileclip_s0.mlpackage')\n",
+    "imgPIL = Image.open(\"./sample_images/IMG_4085.jpeg\")\n",
+    "imgPIL = imgPIL.resize((256, 256), Image.BICUBIC)\n",
+    "\n",
+    "img_np = np.asarray(imgPIL).astype(np.float32) # (256, 256, 3)\n",
+    "img_np = img_np[np.newaxis, :, :, :] # (1, 256, 256, 3)\n",
+    "img_np = np.transpose(img_np, [0, 3, 1, 2]) # (1, 3, 256, 256)\n",
+    "torch_tensor_input = torch.from_numpy(img_np)\n",
+    "\n",
+    "predictions = ie_ml_model.predict({'colorImage': imgPIL})\n",
+    "out = wrapped_model(torch_tensor_input)\n",
+    "print(\"Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", out[0, :10])\n",
+    "print(\"\\nCoreML ImageEncoder ckpt out for jpg:\\n>>>\", predictions['embOutput'][0, :10])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "a71abf7b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "There are 9 images in the dataset, each has a feature of shape torch.Size([512])\n",
+      "\n",
+      "\n",
+      "Text: a photo of a dog\n",
+      "Most similar images:\n",
+      "IMG_4061.jpeg                            50.45%\n",
+      "IMG_2134.jpeg                            45.32%\n",
+      "21-09-07_1153.jpeg                       3.20%\n",
+      "IMG_0519.jpeg                            1.01%\n",
+      "IMG_4085.jpeg                            0.01%\n",
+      "\n",
+      "\n",
+      "Text: a dog\n",
+      "Most similar images:\n",
+      "IMG_2134.jpeg                            85.73%\n",
+      "IMG_4061.jpeg                            12.42%\n",
+      "21-09-07_1153.jpeg                       1.19%\n",
+      "IMG_0519.jpeg                            0.65%\n",
+      "IMG_4085.jpeg                            0.00%\n",
+      "\n",
+      "\n",
+      "Text: dogs\n",
+      "Most similar images:\n",
+      "IMG_0519.jpeg                            79.85%\n",
+      "IMG_2134.jpeg                            16.58%\n",
+      "IMG_4061.jpeg                            3.17%\n",
+      "21-09-07_1153.jpeg                       0.20%\n",
+      "IMG_6172.jpeg                            0.12%\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import pickle\n",
+    "\n",
+    "path = r\"./sample_images\"\n",
+    "# this list holds all the image filename\n",
+    "images = []\n",
+    "\n",
+    "def image_resize(image):\n",
+    "    image = image.resize((256, 256), Image.BICUBIC)\n",
+    "    return image\n",
+    "\n",
+    "# creates a ScandirIterator aliased as files\n",
+    "with os.scandir(path) as files:\n",
+    "  # loops through each file in the directory\n",
+    "    for file in files:\n",
+    "        if file.name.endswith('.jpeg'):\n",
+    "          # adds only the image files to the flowers list\n",
+    "            images.append(file.name)\n",
+    "\n",
+    "def extract_features(path, images):\n",
+    "    num_images = len(images)\n",
+    "    images_features = []\n",
+    "    counter = 0\n",
+    "    for i in range(0, num_images):\n",
+    "        images_preprocess = image_resize(Image.open(os.path.join(path,images[i])).convert(\"RGB\"))        \n",
+    "        print(i)\n",
+    "        cur_features = ie_ml_model.predict({'colorImage': images_preprocess})\n",
+    "        cur_features = torch.tensor(cur_features['embOutput']).float().to(device)\n",
+    "        cur_features /= cur_features.norm(dim=-1, keepdim=True)\n",
+    "        images_features.append(cur_features)\n",
+    "\n",
+    "    images_features = torch.cat(images_features)\n",
+    "    print(\"Features shape {}\".format(images_features.shape))\n",
+    "    return images_features.cpu().numpy()\n",
+    "   \n",
+    "data = {}\n",
+    "p = r\"./ml_mobileclip_s0_features.pkl\"\n",
+    "\n",
+    "# check if the pickled file exists\n",
+    "if os.path.exists(p):\n",
+    "    with open(p,'rb') as file:\n",
+    "        data = pickle.load(file)\n",
+    "else:\n",
+    "    print(\"Extracting features\")\n",
+    "    images_features = extract_features(path, images)\n",
+    "    for i in range(len(images_features)):\n",
+    "        data[images[i]] = images_features[i]\n",
+    "\n",
+    "    with open(p,'wb') as file:\n",
+    "        pickle.dump(data,file)\n",
+    "          \n",
+    " \n",
+    "# get a list of the filenames\n",
+    "filenames = np.array(list(data.keys()))\n",
+    "\n",
+    "# get a list of just the features\n",
+    "feat = np.array(list(data.values()))\n",
+    "feat = torch.tensor(feat).float().to(device)\n",
+    "\n",
+    "# reshape so that there are n samples of 512 vectors\n",
+    "#feat = feat.reshape(-1,512)\n",
+    "\n",
+    "print(f\"There are {len(filenames)} images in the dataset, each has a feature of shape {feat[0].shape}\")\n",
+    "\n",
+    "text_input = [\"a photo of a dog\", \"a dog\", \"dogs\"]\n",
+    "#text = tokenizer(\"a photo of a cat\").to(torch.int32)\n",
+    "texts_input_tokenized = tokenizer(text_input).to(torch.int32)\n",
+    "texts_input_tokenized = texts_input_tokenized[:,:max_seq_length]\n",
+    "\n",
+    "for i in range(len(text_input)):\n",
+    "    text_input_tokenized = [texts_input_tokenized[i]]\n",
+    "    text_features = te_ml_model.predict({'prompt': text_input_tokenized})\n",
+    "    text_features = torch.tensor(text_features['embOutput']).float().to(device)\n",
+    "    text_features /= text_features.norm(dim=-1, keepdim=True)\n",
+    "    # calculate the similarity between the text features and the image features\n",
+    "    similarity = (100.0 * text_features @ feat.T).softmax(dim=-1)\n",
+    "    print(\"\\n\")\n",
+    "    print(f\"Text: {text_input[i]}\")\n",
+    "    values, indices = similarity[0].topk(5)\n",
+    "    print(\"Most similar images:\")\n",
+    "    for value, index in zip(values, indices):\n",
+    "        print(f\"{filenames[index]:<40} {100 * value.item():.2f}%\")    \n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

TextEncoder_mobileclip_s0.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f999dca8d10f1a0a1ca95a09e8a169e59f6c16ed5eb76f67a26e0bcfec9e10a
+size 55887

TextEncoder_mobileclip_s0.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18ea166cc91e2b6b657f8a34edf873696b2c0ab6dac7831d9853e16c8a6a36bf
+size 84871616

TextEncoder_mobileclip_s0.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "2EC2DF70-CC93-4AFF-BD0A-F7B24DD88BBE": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        },
+        "F8DAD87B-2BE0-42E2-AEE2-B5BD6A3FDF88": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        }
+    },
+    "rootModelIdentifier": "2EC2DF70-CC93-4AFF-BD0A-F7B24DD88BBE"
+}