fremko
/

moondream

Safetensors

Model card Files Files and versions

xet

Community

johnmalek312 commited on Jun 23, 2025

Commit

ded605e

1 Parent(s): 9b2871c

broken change start of batching

Browse files

Files changed (2) hide show

moondream2/moondream.py +37 -14
ollama.ipynb +217 -481

moondream2/moondream.py CHANGED Viewed

@@ -43,9 +43,19 @@ class EncodedImage:
 class KVCache(nn.Module):
-    def __init__(self, n_heads, n_kv_heads, max_context, dim, device, dtype):
         super().__init__()
-        cache_shape = (1, n_kv_heads, max_context, dim // n_heads)
         self.register_buffer(
             "k_cache", torch.zeros(*cache_shape, device=device, dtype=dtype)
         )
@@ -132,6 +142,7 @@ class MoondreamModel(nn.Module):
                 c.n_kv_heads,
                 c.max_context,
                 c.dim,
                 device=self.device,
                 dtype=self.vision.pos_emb.dtype,
             )
@@ -190,9 +201,11 @@ class MoondreamModel(nn.Module):
         return self._vis_proj(global_features, reconstructed)
-    def encode_image(self, image: Union[Image.Image, EncodedImage]) -> EncodedImage:
         if isinstance(image, EncodedImage):
             return image
         elif not isinstance(image, Image.Image):
             raise ValueError("image must be a PIL Image or EncodedImage")
@@ -202,12 +215,17 @@ class MoondreamModel(nn.Module):
             bos = torch.tensor([[self.config.tokenizer.bos_id]], device=self.device)
-            img_emb = self._run_vision_encoder(image)
             bos_emb = text_encoder(
                 bos,
                 self.text,
             )
-            inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)
             mask = self.attn_mask[:, :, 0 : inputs_embeds.size(1), :]
             pos_ids = torch.arange(inputs_embeds.size(1), dtype=torch.int32, device=self.device)
             self._prefill(inputs_embeds, mask, pos_ids)
@@ -293,23 +311,28 @@ class MoondreamModel(nn.Module):
     def point(
         self,
-        image: Union[Image.Image, EncodedImage],
-        object: str,
         settings: Optional[ObjectSamplingSettings] = None,
     ):
         if self.config.tokenizer.templates["point"] is None:
             raise NotImplementedError("Model does not support pointing.")
         image = self.encode_image(image)
-        prompt_tokens = torch.tensor(
-            [
                 self.config.tokenizer.templates["point"]["prefix"]
-                + self.tokenizer.encode(" " + object).ids
                 + self.config.tokenizer.templates["point"]["suffix"]
-            ],
-            device=self.device,
-        )
         _, hidden, next_token, pos = self._prefill_prompt(
             prompt_tokens, image.pos, temperature=0, top_p=0
@@ -327,5 +350,5 @@ class MoondreamModel(nn.Module):
         return {"points": objects}
-    def forward(self, image: Union[Image.Image, EncodedImage], prompt: str, settings: Optional[ObjectSamplingSettings] = None):
         return self.point(image, prompt, settings)

 class KVCache(nn.Module):
+    def __init__(self,
+                 n_heads,
+                 n_kv_heads,
+                 max_context,
+                 dim,
+                 batch_size: int = 1,
+                 device=None,
+                 dtype=None):
         super().__init__()
+        cache_shape = (batch_size,
+                       n_kv_heads,
+                       max_context,
+                       dim // n_heads)
         self.register_buffer(
             "k_cache", torch.zeros(*cache_shape, device=device, dtype=dtype)
         )
                 c.n_kv_heads,
                 c.max_context,
                 c.dim,
+                batch_size=2,
                 device=self.device,
                 dtype=self.vision.pos_emb.dtype,
             )
         return self._vis_proj(global_features, reconstructed)
+    def encode_image(self, image: Union[Image.Image, EncodedImage, torch.Tensor]) -> EncodedImage:
         if isinstance(image, EncodedImage):
             return image
+        elif isinstance(image, torch.Tensor):
+            pass
         elif not isinstance(image, Image.Image):
             raise ValueError("image must be a PIL Image or EncodedImage")
             bos = torch.tensor([[self.config.tokenizer.bos_id]], device=self.device)
+            if isinstance(image, Image.Image):
+                img_emb = self._run_vision_encoder(image)
+            else:
+                img_emb = image
             bos_emb = text_encoder(
                 bos,
                 self.text,
             )
+            bos_emb = bos_emb.expand(img_emb.size(0), -1, -1)
+            inputs_embeds = torch.cat([bos_emb, img_emb], dim=1)
             mask = self.attn_mask[:, :, 0 : inputs_embeds.size(1), :]
             pos_ids = torch.arange(inputs_embeds.size(1), dtype=torch.int32, device=self.device)
             self._prefill(inputs_embeds, mask, pos_ids)
     def point(
         self,
+        image: Union[Image.Image, EncodedImage, torch.Tensor],
+        object: list[str],
         settings: Optional[ObjectSamplingSettings] = None,
     ):
         if self.config.tokenizer.templates["point"] is None:
             raise NotImplementedError("Model does not support pointing.")
+        # set the pad token to the eos token
+        self.tokenizer.pad_token = self.tokenizer.eos_token
         image = self.encode_image(image)
+        # input batch tokenized and padded
+        prompt_tokens = [
                 self.config.tokenizer.templates["point"]["prefix"]
+                + self.tokenizer.encode(" " + obj).ids
                 + self.config.tokenizer.templates["point"]["suffix"]
+                for obj in object
+        ]
+        # padding with eos token to the same length as the longest sequence
+        tokens_batch = self.tokenizer.pad(prompt_tokens, padding="longest", return_tensors="pt")
+        prompt_tokens = tokens_batch.input_ids.to(self.device)
         _, hidden, next_token, pos = self._prefill_prompt(
             prompt_tokens, image.pos, temperature=0, top_p=0
         return {"points": objects}
+    def forward(self, image: Union[Image.Image, EncodedImage, torch.Tensor], prompt: str, settings: Optional[ObjectSamplingSettings] = None):
         return self.point(image, prompt, settings)

ollama.ipynb CHANGED Viewed

@@ -4,554 +4,290 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [],
-   "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import torch\n",
-    "import torch.nn as nn\n",
     "\n",
-    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
     "\n",
-    "class RotaryEmbeddingInPlace(nn.Module):\n",
-    "    def __init__(self, head_dim: int, max_seq_len: int, theta: float = 10000.0):\n",
-    "        super().__init__()\n",
-    "        # Match RotaryEmbedding exactly\n",
-    "        self.rot_dim = head_dim // 2  # Only half of head_dim is rotated\n",
-    "        \n",
-    "        # Frequency calculation - match RotaryEmbedding exactly\n",
-    "        freqs = 1.0 / (theta ** (torch.arange(0, self.rot_dim, 2).float() / self.rot_dim))\n",
-    "        t = torch.arange(max_seq_len, dtype=torch.float32).unsqueeze(1)\n",
-    "        freqs = t * freqs.unsqueeze(0)\n",
-    "        \n",
-    "\n",
-    "        freqs_cis = torch.exp(1j * freqs)\n",
-    "        cos_vals = freqs_cis.real\n",
-    "        sin_vals = freqs_cis.imag\n",
-    "\n",
-    "        self.register_buffer('cos_cache', cos_vals, persistent=False)\n",
-    "        self.register_buffer('sin_cache', sin_vals, persistent=False)\n",
-    "    \n",
-    "    def apply(self, x: torch.Tensor) -> torch.Tensor:\n",
-    "        \"\"\"\n",
-    "        WARNING: This modifies the input tensor in-place for maximum speed!\n",
-    "        If you need the original tensor, make a copy before calling this.\n",
-    "        \n",
-    "        Must match RotaryEmbedding output exactly.\n",
-    "        \"\"\"\n",
-    "        seq_len = x.shape[1]\n",
-    "        d = self.rot_dim // 2\n",
-    "        \n",
-    "        # Get cos/sin with same broadcasting as RotaryEmbedding\n",
-    "        cos = self.cos_cache[:seq_len].unsqueeze(0).unsqueeze(2)\n",
-    "        sin = self.sin_cache[:seq_len].unsqueeze(0).unsqueeze(2)\n",
-    "        \n",
-    "        # Split rotated part into real/imaginary components\n",
-    "        xq_r = x[..., :d]      # First half of rot_dim\n",
-    "        xq_i = x[..., d:d*2]   # Second half of rot_dim\n",
-    "        \n",
-    "        # Apply rotation\n",
-    "        xq_out_r = xq_r * cos - xq_i * sin\n",
-    "        xq_out_i = xq_r * sin + xq_i * cos\n",
-    "        \n",
-    "        # Vectorized interleaving using torch.stack and view\n",
-    "        # Stack creates [d, ..., 2] then view as [..., d*2]\n",
-    "        x[..., :self.rot_dim] = torch.stack([xq_out_r, xq_out_i], dim=-1).view(*x.shape[:-1], self.rot_dim)\n",
-    "        \n",
-    "        # x_pass part (x[..., self.rot_dim:]) remains unchanged automatically\n",
-    "        \n",
-    "        return x\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
    "source": [
-    "dim_per_head = 64\n",
-    "n_heads = 32\n",
-    "max_context = 2048\n",
-    "\n",
-    "freq_dim = dim_per_head // 2\n",
-    "\n",
-    "torch.manual_seed(42)\n",
-    "\n",
-    "tensor = torch.rand(1, 730, n_heads, dim_per_head)\n",
-    "tensor = tensor.to(device)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
    "source": [
-    "fast_rope = RotaryEmbeddingInPlace(dim_per_head, max_context)\n",
-    "fast_rope.to(device)\n",
-    "fast_rtensor = fast_rope.apply(tensor)\n",
-    "\n",
-    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
-   "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "tensor([0.8823, 0.8854, 0.9150, 0.5739, 0.3829, 0.2666, 0.9593, 0.6274, 0.3904,\n",
-       "        0.2696, 0.6009, 0.4414, 0.2566, 0.2969, 0.7936], device='cuda:0')"
       ]
      },
-     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "fast_rtensor.flatten()[:15]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
    "source": [
-    "tensor([0.8823, 0.8854, 0.9150, 0.5739, 0.3829, 0.2666, 0.9593, 0.6274, 0.3904,\n",
-    "        0.2696, 0.6009, 0.4414, 0.2566, 0.2969, 0.7936])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
-   "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "✓ OpenCV available\n",
-      "=== Comprehensive Benchmark with Tensor Difference Analysis ===\n",
-      "CUDA available: True\n",
-      "PyVIPS available: True\n",
-      "\n",
-      "================================================================================\n",
-      "Testing 1080p (1920x1080)\n",
-      "================================================================================\n",
-      "\n",
-      "Function     Min (ms)   Avg (ms)   Speedup \n",
-      "--------------------------------------------------\n",
-      "Original     16.3       16.7       1.00x   \n",
-      "Optimized    8.9        9.4        1.77x   \n",
-      "Ultra Fast   9.2        9.5        1.75x   \n",
-      "\n",
-      "🔍 TENSOR DIFFERENCE ANALYSIS\n",
-      "==================================================\n",
-      "\n",
-      "✓ Tiling match: (2, 4)\n",
-      "\n",
-      "--- Tensor Difference Analysis: Original vs Optimized ---\n",
-      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
-      "Max absolute difference: 1.208008\n",
-      "Mean absolute difference: 0.181336\n",
-      "Std of differences: 0.153313\n",
-      "Pixels with any difference: 98.44% (3797773/3857868)\n",
-      "\n",
-      "Tolerance analysis:\n",
-      "  Within    1e-06:   1.56% (60095/3857868)\n",
-      "  Within    1e-05:   1.56% (60095/3857868)\n",
-      "  Within    1e-04:   1.56% (60095/3857868)\n",
-      "  Within    1e-03:   1.56% (60095/3857868)\n",
-      "  Within    1e-02:   4.68% (180528/3857868)\n",
-      "  Within    1e-01:  36.82% (1420591/3857868)\n",
-      "❌ Tensors have significant differences\n",
-      "\n",
-      "Per-crop analysis (9 crops):\n",
-      "  Crop 0: max=1.207520, mean=0.288220\n",
-      "  Crop 1: max=1.160156, mean=0.167923\n",
-      "  Crop 2: max=1.208008, mean=0.167772\n",
-      "  Crop 3: max=1.208008, mean=0.168140\n",
-      "  Crop 4: max=1.176270, mean=0.168022\n",
-      "  ... and 4 more crops\n",
-      "\n",
-      "✓ Tiling match: (2, 4)\n",
-      "\n",
-      "--- Tensor Difference Analysis: Original vs Ultra Fast ---\n",
-      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
-      "Max absolute difference: 1.208008\n",
-      "Mean absolute difference: 0.181336\n",
-      "Std of differences: 0.153313\n",
-      "Pixels with any difference: 98.44% (3797773/3857868)\n",
-      "\n",
-      "Tolerance analysis:\n",
-      "  Within    1e-06:   1.56% (60095/3857868)\n",
-      "  Within    1e-05:   1.56% (60095/3857868)\n",
-      "  Within    1e-04:   1.56% (60095/3857868)\n",
-      "  Within    1e-03:   1.56% (60095/3857868)\n",
-      "  Within    1e-02:   4.68% (180528/3857868)\n",
-      "  Within    1e-01:  36.82% (1420591/3857868)\n",
-      "❌ Tensors have significant differences\n",
-      "\n",
-      "Per-crop analysis (9 crops):\n",
-      "  Crop 0: max=1.207520, mean=0.288220\n",
-      "  Crop 1: max=1.160156, mean=0.167923\n",
-      "  Crop 2: max=1.208008, mean=0.167772\n",
-      "  Crop 3: max=1.208008, mean=0.168140\n",
-      "  Crop 4: max=1.176270, mean=0.168022\n",
-      "  ... and 4 more crops\n",
-      "\n",
-      "✓ Tiling match: (2, 4)\n",
-      "\n",
-      "--- Tensor Difference Analysis: Optimized vs Ultra Fast ---\n",
-      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
-      "Max absolute difference: 0.000000\n",
-      "Mean absolute difference: 0.000000\n",
-      "Std of differences: 0.000000\n",
-      "Pixels with any difference: 0.00% (0/3857868)\n",
-      "\n",
-      "Tolerance analysis:\n",
-      "  Within    1e-06: 100.00% (3857868/3857868)\n",
-      "  Within    1e-05: 100.00% (3857868/3857868)\n",
-      "  Within    1e-04: 100.00% (3857868/3857868)\n",
-      "  Within    1e-03: 100.00% (3857868/3857868)\n",
-      "  Within    1e-02: 100.00% (3857868/3857868)\n",
-      "  Within    1e-01: 100.00% (3857868/3857868)\n",
-      "✅ Tensors are essentially identical (max diff < 1e-5)\n",
-      "\n",
-      "Per-crop analysis (9 crops):\n",
-      "  Crop 0: max=0.000000, mean=0.000000\n",
-      "  Crop 1: max=0.000000, mean=0.000000\n",
-      "  Crop 2: max=0.000000, mean=0.000000\n",
-      "  Crop 3: max=0.000000, mean=0.000000\n",
-      "  Crop 4: max=0.000000, mean=0.000000\n",
-      "  ... and 4 more crops\n",
-      "\n",
-      "================================================================================\n",
-      "Testing 4K (3840x2160)\n",
-      "================================================================================\n",
-      "\n",
-      "Function     Min (ms)   Avg (ms)   Speedup \n",
-      "--------------------------------------------------\n",
-      "Original     55.0       57.2       1.00x   \n",
-      "Optimized    30.8       33.4       1.71x   \n",
-      "Ultra Fast   32.3       36.5       1.57x   \n",
-      "\n",
-      "🔍 TENSOR DIFFERENCE ANALYSIS\n",
-      "==================================================\n",
-      "\n",
-      "✓ Tiling match: (2, 4)\n",
-      "\n",
-      "--- Tensor Difference Analysis: Original vs Optimized ---\n",
-      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
-      "Max absolute difference: 1.278320\n",
-      "Mean absolute difference: 0.280527\n",
-      "Std of differences: 0.198947\n",
-      "Pixels with any difference: 99.16% (3825385/3857868)\n",
-      "\n",
-      "Tolerance analysis:\n",
-      "  Within    1e-06:   0.84% (32483/3857868)\n",
-      "  Within    1e-05:   0.84% (32483/3857868)\n",
-      "  Within    1e-04:   0.84% (32483/3857868)\n",
-      "  Within    1e-03:   0.84% (32483/3857868)\n",
-      "  Within    1e-02:   2.53% (97553/3857868)\n",
-      "  Within    1e-01:  20.93% (807398/3857868)\n",
-      "❌ Tensors have significant differences\n",
-      "\n",
-      "Per-crop analysis (9 crops):\n",
-      "  Crop 0: max=1.105957, mean=0.310640\n",
-      "  Crop 1: max=1.262695, mean=0.276606\n",
-      "  Crop 2: max=1.262695, mean=0.276472\n",
-      "  Crop 3: max=1.278320, mean=0.276858\n",
-      "  Crop 4: max=1.231934, mean=0.276985\n",
-      "  ... and 4 more crops\n",
-      "\n",
-      "✓ Tiling match: (2, 4)\n",
-      "\n",
-      "--- Tensor Difference Analysis: Original vs Ultra Fast ---\n",
-      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
-      "Max absolute difference: 1.278320\n",
-      "Mean absolute difference: 0.280527\n",
-      "Std of differences: 0.198947\n",
-      "Pixels with any difference: 99.16% (3825385/3857868)\n",
-      "\n",
-      "Tolerance analysis:\n",
-      "  Within    1e-06:   0.84% (32483/3857868)\n",
-      "  Within    1e-05:   0.84% (32483/3857868)\n",
-      "  Within    1e-04:   0.84% (32483/3857868)\n",
-      "  Within    1e-03:   0.84% (32483/3857868)\n",
-      "  Within    1e-02:   2.53% (97553/3857868)\n",
-      "  Within    1e-01:  20.93% (807398/3857868)\n",
-      "❌ Tensors have significant differences\n",
-      "\n",
-      "Per-crop analysis (9 crops):\n",
-      "  Crop 0: max=1.105957, mean=0.310640\n",
-      "  Crop 1: max=1.262695, mean=0.276606\n",
-      "  Crop 2: max=1.262695, mean=0.276472\n",
-      "  Crop 3: max=1.278320, mean=0.276858\n",
-      "  Crop 4: max=1.231934, mean=0.276985\n",
-      "  ... and 4 more crops\n",
-      "\n",
-      "✓ Tiling match: (2, 4)\n",
-      "\n",
-      "--- Tensor Difference Analysis: Optimized vs Ultra Fast ---\n",
-      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
-      "Max absolute difference: 0.000000\n",
-      "Mean absolute difference: 0.000000\n",
-      "Std of differences: 0.000000\n",
-      "Pixels with any difference: 0.00% (0/3857868)\n",
-      "\n",
-      "Tolerance analysis:\n",
-      "  Within    1e-06: 100.00% (3857868/3857868)\n",
-      "  Within    1e-05: 100.00% (3857868/3857868)\n",
-      "  Within    1e-04: 100.00% (3857868/3857868)\n",
-      "  Within    1e-03: 100.00% (3857868/3857868)\n",
-      "  Within    1e-02: 100.00% (3857868/3857868)\n",
-      "  Within    1e-01: 100.00% (3857868/3857868)\n",
-      "✅ Tensors are essentially identical (max diff < 1e-5)\n",
-      "\n",
-      "Per-crop analysis (9 crops):\n",
-      "  Crop 0: max=0.000000, mean=0.000000\n",
-      "  Crop 1: max=0.000000, mean=0.000000\n",
-      "  Crop 2: max=0.000000, mean=0.000000\n",
-      "  Crop 3: max=0.000000, mean=0.000000\n",
-      "  Crop 4: max=0.000000, mean=0.000000\n",
-      "  ... and 4 more crops\n",
-      "\n",
-      "💡 Tip: Run with '--speed-only' flag for faster benchmarking without tensor analysis\n"
-     ]
     }
    ],
-   "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "✓ OpenCV available\n",
-      "=== Comprehensive Benchmark with Tensor Difference Analysis ===\n",
-      "CUDA available: True\n",
-      "PyVIPS available: True\n",
-      "\n",
-      "================================================================================\n",
-      "Testing 1080p (1920x1080)\n",
-      "================================================================================\n",
-      "\n",
-      "Function     Min (ms)   Avg (ms)   Speedup \n",
-      "--------------------------------------------------\n",
-      "Original     15.6       16.8       1.00x   \n",
-      "Optimized    8.8        9.2        1.82x   \n",
-      "Ultra Fast   9.4        9.6        1.76x   \n",
-      "\n",
-      "🔍 TENSOR DIFFERENCE ANALYSIS\n",
-      "==================================================\n",
-      "\n",
-      "✓ Tiling match: (2, 4)\n",
-      "\n",
-      "--- Tensor Difference Analysis: Original vs Optimized ---\n",
-      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
-      "Max absolute difference: 1.208008\n",
-      "Mean absolute difference: 0.181336\n",
-      "Std of differences: 0.153313\n",
-      "Pixels with any difference: 98.44% (3797773/3857868)\n",
-      "\n",
-      "Tolerance analysis:\n",
-      "  Within    1e-06:   1.56% (60095/3857868)\n",
-      "  Within    1e-05:   1.56% (60095/3857868)\n",
-      "  Within    1e-04:   1.56% (60095/3857868)\n",
-      "  Within    1e-03:   1.56% (60095/3857868)\n",
-      "  Within    1e-02:   4.68% (180528/3857868)\n",
-      "  Within    1e-01:  36.82% (1420591/3857868)\n",
-      "❌ Tensors have significant differences\n",
-      "\n",
-      "Per-crop analysis (9 crops):\n",
-      "  Crop 0: max=1.207520, mean=0.288220\n",
-      "  Crop 1: max=1.160156, mean=0.167923\n",
-      "  Crop 2: max=1.208008, mean=0.167772\n",
-      "  Crop 3: max=1.208008, mean=0.168140\n",
-      "  Crop 4: max=1.176270, mean=0.168022\n",
-      "  ... and 4 more crops\n",
-      "\n",
-      "✓ Tiling match: (2, 4)\n",
-      "\n",
-      "--- Tensor Difference Analysis: Original vs Ultra Fast ---\n",
-      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
-      "Max absolute difference: 1.208008\n",
-      "Mean absolute difference: 0.181336\n",
-      "Std of differences: 0.153313\n",
-      "Pixels with any difference: 98.44% (3797773/3857868)\n",
-      "\n",
-      "Tolerance analysis:\n",
-      "  Within    1e-06:   1.56% (60095/3857868)\n",
-      "  Within    1e-05:   1.56% (60095/3857868)\n",
-      "  Within    1e-04:   1.56% (60095/3857868)\n",
-      "  Within    1e-03:   1.56% (60095/3857868)\n",
-      "  Within    1e-02:   4.68% (180528/3857868)\n",
-      "  Within    1e-01:  36.82% (1420591/3857868)\n",
-      "❌ Tensors have significant differences\n",
-      "\n",
-      "Per-crop analysis (9 crops):\n",
-      "  Crop 0: max=1.207520, mean=0.288220\n",
-      "  Crop 1: max=1.160156, mean=0.167923\n",
-      "  Crop 2: max=1.208008, mean=0.167772\n",
-      "  Crop 3: max=1.208008, mean=0.168140\n",
-      "  Crop 4: max=1.176270, mean=0.168022\n",
-      "  ... and 4 more crops\n",
-      "\n",
-      "✓ Tiling match: (2, 4)\n",
-      "\n",
-      "--- Tensor Difference Analysis: Optimized vs Ultra Fast ---\n",
-      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
-      "Max absolute difference: 0.000000\n",
-      "Mean absolute difference: 0.000000\n",
-      "Std of differences: 0.000000\n",
-      "Pixels with any difference: 0.00% (0/3857868)\n",
-      "\n",
-      "Tolerance analysis:\n",
-      "  Within    1e-06: 100.00% (3857868/3857868)\n",
-      "  Within    1e-05: 100.00% (3857868/3857868)\n",
-      "  Within    1e-04: 100.00% (3857868/3857868)\n",
-      "  Within    1e-03: 100.00% (3857868/3857868)\n",
-      "  Within    1e-02: 100.00% (3857868/3857868)\n",
-      "  Within    1e-01: 100.00% (3857868/3857868)\n",
-      "✅ Tensors are essentially identical (max diff < 1e-5)\n",
-      "\n",
-      "Per-crop analysis (9 crops):\n",
-      "  Crop 0: max=0.000000, mean=0.000000\n",
-      "  Crop 1: max=0.000000, mean=0.000000\n",
-      "  Crop 2: max=0.000000, mean=0.000000\n",
-      "  Crop 3: max=0.000000, mean=0.000000\n",
-      "  Crop 4: max=0.000000, mean=0.000000\n",
-      "  ... and 4 more crops\n",
-      "\n",
-      "================================================================================\n",
-      "Testing 4K (3840x2160)\n",
-      "================================================================================\n",
-      "\n",
-      "Function     Min (ms)   Avg (ms)   Speedup \n",
-      "--------------------------------------------------\n",
-      "Original     46.9       51.5       1.00x   \n",
-      "Optimized    34.3       35.6       1.45x   \n",
-      "Ultra Fast   30.5       31.9       1.61x   \n",
-      "\n",
-      "🔍 TENSOR DIFFERENCE ANALYSIS\n",
-      "==================================================\n",
-      "\n",
-      "✓ Tiling match: (2, 4)\n",
-      "\n",
-      "--- Tensor Difference Analysis: Original vs Optimized ---\n",
-      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
-      "Max absolute difference: 1.278320\n",
-      "Mean absolute difference: 0.280527\n",
-      "Std of differences: 0.198947\n",
-      "Pixels with any difference: 99.16% (3825385/3857868)\n",
-      "\n",
-      "Tolerance analysis:\n",
-      "  Within    1e-06:   0.84% (32483/3857868)\n",
-      "  Within    1e-05:   0.84% (32483/3857868)\n",
-      "  Within    1e-04:   0.84% (32483/3857868)\n",
-      "  Within    1e-03:   0.84% (32483/3857868)\n",
-      "  Within    1e-02:   2.53% (97553/3857868)\n",
-      "  Within    1e-01:  20.93% (807398/3857868)\n",
-      "❌ Tensors have significant differences\n",
-      "\n",
-      "Per-crop analysis (9 crops):\n",
-      "  Crop 0: max=1.105957, mean=0.310640\n",
-      "  Crop 1: max=1.262695, mean=0.276606\n",
-      "  Crop 2: max=1.262695, mean=0.276472\n",
-      "  Crop 3: max=1.278320, mean=0.276858\n",
-      "  Crop 4: max=1.231934, mean=0.276985\n",
-      "  ... and 4 more crops\n",
-      "\n",
-      "✓ Tiling match: (2, 4)\n",
-      "\n",
-      "--- Tensor Difference Analysis: Original vs Ultra Fast ---\n",
-      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
-      "Max absolute difference: 1.278320\n",
-      "Mean absolute difference: 0.280527\n",
-      "Std of differences: 0.198947\n",
-      "Pixels with any difference: 99.16% (3825385/3857868)\n",
-      "\n",
-      "Tolerance analysis:\n",
-      "  Within    1e-06:   0.84% (32483/3857868)\n",
-      "  Within    1e-05:   0.84% (32483/3857868)\n",
-      "  Within    1e-04:   0.84% (32483/3857868)\n",
-      "  Within    1e-03:   0.84% (32483/3857868)\n",
-      "  Within    1e-02:   2.53% (97553/3857868)\n",
-      "  Within    1e-01:  20.93% (807398/3857868)\n",
-      "❌ Tensors have significant differences\n",
-      "\n",
-      "Per-crop analysis (9 crops):\n",
-      "  Crop 0: max=1.105957, mean=0.310640\n",
-      "  Crop 1: max=1.262695, mean=0.276606\n",
-      "  Crop 2: max=1.262695, mean=0.276472\n",
-      "  Crop 3: max=1.278320, mean=0.276858\n",
-      "  Crop 4: max=1.231934, mean=0.276985\n",
-      "  ... and 4 more crops\n",
-      "\n",
-      "✓ Tiling match: (2, 4)\n",
-      "\n",
-      "--- Tensor Difference Analysis: Optimized vs Ultra Fast ---\n",
-      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
-      "Max absolute difference: 0.000000\n",
-      "Mean absolute difference: 0.000000\n",
-      "Std of differences: 0.000000\n",
-      "Pixels with any difference: 0.00% (0/3857868)\n",
-      "\n",
-      "Tolerance analysis:\n",
-      "  Within    1e-06: 100.00% (3857868/3857868)\n",
-      "  Within    1e-05: 100.00% (3857868/3857868)\n",
-      "  Within    1e-04: 100.00% (3857868/3857868)\n",
-      "  Within    1e-03: 100.00% (3857868/3857868)\n",
-      "  Within    1e-02: 100.00% (3857868/3857868)\n",
-      "  Within    1e-01: 100.00% (3857868/3857868)\n",
-      "✅ Tensors are essentially identical (max diff < 1e-5)\n",
-      "\n",
-      "Per-crop analysis (9 crops):\n",
-      "  Crop 0: max=0.000000, mean=0.000000\n",
-      "  Crop 1: max=0.000000, mean=0.000000\n",
-      "  Crop 2: max=0.000000, mean=0.000000\n",
-      "  Crop 3: max=0.000000, mean=0.000000\n",
-      "  Crop 4: max=0.000000, mean=0.000000\n",
-      "  ... and 4 more crops\n",
-      "\n",
-      "💡 Tip: Run with '--speed-only' flag for faster benchmarking without tensor analysis\n"
-     ]
     }
    ],
-   "source": []
   },
   {
    "cell_type": "code",
@@ -577,7 +313,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.9"
   }
  },
  "nbformat": 4,

    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ed2ab48877fe47178e9e521fae619346",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b348b72e4f4b412f949efee9dd3da8d2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c3c9cb29e5184e6ba7d0e698209b1dbe",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d7af8afa19f04ebda69594a35b75ebbd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b9d9a927b1f446b2818dbb997608fd93",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "77f52404773949c5b6e792eb2b5259dd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"vikhyatk/moondream2\")\n"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 26,
    "metadata": {},
    "outputs": [],
    "source": [
+    "texts = [\n",
+    "    \"This is a short text.\",\n",
+    "    \"This is a much longer text that will determine the padding length.\",\n",
+    "    \"Medium length text here.\"\n",
+    "]\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n",
     "\n",
+    "# Pad to the longest sequence in the batch\n",
+    "encoded = tokenizer(\n",
+    "    texts,\n",
+    "    padding=True,  # or padding=\"longest\"\n",
+    "    return_tensors=\"pt\",  # or \"tf\" for TensorFlow\n",
+    "    model_max_length=512,\n",
     "\n",
+    ")\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 27,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "transformers.tokenization_utils_base.BatchEncoding"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
+    "type(encoded)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 28,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'input_ids': tensor([[ 1212,   318,   257,  1790,  2420,    13, 50256, 50256, 50256, 50256,\n",
+       "         50256, 50256, 50256],\n",
+       "        [ 1212,   318,   257,   881,  2392,  2420,   326,   481,  5004,   262,\n",
+       "         24511,  4129,    13],\n",
+       "        [31205,  4129,  2420,   994,    13, 50256, 50256, 50256, 50256, 50256,\n",
+       "         50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n",
+       "        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
+       "        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
+    "encoded"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([3, 13])"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "encoded.input_ids.shape"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 30,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0])"
       ]
      },
+     "execution_count": 30,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "encoded.attention_mask[0] * encoded.attention_mask[0].T"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
+    "mask = encoded.attention_mask[0].clone().reshape(-1, 1)\n",
+    "\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 19,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0])"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "encoded.attention_mask[0].T"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
+     "data": {
+      "text/plain": [
+       "torch.Size([13, 1])"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
+   "source": [
+    "mask.shape"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
+     "data": {
+      "text/plain": [
+       "tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n",
+       "        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n",
+       "        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n",
+       "        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n",
+       "        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n",
+       "        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
+   "source": [
+    "real = mask @ mask.T\n",
+    "real"
+   ]
   },
   {
    "cell_type": "code",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.13.3"
   }
  },
  "nbformat": 4,