johnmalek312 commited on
Commit
ded605e
·
1 Parent(s): 9b2871c

broken change start of batching

Browse files
Files changed (2) hide show
  1. moondream2/moondream.py +37 -14
  2. ollama.ipynb +217 -481
moondream2/moondream.py CHANGED
@@ -43,9 +43,19 @@ class EncodedImage:
43
 
44
  class KVCache(nn.Module):
45
 
46
- def __init__(self, n_heads, n_kv_heads, max_context, dim, device, dtype):
 
 
 
 
 
 
 
47
  super().__init__()
48
- cache_shape = (1, n_kv_heads, max_context, dim // n_heads)
 
 
 
49
  self.register_buffer(
50
  "k_cache", torch.zeros(*cache_shape, device=device, dtype=dtype)
51
  )
@@ -132,6 +142,7 @@ class MoondreamModel(nn.Module):
132
  c.n_kv_heads,
133
  c.max_context,
134
  c.dim,
 
135
  device=self.device,
136
  dtype=self.vision.pos_emb.dtype,
137
  )
@@ -190,9 +201,11 @@ class MoondreamModel(nn.Module):
190
 
191
  return self._vis_proj(global_features, reconstructed)
192
 
193
- def encode_image(self, image: Union[Image.Image, EncodedImage]) -> EncodedImage:
194
  if isinstance(image, EncodedImage):
195
  return image
 
 
196
  elif not isinstance(image, Image.Image):
197
  raise ValueError("image must be a PIL Image or EncodedImage")
198
 
@@ -202,12 +215,17 @@ class MoondreamModel(nn.Module):
202
 
203
  bos = torch.tensor([[self.config.tokenizer.bos_id]], device=self.device)
204
 
205
- img_emb = self._run_vision_encoder(image)
 
 
 
 
206
  bos_emb = text_encoder(
207
  bos,
208
  self.text,
209
  )
210
- inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)
 
211
  mask = self.attn_mask[:, :, 0 : inputs_embeds.size(1), :]
212
  pos_ids = torch.arange(inputs_embeds.size(1), dtype=torch.int32, device=self.device)
213
  self._prefill(inputs_embeds, mask, pos_ids)
@@ -293,23 +311,28 @@ class MoondreamModel(nn.Module):
293
 
294
  def point(
295
  self,
296
- image: Union[Image.Image, EncodedImage],
297
- object: str,
298
  settings: Optional[ObjectSamplingSettings] = None,
299
  ):
300
  if self.config.tokenizer.templates["point"] is None:
301
  raise NotImplementedError("Model does not support pointing.")
 
 
302
 
303
  image = self.encode_image(image)
304
 
305
- prompt_tokens = torch.tensor(
306
- [
307
  self.config.tokenizer.templates["point"]["prefix"]
308
- + self.tokenizer.encode(" " + object).ids
309
  + self.config.tokenizer.templates["point"]["suffix"]
310
- ],
311
- device=self.device,
312
- )
 
 
 
313
 
314
  _, hidden, next_token, pos = self._prefill_prompt(
315
  prompt_tokens, image.pos, temperature=0, top_p=0
@@ -327,5 +350,5 @@ class MoondreamModel(nn.Module):
327
 
328
  return {"points": objects}
329
 
330
- def forward(self, image: Union[Image.Image, EncodedImage], prompt: str, settings: Optional[ObjectSamplingSettings] = None):
331
  return self.point(image, prompt, settings)
 
43
 
44
  class KVCache(nn.Module):
45
 
46
+ def __init__(self,
47
+ n_heads,
48
+ n_kv_heads,
49
+ max_context,
50
+ dim,
51
+ batch_size: int = 1,
52
+ device=None,
53
+ dtype=None):
54
  super().__init__()
55
+ cache_shape = (batch_size,
56
+ n_kv_heads,
57
+ max_context,
58
+ dim // n_heads)
59
  self.register_buffer(
60
  "k_cache", torch.zeros(*cache_shape, device=device, dtype=dtype)
61
  )
 
142
  c.n_kv_heads,
143
  c.max_context,
144
  c.dim,
145
+ batch_size=2,
146
  device=self.device,
147
  dtype=self.vision.pos_emb.dtype,
148
  )
 
201
 
202
  return self._vis_proj(global_features, reconstructed)
203
 
204
+ def encode_image(self, image: Union[Image.Image, EncodedImage, torch.Tensor]) -> EncodedImage:
205
  if isinstance(image, EncodedImage):
206
  return image
207
+ elif isinstance(image, torch.Tensor):
208
+ pass
209
  elif not isinstance(image, Image.Image):
210
  raise ValueError("image must be a PIL Image or EncodedImage")
211
 
 
215
 
216
  bos = torch.tensor([[self.config.tokenizer.bos_id]], device=self.device)
217
 
218
+ if isinstance(image, Image.Image):
219
+ img_emb = self._run_vision_encoder(image)
220
+ else:
221
+ img_emb = image
222
+
223
  bos_emb = text_encoder(
224
  bos,
225
  self.text,
226
  )
227
+ bos_emb = bos_emb.expand(img_emb.size(0), -1, -1)
228
+ inputs_embeds = torch.cat([bos_emb, img_emb], dim=1)
229
  mask = self.attn_mask[:, :, 0 : inputs_embeds.size(1), :]
230
  pos_ids = torch.arange(inputs_embeds.size(1), dtype=torch.int32, device=self.device)
231
  self._prefill(inputs_embeds, mask, pos_ids)
 
311
 
312
  def point(
313
  self,
314
+ image: Union[Image.Image, EncodedImage, torch.Tensor],
315
+ object: list[str],
316
  settings: Optional[ObjectSamplingSettings] = None,
317
  ):
318
  if self.config.tokenizer.templates["point"] is None:
319
  raise NotImplementedError("Model does not support pointing.")
320
+ # set the pad token to the eos token
321
+ self.tokenizer.pad_token = self.tokenizer.eos_token
322
 
323
  image = self.encode_image(image)
324
 
325
+ # input batch tokenized and padded
326
+ prompt_tokens = [
327
  self.config.tokenizer.templates["point"]["prefix"]
328
+ + self.tokenizer.encode(" " + obj).ids
329
  + self.config.tokenizer.templates["point"]["suffix"]
330
+ for obj in object
331
+ ]
332
+ # padding with eos token to the same length as the longest sequence
333
+ tokens_batch = self.tokenizer.pad(prompt_tokens, padding="longest", return_tensors="pt")
334
+ prompt_tokens = tokens_batch.input_ids.to(self.device)
335
+
336
 
337
  _, hidden, next_token, pos = self._prefill_prompt(
338
  prompt_tokens, image.pos, temperature=0, top_p=0
 
350
 
351
  return {"points": objects}
352
 
353
+ def forward(self, image: Union[Image.Image, EncodedImage, torch.Tensor], prompt: str, settings: Optional[ObjectSamplingSettings] = None):
354
  return self.point(image, prompt, settings)
ollama.ipynb CHANGED
@@ -4,554 +4,290 @@
4
  "cell_type": "code",
5
  "execution_count": 1,
6
  "metadata": {},
7
- "outputs": [],
8
- "source": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  },
10
  {
11
  "cell_type": "code",
12
- "execution_count": 1,
13
  "metadata": {},
14
  "outputs": [],
15
  "source": [
16
- "import torch\n",
17
- "import torch.nn as nn\n",
 
 
 
 
18
  "\n",
19
- "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
 
 
 
 
 
20
  "\n",
21
- "class RotaryEmbeddingInPlace(nn.Module):\n",
22
- " def __init__(self, head_dim: int, max_seq_len: int, theta: float = 10000.0):\n",
23
- " super().__init__()\n",
24
- " # Match RotaryEmbedding exactly\n",
25
- " self.rot_dim = head_dim // 2 # Only half of head_dim is rotated\n",
26
- " \n",
27
- " # Frequency calculation - match RotaryEmbedding exactly\n",
28
- " freqs = 1.0 / (theta ** (torch.arange(0, self.rot_dim, 2).float() / self.rot_dim))\n",
29
- " t = torch.arange(max_seq_len, dtype=torch.float32).unsqueeze(1)\n",
30
- " freqs = t * freqs.unsqueeze(0)\n",
31
- " \n",
32
- "\n",
33
- " freqs_cis = torch.exp(1j * freqs)\n",
34
- " cos_vals = freqs_cis.real\n",
35
- " sin_vals = freqs_cis.imag\n",
36
- "\n",
37
- " self.register_buffer('cos_cache', cos_vals, persistent=False)\n",
38
- " self.register_buffer('sin_cache', sin_vals, persistent=False)\n",
39
- " \n",
40
- " def apply(self, x: torch.Tensor) -> torch.Tensor:\n",
41
- " \"\"\"\n",
42
- " WARNING: This modifies the input tensor in-place for maximum speed!\n",
43
- " If you need the original tensor, make a copy before calling this.\n",
44
- " \n",
45
- " Must match RotaryEmbedding output exactly.\n",
46
- " \"\"\"\n",
47
- " seq_len = x.shape[1]\n",
48
- " d = self.rot_dim // 2\n",
49
- " \n",
50
- " # Get cos/sin with same broadcasting as RotaryEmbedding\n",
51
- " cos = self.cos_cache[:seq_len].unsqueeze(0).unsqueeze(2)\n",
52
- " sin = self.sin_cache[:seq_len].unsqueeze(0).unsqueeze(2)\n",
53
- " \n",
54
- " # Split rotated part into real/imaginary components\n",
55
- " xq_r = x[..., :d] # First half of rot_dim\n",
56
- " xq_i = x[..., d:d*2] # Second half of rot_dim\n",
57
- " \n",
58
- " # Apply rotation\n",
59
- " xq_out_r = xq_r * cos - xq_i * sin\n",
60
- " xq_out_i = xq_r * sin + xq_i * cos\n",
61
- " \n",
62
- " # Vectorized interleaving using torch.stack and view\n",
63
- " # Stack creates [d, ..., 2] then view as [..., d*2]\n",
64
- " x[..., :self.rot_dim] = torch.stack([xq_out_r, xq_out_i], dim=-1).view(*x.shape[:-1], self.rot_dim)\n",
65
- " \n",
66
- " # x_pass part (x[..., self.rot_dim:]) remains unchanged automatically\n",
67
- " \n",
68
- " return x\n"
69
  ]
70
  },
71
  {
72
  "cell_type": "code",
73
- "execution_count": 2,
74
  "metadata": {},
75
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
76
  "source": [
77
- "dim_per_head = 64\n",
78
- "n_heads = 32\n",
79
- "max_context = 2048\n",
80
- "\n",
81
- "freq_dim = dim_per_head // 2\n",
82
- "\n",
83
- "torch.manual_seed(42)\n",
84
- "\n",
85
- "tensor = torch.rand(1, 730, n_heads, dim_per_head)\n",
86
- "tensor = tensor.to(device)\n"
87
  ]
88
  },
89
  {
90
  "cell_type": "code",
91
- "execution_count": 3,
92
  "metadata": {},
93
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  "source": [
95
- "fast_rope = RotaryEmbeddingInPlace(dim_per_head, max_context)\n",
96
- "fast_rope.to(device)\n",
97
- "fast_rtensor = fast_rope.apply(tensor)\n",
98
- "\n",
99
- "\n"
100
  ]
101
  },
102
  {
103
  "cell_type": "code",
104
- "execution_count": null,
105
  "metadata": {},
106
- "outputs": [],
107
- "source": []
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  },
109
  {
110
  "cell_type": "code",
111
- "execution_count": 4,
112
  "metadata": {},
113
  "outputs": [
114
  {
115
  "data": {
116
  "text/plain": [
117
- "tensor([0.8823, 0.8854, 0.9150, 0.5739, 0.3829, 0.2666, 0.9593, 0.6274, 0.3904,\n",
118
- " 0.2696, 0.6009, 0.4414, 0.2566, 0.2969, 0.7936], device='cuda:0')"
119
  ]
120
  },
121
- "execution_count": 4,
122
  "metadata": {},
123
  "output_type": "execute_result"
124
  }
125
  ],
126
  "source": [
127
- "fast_rtensor.flatten()[:15]"
128
  ]
129
  },
130
  {
131
  "cell_type": "code",
132
- "execution_count": null,
133
  "metadata": {},
134
  "outputs": [],
135
- "source": []
136
- },
137
- {
138
- "cell_type": "markdown",
139
- "metadata": {},
140
  "source": [
141
- "tensor([0.8823, 0.8854, 0.9150, 0.5739, 0.3829, 0.2666, 0.9593, 0.6274, 0.3904,\n",
142
- " 0.2696, 0.6009, 0.4414, 0.2566, 0.2969, 0.7936])"
143
  ]
144
  },
145
  {
146
  "cell_type": "code",
147
- "execution_count": 6,
148
  "metadata": {},
149
- "outputs": [],
150
- "source": []
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  },
152
  {
153
  "cell_type": "code",
154
- "execution_count": null,
155
  "metadata": {},
156
  "outputs": [
157
  {
158
- "name": "stdout",
159
- "output_type": "stream",
160
- "text": [
161
- "✓ OpenCV available\n",
162
- "=== Comprehensive Benchmark with Tensor Difference Analysis ===\n",
163
- "CUDA available: True\n",
164
- "PyVIPS available: True\n",
165
- "\n",
166
- "================================================================================\n",
167
- "Testing 1080p (1920x1080)\n",
168
- "================================================================================\n",
169
- "\n",
170
- "Function Min (ms) Avg (ms) Speedup \n",
171
- "--------------------------------------------------\n",
172
- "Original 16.3 16.7 1.00x \n",
173
- "Optimized 8.9 9.4 1.77x \n",
174
- "Ultra Fast 9.2 9.5 1.75x \n",
175
- "\n",
176
- "🔍 TENSOR DIFFERENCE ANALYSIS\n",
177
- "==================================================\n",
178
- "\n",
179
- "✓ Tiling match: (2, 4)\n",
180
- "\n",
181
- "--- Tensor Difference Analysis: Original vs Optimized ---\n",
182
- "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
183
- "Max absolute difference: 1.208008\n",
184
- "Mean absolute difference: 0.181336\n",
185
- "Std of differences: 0.153313\n",
186
- "Pixels with any difference: 98.44% (3797773/3857868)\n",
187
- "\n",
188
- "Tolerance analysis:\n",
189
- " Within 1e-06: 1.56% (60095/3857868)\n",
190
- " Within 1e-05: 1.56% (60095/3857868)\n",
191
- " Within 1e-04: 1.56% (60095/3857868)\n",
192
- " Within 1e-03: 1.56% (60095/3857868)\n",
193
- " Within 1e-02: 4.68% (180528/3857868)\n",
194
- " Within 1e-01: 36.82% (1420591/3857868)\n",
195
- "❌ Tensors have significant differences\n",
196
- "\n",
197
- "Per-crop analysis (9 crops):\n",
198
- " Crop 0: max=1.207520, mean=0.288220\n",
199
- " Crop 1: max=1.160156, mean=0.167923\n",
200
- " Crop 2: max=1.208008, mean=0.167772\n",
201
- " Crop 3: max=1.208008, mean=0.168140\n",
202
- " Crop 4: max=1.176270, mean=0.168022\n",
203
- " ... and 4 more crops\n",
204
- "\n",
205
- "✓ Tiling match: (2, 4)\n",
206
- "\n",
207
- "--- Tensor Difference Analysis: Original vs Ultra Fast ---\n",
208
- "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
209
- "Max absolute difference: 1.208008\n",
210
- "Mean absolute difference: 0.181336\n",
211
- "Std of differences: 0.153313\n",
212
- "Pixels with any difference: 98.44% (3797773/3857868)\n",
213
- "\n",
214
- "Tolerance analysis:\n",
215
- " Within 1e-06: 1.56% (60095/3857868)\n",
216
- " Within 1e-05: 1.56% (60095/3857868)\n",
217
- " Within 1e-04: 1.56% (60095/3857868)\n",
218
- " Within 1e-03: 1.56% (60095/3857868)\n",
219
- " Within 1e-02: 4.68% (180528/3857868)\n",
220
- " Within 1e-01: 36.82% (1420591/3857868)\n",
221
- "❌ Tensors have significant differences\n",
222
- "\n",
223
- "Per-crop analysis (9 crops):\n",
224
- " Crop 0: max=1.207520, mean=0.288220\n",
225
- " Crop 1: max=1.160156, mean=0.167923\n",
226
- " Crop 2: max=1.208008, mean=0.167772\n",
227
- " Crop 3: max=1.208008, mean=0.168140\n",
228
- " Crop 4: max=1.176270, mean=0.168022\n",
229
- " ... and 4 more crops\n",
230
- "\n",
231
- "✓ Tiling match: (2, 4)\n",
232
- "\n",
233
- "--- Tensor Difference Analysis: Optimized vs Ultra Fast ---\n",
234
- "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
235
- "Max absolute difference: 0.000000\n",
236
- "Mean absolute difference: 0.000000\n",
237
- "Std of differences: 0.000000\n",
238
- "Pixels with any difference: 0.00% (0/3857868)\n",
239
- "\n",
240
- "Tolerance analysis:\n",
241
- " Within 1e-06: 100.00% (3857868/3857868)\n",
242
- " Within 1e-05: 100.00% (3857868/3857868)\n",
243
- " Within 1e-04: 100.00% (3857868/3857868)\n",
244
- " Within 1e-03: 100.00% (3857868/3857868)\n",
245
- " Within 1e-02: 100.00% (3857868/3857868)\n",
246
- " Within 1e-01: 100.00% (3857868/3857868)\n",
247
- "✅ Tensors are essentially identical (max diff < 1e-5)\n",
248
- "\n",
249
- "Per-crop analysis (9 crops):\n",
250
- " Crop 0: max=0.000000, mean=0.000000\n",
251
- " Crop 1: max=0.000000, mean=0.000000\n",
252
- " Crop 2: max=0.000000, mean=0.000000\n",
253
- " Crop 3: max=0.000000, mean=0.000000\n",
254
- " Crop 4: max=0.000000, mean=0.000000\n",
255
- " ... and 4 more crops\n",
256
- "\n",
257
- "================================================================================\n",
258
- "Testing 4K (3840x2160)\n",
259
- "================================================================================\n",
260
- "\n",
261
- "Function Min (ms) Avg (ms) Speedup \n",
262
- "--------------------------------------------------\n",
263
- "Original 55.0 57.2 1.00x \n",
264
- "Optimized 30.8 33.4 1.71x \n",
265
- "Ultra Fast 32.3 36.5 1.57x \n",
266
- "\n",
267
- "🔍 TENSOR DIFFERENCE ANALYSIS\n",
268
- "==================================================\n",
269
- "\n",
270
- "✓ Tiling match: (2, 4)\n",
271
- "\n",
272
- "--- Tensor Difference Analysis: Original vs Optimized ---\n",
273
- "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
274
- "Max absolute difference: 1.278320\n",
275
- "Mean absolute difference: 0.280527\n",
276
- "Std of differences: 0.198947\n",
277
- "Pixels with any difference: 99.16% (3825385/3857868)\n",
278
- "\n",
279
- "Tolerance analysis:\n",
280
- " Within 1e-06: 0.84% (32483/3857868)\n",
281
- " Within 1e-05: 0.84% (32483/3857868)\n",
282
- " Within 1e-04: 0.84% (32483/3857868)\n",
283
- " Within 1e-03: 0.84% (32483/3857868)\n",
284
- " Within 1e-02: 2.53% (97553/3857868)\n",
285
- " Within 1e-01: 20.93% (807398/3857868)\n",
286
- "❌ Tensors have significant differences\n",
287
- "\n",
288
- "Per-crop analysis (9 crops):\n",
289
- " Crop 0: max=1.105957, mean=0.310640\n",
290
- " Crop 1: max=1.262695, mean=0.276606\n",
291
- " Crop 2: max=1.262695, mean=0.276472\n",
292
- " Crop 3: max=1.278320, mean=0.276858\n",
293
- " Crop 4: max=1.231934, mean=0.276985\n",
294
- " ... and 4 more crops\n",
295
- "\n",
296
- "✓ Tiling match: (2, 4)\n",
297
- "\n",
298
- "--- Tensor Difference Analysis: Original vs Ultra Fast ---\n",
299
- "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
300
- "Max absolute difference: 1.278320\n",
301
- "Mean absolute difference: 0.280527\n",
302
- "Std of differences: 0.198947\n",
303
- "Pixels with any difference: 99.16% (3825385/3857868)\n",
304
- "\n",
305
- "Tolerance analysis:\n",
306
- " Within 1e-06: 0.84% (32483/3857868)\n",
307
- " Within 1e-05: 0.84% (32483/3857868)\n",
308
- " Within 1e-04: 0.84% (32483/3857868)\n",
309
- " Within 1e-03: 0.84% (32483/3857868)\n",
310
- " Within 1e-02: 2.53% (97553/3857868)\n",
311
- " Within 1e-01: 20.93% (807398/3857868)\n",
312
- "❌ Tensors have significant differences\n",
313
- "\n",
314
- "Per-crop analysis (9 crops):\n",
315
- " Crop 0: max=1.105957, mean=0.310640\n",
316
- " Crop 1: max=1.262695, mean=0.276606\n",
317
- " Crop 2: max=1.262695, mean=0.276472\n",
318
- " Crop 3: max=1.278320, mean=0.276858\n",
319
- " Crop 4: max=1.231934, mean=0.276985\n",
320
- " ... and 4 more crops\n",
321
- "\n",
322
- "✓ Tiling match: (2, 4)\n",
323
- "\n",
324
- "--- Tensor Difference Analysis: Optimized vs Ultra Fast ---\n",
325
- "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
326
- "Max absolute difference: 0.000000\n",
327
- "Mean absolute difference: 0.000000\n",
328
- "Std of differences: 0.000000\n",
329
- "Pixels with any difference: 0.00% (0/3857868)\n",
330
- "\n",
331
- "Tolerance analysis:\n",
332
- " Within 1e-06: 100.00% (3857868/3857868)\n",
333
- " Within 1e-05: 100.00% (3857868/3857868)\n",
334
- " Within 1e-04: 100.00% (3857868/3857868)\n",
335
- " Within 1e-03: 100.00% (3857868/3857868)\n",
336
- " Within 1e-02: 100.00% (3857868/3857868)\n",
337
- " Within 1e-01: 100.00% (3857868/3857868)\n",
338
- "✅ Tensors are essentially identical (max diff < 1e-5)\n",
339
- "\n",
340
- "Per-crop analysis (9 crops):\n",
341
- " Crop 0: max=0.000000, mean=0.000000\n",
342
- " Crop 1: max=0.000000, mean=0.000000\n",
343
- " Crop 2: max=0.000000, mean=0.000000\n",
344
- " Crop 3: max=0.000000, mean=0.000000\n",
345
- " Crop 4: max=0.000000, mean=0.000000\n",
346
- " ... and 4 more crops\n",
347
- "\n",
348
- "💡 Tip: Run with '--speed-only' flag for faster benchmarking without tensor analysis\n"
349
- ]
350
  }
351
  ],
352
- "source": []
 
 
353
  },
354
  {
355
  "cell_type": "code",
356
- "execution_count": 1,
357
  "metadata": {},
358
  "outputs": [
359
  {
360
- "name": "stdout",
361
- "output_type": "stream",
362
- "text": [
363
- " OpenCV available\n",
364
- "=== Comprehensive Benchmark with Tensor Difference Analysis ===\n",
365
- "CUDA available: True\n",
366
- "PyVIPS available: True\n",
367
- "\n",
368
- "================================================================================\n",
369
- "Testing 1080p (1920x1080)\n",
370
- "================================================================================\n",
371
- "\n",
372
- "Function Min (ms) Avg (ms) Speedup \n",
373
- "--------------------------------------------------\n",
374
- "Original 15.6 16.8 1.00x \n",
375
- "Optimized 8.8 9.2 1.82x \n",
376
- "Ultra Fast 9.4 9.6 1.76x \n",
377
- "\n",
378
- "🔍 TENSOR DIFFERENCE ANALYSIS\n",
379
- "==================================================\n",
380
- "\n",
381
- "✓ Tiling match: (2, 4)\n",
382
- "\n",
383
- "--- Tensor Difference Analysis: Original vs Optimized ---\n",
384
- "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
385
- "Max absolute difference: 1.208008\n",
386
- "Mean absolute difference: 0.181336\n",
387
- "Std of differences: 0.153313\n",
388
- "Pixels with any difference: 98.44% (3797773/3857868)\n",
389
- "\n",
390
- "Tolerance analysis:\n",
391
- " Within 1e-06: 1.56% (60095/3857868)\n",
392
- " Within 1e-05: 1.56% (60095/3857868)\n",
393
- " Within 1e-04: 1.56% (60095/3857868)\n",
394
- " Within 1e-03: 1.56% (60095/3857868)\n",
395
- " Within 1e-02: 4.68% (180528/3857868)\n",
396
- " Within 1e-01: 36.82% (1420591/3857868)\n",
397
- "❌ Tensors have significant differences\n",
398
- "\n",
399
- "Per-crop analysis (9 crops):\n",
400
- " Crop 0: max=1.207520, mean=0.288220\n",
401
- " Crop 1: max=1.160156, mean=0.167923\n",
402
- " Crop 2: max=1.208008, mean=0.167772\n",
403
- " Crop 3: max=1.208008, mean=0.168140\n",
404
- " Crop 4: max=1.176270, mean=0.168022\n",
405
- " ... and 4 more crops\n",
406
- "\n",
407
- "✓ Tiling match: (2, 4)\n",
408
- "\n",
409
- "--- Tensor Difference Analysis: Original vs Ultra Fast ---\n",
410
- "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
411
- "Max absolute difference: 1.208008\n",
412
- "Mean absolute difference: 0.181336\n",
413
- "Std of differences: 0.153313\n",
414
- "Pixels with any difference: 98.44% (3797773/3857868)\n",
415
- "\n",
416
- "Tolerance analysis:\n",
417
- " Within 1e-06: 1.56% (60095/3857868)\n",
418
- " Within 1e-05: 1.56% (60095/3857868)\n",
419
- " Within 1e-04: 1.56% (60095/3857868)\n",
420
- " Within 1e-03: 1.56% (60095/3857868)\n",
421
- " Within 1e-02: 4.68% (180528/3857868)\n",
422
- " Within 1e-01: 36.82% (1420591/3857868)\n",
423
- "❌ Tensors have significant differences\n",
424
- "\n",
425
- "Per-crop analysis (9 crops):\n",
426
- " Crop 0: max=1.207520, mean=0.288220\n",
427
- " Crop 1: max=1.160156, mean=0.167923\n",
428
- " Crop 2: max=1.208008, mean=0.167772\n",
429
- " Crop 3: max=1.208008, mean=0.168140\n",
430
- " Crop 4: max=1.176270, mean=0.168022\n",
431
- " ... and 4 more crops\n",
432
- "\n",
433
- "✓ Tiling match: (2, 4)\n",
434
- "\n",
435
- "--- Tensor Difference Analysis: Optimized vs Ultra Fast ---\n",
436
- "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
437
- "Max absolute difference: 0.000000\n",
438
- "Mean absolute difference: 0.000000\n",
439
- "Std of differences: 0.000000\n",
440
- "Pixels with any difference: 0.00% (0/3857868)\n",
441
- "\n",
442
- "Tolerance analysis:\n",
443
- " Within 1e-06: 100.00% (3857868/3857868)\n",
444
- " Within 1e-05: 100.00% (3857868/3857868)\n",
445
- " Within 1e-04: 100.00% (3857868/3857868)\n",
446
- " Within 1e-03: 100.00% (3857868/3857868)\n",
447
- " Within 1e-02: 100.00% (3857868/3857868)\n",
448
- " Within 1e-01: 100.00% (3857868/3857868)\n",
449
- "✅ Tensors are essentially identical (max diff < 1e-5)\n",
450
- "\n",
451
- "Per-crop analysis (9 crops):\n",
452
- " Crop 0: max=0.000000, mean=0.000000\n",
453
- " Crop 1: max=0.000000, mean=0.000000\n",
454
- " Crop 2: max=0.000000, mean=0.000000\n",
455
- " Crop 3: max=0.000000, mean=0.000000\n",
456
- " Crop 4: max=0.000000, mean=0.000000\n",
457
- " ... and 4 more crops\n",
458
- "\n",
459
- "================================================================================\n",
460
- "Testing 4K (3840x2160)\n",
461
- "================================================================================\n",
462
- "\n",
463
- "Function Min (ms) Avg (ms) Speedup \n",
464
- "--------------------------------------------------\n",
465
- "Original 46.9 51.5 1.00x \n",
466
- "Optimized 34.3 35.6 1.45x \n",
467
- "Ultra Fast 30.5 31.9 1.61x \n",
468
- "\n",
469
- "🔍 TENSOR DIFFERENCE ANALYSIS\n",
470
- "==================================================\n",
471
- "\n",
472
- "✓ Tiling match: (2, 4)\n",
473
- "\n",
474
- "--- Tensor Difference Analysis: Original vs Optimized ---\n",
475
- "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
476
- "Max absolute difference: 1.278320\n",
477
- "Mean absolute difference: 0.280527\n",
478
- "Std of differences: 0.198947\n",
479
- "Pixels with any difference: 99.16% (3825385/3857868)\n",
480
- "\n",
481
- "Tolerance analysis:\n",
482
- " Within 1e-06: 0.84% (32483/3857868)\n",
483
- " Within 1e-05: 0.84% (32483/3857868)\n",
484
- " Within 1e-04: 0.84% (32483/3857868)\n",
485
- " Within 1e-03: 0.84% (32483/3857868)\n",
486
- " Within 1e-02: 2.53% (97553/3857868)\n",
487
- " Within 1e-01: 20.93% (807398/3857868)\n",
488
- "❌ Tensors have significant differences\n",
489
- "\n",
490
- "Per-crop analysis (9 crops):\n",
491
- " Crop 0: max=1.105957, mean=0.310640\n",
492
- " Crop 1: max=1.262695, mean=0.276606\n",
493
- " Crop 2: max=1.262695, mean=0.276472\n",
494
- " Crop 3: max=1.278320, mean=0.276858\n",
495
- " Crop 4: max=1.231934, mean=0.276985\n",
496
- " ... and 4 more crops\n",
497
- "\n",
498
- "✓ Tiling match: (2, 4)\n",
499
- "\n",
500
- "--- Tensor Difference Analysis: Original vs Ultra Fast ---\n",
501
- "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
502
- "Max absolute difference: 1.278320\n",
503
- "Mean absolute difference: 0.280527\n",
504
- "Std of differences: 0.198947\n",
505
- "Pixels with any difference: 99.16% (3825385/3857868)\n",
506
- "\n",
507
- "Tolerance analysis:\n",
508
- " Within 1e-06: 0.84% (32483/3857868)\n",
509
- " Within 1e-05: 0.84% (32483/3857868)\n",
510
- " Within 1e-04: 0.84% (32483/3857868)\n",
511
- " Within 1e-03: 0.84% (32483/3857868)\n",
512
- " Within 1e-02: 2.53% (97553/3857868)\n",
513
- " Within 1e-01: 20.93% (807398/3857868)\n",
514
- "❌ Tensors have significant differences\n",
515
- "\n",
516
- "Per-crop analysis (9 crops):\n",
517
- " Crop 0: max=1.105957, mean=0.310640\n",
518
- " Crop 1: max=1.262695, mean=0.276606\n",
519
- " Crop 2: max=1.262695, mean=0.276472\n",
520
- " Crop 3: max=1.278320, mean=0.276858\n",
521
- " Crop 4: max=1.231934, mean=0.276985\n",
522
- " ... and 4 more crops\n",
523
- "\n",
524
- "✓ Tiling match: (2, 4)\n",
525
- "\n",
526
- "--- Tensor Difference Analysis: Optimized vs Ultra Fast ---\n",
527
- "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
528
- "Max absolute difference: 0.000000\n",
529
- "Mean absolute difference: 0.000000\n",
530
- "Std of differences: 0.000000\n",
531
- "Pixels with any difference: 0.00% (0/3857868)\n",
532
- "\n",
533
- "Tolerance analysis:\n",
534
- " Within 1e-06: 100.00% (3857868/3857868)\n",
535
- " Within 1e-05: 100.00% (3857868/3857868)\n",
536
- " Within 1e-04: 100.00% (3857868/3857868)\n",
537
- " Within 1e-03: 100.00% (3857868/3857868)\n",
538
- " Within 1e-02: 100.00% (3857868/3857868)\n",
539
- " Within 1e-01: 100.00% (3857868/3857868)\n",
540
- "✅ Tensors are essentially identical (max diff < 1e-5)\n",
541
- "\n",
542
- "Per-crop analysis (9 crops):\n",
543
- " Crop 0: max=0.000000, mean=0.000000\n",
544
- " Crop 1: max=0.000000, mean=0.000000\n",
545
- " Crop 2: max=0.000000, mean=0.000000\n",
546
- " Crop 3: max=0.000000, mean=0.000000\n",
547
- " Crop 4: max=0.000000, mean=0.000000\n",
548
- " ... and 4 more crops\n",
549
- "\n",
550
- "💡 Tip: Run with '--speed-only' flag for faster benchmarking without tensor analysis\n"
551
- ]
552
  }
553
  ],
554
- "source": []
 
 
 
555
  },
556
  {
557
  "cell_type": "code",
@@ -577,7 +313,7 @@
577
  "name": "python",
578
  "nbconvert_exporter": "python",
579
  "pygments_lexer": "ipython3",
580
- "version": "3.12.9"
581
  }
582
  },
583
  "nbformat": 4,
 
4
  "cell_type": "code",
5
  "execution_count": 1,
6
  "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "application/vnd.jupyter.widget-view+json": {
11
+ "model_id": "ed2ab48877fe47178e9e521fae619346",
12
+ "version_major": 2,
13
+ "version_minor": 0
14
+ },
15
+ "text/plain": [
16
+ "tokenizer_config.json: 0%| | 0.00/7.34k [00:00<?, ?B/s]"
17
+ ]
18
+ },
19
+ "metadata": {},
20
+ "output_type": "display_data"
21
+ },
22
+ {
23
+ "data": {
24
+ "application/vnd.jupyter.widget-view+json": {
25
+ "model_id": "b348b72e4f4b412f949efee9dd3da8d2",
26
+ "version_major": 2,
27
+ "version_minor": 0
28
+ },
29
+ "text/plain": [
30
+ "vocab.json: 0%| | 0.00/798k [00:00<?, ?B/s]"
31
+ ]
32
+ },
33
+ "metadata": {},
34
+ "output_type": "display_data"
35
+ },
36
+ {
37
+ "data": {
38
+ "application/vnd.jupyter.widget-view+json": {
39
+ "model_id": "c3c9cb29e5184e6ba7d0e698209b1dbe",
40
+ "version_major": 2,
41
+ "version_minor": 0
42
+ },
43
+ "text/plain": [
44
+ "merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]"
45
+ ]
46
+ },
47
+ "metadata": {},
48
+ "output_type": "display_data"
49
+ },
50
+ {
51
+ "data": {
52
+ "application/vnd.jupyter.widget-view+json": {
53
+ "model_id": "d7af8afa19f04ebda69594a35b75ebbd",
54
+ "version_major": 2,
55
+ "version_minor": 0
56
+ },
57
+ "text/plain": [
58
+ "tokenizer.json: 0%| | 0.00/2.11M [00:00<?, ?B/s]"
59
+ ]
60
+ },
61
+ "metadata": {},
62
+ "output_type": "display_data"
63
+ },
64
+ {
65
+ "data": {
66
+ "application/vnd.jupyter.widget-view+json": {
67
+ "model_id": "b9d9a927b1f446b2818dbb997608fd93",
68
+ "version_major": 2,
69
+ "version_minor": 0
70
+ },
71
+ "text/plain": [
72
+ "added_tokens.json: 0%| | 0.00/1.08k [00:00<?, ?B/s]"
73
+ ]
74
+ },
75
+ "metadata": {},
76
+ "output_type": "display_data"
77
+ },
78
+ {
79
+ "data": {
80
+ "application/vnd.jupyter.widget-view+json": {
81
+ "model_id": "77f52404773949c5b6e792eb2b5259dd",
82
+ "version_major": 2,
83
+ "version_minor": 0
84
+ },
85
+ "text/plain": [
86
+ "special_tokens_map.json: 0%| | 0.00/99.0 [00:00<?, ?B/s]"
87
+ ]
88
+ },
89
+ "metadata": {},
90
+ "output_type": "display_data"
91
+ }
92
+ ],
93
+ "source": [
94
+ "from transformers import AutoTokenizer\n",
95
+ "\n",
96
+ "tokenizer = AutoTokenizer.from_pretrained(\"vikhyatk/moondream2\")\n"
97
+ ]
98
  },
99
  {
100
  "cell_type": "code",
101
+ "execution_count": 26,
102
  "metadata": {},
103
  "outputs": [],
104
  "source": [
105
+ "texts = [\n",
106
+ " \"This is a short text.\",\n",
107
+ " \"This is a much longer text that will determine the padding length.\",\n",
108
+ " \"Medium length text here.\"\n",
109
+ "]\n",
110
+ "tokenizer.pad_token = tokenizer.eos_token\n",
111
  "\n",
112
+ "# Pad to the longest sequence in the batch\n",
113
+ "encoded = tokenizer(\n",
114
+ " texts,\n",
115
+ " padding=True, # or padding=\"longest\"\n",
116
+ " return_tensors=\"pt\", # or \"tf\" for TensorFlow\n",
117
+ " model_max_length=512,\n",
118
  "\n",
119
+ ")\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  ]
121
  },
122
  {
123
  "cell_type": "code",
124
+ "execution_count": 27,
125
  "metadata": {},
126
+ "outputs": [
127
+ {
128
+ "data": {
129
+ "text/plain": [
130
+ "transformers.tokenization_utils_base.BatchEncoding"
131
+ ]
132
+ },
133
+ "execution_count": 27,
134
+ "metadata": {},
135
+ "output_type": "execute_result"
136
+ }
137
+ ],
138
  "source": [
139
+ "type(encoded)"
 
 
 
 
 
 
 
 
 
140
  ]
141
  },
142
  {
143
  "cell_type": "code",
144
+ "execution_count": 28,
145
  "metadata": {},
146
+ "outputs": [
147
+ {
148
+ "data": {
149
+ "text/plain": [
150
+ "{'input_ids': tensor([[ 1212, 318, 257, 1790, 2420, 13, 50256, 50256, 50256, 50256,\n",
151
+ " 50256, 50256, 50256],\n",
152
+ " [ 1212, 318, 257, 881, 2392, 2420, 326, 481, 5004, 262,\n",
153
+ " 24511, 4129, 13],\n",
154
+ " [31205, 4129, 2420, 994, 13, 50256, 50256, 50256, 50256, 50256,\n",
155
+ " 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n",
156
+ " [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
157
+ " [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}"
158
+ ]
159
+ },
160
+ "execution_count": 28,
161
+ "metadata": {},
162
+ "output_type": "execute_result"
163
+ }
164
+ ],
165
  "source": [
166
+ "encoded"
 
 
 
 
167
  ]
168
  },
169
  {
170
  "cell_type": "code",
171
+ "execution_count": 9,
172
  "metadata": {},
173
+ "outputs": [
174
+ {
175
+ "data": {
176
+ "text/plain": [
177
+ "torch.Size([3, 13])"
178
+ ]
179
+ },
180
+ "execution_count": 9,
181
+ "metadata": {},
182
+ "output_type": "execute_result"
183
+ }
184
+ ],
185
+ "source": [
186
+ "encoded.input_ids.shape"
187
+ ]
188
  },
189
  {
190
  "cell_type": "code",
191
+ "execution_count": 30,
192
  "metadata": {},
193
  "outputs": [
194
  {
195
  "data": {
196
  "text/plain": [
197
+ "tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0])"
 
198
  ]
199
  },
200
+ "execution_count": 30,
201
  "metadata": {},
202
  "output_type": "execute_result"
203
  }
204
  ],
205
  "source": [
206
+ "encoded.attention_mask[0] * encoded.attention_mask[0].T"
207
  ]
208
  },
209
  {
210
  "cell_type": "code",
211
+ "execution_count": 20,
212
  "metadata": {},
213
  "outputs": [],
 
 
 
 
 
214
  "source": [
215
+ "mask = encoded.attention_mask[0].clone().reshape(-1, 1)\n",
216
+ "\n"
217
  ]
218
  },
219
  {
220
  "cell_type": "code",
221
+ "execution_count": 19,
222
  "metadata": {},
223
+ "outputs": [
224
+ {
225
+ "data": {
226
+ "text/plain": [
227
+ "tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0])"
228
+ ]
229
+ },
230
+ "execution_count": 19,
231
+ "metadata": {},
232
+ "output_type": "execute_result"
233
+ }
234
+ ],
235
+ "source": [
236
+ "encoded.attention_mask[0].T"
237
+ ]
238
  },
239
  {
240
  "cell_type": "code",
241
+ "execution_count": 22,
242
  "metadata": {},
243
  "outputs": [
244
  {
245
+ "data": {
246
+ "text/plain": [
247
+ "torch.Size([13, 1])"
248
+ ]
249
+ },
250
+ "execution_count": 22,
251
+ "metadata": {},
252
+ "output_type": "execute_result"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  }
254
  ],
255
+ "source": [
256
+ "mask.shape"
257
+ ]
258
  },
259
  {
260
  "cell_type": "code",
261
+ "execution_count": 23,
262
  "metadata": {},
263
  "outputs": [
264
  {
265
+ "data": {
266
+ "text/plain": [
267
+ "tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n",
268
+ " [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n",
269
+ " [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n",
270
+ " [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n",
271
+ " [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n",
272
+ " [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n",
273
+ " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
274
+ " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
275
+ " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
276
+ " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
277
+ " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
278
+ " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
279
+ " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])"
280
+ ]
281
+ },
282
+ "execution_count": 23,
283
+ "metadata": {},
284
+ "output_type": "execute_result"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  }
286
  ],
287
+ "source": [
288
+ "real = mask @ mask.T\n",
289
+ "real"
290
+ ]
291
  },
292
  {
293
  "cell_type": "code",
 
313
  "name": "python",
314
  "nbconvert_exporter": "python",
315
  "pygments_lexer": "ipython3",
316
+ "version": "3.13.3"
317
  }
318
  },
319
  "nbformat": 4,