bounty commited on
Commit
9b2871c
Β·
1 Parent(s): 505474b
Files changed (5) hide show
  1. benchmark_results.png +3 -0
  2. moondream2/moondream.py +4 -5
  3. notes.ipynb +0 -61
  4. ollama.ipynb +392 -201
  5. requirements.txt +3 -0
benchmark_results.png ADDED

Git LFS Details

  • SHA256: 44a4a7477d9616a35b237654e6887258f38d88aeafc0b95ab114d108b8fc3e03
  • Pointer size: 130 Bytes
  • Size of remote file: 80.1 kB
moondream2/moondream.py CHANGED
@@ -11,8 +11,7 @@ from .config import MoondreamConfig
11
  from .image_crops import reconstruct_from_crops
12
  from .vision import vision_encoder, vision_projection, prepare_crops, build_vision_model
13
  from .text import build_text_model, text_encoder, lm_head, text_decoder
14
- from .region import decode_coordinate, encode_coordinate, decode_size, encode_size
15
- from .utils import remove_outlier_points
16
  import os
17
  from .rope import RotaryEmbedding
18
  TextSamplingSettings = TypedDict(
@@ -210,7 +209,7 @@ class MoondreamModel(nn.Module):
210
  )
211
  inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)
212
  mask = self.attn_mask[:, :, 0 : inputs_embeds.size(1), :]
213
- pos_ids = torch.arange(inputs_embeds.size(1), dtype=torch.long, device=self.device)
214
  self._prefill(inputs_embeds, mask, pos_ids)
215
 
216
  return EncodedImage(
@@ -235,7 +234,7 @@ class MoondreamModel(nn.Module):
235
  prompt_emb = text_encoder(prompt_tokens, self.text)
236
  torch._dynamo.mark_dynamic(prompt_emb, 1)
237
  mask = self.attn_mask[:, :, pos : pos + prompt_emb.size(1), :]
238
- pos_ids = torch.arange(pos, pos + prompt_emb.size(1), dtype=torch.long, device=self.device)
239
  hidden = self._prefill(prompt_emb, mask, pos_ids)
240
  logits = lm_head(hidden, self.text)
241
 
@@ -259,7 +258,7 @@ class MoondreamModel(nn.Module):
259
  out = []
260
  mask = torch.zeros(1, 1, 2048, device=self.device, dtype=torch.bool)
261
  mask[:, :, :pos] = 1
262
- pos_ids = torch.tensor([pos], device=self.device, dtype=torch.long)
263
 
264
  with torch.inference_mode():
265
  while (
 
11
  from .image_crops import reconstruct_from_crops
12
  from .vision import vision_encoder, vision_projection, prepare_crops, build_vision_model
13
  from .text import build_text_model, text_encoder, lm_head, text_decoder
14
+ from .region import decode_coordinate, encode_coordinate
 
15
  import os
16
  from .rope import RotaryEmbedding
17
  TextSamplingSettings = TypedDict(
 
209
  )
210
  inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)
211
  mask = self.attn_mask[:, :, 0 : inputs_embeds.size(1), :]
212
+ pos_ids = torch.arange(inputs_embeds.size(1), dtype=torch.int32, device=self.device)
213
  self._prefill(inputs_embeds, mask, pos_ids)
214
 
215
  return EncodedImage(
 
234
  prompt_emb = text_encoder(prompt_tokens, self.text)
235
  torch._dynamo.mark_dynamic(prompt_emb, 1)
236
  mask = self.attn_mask[:, :, pos : pos + prompt_emb.size(1), :]
237
+ pos_ids = torch.arange(pos, pos + prompt_emb.size(1), dtype=torch.int32, device=self.device)
238
  hidden = self._prefill(prompt_emb, mask, pos_ids)
239
  logits = lm_head(hidden, self.text)
240
 
 
258
  out = []
259
  mask = torch.zeros(1, 1, 2048, device=self.device, dtype=torch.bool)
260
  mask[:, :, :pos] = 1
261
+ pos_ids = torch.tensor([pos], device=self.device, dtype=torch.int32)
262
 
263
  with torch.inference_mode():
264
  while (
notes.ipynb CHANGED
@@ -29,72 +29,11 @@
29
  "\n"
30
  ]
31
  },
32
- {
33
- "cell_type": "code",
34
- "execution_count": 1,
35
- "metadata": {},
36
- "outputs": [
37
- {
38
- "name": "stderr",
39
- "output_type": "stream",
40
- "text": [
41
- "WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work\n",
42
- "W0612 18:34:05.382000 19960 Lib\\site-packages\\torch\\distributed\\elastic\\multiprocessing\\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.\n"
43
- ]
44
- }
45
- ],
46
- "source": [
47
- "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
48
- "from PIL import Image\n",
49
- "\n",
50
- "model = AutoModelForCausalLM.from_pretrained(\n",
51
- " \"vikhyatk/moondream2\",\n",
52
- " revision=\"2025-04-14\",\n",
53
- " trust_remote_code=True,\n",
54
- " # Uncomment to run on GPU.\n",
55
- " device_map={\"\": \"cuda\"}\n",
56
- ")"
57
- ]
58
- },
59
- {
60
- "cell_type": "code",
61
- "execution_count": 2,
62
- "metadata": {},
63
- "outputs": [
64
- {
65
- "name": "stdout",
66
- "output_type": "stream",
67
- "text": [
68
- "model size: 3680.163MB\n"
69
- ]
70
- }
71
- ],
72
- "source": [
73
- "param_size = 0\n",
74
- "for param in model.parameters():\n",
75
- " param_size += param.nelement() * param.element_size()\n",
76
- "buffer_size = 0\n",
77
- "for buffer in model.buffers():\n",
78
- " buffer_size += buffer.nelement() * buffer.element_size()\n",
79
- "\n",
80
- "size_all_mb = (param_size + buffer_size) / 1024**2\n",
81
- "print('model size: {:.3f}MB'.format(size_all_mb))"
82
- ]
83
- },
84
  {
85
  "cell_type": "code",
86
  "execution_count": 2,
87
  "metadata": {},
88
  "outputs": [],
89
- "source": [
90
- "import torch"
91
- ]
92
- },
93
- {
94
- "cell_type": "code",
95
- "execution_count": 3,
96
- "metadata": {},
97
- "outputs": [],
98
  "source": [
99
  "from PIL import Image\n",
100
  "with torch.inference_mode():\n",
 
29
  "\n"
30
  ]
31
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  {
33
  "cell_type": "code",
34
  "execution_count": 2,
35
  "metadata": {},
36
  "outputs": [],
 
 
 
 
 
 
 
 
 
37
  "source": [
38
  "from PIL import Image\n",
39
  "with torch.inference_mode():\n",
ollama.ipynb CHANGED
@@ -151,216 +151,407 @@
151
  },
152
  {
153
  "cell_type": "code",
154
- "execution_count": 2,
155
  "metadata": {},
156
  "outputs": [
157
  {
158
  "name": "stdout",
159
  "output_type": "stream",
160
  "text": [
161
- "Benchmarking with tensor shape: torch.Size([1, 730, 32, 64])\n",
162
- "Device: cuda:0\n",
163
- "Warmup iterations: 10\n",
164
- "Benchmark iterations: 100\n",
 
 
 
 
 
 
165
  "--------------------------------------------------\n",
166
- "Warming up regular rope...\n",
167
- "Benchmarking regular rope...\n",
168
- "Warming up fast rope...\n",
169
- "Benchmarking fast rope...\n",
170
- "\n",
171
- "============================================================\n",
172
- "BENCHMARK RESULTS\n",
173
- "============================================================\n",
174
- "\n",
175
- "Regular Rope:\n",
176
- " Mean: 0.338 ms\n",
177
- " Median: 0.335 ms\n",
178
- " Std: 0.009 ms\n",
179
- " Min: 0.330 ms\n",
180
- " Max: 0.385 ms\n",
181
- "\n",
182
- "Fast Rope (In-place):\n",
183
- " Mean: 0.267 ms\n",
184
- " Median: 0.265 ms\n",
185
- " Std: 0.005 ms\n",
186
- " Min: 0.261 ms\n",
187
- " Max: 0.285 ms\n",
188
- "\n",
189
- "Speedup: 1.27x\n",
190
- "Fast rope is 1.27x faster\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  ]
192
  }
193
  ],
194
- "source": [
195
- "import torch\n",
196
- "import time\n",
197
- "import statistics\n",
198
- "from typing import List, Tuple\n",
199
- "\n",
200
- "def benchmark_rope_functions(\n",
201
- " rope, \n",
202
- " fast_rope, \n",
203
- " tensor: torch.Tensor, \n",
204
- " num_warmup: int = 10,\n",
205
- " num_iterations: int = 100\n",
206
- ") -> Tuple[float, float, List[float], List[float]]:\n",
207
- " \"\"\"\n",
208
- " Benchmark two rope functions, accounting for in-place modification.\n",
209
- " \n",
210
- " Args:\n",
211
- " rope: Regular RotaryEmbedding instance\n",
212
- " fast_rope: RotaryEmbeddingInPlace instance\n",
213
- " tensor: Input tensor to benchmark with\n",
214
- " num_warmup: Number of warmup iterations\n",
215
- " num_iterations: Number of benchmark iterations\n",
216
- " \n",
217
- " Returns:\n",
218
- " Tuple of (regular_avg_time, fast_avg_time, regular_times, fast_times)\n",
219
- " \"\"\"\n",
220
- " \n",
221
- " # Ensure we're on the right device and in eval mode if applicable\n",
222
- " device = tensor.device\n",
223
- " \n",
224
- " # Pre-allocate tensor copies to avoid allocation overhead during timing\n",
225
- " tensor_copies = []\n",
226
- " for _ in range(num_warmup + num_iterations):\n",
227
- " tensor_copies.append(tensor.clone().detach())\n",
228
- " \n",
229
- " print(f\"Benchmarking with tensor shape: {tensor.shape}\")\n",
230
- " print(f\"Device: {device}\")\n",
231
- " print(f\"Warmup iterations: {num_warmup}\")\n",
232
- " print(f\"Benchmark iterations: {num_iterations}\")\n",
233
- " print(\"-\" * 50)\n",
234
- " \n",
235
- " # Warmup phase for regular rope\n",
236
- " print(\"Warming up regular rope...\")\n",
237
- " for i in range(num_warmup):\n",
238
- " _ = rope.apply(tensor)\n",
239
- " if device.type == 'cuda':\n",
240
- " torch.cuda.synchronize()\n",
241
- " \n",
242
- " # Benchmark regular rope\n",
243
- " print(\"Benchmarking regular rope...\")\n",
244
- " regular_times = []\n",
245
- " for i in range(num_iterations):\n",
246
- " if device.type == 'cuda':\n",
247
- " torch.cuda.synchronize()\n",
248
- " \n",
249
- " start_time = time.perf_counter()\n",
250
- " result = rope.apply(tensor)\n",
251
- " \n",
252
- " if device.type == 'cuda':\n",
253
- " torch.cuda.synchronize()\n",
254
- " \n",
255
- " end_time = time.perf_counter()\n",
256
- " regular_times.append((end_time - start_time) * 1000) # Convert to milliseconds\n",
257
- " \n",
258
- " # Warmup phase for fast rope (in-place)\n",
259
- " print(\"Warming up fast rope...\")\n",
260
- " for i in range(num_warmup):\n",
261
- " test_tensor = tensor_copies[i].clone() # Use a copy for warmup\n",
262
- " _ = fast_rope.apply(test_tensor)\n",
263
- " if device.type == 'cuda':\n",
264
- " torch.cuda.synchronize()\n",
265
- " \n",
266
- " # Benchmark fast rope (in-place)\n",
267
- " print(\"Benchmarking fast rope...\")\n",
268
- " fast_times = []\n",
269
- " copy_idx = num_warmup # Start from after warmup copies\n",
270
- " \n",
271
- " for i in range(num_iterations):\n",
272
- " # Use pre-allocated copy\n",
273
- " tensor_copy = tensor_copies[copy_idx + i]\n",
274
- " \n",
275
- " if device.type == 'cuda':\n",
276
- " torch.cuda.synchronize()\n",
277
- " \n",
278
- " # Time only the apply operation, not the copy\n",
279
- " start_time = time.perf_counter()\n",
280
- " result = fast_rope.apply(tensor_copy)\n",
281
- " \n",
282
- " if device.type == 'cuda':\n",
283
- " torch.cuda.synchronize()\n",
284
- " \n",
285
- " end_time = time.perf_counter()\n",
286
- " fast_times.append((end_time - start_time) * 1000) # Convert to milliseconds\n",
287
- " \n",
288
- " # Calculate statistics\n",
289
- " regular_avg = statistics.mean(regular_times)\n",
290
- " fast_avg = statistics.mean(fast_times)\n",
291
- " \n",
292
- " return regular_avg, fast_avg, regular_times, fast_times\n",
293
- "\n",
294
- "def print_benchmark_results(regular_avg: float, fast_avg: float, \n",
295
- " regular_times: List[float], fast_times: List[float]):\n",
296
- " \"\"\"Print detailed benchmark results.\"\"\"\n",
297
- " \n",
298
- " regular_median = statistics.median(regular_times)\n",
299
- " regular_std = statistics.stdev(regular_times) if len(regular_times) > 1 else 0\n",
300
- " regular_min = min(regular_times)\n",
301
- " regular_max = max(regular_times)\n",
302
- " \n",
303
- " fast_median = statistics.median(fast_times)\n",
304
- " fast_std = statistics.stdev(fast_times) if len(fast_times) > 1 else 0\n",
305
- " fast_min = min(fast_times)\n",
306
- " fast_max = max(fast_times)\n",
307
- " \n",
308
- " speedup = regular_avg / fast_avg if fast_avg > 0 else float('inf')\n",
309
- " \n",
310
- " print(\"\\n\" + \"=\" * 60)\n",
311
- " print(\"BENCHMARK RESULTS\")\n",
312
- " print(\"=\" * 60)\n",
313
- " \n",
314
- " print(f\"\\nRegular Rope:\")\n",
315
- " print(f\" Mean: {regular_avg:.3f} ms\")\n",
316
- " print(f\" Median: {regular_median:.3f} ms\")\n",
317
- " print(f\" Std: {regular_std:.3f} ms\")\n",
318
- " print(f\" Min: {regular_min:.3f} ms\")\n",
319
- " print(f\" Max: {regular_max:.3f} ms\")\n",
320
- " \n",
321
- " print(f\"\\nFast Rope (In-place):\")\n",
322
- " print(f\" Mean: {fast_avg:.3f} ms\")\n",
323
- " print(f\" Median: {fast_median:.3f} ms\")\n",
324
- " print(f\" Std: {fast_std:.3f} ms\")\n",
325
- " print(f\" Min: {fast_min:.3f} ms\")\n",
326
- " print(f\" Max: {fast_max:.3f} ms\")\n",
327
- " \n",
328
- " print(f\"\\nSpeedup: {speedup:.2f}x\")\n",
329
- " if speedup > 1:\n",
330
- " print(f\"Fast rope is {speedup:.2f}x faster\")\n",
331
- " else:\n",
332
- " print(f\"Regular rope is {1/speedup:.2f}x faster\")\n",
333
- "\n",
334
- "# Example usage\n",
335
- "def run_benchmark():\n",
336
- " \"\"\"\n",
337
- " Example of how to use the benchmark functions.\n",
338
- " Replace with your actual RotaryEmbedding classes.\n",
339
- " \"\"\"\n",
340
- " \n",
341
- " # Example parameters - adjust these to match your setup\n",
342
- " dim_per_head = 64\n",
343
- " n_heads = 32\n",
344
- " max_context = 2048\n",
345
- "\n",
346
- " freq_dim = dim_per_head // 2\n",
347
- "\n",
348
- " torch.manual_seed(42)\n",
349
- "\n",
350
- " \n",
351
- " # Create your rope instances\n",
352
- " rope = RotaryEmbedding(dim_per_head, max_context)\n",
353
- " fast_rope = RotaryEmbeddingInPlace(dim_per_head, max_context)\n",
354
- " \n",
355
- " # Create test tensor - adjust shape to match your use case\n",
356
- " tensor = torch.rand(1, 730, n_heads, dim_per_head, device=device)\n",
357
- "\n",
358
- " regular_avg, fast_avg, regular_times, fast_times = benchmark_rope_functions(rope, fast_rope, tensor)\n",
359
- " print_benchmark_results(regular_avg, fast_avg, regular_times, fast_times)\n",
360
- "\n",
361
- "if __name__ == \"__main__\":\n",
362
- " run_benchmark()"
363
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  },
365
  {
366
  "cell_type": "code",
@@ -386,7 +577,7 @@
386
  "name": "python",
387
  "nbconvert_exporter": "python",
388
  "pygments_lexer": "ipython3",
389
- "version": "3.13.3"
390
  }
391
  },
392
  "nbformat": 4,
 
151
  },
152
  {
153
  "cell_type": "code",
154
+ "execution_count": null,
155
  "metadata": {},
156
  "outputs": [
157
  {
158
  "name": "stdout",
159
  "output_type": "stream",
160
  "text": [
161
+ "βœ“ OpenCV available\n",
162
+ "=== Comprehensive Benchmark with Tensor Difference Analysis ===\n",
163
+ "CUDA available: True\n",
164
+ "PyVIPS available: True\n",
165
+ "\n",
166
+ "================================================================================\n",
167
+ "Testing 1080p (1920x1080)\n",
168
+ "================================================================================\n",
169
+ "\n",
170
+ "Function Min (ms) Avg (ms) Speedup \n",
171
  "--------------------------------------------------\n",
172
+ "Original 16.3 16.7 1.00x \n",
173
+ "Optimized 8.9 9.4 1.77x \n",
174
+ "Ultra Fast 9.2 9.5 1.75x \n",
175
+ "\n",
176
+ "πŸ” TENSOR DIFFERENCE ANALYSIS\n",
177
+ "==================================================\n",
178
+ "\n",
179
+ "βœ“ Tiling match: (2, 4)\n",
180
+ "\n",
181
+ "--- Tensor Difference Analysis: Original vs Optimized ---\n",
182
+ "βœ“ Shape match: torch.Size([9, 3, 378, 378])\n",
183
+ "Max absolute difference: 1.208008\n",
184
+ "Mean absolute difference: 0.181336\n",
185
+ "Std of differences: 0.153313\n",
186
+ "Pixels with any difference: 98.44% (3797773/3857868)\n",
187
+ "\n",
188
+ "Tolerance analysis:\n",
189
+ " Within 1e-06: 1.56% (60095/3857868)\n",
190
+ " Within 1e-05: 1.56% (60095/3857868)\n",
191
+ " Within 1e-04: 1.56% (60095/3857868)\n",
192
+ " Within 1e-03: 1.56% (60095/3857868)\n",
193
+ " Within 1e-02: 4.68% (180528/3857868)\n",
194
+ " Within 1e-01: 36.82% (1420591/3857868)\n",
195
+ "❌ Tensors have significant differences\n",
196
+ "\n",
197
+ "Per-crop analysis (9 crops):\n",
198
+ " Crop 0: max=1.207520, mean=0.288220\n",
199
+ " Crop 1: max=1.160156, mean=0.167923\n",
200
+ " Crop 2: max=1.208008, mean=0.167772\n",
201
+ " Crop 3: max=1.208008, mean=0.168140\n",
202
+ " Crop 4: max=1.176270, mean=0.168022\n",
203
+ " ... and 4 more crops\n",
204
+ "\n",
205
+ "βœ“ Tiling match: (2, 4)\n",
206
+ "\n",
207
+ "--- Tensor Difference Analysis: Original vs Ultra Fast ---\n",
208
+ "βœ“ Shape match: torch.Size([9, 3, 378, 378])\n",
209
+ "Max absolute difference: 1.208008\n",
210
+ "Mean absolute difference: 0.181336\n",
211
+ "Std of differences: 0.153313\n",
212
+ "Pixels with any difference: 98.44% (3797773/3857868)\n",
213
+ "\n",
214
+ "Tolerance analysis:\n",
215
+ " Within 1e-06: 1.56% (60095/3857868)\n",
216
+ " Within 1e-05: 1.56% (60095/3857868)\n",
217
+ " Within 1e-04: 1.56% (60095/3857868)\n",
218
+ " Within 1e-03: 1.56% (60095/3857868)\n",
219
+ " Within 1e-02: 4.68% (180528/3857868)\n",
220
+ " Within 1e-01: 36.82% (1420591/3857868)\n",
221
+ "❌ Tensors have significant differences\n",
222
+ "\n",
223
+ "Per-crop analysis (9 crops):\n",
224
+ " Crop 0: max=1.207520, mean=0.288220\n",
225
+ " Crop 1: max=1.160156, mean=0.167923\n",
226
+ " Crop 2: max=1.208008, mean=0.167772\n",
227
+ " Crop 3: max=1.208008, mean=0.168140\n",
228
+ " Crop 4: max=1.176270, mean=0.168022\n",
229
+ " ... and 4 more crops\n",
230
+ "\n",
231
+ "βœ“ Tiling match: (2, 4)\n",
232
+ "\n",
233
+ "--- Tensor Difference Analysis: Optimized vs Ultra Fast ---\n",
234
+ "βœ“ Shape match: torch.Size([9, 3, 378, 378])\n",
235
+ "Max absolute difference: 0.000000\n",
236
+ "Mean absolute difference: 0.000000\n",
237
+ "Std of differences: 0.000000\n",
238
+ "Pixels with any difference: 0.00% (0/3857868)\n",
239
+ "\n",
240
+ "Tolerance analysis:\n",
241
+ " Within 1e-06: 100.00% (3857868/3857868)\n",
242
+ " Within 1e-05: 100.00% (3857868/3857868)\n",
243
+ " Within 1e-04: 100.00% (3857868/3857868)\n",
244
+ " Within 1e-03: 100.00% (3857868/3857868)\n",
245
+ " Within 1e-02: 100.00% (3857868/3857868)\n",
246
+ " Within 1e-01: 100.00% (3857868/3857868)\n",
247
+ "βœ… Tensors are essentially identical (max diff < 1e-5)\n",
248
+ "\n",
249
+ "Per-crop analysis (9 crops):\n",
250
+ " Crop 0: max=0.000000, mean=0.000000\n",
251
+ " Crop 1: max=0.000000, mean=0.000000\n",
252
+ " Crop 2: max=0.000000, mean=0.000000\n",
253
+ " Crop 3: max=0.000000, mean=0.000000\n",
254
+ " Crop 4: max=0.000000, mean=0.000000\n",
255
+ " ... and 4 more crops\n",
256
+ "\n",
257
+ "================================================================================\n",
258
+ "Testing 4K (3840x2160)\n",
259
+ "================================================================================\n",
260
+ "\n",
261
+ "Function Min (ms) Avg (ms) Speedup \n",
262
+ "--------------------------------------------------\n",
263
+ "Original 55.0 57.2 1.00x \n",
264
+ "Optimized 30.8 33.4 1.71x \n",
265
+ "Ultra Fast 32.3 36.5 1.57x \n",
266
+ "\n",
267
+ "πŸ” TENSOR DIFFERENCE ANALYSIS\n",
268
+ "==================================================\n",
269
+ "\n",
270
+ "βœ“ Tiling match: (2, 4)\n",
271
+ "\n",
272
+ "--- Tensor Difference Analysis: Original vs Optimized ---\n",
273
+ "βœ“ Shape match: torch.Size([9, 3, 378, 378])\n",
274
+ "Max absolute difference: 1.278320\n",
275
+ "Mean absolute difference: 0.280527\n",
276
+ "Std of differences: 0.198947\n",
277
+ "Pixels with any difference: 99.16% (3825385/3857868)\n",
278
+ "\n",
279
+ "Tolerance analysis:\n",
280
+ " Within 1e-06: 0.84% (32483/3857868)\n",
281
+ " Within 1e-05: 0.84% (32483/3857868)\n",
282
+ " Within 1e-04: 0.84% (32483/3857868)\n",
283
+ " Within 1e-03: 0.84% (32483/3857868)\n",
284
+ " Within 1e-02: 2.53% (97553/3857868)\n",
285
+ " Within 1e-01: 20.93% (807398/3857868)\n",
286
+ "❌ Tensors have significant differences\n",
287
+ "\n",
288
+ "Per-crop analysis (9 crops):\n",
289
+ " Crop 0: max=1.105957, mean=0.310640\n",
290
+ " Crop 1: max=1.262695, mean=0.276606\n",
291
+ " Crop 2: max=1.262695, mean=0.276472\n",
292
+ " Crop 3: max=1.278320, mean=0.276858\n",
293
+ " Crop 4: max=1.231934, mean=0.276985\n",
294
+ " ... and 4 more crops\n",
295
+ "\n",
296
+ "βœ“ Tiling match: (2, 4)\n",
297
+ "\n",
298
+ "--- Tensor Difference Analysis: Original vs Ultra Fast ---\n",
299
+ "βœ“ Shape match: torch.Size([9, 3, 378, 378])\n",
300
+ "Max absolute difference: 1.278320\n",
301
+ "Mean absolute difference: 0.280527\n",
302
+ "Std of differences: 0.198947\n",
303
+ "Pixels with any difference: 99.16% (3825385/3857868)\n",
304
+ "\n",
305
+ "Tolerance analysis:\n",
306
+ " Within 1e-06: 0.84% (32483/3857868)\n",
307
+ " Within 1e-05: 0.84% (32483/3857868)\n",
308
+ " Within 1e-04: 0.84% (32483/3857868)\n",
309
+ " Within 1e-03: 0.84% (32483/3857868)\n",
310
+ " Within 1e-02: 2.53% (97553/3857868)\n",
311
+ " Within 1e-01: 20.93% (807398/3857868)\n",
312
+ "❌ Tensors have significant differences\n",
313
+ "\n",
314
+ "Per-crop analysis (9 crops):\n",
315
+ " Crop 0: max=1.105957, mean=0.310640\n",
316
+ " Crop 1: max=1.262695, mean=0.276606\n",
317
+ " Crop 2: max=1.262695, mean=0.276472\n",
318
+ " Crop 3: max=1.278320, mean=0.276858\n",
319
+ " Crop 4: max=1.231934, mean=0.276985\n",
320
+ " ... and 4 more crops\n",
321
+ "\n",
322
+ "βœ“ Tiling match: (2, 4)\n",
323
+ "\n",
324
+ "--- Tensor Difference Analysis: Optimized vs Ultra Fast ---\n",
325
+ "βœ“ Shape match: torch.Size([9, 3, 378, 378])\n",
326
+ "Max absolute difference: 0.000000\n",
327
+ "Mean absolute difference: 0.000000\n",
328
+ "Std of differences: 0.000000\n",
329
+ "Pixels with any difference: 0.00% (0/3857868)\n",
330
+ "\n",
331
+ "Tolerance analysis:\n",
332
+ " Within 1e-06: 100.00% (3857868/3857868)\n",
333
+ " Within 1e-05: 100.00% (3857868/3857868)\n",
334
+ " Within 1e-04: 100.00% (3857868/3857868)\n",
335
+ " Within 1e-03: 100.00% (3857868/3857868)\n",
336
+ " Within 1e-02: 100.00% (3857868/3857868)\n",
337
+ " Within 1e-01: 100.00% (3857868/3857868)\n",
338
+ "βœ… Tensors are essentially identical (max diff < 1e-5)\n",
339
+ "\n",
340
+ "Per-crop analysis (9 crops):\n",
341
+ " Crop 0: max=0.000000, mean=0.000000\n",
342
+ " Crop 1: max=0.000000, mean=0.000000\n",
343
+ " Crop 2: max=0.000000, mean=0.000000\n",
344
+ " Crop 3: max=0.000000, mean=0.000000\n",
345
+ " Crop 4: max=0.000000, mean=0.000000\n",
346
+ " ... and 4 more crops\n",
347
+ "\n",
348
+ "πŸ’‘ Tip: Run with '--speed-only' flag for faster benchmarking without tensor analysis\n"
349
  ]
350
  }
351
  ],
352
+ "source": []
353
+ },
354
+ {
355
+ "cell_type": "code",
356
+ "execution_count": 1,
357
+ "metadata": {},
358
+ "outputs": [
359
+ {
360
+ "name": "stdout",
361
+ "output_type": "stream",
362
+ "text": [
363
+ "βœ“ OpenCV available\n",
364
+ "=== Comprehensive Benchmark with Tensor Difference Analysis ===\n",
365
+ "CUDA available: True\n",
366
+ "PyVIPS available: True\n",
367
+ "\n",
368
+ "================================================================================\n",
369
+ "Testing 1080p (1920x1080)\n",
370
+ "================================================================================\n",
371
+ "\n",
372
+ "Function Min (ms) Avg (ms) Speedup \n",
373
+ "--------------------------------------------------\n",
374
+ "Original 15.6 16.8 1.00x \n",
375
+ "Optimized 8.8 9.2 1.82x \n",
376
+ "Ultra Fast 9.4 9.6 1.76x \n",
377
+ "\n",
378
+ "πŸ” TENSOR DIFFERENCE ANALYSIS\n",
379
+ "==================================================\n",
380
+ "\n",
381
+ "βœ“ Tiling match: (2, 4)\n",
382
+ "\n",
383
+ "--- Tensor Difference Analysis: Original vs Optimized ---\n",
384
+ "βœ“ Shape match: torch.Size([9, 3, 378, 378])\n",
385
+ "Max absolute difference: 1.208008\n",
386
+ "Mean absolute difference: 0.181336\n",
387
+ "Std of differences: 0.153313\n",
388
+ "Pixels with any difference: 98.44% (3797773/3857868)\n",
389
+ "\n",
390
+ "Tolerance analysis:\n",
391
+ " Within 1e-06: 1.56% (60095/3857868)\n",
392
+ " Within 1e-05: 1.56% (60095/3857868)\n",
393
+ " Within 1e-04: 1.56% (60095/3857868)\n",
394
+ " Within 1e-03: 1.56% (60095/3857868)\n",
395
+ " Within 1e-02: 4.68% (180528/3857868)\n",
396
+ " Within 1e-01: 36.82% (1420591/3857868)\n",
397
+ "❌ Tensors have significant differences\n",
398
+ "\n",
399
+ "Per-crop analysis (9 crops):\n",
400
+ " Crop 0: max=1.207520, mean=0.288220\n",
401
+ " Crop 1: max=1.160156, mean=0.167923\n",
402
+ " Crop 2: max=1.208008, mean=0.167772\n",
403
+ " Crop 3: max=1.208008, mean=0.168140\n",
404
+ " Crop 4: max=1.176270, mean=0.168022\n",
405
+ " ... and 4 more crops\n",
406
+ "\n",
407
+ "βœ“ Tiling match: (2, 4)\n",
408
+ "\n",
409
+ "--- Tensor Difference Analysis: Original vs Ultra Fast ---\n",
410
+ "βœ“ Shape match: torch.Size([9, 3, 378, 378])\n",
411
+ "Max absolute difference: 1.208008\n",
412
+ "Mean absolute difference: 0.181336\n",
413
+ "Std of differences: 0.153313\n",
414
+ "Pixels with any difference: 98.44% (3797773/3857868)\n",
415
+ "\n",
416
+ "Tolerance analysis:\n",
417
+ " Within 1e-06: 1.56% (60095/3857868)\n",
418
+ " Within 1e-05: 1.56% (60095/3857868)\n",
419
+ " Within 1e-04: 1.56% (60095/3857868)\n",
420
+ " Within 1e-03: 1.56% (60095/3857868)\n",
421
+ " Within 1e-02: 4.68% (180528/3857868)\n",
422
+ " Within 1e-01: 36.82% (1420591/3857868)\n",
423
+ "❌ Tensors have significant differences\n",
424
+ "\n",
425
+ "Per-crop analysis (9 crops):\n",
426
+ " Crop 0: max=1.207520, mean=0.288220\n",
427
+ " Crop 1: max=1.160156, mean=0.167923\n",
428
+ " Crop 2: max=1.208008, mean=0.167772\n",
429
+ " Crop 3: max=1.208008, mean=0.168140\n",
430
+ " Crop 4: max=1.176270, mean=0.168022\n",
431
+ " ... and 4 more crops\n",
432
+ "\n",
433
+ "βœ“ Tiling match: (2, 4)\n",
434
+ "\n",
435
+ "--- Tensor Difference Analysis: Optimized vs Ultra Fast ---\n",
436
+ "βœ“ Shape match: torch.Size([9, 3, 378, 378])\n",
437
+ "Max absolute difference: 0.000000\n",
438
+ "Mean absolute difference: 0.000000\n",
439
+ "Std of differences: 0.000000\n",
440
+ "Pixels with any difference: 0.00% (0/3857868)\n",
441
+ "\n",
442
+ "Tolerance analysis:\n",
443
+ " Within 1e-06: 100.00% (3857868/3857868)\n",
444
+ " Within 1e-05: 100.00% (3857868/3857868)\n",
445
+ " Within 1e-04: 100.00% (3857868/3857868)\n",
446
+ " Within 1e-03: 100.00% (3857868/3857868)\n",
447
+ " Within 1e-02: 100.00% (3857868/3857868)\n",
448
+ " Within 1e-01: 100.00% (3857868/3857868)\n",
449
+ "βœ… Tensors are essentially identical (max diff < 1e-5)\n",
450
+ "\n",
451
+ "Per-crop analysis (9 crops):\n",
452
+ " Crop 0: max=0.000000, mean=0.000000\n",
453
+ " Crop 1: max=0.000000, mean=0.000000\n",
454
+ " Crop 2: max=0.000000, mean=0.000000\n",
455
+ " Crop 3: max=0.000000, mean=0.000000\n",
456
+ " Crop 4: max=0.000000, mean=0.000000\n",
457
+ " ... and 4 more crops\n",
458
+ "\n",
459
+ "================================================================================\n",
460
+ "Testing 4K (3840x2160)\n",
461
+ "================================================================================\n",
462
+ "\n",
463
+ "Function Min (ms) Avg (ms) Speedup \n",
464
+ "--------------------------------------------------\n",
465
+ "Original 46.9 51.5 1.00x \n",
466
+ "Optimized 34.3 35.6 1.45x \n",
467
+ "Ultra Fast 30.5 31.9 1.61x \n",
468
+ "\n",
469
+ "πŸ” TENSOR DIFFERENCE ANALYSIS\n",
470
+ "==================================================\n",
471
+ "\n",
472
+ "βœ“ Tiling match: (2, 4)\n",
473
+ "\n",
474
+ "--- Tensor Difference Analysis: Original vs Optimized ---\n",
475
+ "βœ“ Shape match: torch.Size([9, 3, 378, 378])\n",
476
+ "Max absolute difference: 1.278320\n",
477
+ "Mean absolute difference: 0.280527\n",
478
+ "Std of differences: 0.198947\n",
479
+ "Pixels with any difference: 99.16% (3825385/3857868)\n",
480
+ "\n",
481
+ "Tolerance analysis:\n",
482
+ " Within 1e-06: 0.84% (32483/3857868)\n",
483
+ " Within 1e-05: 0.84% (32483/3857868)\n",
484
+ " Within 1e-04: 0.84% (32483/3857868)\n",
485
+ " Within 1e-03: 0.84% (32483/3857868)\n",
486
+ " Within 1e-02: 2.53% (97553/3857868)\n",
487
+ " Within 1e-01: 20.93% (807398/3857868)\n",
488
+ "❌ Tensors have significant differences\n",
489
+ "\n",
490
+ "Per-crop analysis (9 crops):\n",
491
+ " Crop 0: max=1.105957, mean=0.310640\n",
492
+ " Crop 1: max=1.262695, mean=0.276606\n",
493
+ " Crop 2: max=1.262695, mean=0.276472\n",
494
+ " Crop 3: max=1.278320, mean=0.276858\n",
495
+ " Crop 4: max=1.231934, mean=0.276985\n",
496
+ " ... and 4 more crops\n",
497
+ "\n",
498
+ "βœ“ Tiling match: (2, 4)\n",
499
+ "\n",
500
+ "--- Tensor Difference Analysis: Original vs Ultra Fast ---\n",
501
+ "βœ“ Shape match: torch.Size([9, 3, 378, 378])\n",
502
+ "Max absolute difference: 1.278320\n",
503
+ "Mean absolute difference: 0.280527\n",
504
+ "Std of differences: 0.198947\n",
505
+ "Pixels with any difference: 99.16% (3825385/3857868)\n",
506
+ "\n",
507
+ "Tolerance analysis:\n",
508
+ " Within 1e-06: 0.84% (32483/3857868)\n",
509
+ " Within 1e-05: 0.84% (32483/3857868)\n",
510
+ " Within 1e-04: 0.84% (32483/3857868)\n",
511
+ " Within 1e-03: 0.84% (32483/3857868)\n",
512
+ " Within 1e-02: 2.53% (97553/3857868)\n",
513
+ " Within 1e-01: 20.93% (807398/3857868)\n",
514
+ "❌ Tensors have significant differences\n",
515
+ "\n",
516
+ "Per-crop analysis (9 crops):\n",
517
+ " Crop 0: max=1.105957, mean=0.310640\n",
518
+ " Crop 1: max=1.262695, mean=0.276606\n",
519
+ " Crop 2: max=1.262695, mean=0.276472\n",
520
+ " Crop 3: max=1.278320, mean=0.276858\n",
521
+ " Crop 4: max=1.231934, mean=0.276985\n",
522
+ " ... and 4 more crops\n",
523
+ "\n",
524
+ "βœ“ Tiling match: (2, 4)\n",
525
+ "\n",
526
+ "--- Tensor Difference Analysis: Optimized vs Ultra Fast ---\n",
527
+ "βœ“ Shape match: torch.Size([9, 3, 378, 378])\n",
528
+ "Max absolute difference: 0.000000\n",
529
+ "Mean absolute difference: 0.000000\n",
530
+ "Std of differences: 0.000000\n",
531
+ "Pixels with any difference: 0.00% (0/3857868)\n",
532
+ "\n",
533
+ "Tolerance analysis:\n",
534
+ " Within 1e-06: 100.00% (3857868/3857868)\n",
535
+ " Within 1e-05: 100.00% (3857868/3857868)\n",
536
+ " Within 1e-04: 100.00% (3857868/3857868)\n",
537
+ " Within 1e-03: 100.00% (3857868/3857868)\n",
538
+ " Within 1e-02: 100.00% (3857868/3857868)\n",
539
+ " Within 1e-01: 100.00% (3857868/3857868)\n",
540
+ "βœ… Tensors are essentially identical (max diff < 1e-5)\n",
541
+ "\n",
542
+ "Per-crop analysis (9 crops):\n",
543
+ " Crop 0: max=0.000000, mean=0.000000\n",
544
+ " Crop 1: max=0.000000, mean=0.000000\n",
545
+ " Crop 2: max=0.000000, mean=0.000000\n",
546
+ " Crop 3: max=0.000000, mean=0.000000\n",
547
+ " Crop 4: max=0.000000, mean=0.000000\n",
548
+ " ... and 4 more crops\n",
549
+ "\n",
550
+ "πŸ’‘ Tip: Run with '--speed-only' flag for faster benchmarking without tensor analysis\n"
551
+ ]
552
+ }
553
+ ],
554
+ "source": []
555
  },
556
  {
557
  "cell_type": "code",
 
577
  "name": "python",
578
  "nbconvert_exporter": "python",
579
  "pygments_lexer": "ipython3",
580
+ "version": "3.12.9"
581
  }
582
  },
583
  "nbformat": 4,
requirements.txt CHANGED
@@ -1 +1,4 @@
1
  torch==2.7.0+cu128 torchvision==0.22.0+cu128 torchaudio==2.7.0+cu128 --index-url https://download.pytorch.org/whl/cu128
 
 
 
 
1
  torch==2.7.0+cu128 torchvision==0.22.0+cu128 torchaudio==2.7.0+cu128 --index-url https://download.pytorch.org/whl/cu128
2
+
3
+
4
+ opencv-python