cleanup

Browse files

Files changed (5) hide show

benchmark_results.png +3 -0
moondream2/moondream.py +4 -5
notes.ipynb +0 -61
ollama.ipynb +392 -201
requirements.txt +3 -0

benchmark_results.png ADDED Viewed

Git LFS Details

SHA256: 44a4a7477d9616a35b237654e6887258f38d88aeafc0b95ab114d108b8fc3e03
Pointer size: 130 Bytes
Size of remote file: 80.1 kB

moondream2/moondream.py CHANGED Viewed

@@ -11,8 +11,7 @@ from .config import MoondreamConfig
 from .image_crops import reconstruct_from_crops
 from .vision import vision_encoder, vision_projection, prepare_crops, build_vision_model
 from .text import build_text_model, text_encoder, lm_head, text_decoder
-from .region import decode_coordinate, encode_coordinate, decode_size, encode_size
-from .utils import remove_outlier_points
 import os
 from .rope import RotaryEmbedding
 TextSamplingSettings = TypedDict(
@@ -210,7 +209,7 @@ class MoondreamModel(nn.Module):
             )
             inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)
             mask = self.attn_mask[:, :, 0 : inputs_embeds.size(1), :]
-            pos_ids = torch.arange(inputs_embeds.size(1), dtype=torch.long, device=self.device)
             self._prefill(inputs_embeds, mask, pos_ids)
         return EncodedImage(
@@ -235,7 +234,7 @@ class MoondreamModel(nn.Module):
             prompt_emb = text_encoder(prompt_tokens, self.text)
             torch._dynamo.mark_dynamic(prompt_emb, 1)
             mask = self.attn_mask[:, :, pos : pos + prompt_emb.size(1), :]
-            pos_ids = torch.arange(pos, pos + prompt_emb.size(1), dtype=torch.long, device=self.device)
             hidden = self._prefill(prompt_emb, mask, pos_ids)
             logits = lm_head(hidden, self.text)
@@ -259,7 +258,7 @@ class MoondreamModel(nn.Module):
         out = []
         mask = torch.zeros(1, 1, 2048, device=self.device, dtype=torch.bool)
         mask[:, :, :pos] = 1
-        pos_ids = torch.tensor([pos], device=self.device, dtype=torch.long)
         with torch.inference_mode():
             while (

 from .image_crops import reconstruct_from_crops
 from .vision import vision_encoder, vision_projection, prepare_crops, build_vision_model
 from .text import build_text_model, text_encoder, lm_head, text_decoder
+from .region import decode_coordinate, encode_coordinate
 import os
 from .rope import RotaryEmbedding
 TextSamplingSettings = TypedDict(
             )
             inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)
             mask = self.attn_mask[:, :, 0 : inputs_embeds.size(1), :]
+            pos_ids = torch.arange(inputs_embeds.size(1), dtype=torch.int32, device=self.device)
             self._prefill(inputs_embeds, mask, pos_ids)
         return EncodedImage(
             prompt_emb = text_encoder(prompt_tokens, self.text)
             torch._dynamo.mark_dynamic(prompt_emb, 1)
             mask = self.attn_mask[:, :, pos : pos + prompt_emb.size(1), :]
+            pos_ids = torch.arange(pos, pos + prompt_emb.size(1), dtype=torch.int32, device=self.device)
             hidden = self._prefill(prompt_emb, mask, pos_ids)
             logits = lm_head(hidden, self.text)
         out = []
         mask = torch.zeros(1, 1, 2048, device=self.device, dtype=torch.bool)
         mask[:, :, :pos] = 1
+        pos_ids = torch.tensor([pos], device=self.device, dtype=torch.int32)
         with torch.inference_mode():
             while (

notes.ipynb CHANGED Viewed

@@ -29,72 +29,11 @@
     "\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work\n",
-      "W0612 18:34:05.382000 19960 Lib\\site-packages\\torch\\distributed\\elastic\\multiprocessing\\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.\n"
-     ]
-    }
-   ],
-   "source": [
-    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
-    "from PIL import Image\n",
-    "\n",
-    "model = AutoModelForCausalLM.from_pretrained(\n",
-    "    \"vikhyatk/moondream2\",\n",
-    "    revision=\"2025-04-14\",\n",
-    "    trust_remote_code=True,\n",
-    "    # Uncomment to run on GPU.\n",
-    "    device_map={\"\": \"cuda\"}\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "model size: 3680.163MB\n"
-     ]
-    }
-   ],
-   "source": [
-    "param_size = 0\n",
-    "for param in model.parameters():\n",
-    "    param_size += param.nelement() * param.element_size()\n",
-    "buffer_size = 0\n",
-    "for buffer in model.buffers():\n",
-    "    buffer_size += buffer.nelement() * buffer.element_size()\n",
-    "\n",
-    "size_all_mb = (param_size + buffer_size) / 1024**2\n",
-    "print('model size: {:.3f}MB'.format(size_all_mb))"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "import torch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "from PIL import Image\n",
     "with torch.inference_mode():\n",

     "\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
     "from PIL import Image\n",
     "with torch.inference_mode():\n",

ollama.ipynb CHANGED Viewed

@@ -151,216 +151,407 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Benchmarking with tensor shape: torch.Size([1, 730, 32, 64])\n",
-      "Device: cuda:0\n",
-      "Warmup iterations: 10\n",
-      "Benchmark iterations: 100\n",
       "--------------------------------------------------\n",
-      "Warming up regular rope...\n",
-      "Benchmarking regular rope...\n",
-      "Warming up fast rope...\n",
-      "Benchmarking fast rope...\n",
-      "\n",
-      "============================================================\n",
-      "BENCHMARK RESULTS\n",
-      "============================================================\n",
-      "\n",
-      "Regular Rope:\n",
-      "  Mean:   0.338 ms\n",
-      "  Median: 0.335 ms\n",
-      "  Std:    0.009 ms\n",
-      "  Min:    0.330 ms\n",
-      "  Max:    0.385 ms\n",
-      "\n",
-      "Fast Rope (In-place):\n",
-      "  Mean:   0.267 ms\n",
-      "  Median: 0.265 ms\n",
-      "  Std:    0.005 ms\n",
-      "  Min:    0.261 ms\n",
-      "  Max:    0.285 ms\n",
-      "\n",
-      "Speedup: 1.27x\n",
-      "Fast rope is 1.27x faster\n"
      ]
     }
    ],
-   "source": [
-    "import torch\n",
-    "import time\n",
-    "import statistics\n",
-    "from typing import List, Tuple\n",
-    "\n",
-    "def benchmark_rope_functions(\n",
-    "    rope, \n",
-    "    fast_rope, \n",
-    "    tensor: torch.Tensor, \n",
-    "    num_warmup: int = 10,\n",
-    "    num_iterations: int = 100\n",
-    ") -> Tuple[float, float, List[float], List[float]]:\n",
-    "    \"\"\"\n",
-    "    Benchmark two rope functions, accounting for in-place modification.\n",
-    "    \n",
-    "    Args:\n",
-    "        rope: Regular RotaryEmbedding instance\n",
-    "        fast_rope: RotaryEmbeddingInPlace instance\n",
-    "        tensor: Input tensor to benchmark with\n",
-    "        num_warmup: Number of warmup iterations\n",
-    "        num_iterations: Number of benchmark iterations\n",
-    "    \n",
-    "    Returns:\n",
-    "        Tuple of (regular_avg_time, fast_avg_time, regular_times, fast_times)\n",
-    "    \"\"\"\n",
-    "    \n",
-    "    # Ensure we're on the right device and in eval mode if applicable\n",
-    "    device = tensor.device\n",
-    "    \n",
-    "    # Pre-allocate tensor copies to avoid allocation overhead during timing\n",
-    "    tensor_copies = []\n",
-    "    for _ in range(num_warmup + num_iterations):\n",
-    "        tensor_copies.append(tensor.clone().detach())\n",
-    "    \n",
-    "    print(f\"Benchmarking with tensor shape: {tensor.shape}\")\n",
-    "    print(f\"Device: {device}\")\n",
-    "    print(f\"Warmup iterations: {num_warmup}\")\n",
-    "    print(f\"Benchmark iterations: {num_iterations}\")\n",
-    "    print(\"-\" * 50)\n",
-    "    \n",
-    "    # Warmup phase for regular rope\n",
-    "    print(\"Warming up regular rope...\")\n",
-    "    for i in range(num_warmup):\n",
-    "        _ = rope.apply(tensor)\n",
-    "        if device.type == 'cuda':\n",
-    "            torch.cuda.synchronize()\n",
-    "    \n",
-    "    # Benchmark regular rope\n",
-    "    print(\"Benchmarking regular rope...\")\n",
-    "    regular_times = []\n",
-    "    for i in range(num_iterations):\n",
-    "        if device.type == 'cuda':\n",
-    "            torch.cuda.synchronize()\n",
-    "        \n",
-    "        start_time = time.perf_counter()\n",
-    "        result = rope.apply(tensor)\n",
-    "        \n",
-    "        if device.type == 'cuda':\n",
-    "            torch.cuda.synchronize()\n",
-    "        \n",
-    "        end_time = time.perf_counter()\n",
-    "        regular_times.append((end_time - start_time) * 1000)  # Convert to milliseconds\n",
-    "    \n",
-    "    # Warmup phase for fast rope (in-place)\n",
-    "    print(\"Warming up fast rope...\")\n",
-    "    for i in range(num_warmup):\n",
-    "        test_tensor = tensor_copies[i].clone()  # Use a copy for warmup\n",
-    "        _ = fast_rope.apply(test_tensor)\n",
-    "        if device.type == 'cuda':\n",
-    "            torch.cuda.synchronize()\n",
-    "    \n",
-    "    # Benchmark fast rope (in-place)\n",
-    "    print(\"Benchmarking fast rope...\")\n",
-    "    fast_times = []\n",
-    "    copy_idx = num_warmup  # Start from after warmup copies\n",
-    "    \n",
-    "    for i in range(num_iterations):\n",
-    "        # Use pre-allocated copy\n",
-    "        tensor_copy = tensor_copies[copy_idx + i]\n",
-    "        \n",
-    "        if device.type == 'cuda':\n",
-    "            torch.cuda.synchronize()\n",
-    "        \n",
-    "        # Time only the apply operation, not the copy\n",
-    "        start_time = time.perf_counter()\n",
-    "        result = fast_rope.apply(tensor_copy)\n",
-    "        \n",
-    "        if device.type == 'cuda':\n",
-    "            torch.cuda.synchronize()\n",
-    "        \n",
-    "        end_time = time.perf_counter()\n",
-    "        fast_times.append((end_time - start_time) * 1000)  # Convert to milliseconds\n",
-    "    \n",
-    "    # Calculate statistics\n",
-    "    regular_avg = statistics.mean(regular_times)\n",
-    "    fast_avg = statistics.mean(fast_times)\n",
-    "    \n",
-    "    return regular_avg, fast_avg, regular_times, fast_times\n",
-    "\n",
-    "def print_benchmark_results(regular_avg: float, fast_avg: float, \n",
-    "                          regular_times: List[float], fast_times: List[float]):\n",
-    "    \"\"\"Print detailed benchmark results.\"\"\"\n",
-    "    \n",
-    "    regular_median = statistics.median(regular_times)\n",
-    "    regular_std = statistics.stdev(regular_times) if len(regular_times) > 1 else 0\n",
-    "    regular_min = min(regular_times)\n",
-    "    regular_max = max(regular_times)\n",
-    "    \n",
-    "    fast_median = statistics.median(fast_times)\n",
-    "    fast_std = statistics.stdev(fast_times) if len(fast_times) > 1 else 0\n",
-    "    fast_min = min(fast_times)\n",
-    "    fast_max = max(fast_times)\n",
-    "    \n",
-    "    speedup = regular_avg / fast_avg if fast_avg > 0 else float('inf')\n",
-    "    \n",
-    "    print(\"\\n\" + \"=\" * 60)\n",
-    "    print(\"BENCHMARK RESULTS\")\n",
-    "    print(\"=\" * 60)\n",
-    "    \n",
-    "    print(f\"\\nRegular Rope:\")\n",
-    "    print(f\"  Mean:   {regular_avg:.3f} ms\")\n",
-    "    print(f\"  Median: {regular_median:.3f} ms\")\n",
-    "    print(f\"  Std:    {regular_std:.3f} ms\")\n",
-    "    print(f\"  Min:    {regular_min:.3f} ms\")\n",
-    "    print(f\"  Max:    {regular_max:.3f} ms\")\n",
-    "    \n",
-    "    print(f\"\\nFast Rope (In-place):\")\n",
-    "    print(f\"  Mean:   {fast_avg:.3f} ms\")\n",
-    "    print(f\"  Median: {fast_median:.3f} ms\")\n",
-    "    print(f\"  Std:    {fast_std:.3f} ms\")\n",
-    "    print(f\"  Min:    {fast_min:.3f} ms\")\n",
-    "    print(f\"  Max:    {fast_max:.3f} ms\")\n",
-    "    \n",
-    "    print(f\"\\nSpeedup: {speedup:.2f}x\")\n",
-    "    if speedup > 1:\n",
-    "        print(f\"Fast rope is {speedup:.2f}x faster\")\n",
-    "    else:\n",
-    "        print(f\"Regular rope is {1/speedup:.2f}x faster\")\n",
-    "\n",
-    "# Example usage\n",
-    "def run_benchmark():\n",
-    "    \"\"\"\n",
-    "    Example of how to use the benchmark functions.\n",
-    "    Replace with your actual RotaryEmbedding classes.\n",
-    "    \"\"\"\n",
-    "    \n",
-    "    # Example parameters - adjust these to match your setup\n",
-    "    dim_per_head = 64\n",
-    "    n_heads = 32\n",
-    "    max_context = 2048\n",
-    "\n",
-    "    freq_dim = dim_per_head // 2\n",
-    "\n",
-    "    torch.manual_seed(42)\n",
-    "\n",
-    "    \n",
-    "    # Create your rope instances\n",
-    "    rope = RotaryEmbedding(dim_per_head, max_context)\n",
-    "    fast_rope = RotaryEmbeddingInPlace(dim_per_head, max_context)\n",
-    "    \n",
-    "    # Create test tensor - adjust shape to match your use case\n",
-    "    tensor = torch.rand(1, 730, n_heads, dim_per_head, device=device)\n",
-    "\n",
-    "    regular_avg, fast_avg, regular_times, fast_times = benchmark_rope_functions(rope, fast_rope, tensor)\n",
-    "    print_benchmark_results(regular_avg, fast_avg, regular_times, fast_times)\n",
-    "\n",
-    "if __name__ == \"__main__\":\n",
-    "    run_benchmark()"
-   ]
   },
   {
    "cell_type": "code",
@@ -386,7 +577,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.3"
   }
  },
  "nbformat": 4,

   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "✓ OpenCV available\n",
+      "=== Comprehensive Benchmark with Tensor Difference Analysis ===\n",
+      "CUDA available: True\n",
+      "PyVIPS available: True\n",
+      "\n",
+      "================================================================================\n",
+      "Testing 1080p (1920x1080)\n",
+      "================================================================================\n",
+      "\n",
+      "Function     Min (ms)   Avg (ms)   Speedup \n",
       "--------------------------------------------------\n",
+      "Original     16.3       16.7       1.00x   \n",
+      "Optimized    8.9        9.4        1.77x   \n",
+      "Ultra Fast   9.2        9.5        1.75x   \n",
+      "\n",
+      "🔍 TENSOR DIFFERENCE ANALYSIS\n",
+      "==================================================\n",
+      "\n",
+      "✓ Tiling match: (2, 4)\n",
+      "\n",
+      "--- Tensor Difference Analysis: Original vs Optimized ---\n",
+      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
+      "Max absolute difference: 1.208008\n",
+      "Mean absolute difference: 0.181336\n",
+      "Std of differences: 0.153313\n",
+      "Pixels with any difference: 98.44% (3797773/3857868)\n",
+      "\n",
+      "Tolerance analysis:\n",
+      "  Within    1e-06:   1.56% (60095/3857868)\n",
+      "  Within    1e-05:   1.56% (60095/3857868)\n",
+      "  Within    1e-04:   1.56% (60095/3857868)\n",
+      "  Within    1e-03:   1.56% (60095/3857868)\n",
+      "  Within    1e-02:   4.68% (180528/3857868)\n",
+      "  Within    1e-01:  36.82% (1420591/3857868)\n",
+      "❌ Tensors have significant differences\n",
+      "\n",
+      "Per-crop analysis (9 crops):\n",
+      "  Crop 0: max=1.207520, mean=0.288220\n",
+      "  Crop 1: max=1.160156, mean=0.167923\n",
+      "  Crop 2: max=1.208008, mean=0.167772\n",
+      "  Crop 3: max=1.208008, mean=0.168140\n",
+      "  Crop 4: max=1.176270, mean=0.168022\n",
+      "  ... and 4 more crops\n",
+      "\n",
+      "✓ Tiling match: (2, 4)\n",
+      "\n",
+      "--- Tensor Difference Analysis: Original vs Ultra Fast ---\n",
+      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
+      "Max absolute difference: 1.208008\n",
+      "Mean absolute difference: 0.181336\n",
+      "Std of differences: 0.153313\n",
+      "Pixels with any difference: 98.44% (3797773/3857868)\n",
+      "\n",
+      "Tolerance analysis:\n",
+      "  Within    1e-06:   1.56% (60095/3857868)\n",
+      "  Within    1e-05:   1.56% (60095/3857868)\n",
+      "  Within    1e-04:   1.56% (60095/3857868)\n",
+      "  Within    1e-03:   1.56% (60095/3857868)\n",
+      "  Within    1e-02:   4.68% (180528/3857868)\n",
+      "  Within    1e-01:  36.82% (1420591/3857868)\n",
+      "❌ Tensors have significant differences\n",
+      "\n",
+      "Per-crop analysis (9 crops):\n",
+      "  Crop 0: max=1.207520, mean=0.288220\n",
+      "  Crop 1: max=1.160156, mean=0.167923\n",
+      "  Crop 2: max=1.208008, mean=0.167772\n",
+      "  Crop 3: max=1.208008, mean=0.168140\n",
+      "  Crop 4: max=1.176270, mean=0.168022\n",
+      "  ... and 4 more crops\n",
+      "\n",
+      "✓ Tiling match: (2, 4)\n",
+      "\n",
+      "--- Tensor Difference Analysis: Optimized vs Ultra Fast ---\n",
+      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
+      "Max absolute difference: 0.000000\n",
+      "Mean absolute difference: 0.000000\n",
+      "Std of differences: 0.000000\n",
+      "Pixels with any difference: 0.00% (0/3857868)\n",
+      "\n",
+      "Tolerance analysis:\n",
+      "  Within    1e-06: 100.00% (3857868/3857868)\n",
+      "  Within    1e-05: 100.00% (3857868/3857868)\n",
+      "  Within    1e-04: 100.00% (3857868/3857868)\n",
+      "  Within    1e-03: 100.00% (3857868/3857868)\n",
+      "  Within    1e-02: 100.00% (3857868/3857868)\n",
+      "  Within    1e-01: 100.00% (3857868/3857868)\n",
+      "✅ Tensors are essentially identical (max diff < 1e-5)\n",
+      "\n",
+      "Per-crop analysis (9 crops):\n",
+      "  Crop 0: max=0.000000, mean=0.000000\n",
+      "  Crop 1: max=0.000000, mean=0.000000\n",
+      "  Crop 2: max=0.000000, mean=0.000000\n",
+      "  Crop 3: max=0.000000, mean=0.000000\n",
+      "  Crop 4: max=0.000000, mean=0.000000\n",
+      "  ... and 4 more crops\n",
+      "\n",
+      "================================================================================\n",
+      "Testing 4K (3840x2160)\n",
+      "================================================================================\n",
+      "\n",
+      "Function     Min (ms)   Avg (ms)   Speedup \n",
+      "--------------------------------------------------\n",
+      "Original     55.0       57.2       1.00x   \n",
+      "Optimized    30.8       33.4       1.71x   \n",
+      "Ultra Fast   32.3       36.5       1.57x   \n",
+      "\n",
+      "🔍 TENSOR DIFFERENCE ANALYSIS\n",
+      "==================================================\n",
+      "\n",
+      "✓ Tiling match: (2, 4)\n",
+      "\n",
+      "--- Tensor Difference Analysis: Original vs Optimized ---\n",
+      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
+      "Max absolute difference: 1.278320\n",
+      "Mean absolute difference: 0.280527\n",
+      "Std of differences: 0.198947\n",
+      "Pixels with any difference: 99.16% (3825385/3857868)\n",
+      "\n",
+      "Tolerance analysis:\n",
+      "  Within    1e-06:   0.84% (32483/3857868)\n",
+      "  Within    1e-05:   0.84% (32483/3857868)\n",
+      "  Within    1e-04:   0.84% (32483/3857868)\n",
+      "  Within    1e-03:   0.84% (32483/3857868)\n",
+      "  Within    1e-02:   2.53% (97553/3857868)\n",
+      "  Within    1e-01:  20.93% (807398/3857868)\n",
+      "❌ Tensors have significant differences\n",
+      "\n",
+      "Per-crop analysis (9 crops):\n",
+      "  Crop 0: max=1.105957, mean=0.310640\n",
+      "  Crop 1: max=1.262695, mean=0.276606\n",
+      "  Crop 2: max=1.262695, mean=0.276472\n",
+      "  Crop 3: max=1.278320, mean=0.276858\n",
+      "  Crop 4: max=1.231934, mean=0.276985\n",
+      "  ... and 4 more crops\n",
+      "\n",
+      "✓ Tiling match: (2, 4)\n",
+      "\n",
+      "--- Tensor Difference Analysis: Original vs Ultra Fast ---\n",
+      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
+      "Max absolute difference: 1.278320\n",
+      "Mean absolute difference: 0.280527\n",
+      "Std of differences: 0.198947\n",
+      "Pixels with any difference: 99.16% (3825385/3857868)\n",
+      "\n",
+      "Tolerance analysis:\n",
+      "  Within    1e-06:   0.84% (32483/3857868)\n",
+      "  Within    1e-05:   0.84% (32483/3857868)\n",
+      "  Within    1e-04:   0.84% (32483/3857868)\n",
+      "  Within    1e-03:   0.84% (32483/3857868)\n",
+      "  Within    1e-02:   2.53% (97553/3857868)\n",
+      "  Within    1e-01:  20.93% (807398/3857868)\n",
+      "❌ Tensors have significant differences\n",
+      "\n",
+      "Per-crop analysis (9 crops):\n",
+      "  Crop 0: max=1.105957, mean=0.310640\n",
+      "  Crop 1: max=1.262695, mean=0.276606\n",
+      "  Crop 2: max=1.262695, mean=0.276472\n",
+      "  Crop 3: max=1.278320, mean=0.276858\n",
+      "  Crop 4: max=1.231934, mean=0.276985\n",
+      "  ... and 4 more crops\n",
+      "\n",
+      "✓ Tiling match: (2, 4)\n",
+      "\n",
+      "--- Tensor Difference Analysis: Optimized vs Ultra Fast ---\n",
+      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
+      "Max absolute difference: 0.000000\n",
+      "Mean absolute difference: 0.000000\n",
+      "Std of differences: 0.000000\n",
+      "Pixels with any difference: 0.00% (0/3857868)\n",
+      "\n",
+      "Tolerance analysis:\n",
+      "  Within    1e-06: 100.00% (3857868/3857868)\n",
+      "  Within    1e-05: 100.00% (3857868/3857868)\n",
+      "  Within    1e-04: 100.00% (3857868/3857868)\n",
+      "  Within    1e-03: 100.00% (3857868/3857868)\n",
+      "  Within    1e-02: 100.00% (3857868/3857868)\n",
+      "  Within    1e-01: 100.00% (3857868/3857868)\n",
+      "✅ Tensors are essentially identical (max diff < 1e-5)\n",
+      "\n",
+      "Per-crop analysis (9 crops):\n",
+      "  Crop 0: max=0.000000, mean=0.000000\n",
+      "  Crop 1: max=0.000000, mean=0.000000\n",
+      "  Crop 2: max=0.000000, mean=0.000000\n",
+      "  Crop 3: max=0.000000, mean=0.000000\n",
+      "  Crop 4: max=0.000000, mean=0.000000\n",
+      "  ... and 4 more crops\n",
+      "\n",
+      "💡 Tip: Run with '--speed-only' flag for faster benchmarking without tensor analysis\n"
      ]
     }
    ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✓ OpenCV available\n",
+      "=== Comprehensive Benchmark with Tensor Difference Analysis ===\n",
+      "CUDA available: True\n",
+      "PyVIPS available: True\n",
+      "\n",
+      "================================================================================\n",
+      "Testing 1080p (1920x1080)\n",
+      "================================================================================\n",
+      "\n",
+      "Function     Min (ms)   Avg (ms)   Speedup \n",
+      "--------------------------------------------------\n",
+      "Original     15.6       16.8       1.00x   \n",
+      "Optimized    8.8        9.2        1.82x   \n",
+      "Ultra Fast   9.4        9.6        1.76x   \n",
+      "\n",
+      "🔍 TENSOR DIFFERENCE ANALYSIS\n",
+      "==================================================\n",
+      "\n",
+      "✓ Tiling match: (2, 4)\n",
+      "\n",
+      "--- Tensor Difference Analysis: Original vs Optimized ---\n",
+      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
+      "Max absolute difference: 1.208008\n",
+      "Mean absolute difference: 0.181336\n",
+      "Std of differences: 0.153313\n",
+      "Pixels with any difference: 98.44% (3797773/3857868)\n",
+      "\n",
+      "Tolerance analysis:\n",
+      "  Within    1e-06:   1.56% (60095/3857868)\n",
+      "  Within    1e-05:   1.56% (60095/3857868)\n",
+      "  Within    1e-04:   1.56% (60095/3857868)\n",
+      "  Within    1e-03:   1.56% (60095/3857868)\n",
+      "  Within    1e-02:   4.68% (180528/3857868)\n",
+      "  Within    1e-01:  36.82% (1420591/3857868)\n",
+      "❌ Tensors have significant differences\n",
+      "\n",
+      "Per-crop analysis (9 crops):\n",
+      "  Crop 0: max=1.207520, mean=0.288220\n",
+      "  Crop 1: max=1.160156, mean=0.167923\n",
+      "  Crop 2: max=1.208008, mean=0.167772\n",
+      "  Crop 3: max=1.208008, mean=0.168140\n",
+      "  Crop 4: max=1.176270, mean=0.168022\n",
+      "  ... and 4 more crops\n",
+      "\n",
+      "✓ Tiling match: (2, 4)\n",
+      "\n",
+      "--- Tensor Difference Analysis: Original vs Ultra Fast ---\n",
+      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
+      "Max absolute difference: 1.208008\n",
+      "Mean absolute difference: 0.181336\n",
+      "Std of differences: 0.153313\n",
+      "Pixels with any difference: 98.44% (3797773/3857868)\n",
+      "\n",
+      "Tolerance analysis:\n",
+      "  Within    1e-06:   1.56% (60095/3857868)\n",
+      "  Within    1e-05:   1.56% (60095/3857868)\n",
+      "  Within    1e-04:   1.56% (60095/3857868)\n",
+      "  Within    1e-03:   1.56% (60095/3857868)\n",
+      "  Within    1e-02:   4.68% (180528/3857868)\n",
+      "  Within    1e-01:  36.82% (1420591/3857868)\n",
+      "❌ Tensors have significant differences\n",
+      "\n",
+      "Per-crop analysis (9 crops):\n",
+      "  Crop 0: max=1.207520, mean=0.288220\n",
+      "  Crop 1: max=1.160156, mean=0.167923\n",
+      "  Crop 2: max=1.208008, mean=0.167772\n",
+      "  Crop 3: max=1.208008, mean=0.168140\n",
+      "  Crop 4: max=1.176270, mean=0.168022\n",
+      "  ... and 4 more crops\n",
+      "\n",
+      "✓ Tiling match: (2, 4)\n",
+      "\n",
+      "--- Tensor Difference Analysis: Optimized vs Ultra Fast ---\n",
+      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
+      "Max absolute difference: 0.000000\n",
+      "Mean absolute difference: 0.000000\n",
+      "Std of differences: 0.000000\n",
+      "Pixels with any difference: 0.00% (0/3857868)\n",
+      "\n",
+      "Tolerance analysis:\n",
+      "  Within    1e-06: 100.00% (3857868/3857868)\n",
+      "  Within    1e-05: 100.00% (3857868/3857868)\n",
+      "  Within    1e-04: 100.00% (3857868/3857868)\n",
+      "  Within    1e-03: 100.00% (3857868/3857868)\n",
+      "  Within    1e-02: 100.00% (3857868/3857868)\n",
+      "  Within    1e-01: 100.00% (3857868/3857868)\n",
+      "✅ Tensors are essentially identical (max diff < 1e-5)\n",
+      "\n",
+      "Per-crop analysis (9 crops):\n",
+      "  Crop 0: max=0.000000, mean=0.000000\n",
+      "  Crop 1: max=0.000000, mean=0.000000\n",
+      "  Crop 2: max=0.000000, mean=0.000000\n",
+      "  Crop 3: max=0.000000, mean=0.000000\n",
+      "  Crop 4: max=0.000000, mean=0.000000\n",
+      "  ... and 4 more crops\n",
+      "\n",
+      "================================================================================\n",
+      "Testing 4K (3840x2160)\n",
+      "================================================================================\n",
+      "\n",
+      "Function     Min (ms)   Avg (ms)   Speedup \n",
+      "--------------------------------------------------\n",
+      "Original     46.9       51.5       1.00x   \n",
+      "Optimized    34.3       35.6       1.45x   \n",
+      "Ultra Fast   30.5       31.9       1.61x   \n",
+      "\n",
+      "🔍 TENSOR DIFFERENCE ANALYSIS\n",
+      "==================================================\n",
+      "\n",
+      "✓ Tiling match: (2, 4)\n",
+      "\n",
+      "--- Tensor Difference Analysis: Original vs Optimized ---\n",
+      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
+      "Max absolute difference: 1.278320\n",
+      "Mean absolute difference: 0.280527\n",
+      "Std of differences: 0.198947\n",
+      "Pixels with any difference: 99.16% (3825385/3857868)\n",
+      "\n",
+      "Tolerance analysis:\n",
+      "  Within    1e-06:   0.84% (32483/3857868)\n",
+      "  Within    1e-05:   0.84% (32483/3857868)\n",
+      "  Within    1e-04:   0.84% (32483/3857868)\n",
+      "  Within    1e-03:   0.84% (32483/3857868)\n",
+      "  Within    1e-02:   2.53% (97553/3857868)\n",
+      "  Within    1e-01:  20.93% (807398/3857868)\n",
+      "❌ Tensors have significant differences\n",
+      "\n",
+      "Per-crop analysis (9 crops):\n",
+      "  Crop 0: max=1.105957, mean=0.310640\n",
+      "  Crop 1: max=1.262695, mean=0.276606\n",
+      "  Crop 2: max=1.262695, mean=0.276472\n",
+      "  Crop 3: max=1.278320, mean=0.276858\n",
+      "  Crop 4: max=1.231934, mean=0.276985\n",
+      "  ... and 4 more crops\n",
+      "\n",
+      "✓ Tiling match: (2, 4)\n",
+      "\n",
+      "--- Tensor Difference Analysis: Original vs Ultra Fast ---\n",
+      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
+      "Max absolute difference: 1.278320\n",
+      "Mean absolute difference: 0.280527\n",
+      "Std of differences: 0.198947\n",
+      "Pixels with any difference: 99.16% (3825385/3857868)\n",
+      "\n",
+      "Tolerance analysis:\n",
+      "  Within    1e-06:   0.84% (32483/3857868)\n",
+      "  Within    1e-05:   0.84% (32483/3857868)\n",
+      "  Within    1e-04:   0.84% (32483/3857868)\n",
+      "  Within    1e-03:   0.84% (32483/3857868)\n",
+      "  Within    1e-02:   2.53% (97553/3857868)\n",
+      "  Within    1e-01:  20.93% (807398/3857868)\n",
+      "❌ Tensors have significant differences\n",
+      "\n",
+      "Per-crop analysis (9 crops):\n",
+      "  Crop 0: max=1.105957, mean=0.310640\n",
+      "  Crop 1: max=1.262695, mean=0.276606\n",
+      "  Crop 2: max=1.262695, mean=0.276472\n",
+      "  Crop 3: max=1.278320, mean=0.276858\n",
+      "  Crop 4: max=1.231934, mean=0.276985\n",
+      "  ... and 4 more crops\n",
+      "\n",
+      "✓ Tiling match: (2, 4)\n",
+      "\n",
+      "--- Tensor Difference Analysis: Optimized vs Ultra Fast ---\n",
+      "✓ Shape match: torch.Size([9, 3, 378, 378])\n",
+      "Max absolute difference: 0.000000\n",
+      "Mean absolute difference: 0.000000\n",
+      "Std of differences: 0.000000\n",
+      "Pixels with any difference: 0.00% (0/3857868)\n",
+      "\n",
+      "Tolerance analysis:\n",
+      "  Within    1e-06: 100.00% (3857868/3857868)\n",
+      "  Within    1e-05: 100.00% (3857868/3857868)\n",
+      "  Within    1e-04: 100.00% (3857868/3857868)\n",
+      "  Within    1e-03: 100.00% (3857868/3857868)\n",
+      "  Within    1e-02: 100.00% (3857868/3857868)\n",
+      "  Within    1e-01: 100.00% (3857868/3857868)\n",
+      "✅ Tensors are essentially identical (max diff < 1e-5)\n",
+      "\n",
+      "Per-crop analysis (9 crops):\n",
+      "  Crop 0: max=0.000000, mean=0.000000\n",
+      "  Crop 1: max=0.000000, mean=0.000000\n",
+      "  Crop 2: max=0.000000, mean=0.000000\n",
+      "  Crop 3: max=0.000000, mean=0.000000\n",
+      "  Crop 4: max=0.000000, mean=0.000000\n",
+      "  ... and 4 more crops\n",
+      "\n",
+      "💡 Tip: Run with '--speed-only' flag for faster benchmarking without tensor analysis\n"
+     ]
+    }
+   ],
+   "source": []
   },
   {
    "cell_type": "code",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.12.9"
   }
  },
  "nbformat": 4,

requirements.txt CHANGED Viewed

	@@ -1 +1,4 @@
1	torch==2.7.0+cu128 torchvision==0.22.0+cu128 torchaudio==2.7.0+cu128 --index-url https://download.pytorch.org/whl/cu128


1	torch==2.7.0+cu128 torchvision==0.22.0+cu128 torchaudio==2.7.0+cu128 --index-url https://download.pytorch.org/whl/cu128
2	+
3	+
4	+ opencv-python