new rope

Files changed (9) hide show

moondream2/config.py +1 -1
moondream2/hf_moondream.py +0 -132
moondream2/layers.py +1 -1
moondream2/moondream.py +5 -57
moondream2/rope.py +52 -88
moondream2/text.py +10 -16
moondream2/vision.py +3 -2
notes.ipynb +280 -116
ollama.ipynb +312 -113

moondream2/config.py CHANGED Viewed

@@ -8,7 +8,7 @@ class TextConfig:
     ff_dim: int = 8192
     n_layers: int = 24
     vocab_size: int = 51200
-    max_context: int = 1000
     n_heads: int = 32
     n_kv_heads: int = 32
     prefix_attn: int = 730

     ff_dim: int = 8192
     n_layers: int = 24
     vocab_size: int = 51200
+    max_context: int = 2048
     n_heads: int = 32
     n_kv_heads: int = 32
     prefix_attn: int = 730

moondream2/hf_moondream.py DELETED Viewed

@@ -1,132 +0,0 @@
-from transformers import PreTrainedModel, PretrainedConfig
-from .config import MoondreamConfig
-from .moondream import MoondreamModel
-# Files sometimes don't get loaded without these...
-from .image_crops import *
-from .vision import *
-from .text import *
-from .region import *
-from .utils import *
-def extract_question(text):
-    prefix = "<image>\n\nQuestion: "
-    suffix = "\n\nAnswer:"
-    if text.startswith(prefix) and text.endswith(suffix):
-        return text[len(prefix) : -len(suffix)]
-    else:
-        return None
-class HfConfig(PretrainedConfig):
-    _auto_class = "AutoConfig"
-    model_type = "moondream1"
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.config = {}
-class HfMoondream(PreTrainedModel):
-    _supports_gradient_checkpointing = True
-    _auto_class = "AutoModelForCausalLM"
-    config_class = HfConfig
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = MoondreamModel(
-            MoondreamConfig.from_dict(config.config), setup_caches=False
-        )
-        self.model._setup_caches()
-    @property
-    def encode_image(self):
-        return self.model.encode_image
-    @property
-    def query(self):
-        return self.model.query
-    @property
-    def caption(self):
-        return self.model.caption
-    @property
-    def detect(self):
-        return self.model.detect
-    @property
-    def point(self):
-        return self.model.point
-    @property
-    def detect_gaze(self):
-        return self.model.detect_gaze
-    def answer_question(
-        self,
-        image_embeds,
-        question,
-        tokenizer=None,
-        chat_history="",
-        result_queue=None,
-        max_new_tokens=256,
-        **kwargs
-    ):
-        answer = self.query(image_embeds, question)["answer"].strip()
-        if result_queue is not None:
-            result_queue.put(answer)
-        return answer
-    def batch_answer(self, images, prompts, tokenizer=None, **kwargs):
-        answers = []
-        for image, prompt in zip(images, prompts):
-            answers.append(self.query(image, prompt)["answer"].strip())
-        return answers
-    def _unsupported_exception(self):
-        raise NotImplementedError(
-            "This method is not supported in the latest version of moondream. "
-            "Consider upgrading to the updated API spec, or alternately pin "
-            "to 'revision=2024-08-26'."
-        )
-    def generate(self, image_embeds, prompt, tokenizer, max_new_tokens=128, **kwargs):
-        """
-        Function definition remains unchanged for backwards compatibility.
-        Be aware that tokenizer, max_new_takens, and kwargs are ignored.
-        """
-        prompt_extracted = extract_question(prompt)
-        if prompt_extracted is not None:
-            answer = self.model.query(
-                image=image_embeds, question=prompt_extracted, stream=False
-            )["answer"]
-        else:
-            image_embeds = self.encode_image(image_embeds)
-            prompt_tokens = torch.tensor(
-                [self.model.tokenizer.encode(prompt).ids],
-                device=self.device,
-            )
-            def generator():
-                for token in self.model._generate_text(
-                    prompt_tokens,
-                    image_embeds.kv_cache,
-                    image_embeds.pos,
-                    max_new_tokens,
-                ):
-                    yield token
-            answer = "".join(list(generator()))
-        return [answer]
-    def get_input_embeddings(self):
-        return super().get_input_embeddings()
-    def input_embeds(self, *args, **kwargs):
-        self._unsupported_exception()

moondream2/layers.py CHANGED Viewed

@@ -16,7 +16,7 @@ class LinearWeights:
 def linear(x: torch.Tensor, w: LinearWeights) -> torch.Tensor:
-    return F.linear(x, w.weight, w.bias)
 @dataclass

 def linear(x: torch.Tensor, w: LinearWeights) -> torch.Tensor:
+    return F.linear(x, w.weight(), w.bias())
 @dataclass

moondream2/moondream.py CHANGED Viewed

@@ -14,7 +14,7 @@ from .text import build_text_model, text_encoder, lm_head, text_decoder
 from .region import decode_coordinate, encode_coordinate, decode_size, encode_size
 from .utils import remove_outlier_points
 import os
-from torchtune.modules import RotaryPositionalEmbeddings
 TextSamplingSettings = TypedDict(
     "TextSamplingSettings",
     {
@@ -40,26 +40,6 @@ DEFAULT_MAX_OBJECTS = 50
 @dataclass(frozen=True)
 class EncodedImage:
     pos: int
-    caches: List[Tuple[torch.Tensor, torch.Tensor]]
-class KVCache(nn.Module):
-    def __init__(self, n_heads, n_kv_heads, max_context, dim, device, dtype):
-        super().__init__()
-        cache_shape = (1, n_kv_heads, max_context, dim // n_heads)
-        self.register_buffer(
-            "k_cache", torch.zeros(*cache_shape, device=device, dtype=dtype)
-        )
-        self.register_buffer(
-            "v_cache", torch.zeros(*cache_shape, device=device, dtype=dtype)
-        )
-    def update(self, pos_ids, k, v):
-        kout, vout = self.k_cache, self.v_cache
-        kout[:, :, pos_ids, :] = k
-        vout[:, :, pos_ids, :] = v
-        return kout, vout
 class MoondreamModel(nn.Module):
@@ -70,11 +50,7 @@ class MoondreamModel(nn.Module):
         self.tokenizer = Tokenizer.from_file(os.path.join(current_dir, "tokenizer.json"))
         self.vision = build_vision_model(config.vision, dtype)
         self.text = build_text_model(config.text, dtype)
-        self.rotary_emb = RotaryPositionalEmbeddings(
-            config.text.dim // config.text.n_heads,
-            config.text.max_context
-        )
         # Region Model
         self.region = nn.ModuleDict(
@@ -128,21 +104,6 @@ class MoondreamModel(nn.Module):
         attn_mask[..., :prefix_attn_len, :prefix_attn_len] = 1
         self.register_buffer("attn_mask", attn_mask, persistent=False)
-        # Initialize KV caches.
-        if setup_caches:
-            self._setup_caches()
-    def _setup_caches(self):
-        c = self.config.text
-        for b in self.text.blocks:
-            b.kv_cache = KVCache(
-                c.n_heads,
-                c.n_kv_heads,
-                c.max_context,
-                c.dim,
-                device=self.device,
-                dtype=self.vision.pos_emb.dtype,
-            )
     @property
     def device(self):
         return self.vision.pos_emb.device
@@ -154,12 +115,12 @@ class MoondreamModel(nn.Module):
         return vision_projection(g, r, self.vision, self.config.vision)
     def _prefill(self, x: torch.Tensor, attn_mask: torch.Tensor, pos_ids: torch.Tensor):
-        return text_decoder(x, self.text, attn_mask, pos_ids, self.config.text, self.rotary_emb)
     def _decode_one_tok(
         self, x: torch.Tensor, attn_mask: torch.Tensor, pos_ids: torch.Tensor
     ):
-        hidden = text_decoder(x, self.text, attn_mask, pos_ids, self.config.text, self.rotary_emb)
         logits = lm_head(hidden, self.text)
         return logits, hidden
@@ -217,14 +178,7 @@ class MoondreamModel(nn.Module):
             self._prefill(inputs_embeds, mask, pos_ids)
         return EncodedImage(
-            pos=inputs_embeds.size(1),
-            caches=[
-                (
-                    b.kv_cache.k_cache[:, :, : inputs_embeds.size(1), :].clone(),
-                    b.kv_cache.v_cache[:, :, : inputs_embeds.size(1), :].clone(),
-                )
-                for b in self.text.blocks
-            ],
         )
     def _apply_top_p(self, probs: torch.Tensor, top_p: float):
@@ -345,11 +299,6 @@ class MoondreamModel(nn.Module):
         return generator(next_token, pos)
-    def load_encoded_image(self, encoded_image: EncodedImage):
-        for b, (k, v) in zip(self.text.blocks, encoded_image.caches):
-            b.kv_cache.k_cache[:, :, : k.size(2), :] = k
-            b.kv_cache.v_cache[:, :, : v.size(2), :] = v
     def _generate_points(
         self,
         hidden: torch.Tensor,
@@ -441,7 +390,6 @@ class MoondreamModel(nn.Module):
             raise NotImplementedError("Model does not support pointing.")
         image = self.encode_image(image)
-        self.load_encoded_image(image)
         prompt_tokens = torch.tensor(
             [

 from .region import decode_coordinate, encode_coordinate, decode_size, encode_size
 from .utils import remove_outlier_points
 import os
+from .rope import RotaryEmbedding
 TextSamplingSettings = TypedDict(
     "TextSamplingSettings",
     {
 @dataclass(frozen=True)
 class EncodedImage:
     pos: int
 class MoondreamModel(nn.Module):
         self.tokenizer = Tokenizer.from_file(os.path.join(current_dir, "tokenizer.json"))
         self.vision = build_vision_model(config.vision, dtype)
         self.text = build_text_model(config.text, dtype)
+        self.rope = RotaryEmbedding(config.text.dim // config.text.n_heads, config.text.max_context)
         # Region Model
         self.region = nn.ModuleDict(
         attn_mask[..., :prefix_attn_len, :prefix_attn_len] = 1
         self.register_buffer("attn_mask", attn_mask, persistent=False)
     @property
     def device(self):
         return self.vision.pos_emb.device
         return vision_projection(g, r, self.vision, self.config.vision)
     def _prefill(self, x: torch.Tensor, attn_mask: torch.Tensor, pos_ids: torch.Tensor):
+        return text_decoder(x, self.text, attn_mask, self.config.text, self.rope)
     def _decode_one_tok(
         self, x: torch.Tensor, attn_mask: torch.Tensor, pos_ids: torch.Tensor
     ):
+        hidden = text_decoder(x, self.text, attn_mask, self.config.text, self.rope)
         logits = lm_head(hidden, self.text)
         return logits, hidden
             self._prefill(inputs_embeds, mask, pos_ids)
         return EncodedImage(
+            pos=inputs_embeds.size(1)
         )
     def _apply_top_p(self, probs: torch.Tensor, top_p: float):
         return generator(next_token, pos)
     def _generate_points(
         self,
         hidden: torch.Tensor,
             raise NotImplementedError("Model does not support pointing.")
         image = self.encode_image(image)
         prompt_tokens = torch.tensor(
             [

moondream2/rope.py CHANGED Viewed

@@ -1,89 +1,53 @@
-# Ethically sourced from https://github.com/xjdr-alt/entropix
 import torch
-import time
-def precompute_freqs_cis(
-    dim: int,
-    end: int,
-    theta: float = 10000.0,
-    use_scaled: bool = False,
-    dtype: torch.dtype = torch.float32,
-) -> torch.Tensor:
-    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=dtype)[: (dim // 2)] / dim))
-    t = torch.arange(end, dtype=dtype).unsqueeze(1)
-    freqs = t * freqs.unsqueeze(0)
-    freqs = torch.exp(1j * freqs)
-    return torch.stack([freqs.real, freqs.imag], dim=-1)
-def func1(x):
-    #print(x)
-    pass
-def func2(x):
-    #print(x)
-    pass
-def func3(x):
-    #print(x)
-    pass
-def func4(x):
-    #print(x)
-    pass
-def func5(x):
-    #print(x)
-    pass
-def func6(x):
-    #print(x)
-    pass
-def func7(x):
-    #print(x)
-    pass
-def func8(x):
-    #print(x)
-    pass
-def func9(x):
-    #print(x)
-    pass
-def func10(x):
-    #print(x)
-    pass
-def func11(x):
-    #print(x)
-    pass
-def apply_rotary_emb(
-    x: torch.Tensor,
-    position_ids: torch.Tensor,
-    num_heads: int,
-    rot_dim: int = 32,
-    interleave: bool = False,
-) -> torch.Tensor:
-    assert rot_dim == freqs_cis.shape[-2] * 2
-    assert num_heads == x.shape[1]
-    x_rot, x_pass = x[..., :rot_dim], x[..., rot_dim:]
-    d_q = x_rot.shape[-1] // 2
-    xq_r, xq_i = x_rot[..., :d_q], x_rot[..., d_q:]
-    # Get the cosine component from freqs_cis
-    cos_component = freqs_cis[..., 0]
-    # Index with position_ids
-    cos_indexed = cos_component[position_ids, :]
-    # Add two dimensions at the beginning
-    freqs_cos = cos_indexed.unsqueeze(0).unsqueeze(0)
-    freqs_sin = freqs_cis[..., 1][position_ids, :].unsqueeze(0).unsqueeze(0)
-    # Complex multiplication: (a + bi) * (c + di) = (ac - bd) + (ad + bc)i
-    xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin
-    xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos
-    xq_out = torch.stack((xq_out_r, xq_out_i), dim=-1).flatten(-2)
-    return torch.cat([xq_out.to(x.dtype), x_pass], dim=-1)

 import torch
+import torch.nn as nn
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+class RotaryEmbedding(nn.Module):
+    def __init__(self, head_dim: int, max_seq_len: int, theta: float = 10000.0):
+        super().__init__()
+        # Match RotaryEmbedding exactly
+        self.rot_dim = head_dim // 2  # Only half of head_dim is rotated
+        # Frequency calculation - match RotaryEmbedding exactly
+        freqs = 1.0 / (theta ** (torch.arange(0, self.rot_dim, 2).float() / self.rot_dim))
+        t = torch.arange(max_seq_len, dtype=torch.float32).unsqueeze(1)
+        freqs = t * freqs.unsqueeze(0)
+        freqs_cis = torch.exp(1j * freqs)
+        cos_vals = freqs_cis.real
+        sin_vals = freqs_cis.imag
+        self.register_buffer('cos_cache', cos_vals, persistent=False)
+        self.register_buffer('sin_cache', sin_vals, persistent=False)
+    def apply(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        WARNING: This modifies the input tensor in-place for maximum speed!
+        If you need the original tensor, make a copy before calling this.
+        Must match RotaryEmbedding output exactly.
+        """
+        seq_len = x.shape[1]
+        d = self.rot_dim // 2
+        # Get cos/sin with same broadcasting as RotaryEmbedding
+        cos = self.cos_cache[:seq_len].unsqueeze(0).unsqueeze(2)
+        sin = self.sin_cache[:seq_len].unsqueeze(0).unsqueeze(2)
+        # Split rotated part into real/imaginary components
+        xq_r = x[..., :d]      # First half of rot_dim
+        xq_i = x[..., d:d*2]   # Second half of rot_dim
+        # Apply rotation
+        xq_out_r = xq_r * cos - xq_i * sin
+        xq_out_i = xq_r * sin + xq_i * cos
+        # Vectorized interleaving using torch.stack and view
+        # Stack creates [d, ..., 2] then view as [..., d*2]
+        x[..., :self.rot_dim] = torch.stack([xq_out_r, xq_out_i], dim=-1).view(*x.shape[:-1], self.rot_dim)
+        # x_pass part (x[..., self.rot_dim:]) remains unchanged automatically
+        return x

moondream2/text.py CHANGED Viewed

@@ -1,12 +1,15 @@
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 from .layers import layer_norm, mlp
-from .rope import apply_rotary_emb, precompute_freqs_cis
 from .config import TextConfig
 def text_encoder(input_ids: torch.Tensor, w: nn.Module):
     return F.embedding(input_ids, w.wte)
@@ -15,12 +18,9 @@ def text_encoder(input_ids: torch.Tensor, w: nn.Module):
 def attn(
     x: torch.Tensor,
     w: nn.Module,
-    kv_cache: nn.Module,
     attn_mask: torch.Tensor,
     n_heads: int,
-    position_ids: torch.Tensor,
-    rotary_emb: nn.Module,
-    do_apply_rotary_emb: bool = True,
 ):
     bsz, q_len, d_model = x.shape
     head_dim = d_model // n_heads
@@ -37,11 +37,8 @@ def attn(
     # 3. Unpack/Split along the first dimension (which now separates Q, K, V)
     q, k, v = qkv_permuted[0], qkv_permuted[1], qkv_permuted[2]
-    q = rotary_emb(q.permute(0, 2, 1, 3))
-    k = rotary_emb(k.permute(0, 2, 1, 3))
-    if kv_cache is not None:
-        k, v = kv_cache.update(position_ids, k, v)
     out = F.scaled_dot_product_attention(
         q, k, v, attn_mask=attn_mask
@@ -55,9 +52,8 @@ def text_decoder(
     x: torch.Tensor,
     w: nn.Module,
     attn_mask: torch.Tensor,
-    position_ids: torch.Tensor,
     config: TextConfig,
-    rotary_emb: nn.Module
 ):
     for i, block in enumerate(w.blocks):
@@ -65,11 +61,9 @@ def text_decoder(
         l_attn = attn(
             l_in,
             block.attn,
-            kv_cache=block.kv_cache,
             attn_mask=attn_mask,
             n_heads=config.n_heads,
-            rotary_emb=rotary_emb,
-            position_ids=position_ids,
         )
         l_mlp = mlp(l_in, block.mlp)
         x = x + l_attn + l_mlp

 import torch
 import torch.nn as nn
+from typing import TYPE_CHECKING
 from torch.nn import functional as F
 from .layers import layer_norm, mlp
 from .config import TextConfig
+# type checking imports if typechecking
+if TYPE_CHECKING:
+    from .rope import RotaryEmbedding
 def text_encoder(input_ids: torch.Tensor, w: nn.Module):
     return F.embedding(input_ids, w.wte)
 def attn(
     x: torch.Tensor,
     w: nn.Module,
     attn_mask: torch.Tensor,
     n_heads: int,
+    rope: "RotaryEmbedding"
 ):
     bsz, q_len, d_model = x.shape
     head_dim = d_model // n_heads
     # 3. Unpack/Split along the first dimension (which now separates Q, K, V)
     q, k, v = qkv_permuted[0], qkv_permuted[1], qkv_permuted[2]
+    q = rope.apply(q.permute(0, 2, 1, 3))
+    k = rope.apply(k.permute(0, 2, 1, 3))
     out = F.scaled_dot_product_attention(
         q, k, v, attn_mask=attn_mask
     x: torch.Tensor,
     w: nn.Module,
     attn_mask: torch.Tensor,
     config: TextConfig,
+    rope: "RotaryEmbedding"
 ):
     for i, block in enumerate(w.blocks):
         l_attn = attn(
             l_in,
             block.attn,
             attn_mask=attn_mask,
             n_heads=config.n_heads,
+            rope=rope,
         )
         l_mlp = mlp(l_in, block.mlp)
         x = x + l_attn + l_mlp

moondream2/vision.py CHANGED Viewed

@@ -34,9 +34,10 @@ def prepare_crops(
     )
     all_crops = overlap_crops["crops"]
     all_crops = np.transpose(all_crops, (0, 3, 1, 2))
     all_crops = (
-        torch.from_numpy(all_crops)
-        .to(device=device, dtype=torch.float16)
         .div_(255.0)
         .sub_(0.5)
         .div_(0.5)

     )
     all_crops = overlap_crops["crops"]
     all_crops = np.transpose(all_crops, (0, 3, 1, 2))
+    all_crops = torch.from_numpy(all_crops).to(dtype=torch.float16)
     all_crops = (
+        all_crops
+        .to(device=device)
         .div_(255.0)
         .sub_(0.5)
         .div_(0.5)

notes.ipynb CHANGED Viewed

@@ -8,7 +8,7 @@
     {
      "data": {
       "text/plain": [
-       "True"
       ]
      },
      "execution_count": 2,
@@ -18,7 +18,17 @@
    ],
    "source": [
     "import torch\n",
-    "torch.cuda.is_available()"
    ]
   },
   {
@@ -54,28 +64,30 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31mRunning cells with 'venv12 (Python 3.12.10)' requires the ipykernel package.\n",
-      "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n",
-      "\u001b[1;31mCommand: '/home/pixel/Desktop/moondream/venv12/bin/python3.12 -m pip install ipykernel -U --force-reinstall'"
-     ]
-    }
-   ],
    "source": [
     "import torch"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
    "source": [
     "import os\n",
     "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n",
     "\n",
@@ -83,51 +95,263 @@
     "from moondream2.config import MoondreamConfig\n",
     "from moondream2.moondream import MoondreamModel\n",
     "import torch.profiler\n",
     "\n",
-    "config = MoondreamConfig()\n",
-    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
-    "model = MoondreamModel(config, setup_caches=False)\n",
-    "from safetensors.torch import load_model\n",
-    "weights_path = \"moondream2/model2.safetensors\"  # Path to your local weights file\n",
-    "state_dict = load_model(model, weights_path)\n",
-    "model._setup_caches()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
-     "ename": "OutOfMemoryError",
-     "evalue": "CUDA out of memory. Tried to allocate 200.00 MiB. GPU 0 has a total capacity of 3.69 GiB of which 129.50 MiB is free. Including non-PyTorch memory, this process has 3.56 GiB memory in use. Of the allocated memory 3.47 GiB is allocated by PyTorch, and 13.58 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mOutOfMemoryError\u001b[39m                          Traceback (most recent call last)",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/venv/lib/python3.13/site-packages/torch/nn/modules/module.py:1355\u001b[39m, in \u001b[36mModule.to\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1352\u001b[39m         \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m   1353\u001b[39m             \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1355\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/venv/lib/python3.13/site-packages/torch/nn/modules/module.py:915\u001b[39m, in \u001b[36mModule._apply\u001b[39m\u001b[34m(self, fn, recurse)\u001b[39m\n\u001b[32m    913\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[32m    914\u001b[39m     \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.children():\n\u001b[32m--> \u001b[39m\u001b[32m915\u001b[39m         \u001b[43mmodule\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    917\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied):\n\u001b[32m    918\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m torch._has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[32m    919\u001b[39m         \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[32m    920\u001b[39m         \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[32m   (...)\u001b[39m\u001b[32m    925\u001b[39m         \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[32m    926\u001b[39m         \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/venv/lib/python3.13/site-packages/torch/nn/modules/module.py:942\u001b[39m, in \u001b[36mModule._apply\u001b[39m\u001b[34m(self, fn, recurse)\u001b[39m\n\u001b[32m    938\u001b[39m \u001b[38;5;66;03m# Tensors stored in modules are graph leaves, and we don't want to\u001b[39;00m\n\u001b[32m    939\u001b[39m \u001b[38;5;66;03m# track autograd history of `param_applied`, so we have to use\u001b[39;00m\n\u001b[32m    940\u001b[39m \u001b[38;5;66;03m# `with torch.no_grad():`\u001b[39;00m\n\u001b[32m    941\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m torch.no_grad():\n\u001b[32m--> \u001b[39m\u001b[32m942\u001b[39m     param_applied = \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mparam\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    943\u001b[39m p_should_use_set_data = compute_should_use_set_data(param, param_applied)\n\u001b[32m    945\u001b[39m \u001b[38;5;66;03m# subclasses may have multiple child tensors so we need to use swap_tensors\u001b[39;00m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/venv/lib/python3.13/site-packages/torch/nn/modules/module.py:1341\u001b[39m, in \u001b[36mModule.to.<locals>.convert\u001b[39m\u001b[34m(t)\u001b[39m\n\u001b[32m   1334\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m convert_to_format \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m t.dim() \u001b[38;5;129;01min\u001b[39;00m (\u001b[32m4\u001b[39m, \u001b[32m5\u001b[39m):\n\u001b[32m   1335\u001b[39m         \u001b[38;5;28;01mreturn\u001b[39;00m t.to(\n\u001b[32m   1336\u001b[39m             device,\n\u001b[32m   1337\u001b[39m             dtype \u001b[38;5;28;01mif\u001b[39;00m t.is_floating_point() \u001b[38;5;129;01mor\u001b[39;00m t.is_complex() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m   1338\u001b[39m             non_blocking,\n\u001b[32m   1339\u001b[39m             memory_format=convert_to_format,\n\u001b[32m   1340\u001b[39m         )\n\u001b[32m-> \u001b[39m\u001b[32m1341\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mt\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1342\u001b[39m \u001b[43m        \u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1343\u001b[39m \u001b[43m        \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m.\u001b[49m\u001b[43mis_floating_point\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m.\u001b[49m\u001b[43mis_complex\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m   1344\u001b[39m \u001b[43m        \u001b[49m\u001b[43mnon_blocking\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1345\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1346\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m   1347\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mstr\u001b[39m(e) == \u001b[33m\"\u001b[39m\u001b[33mCannot copy out of meta tensor; no data!\u001b[39m\u001b[33m\"\u001b[39m:\n",
-      "\u001b[31mOutOfMemoryError\u001b[39m: CUDA out of memory. Tried to allocate 200.00 MiB. GPU 0 has a total capacity of 3.69 GiB of which 129.50 MiB is free. Including non-PyTorch memory, this process has 3.56 GiB memory in use. Of the allocated memory 3.47 GiB is allocated by PyTorch, and 13.58 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)"
      ]
     }
    ],
    "source": [
-    "model.to(device)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
     "from PIL import Image\n",
-    "image = Image.open(\"example.png\")\n",
-    "query = \"home icon at the bottom of the screen is visible\"\n",
-    "points = model.point(image, query)[\"points\"]\n"
    ]
   },
   {
@@ -303,96 +527,36 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
-    "\n",
-    "model = AutoModelForCausalLM.from_pretrained(\n",
-    "    \"vikhyatk/moondream2\",\n",
-    "    revision=\"2025-04-14\",\n",
-    "    trust_remote_code=True,\n",
-    "    cache_dir=\"moondream\",\n",
-    "    # Uncomment to run on GPU.\n",
-    "    device_map={\"\": \"cuda\"}\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "HfMoondream(\n",
-       "  (model): MoondreamModel(\n",
-       "    (vision): ModuleDict(\n",
-       "      (patch_emb): Linear(in_features=588, out_features=1152, bias=True)\n",
-       "      (blocks): ModuleList(\n",
-       "        (0-26): 27 x ModuleDict(\n",
-       "          (ln1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
-       "          (attn): ModuleDict(\n",
-       "            (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n",
-       "            (proj): Linear(in_features=1152, out_features=1152, bias=True)\n",
-       "          )\n",
-       "          (ln2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
-       "          (mlp): ModuleDict(\n",
-       "            (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n",
-       "            (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "      (post_ln): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
-       "      (proj_mlp): ModuleDict(\n",
-       "        (fc1): Linear(in_features=2304, out_features=8192, bias=True)\n",
-       "        (fc2): Linear(in_features=8192, out_features=2048, bias=True)\n",
-       "      )\n",
-       "    )\n",
-       "    (text): ModuleDict(\n",
-       "      (blocks): ModuleList(\n",
-       "        (0-23): 24 x ModuleDict(\n",
-       "          (ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
-       "          (attn): ModuleDict(\n",
-       "            (qkv): Linear(in_features=2048, out_features=6144, bias=True)\n",
-       "            (proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
-       "          )\n",
-       "          (mlp): ModuleDict(\n",
-       "            (fc1): Linear(in_features=2048, out_features=8192, bias=True)\n",
-       "            (fc2): Linear(in_features=8192, out_features=2048, bias=True)\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "      (post_ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
-       "      (lm_head): Linear(in_features=2048, out_features=51200, bias=True)\n",
-       "    )\n",
-       "    (region): ModuleDict(\n",
-       "      (coord_encoder): Linear(in_features=256, out_features=2048, bias=True)\n",
-       "      (coord_decoder): ModuleDict(\n",
-       "        (fc1): Linear(in_features=2048, out_features=8192, bias=True)\n",
-       "        (fc2): Linear(in_features=8192, out_features=1024, bias=True)\n",
-       "      )\n",
-       "      (size_encoder): Linear(in_features=512, out_features=2048, bias=True)\n",
-       "      (size_decoder): ModuleDict(\n",
-       "        (fc1): Linear(in_features=2048, out_features=8192, bias=True)\n",
-       "        (fc2): Linear(in_features=8192, out_features=2048, bias=True)\n",
-       "      )\n",
-       "    )\n",
-       "  )\n",
-       ")"
       ]
      },
-     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "model.point()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,

     {
      "data": {
       "text/plain": [
+       "20"
       ]
      },
      "execution_count": 2,
    ],
    "source": [
     "import torch\n",
+    "import gc\n",
+    "# ... your code that uses GPU tensors ...\n",
+    "# For example:\n",
+    "# x = torch.randn(1000, 1000, device='cuda')\n",
+    "# del x # or x = None\n",
+    "\n",
+    "# This is the key command:\n",
+    "torch.cuda.empty_cache()\n",
+    "\n",
+    "# Optionally, run Python's garbage collector too\n",
+    "gc.collect()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
+   "outputs": [],
    "source": [
     "import torch"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/pixel/Desktop/moondream/venv/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
+    "# auto reload jupyter notebook\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
     "import os\n",
     "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n",
     "\n",
     "from moondream2.config import MoondreamConfig\n",
     "from moondream2.moondream import MoondreamModel\n",
     "import torch.profiler\n",
+    "with torch.inference_mode():\n",
+    "    config = MoondreamConfig()\n",
+    "    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "    model = MoondreamModel(config, setup_caches=False)\n",
+    "    from safetensors.torch import load_model\n",
+    "    weights_path = \"moondream2/model.safetensors\"  # Path to your local weights file\n",
+    "    state_dict = load_model(model, weights_path)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch.nn as nn\n",
+    "from torch.quantization import quantize_dynamic\n",
+    "\n",
     "\n",
+    "model_quantized = quantize_dynamic(\n",
+    "    model, \n",
+    "    {nn.Linear},  # Only quantize these layer types\n",
+    "    dtype=torch.qint8\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "real = model.get_submodule(\"vision\").proj_mlp.get_submodule(\"fc1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[ 0.0198,  0.0356, -0.0158,  ...,  0.0119,  0.0487,  0.0448],\n",
+       "        [ 0.0198,  0.0277,  0.0725,  ...,  0.0079,  0.0343,  0.0435],\n",
+       "        [-0.0422,  0.0356, -0.0263,  ..., -0.0145, -0.0250,  0.0184],\n",
+       "        ...,\n",
+       "        [-0.0356, -0.0474, -0.0237,  ..., -0.0198,  0.0277,  0.0263],\n",
+       "        [ 0.0013,  0.0263,  0.0395,  ..., -0.0422,  0.0329, -0.0316],\n",
+       "        [ 0.0303, -0.0527, -0.0356,  ...,  0.0382, -0.0171, -0.0171]],\n",
+       "       size=(8192, 2304), dtype=torch.qint8,\n",
+       "       quantization_scheme=torch.per_tensor_affine, scale=0.0013174019986763597,\n",
+       "       zero_point=0)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "real."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vision = model_quantized.get_submodule(\"vision\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fcc = vision.proj_mlp.get_submodule(\"fc1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[ 0.0198,  0.0356, -0.0158,  ...,  0.0119,  0.0487,  0.0448],\n",
+       "        [ 0.0198,  0.0277,  0.0725,  ...,  0.0079,  0.0343,  0.0435],\n",
+       "        [-0.0422,  0.0356, -0.0263,  ..., -0.0145, -0.0250,  0.0184],\n",
+       "        ...,\n",
+       "        [-0.0356, -0.0474, -0.0237,  ..., -0.0198,  0.0277,  0.0263],\n",
+       "        [ 0.0013,  0.0263,  0.0395,  ..., -0.0422,  0.0329, -0.0316],\n",
+       "        [ 0.0303, -0.0527, -0.0356,  ...,  0.0382, -0.0171, -0.0171]],\n",
+       "       size=(8192, 2304), dtype=torch.qint8,\n",
+       "       quantization_scheme=torch.per_tensor_affine, scale=0.0013174019986763597,\n",
+       "       zero_point=0)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fcc.weight()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "model = model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
    "outputs": [
     {
+     "data": {
+      "text/plain": [
+       "MoondreamModel(\n",
+       "  (vision): ModuleDict(\n",
+       "    (patch_emb): DynamicQuantizedLinear(in_features=588, out_features=1152, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "    (blocks): ModuleList(\n",
+       "      (0-26): 27 x ModuleDict(\n",
+       "        (ln1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
+       "        (attn): ModuleDict(\n",
+       "          (qkv): DynamicQuantizedLinear(in_features=1152, out_features=3456, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "          (proj): DynamicQuantizedLinear(in_features=1152, out_features=1152, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "        )\n",
+       "        (ln2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): ModuleDict(\n",
+       "          (fc1): DynamicQuantizedLinear(in_features=1152, out_features=4304, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "          (fc2): DynamicQuantizedLinear(in_features=4304, out_features=1152, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "    (post_ln): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
+       "    (proj_mlp): ModuleDict(\n",
+       "      (fc1): DynamicQuantizedLinear(in_features=2304, out_features=8192, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "      (fc2): DynamicQuantizedLinear(in_features=8192, out_features=2048, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "    )\n",
+       "  )\n",
+       "  (text): ModuleDict(\n",
+       "    (blocks): ModuleList(\n",
+       "      (0-23): 24 x ModuleDict(\n",
+       "        (ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
+       "        (attn): ModuleDict(\n",
+       "          (qkv): DynamicQuantizedLinear(in_features=2048, out_features=6144, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "          (proj): DynamicQuantizedLinear(in_features=2048, out_features=2048, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "        )\n",
+       "        (mlp): ModuleDict(\n",
+       "          (fc1): DynamicQuantizedLinear(in_features=2048, out_features=8192, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "          (fc2): DynamicQuantizedLinear(in_features=8192, out_features=2048, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "    (post_ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
+       "    (lm_head): DynamicQuantizedLinear(in_features=2048, out_features=51200, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "  )\n",
+       "  (rotary_emb): RotaryPositionalEmbeddings()\n",
+       "  (region): ModuleDict(\n",
+       "    (coord_encoder): DynamicQuantizedLinear(in_features=256, out_features=2048, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "    (coord_decoder): ModuleDict(\n",
+       "      (fc1): DynamicQuantizedLinear(in_features=2048, out_features=8192, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "      (fc2): DynamicQuantizedLinear(in_features=8192, out_features=1024, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "    )\n",
+       "    (size_encoder): DynamicQuantizedLinear(in_features=512, out_features=2048, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "    (size_decoder): ModuleDict(\n",
+       "      (fc1): DynamicQuantizedLinear(in_features=2048, out_features=8192, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "      (fc2): DynamicQuantizedLinear(in_features=8192, out_features=2048, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
+       "    )\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "model size: 203.238MB\n"
      ]
     }
    ],
    "source": [
+    "param_size = 0\n",
+    "for param in model.parameters():\n",
+    "    param_size += param.nelement() * param.element_size()\n",
+    "buffer_size = 0\n",
+    "for buffer in model.buffers():\n",
+    "    buffer_size += buffer.nelement() * buffer.element_size()\n",
+    "\n",
+    "size_all_mb = (param_size + buffer_size) / 1024**2\n",
+    "print('model size: {:.3f}MB'.format(size_all_mb))"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "model = model_quantized.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "self and mat2 must have the same dtype, but got Half and QInt8",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mRuntimeError\u001b[39m                              Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 5\u001b[39m\n\u001b[32m      3\u001b[39m image = Image.open(\u001b[33m\"\u001b[39m\u001b[33mexample.png\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m      4\u001b[39m query = \u001b[33m\"\u001b[39m\u001b[33mhome icon at the bottom of the screen is visible\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m points = \u001b[43mmodel\u001b[49m\u001b[43m.\u001b[49m\u001b[43mpoint\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m)\u001b[49m[\u001b[33m\"\u001b[39m\u001b[33mpoints\u001b[39m\u001b[33m\"\u001b[39m]\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/moondream2/moondream.py:396\u001b[39m, in \u001b[36mMoondreamModel.point\u001b[39m\u001b[34m(self, image, object, settings)\u001b[39m\n\u001b[32m    393\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.config.tokenizer.templates[\u001b[33m\"\u001b[39m\u001b[33mpoint\u001b[39m\u001b[33m\"\u001b[39m] \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m    394\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33mModel does not support pointing.\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m396\u001b[39m image = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mencode_image\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimage\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    398\u001b[39m prompt_tokens = torch.tensor(\n\u001b[32m    399\u001b[39m     [\n\u001b[32m    400\u001b[39m         \u001b[38;5;28mself\u001b[39m.config.tokenizer.templates[\u001b[33m\"\u001b[39m\u001b[33mpoint\u001b[39m\u001b[33m\"\u001b[39m][\u001b[33m\"\u001b[39m\u001b[33mprefix\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m   (...)\u001b[39m\u001b[32m    404\u001b[39m     device=\u001b[38;5;28mself\u001b[39m.device,\n\u001b[32m    405\u001b[39m )\n\u001b[32m    407\u001b[39m _, hidden, next_token, pos = \u001b[38;5;28mself\u001b[39m._prefill_prompt(\n\u001b[32m    408\u001b[39m     prompt_tokens, image.pos, temperature=\u001b[32m0\u001b[39m, top_p=\u001b[32m0\u001b[39m\n\u001b[32m    409\u001b[39m )\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/moondream2/moondream.py:174\u001b[39m, in \u001b[36mMoondreamModel.encode_image\u001b[39m\u001b[34m(self, image)\u001b[39m\n\u001b[32m    170\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m torch.inference_mode():\n\u001b[32m    172\u001b[39m     bos = torch.tensor([[\u001b[38;5;28mself\u001b[39m.config.tokenizer.bos_id]], device=\u001b[38;5;28mself\u001b[39m.device)\n\u001b[32m--> \u001b[39m\u001b[32m174\u001b[39m     img_emb = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_run_vision_encoder\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimage\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    175\u001b[39m     bos_emb = text_encoder(\n\u001b[32m    176\u001b[39m         bos,\n\u001b[32m    177\u001b[39m         \u001b[38;5;28mself\u001b[39m.text,\n\u001b[32m    178\u001b[39m     )\n\u001b[32m    179\u001b[39m     inputs_embeds = torch.cat([bos_emb, img_emb[\u001b[38;5;28;01mNone\u001b[39;00m]], dim=\u001b[32m1\u001b[39m)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/moondream2/moondream.py:143\u001b[39m, in \u001b[36mMoondreamModel._run_vision_encoder\u001b[39m\u001b[34m(self, image)\u001b[39m\n\u001b[32m    140\u001b[39m all_crops, tiling = prepare_crops(image, \u001b[38;5;28mself\u001b[39m.config.vision, device=\u001b[38;5;28mself\u001b[39m.device)\n\u001b[32m    141\u001b[39m torch._dynamo.mark_dynamic(all_crops, \u001b[32m0\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m143\u001b[39m outputs = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_vis_enc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mall_crops\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    145\u001b[39m global_features = outputs[\u001b[32m0\u001b[39m]\n\u001b[32m    146\u001b[39m local_features = outputs[\u001b[32m1\u001b[39m:].view(\n\u001b[32m    147\u001b[39m     -\u001b[32m1\u001b[39m,\n\u001b[32m    148\u001b[39m     \u001b[38;5;28mself\u001b[39m.config.vision.enc_n_layers,\n\u001b[32m    149\u001b[39m     \u001b[38;5;28mself\u001b[39m.config.vision.enc_n_layers,\n\u001b[32m    150\u001b[39m     \u001b[38;5;28mself\u001b[39m.config.vision.enc_dim,\n\u001b[32m    151\u001b[39m )\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/moondream2/moondream.py:116\u001b[39m, in \u001b[36mMoondreamModel._vis_enc\u001b[39m\u001b[34m(self, x)\u001b[39m\n\u001b[32m    115\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_vis_enc\u001b[39m(\u001b[38;5;28mself\u001b[39m, x: torch.Tensor):\n\u001b[32m--> \u001b[39m\u001b[32m116\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mvision_encoder\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mvision\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m.\u001b[49m\u001b[43mvision\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/moondream2/vision.py:71\u001b[39m, in \u001b[36mvision_encoder\u001b[39m\u001b[34m(input_BCHW, w, config)\u001b[39m\n\u001b[32m     68\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mvision_encoder\u001b[39m(input_BCHW: torch.Tensor, w: nn.Module, config: VisionConfig):\n\u001b[32m     69\u001b[39m     x = create_patches(input_BCHW, config.enc_patch_size)\n\u001b[32m---> \u001b[39m\u001b[32m71\u001b[39m     x = \u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mw\u001b[49m\u001b[43m.\u001b[49m\u001b[43mpatch_emb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     72\u001b[39m     x = x + w.pos_emb\n\u001b[32m     73\u001b[39m     \u001b[38;5;28;01mfor\u001b[39;00m block \u001b[38;5;129;01min\u001b[39;00m w.blocks:\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/moondream2/layers.py:19\u001b[39m, in \u001b[36mlinear\u001b[39m\u001b[34m(x, w)\u001b[39m\n\u001b[32m     18\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mlinear\u001b[39m(x: torch.Tensor, w: LinearWeights) -> torch.Tensor:\n\u001b[32m---> \u001b[39m\u001b[32m19\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[43m.\u001b[49m\u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mw\u001b[49m\u001b[43m.\u001b[49m\u001b[43mweight\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mw\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbias\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[31mRuntimeError\u001b[39m: self and mat2 must have the same dtype, but got Half and QInt8"
+     ]
+    }
+   ],
    "source": [
     "from PIL import Image\n",
+    "with torch.inference_mode():\n",
+    "    image = Image.open(\"example.png\")\n",
+    "    query = \"home icon at the bottom of the screen is visible\"\n",
+    "    points = model.point(image, query)[\"points\"]\n"
    ]
   },
   {
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "Linear(in_features=10, out_features=10, bias=True)"
       ]
      },
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "import torch \n",
+    "import torch.nn as nn\n",
+    "\n",
+    "linear = nn.Linear(10, 10, dtype=torch.float16)\n",
+    "\n",
+    "linear"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,

ollama.ipynb CHANGED Viewed

@@ -1,166 +1,365 @@
 {
  "cells": [
   {
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from transformers import AutoConfig\n",
     "\n",
-    "config = AutoConfig.from_pretrained(\"vikhyatk/moondream2\", trust_remote_code=True)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'return_dict': True,\n",
-       " 'output_hidden_states': False,\n",
-       " 'output_attentions': False,\n",
-       " 'torchscript': False,\n",
-       " 'torch_dtype': 'float16',\n",
-       " 'use_bfloat16': False,\n",
-       " 'tf_legacy_loss': False,\n",
-       " 'pruned_heads': {},\n",
-       " 'tie_word_embeddings': True,\n",
-       " 'chunk_size_feed_forward': 0,\n",
-       " 'is_encoder_decoder': False,\n",
-       " 'is_decoder': False,\n",
-       " 'cross_attention_hidden_size': None,\n",
-       " 'add_cross_attention': False,\n",
-       " 'tie_encoder_decoder': False,\n",
-       " 'max_length': 20,\n",
-       " 'min_length': 0,\n",
-       " 'do_sample': False,\n",
-       " 'early_stopping': False,\n",
-       " 'num_beams': 1,\n",
-       " 'num_beam_groups': 1,\n",
-       " 'diversity_penalty': 0.0,\n",
-       " 'temperature': 1.0,\n",
-       " 'top_k': 50,\n",
-       " 'top_p': 1.0,\n",
-       " 'typical_p': 1.0,\n",
-       " 'repetition_penalty': 1.0,\n",
-       " 'length_penalty': 1.0,\n",
-       " 'no_repeat_ngram_size': 0,\n",
-       " 'encoder_no_repeat_ngram_size': 0,\n",
-       " 'bad_words_ids': None,\n",
-       " 'num_return_sequences': 1,\n",
-       " 'output_scores': False,\n",
-       " 'return_dict_in_generate': False,\n",
-       " 'forced_bos_token_id': None,\n",
-       " 'forced_eos_token_id': None,\n",
-       " 'remove_invalid_values': False,\n",
-       " 'exponential_decay_length_penalty': None,\n",
-       " 'suppress_tokens': None,\n",
-       " 'begin_suppress_tokens': None,\n",
-       " 'architectures': ['HfMoondream'],\n",
-       " 'finetuning_task': None,\n",
-       " 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'},\n",
-       " 'label2id': {'LABEL_0': 0, 'LABEL_1': 1},\n",
-       " 'tokenizer_class': None,\n",
-       " 'prefix': None,\n",
-       " 'bos_token_id': None,\n",
-       " 'pad_token_id': None,\n",
-       " 'eos_token_id': None,\n",
-       " 'sep_token_id': None,\n",
-       " 'decoder_start_token_id': None,\n",
-       " 'task_specific_params': None,\n",
-       " 'problem_type': None,\n",
-       " '_name_or_path': 'vikhyatk/moondream2',\n",
-       " '_attn_implementation_autoset': False,\n",
-       " 'transformers_version': '4.49.0',\n",
-       " 'auto_map': {'AutoConfig': 'vikhyatk/moondream2--hf_moondream.HfConfig',\n",
-       "  'AutoModelForCausalLM': 'vikhyatk/moondream2--hf_moondream.HfMoondream'},\n",
-       " 'config': {},\n",
-       " 'model_type': 'moondream1'}"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
-    "config.to_dict()\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from moondream2.hf_moondream import HfConfig\n",
-    "# serialize config.json to data variable\n",
-    "import json\n",
-    "with open(\"moondream2/config.json\", \"r\") as f:\n",
-    "    data = json.load(f)\n",
-    "configo = HfConfig(**data)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "_name_or_path : vikhyatk/moondream2 != \n",
-      "auto_map : {'AutoConfig': 'vikhyatk/moondream2--hf_moondream.HfConfig', 'AutoModelForCausalLM': 'vikhyatk/moondream2--hf_moondream.HfMoondream'} != {'AutoConfig': 'hf_moondream.HfConfig', 'AutoModelForCausalLM': 'hf_moondream.HfMoondream'}\n"
-     ]
     }
    ],
    "source": [
-    "# compare dicts of config.to_dict() and configo.to_dict()    \n",
-    "\n",
-    "# make it check values of the dicts\n",
-    "\n",
-    "for key, value in config.to_dict().items():\n",
-    "    if key not in configo.to_dict():\n",
-    "        print(key + \" : \" + str(value) + \" not in hf_config\")\n",
-    "    elif value != configo.to_dict()[key]:\n",
-    "        print(key+ \" : \"+str(value)+\" != \"+str(configo.to_dict()[key]))\n",
-    "\n",
-    "for key, value in configo.to_dict().items():\n",
-    "    if key not in config.to_dict():\n",
-    "        print(key + \" : \" + str(value) + \" not in from_pretrained\")\n",
-    "\n",
-    "hparams = config.to_dict()\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
-    "text_config = hparams.get(\"text_config\", {})"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "text_config.get(\"architectures\")\n"
-   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
    "metadata": {},
-   "outputs": [],
    "source": [
-    "hparams.get(\"vision_config\")"
    ]
   },
   {
@@ -187,7 +386,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.4"
   }
  },
  "nbformat": 4,

 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+    "\n",
+    "class RotaryEmbeddingInPlace(nn.Module):\n",
+    "    def __init__(self, head_dim: int, max_seq_len: int, theta: float = 10000.0):\n",
+    "        super().__init__()\n",
+    "        # Match RotaryEmbedding exactly\n",
+    "        self.rot_dim = head_dim // 2  # Only half of head_dim is rotated\n",
+    "        \n",
+    "        # Frequency calculation - match RotaryEmbedding exactly\n",
+    "        freqs = 1.0 / (theta ** (torch.arange(0, self.rot_dim, 2).float() / self.rot_dim))\n",
+    "        t = torch.arange(max_seq_len, dtype=torch.float32).unsqueeze(1)\n",
+    "        freqs = t * freqs.unsqueeze(0)\n",
+    "        \n",
+    "\n",
+    "        freqs_cis = torch.exp(1j * freqs)\n",
+    "        cos_vals = freqs_cis.real\n",
+    "        sin_vals = freqs_cis.imag\n",
     "\n",
+    "        self.register_buffer('cos_cache', cos_vals, persistent=False)\n",
+    "        self.register_buffer('sin_cache', sin_vals, persistent=False)\n",
+    "    \n",
+    "    def apply(self, x: torch.Tensor) -> torch.Tensor:\n",
+    "        \"\"\"\n",
+    "        WARNING: This modifies the input tensor in-place for maximum speed!\n",
+    "        If you need the original tensor, make a copy before calling this.\n",
+    "        \n",
+    "        Must match RotaryEmbedding output exactly.\n",
+    "        \"\"\"\n",
+    "        seq_len = x.shape[1]\n",
+    "        d = self.rot_dim // 2\n",
+    "        \n",
+    "        # Get cos/sin with same broadcasting as RotaryEmbedding\n",
+    "        cos = self.cos_cache[:seq_len].unsqueeze(0).unsqueeze(2)\n",
+    "        sin = self.sin_cache[:seq_len].unsqueeze(0).unsqueeze(2)\n",
+    "        \n",
+    "        # Split rotated part into real/imaginary components\n",
+    "        xq_r = x[..., :d]      # First half of rot_dim\n",
+    "        xq_i = x[..., d:d*2]   # Second half of rot_dim\n",
+    "        \n",
+    "        # Apply rotation\n",
+    "        xq_out_r = xq_r * cos - xq_i * sin\n",
+    "        xq_out_i = xq_r * sin + xq_i * cos\n",
+    "        \n",
+    "        # Vectorized interleaving using torch.stack and view\n",
+    "        # Stack creates [d, ..., 2] then view as [..., d*2]\n",
+    "        x[..., :self.rot_dim] = torch.stack([xq_out_r, xq_out_i], dim=-1).view(*x.shape[:-1], self.rot_dim)\n",
+    "        \n",
+    "        # x_pass part (x[..., self.rot_dim:]) remains unchanged automatically\n",
+    "        \n",
+    "        return x\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
+   "outputs": [],
    "source": [
+    "dim_per_head = 64\n",
+    "n_heads = 32\n",
+    "max_context = 2048\n",
+    "\n",
+    "freq_dim = dim_per_head // 2\n",
+    "\n",
+    "torch.manual_seed(42)\n",
+    "\n",
+    "tensor = torch.rand(1, 730, n_heads, dim_per_head)\n",
+    "tensor = tensor.to(device)\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
+    "fast_rope = RotaryEmbeddingInPlace(dim_per_head, max_context)\n",
+    "fast_rope.to(device)\n",
+    "fast_rtensor = fast_rope.apply(tensor)\n",
+    "\n",
+    "\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
+     "data": {
+      "text/plain": [
+       "tensor([0.8823, 0.8854, 0.9150, 0.5739, 0.3829, 0.2666, 0.9593, 0.6274, 0.3904,\n",
+       "        0.2696, 0.6009, 0.4414, 0.2566, 0.2969, 0.7936], device='cuda:0')"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
+    "fast_rtensor.flatten()[:15]"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
    "source": [
+    "tensor([0.8823, 0.8854, 0.9150, 0.5739, 0.3829, 0.2666, 0.9593, 0.6274, 0.3904,\n",
+    "        0.2696, 0.6009, 0.4414, 0.2566, 0.2969, 0.7936])"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
+   "source": []
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Benchmarking with tensor shape: torch.Size([1, 730, 32, 64])\n",
+      "Device: cuda:0\n",
+      "Warmup iterations: 10\n",
+      "Benchmark iterations: 100\n",
+      "--------------------------------------------------\n",
+      "Warming up regular rope...\n",
+      "Benchmarking regular rope...\n",
+      "Warming up fast rope...\n",
+      "Benchmarking fast rope...\n",
+      "\n",
+      "============================================================\n",
+      "BENCHMARK RESULTS\n",
+      "============================================================\n",
+      "\n",
+      "Regular Rope:\n",
+      "  Mean:   0.338 ms\n",
+      "  Median: 0.335 ms\n",
+      "  Std:    0.009 ms\n",
+      "  Min:    0.330 ms\n",
+      "  Max:    0.385 ms\n",
+      "\n",
+      "Fast Rope (In-place):\n",
+      "  Mean:   0.267 ms\n",
+      "  Median: 0.265 ms\n",
+      "  Std:    0.005 ms\n",
+      "  Min:    0.261 ms\n",
+      "  Max:    0.285 ms\n",
+      "\n",
+      "Speedup: 1.27x\n",
+      "Fast rope is 1.27x faster\n"
+     ]
+    }
+   ],
    "source": [
+    "import torch\n",
+    "import time\n",
+    "import statistics\n",
+    "from typing import List, Tuple\n",
+    "\n",
+    "def benchmark_rope_functions(\n",
+    "    rope, \n",
+    "    fast_rope, \n",
+    "    tensor: torch.Tensor, \n",
+    "    num_warmup: int = 10,\n",
+    "    num_iterations: int = 100\n",
+    ") -> Tuple[float, float, List[float], List[float]]:\n",
+    "    \"\"\"\n",
+    "    Benchmark two rope functions, accounting for in-place modification.\n",
+    "    \n",
+    "    Args:\n",
+    "        rope: Regular RotaryEmbedding instance\n",
+    "        fast_rope: RotaryEmbeddingInPlace instance\n",
+    "        tensor: Input tensor to benchmark with\n",
+    "        num_warmup: Number of warmup iterations\n",
+    "        num_iterations: Number of benchmark iterations\n",
+    "    \n",
+    "    Returns:\n",
+    "        Tuple of (regular_avg_time, fast_avg_time, regular_times, fast_times)\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    # Ensure we're on the right device and in eval mode if applicable\n",
+    "    device = tensor.device\n",
+    "    \n",
+    "    # Pre-allocate tensor copies to avoid allocation overhead during timing\n",
+    "    tensor_copies = []\n",
+    "    for _ in range(num_warmup + num_iterations):\n",
+    "        tensor_copies.append(tensor.clone().detach())\n",
+    "    \n",
+    "    print(f\"Benchmarking with tensor shape: {tensor.shape}\")\n",
+    "    print(f\"Device: {device}\")\n",
+    "    print(f\"Warmup iterations: {num_warmup}\")\n",
+    "    print(f\"Benchmark iterations: {num_iterations}\")\n",
+    "    print(\"-\" * 50)\n",
+    "    \n",
+    "    # Warmup phase for regular rope\n",
+    "    print(\"Warming up regular rope...\")\n",
+    "    for i in range(num_warmup):\n",
+    "        _ = rope.apply(tensor)\n",
+    "        if device.type == 'cuda':\n",
+    "            torch.cuda.synchronize()\n",
+    "    \n",
+    "    # Benchmark regular rope\n",
+    "    print(\"Benchmarking regular rope...\")\n",
+    "    regular_times = []\n",
+    "    for i in range(num_iterations):\n",
+    "        if device.type == 'cuda':\n",
+    "            torch.cuda.synchronize()\n",
+    "        \n",
+    "        start_time = time.perf_counter()\n",
+    "        result = rope.apply(tensor)\n",
+    "        \n",
+    "        if device.type == 'cuda':\n",
+    "            torch.cuda.synchronize()\n",
+    "        \n",
+    "        end_time = time.perf_counter()\n",
+    "        regular_times.append((end_time - start_time) * 1000)  # Convert to milliseconds\n",
+    "    \n",
+    "    # Warmup phase for fast rope (in-place)\n",
+    "    print(\"Warming up fast rope...\")\n",
+    "    for i in range(num_warmup):\n",
+    "        test_tensor = tensor_copies[i].clone()  # Use a copy for warmup\n",
+    "        _ = fast_rope.apply(test_tensor)\n",
+    "        if device.type == 'cuda':\n",
+    "            torch.cuda.synchronize()\n",
+    "    \n",
+    "    # Benchmark fast rope (in-place)\n",
+    "    print(\"Benchmarking fast rope...\")\n",
+    "    fast_times = []\n",
+    "    copy_idx = num_warmup  # Start from after warmup copies\n",
+    "    \n",
+    "    for i in range(num_iterations):\n",
+    "        # Use pre-allocated copy\n",
+    "        tensor_copy = tensor_copies[copy_idx + i]\n",
+    "        \n",
+    "        if device.type == 'cuda':\n",
+    "            torch.cuda.synchronize()\n",
+    "        \n",
+    "        # Time only the apply operation, not the copy\n",
+    "        start_time = time.perf_counter()\n",
+    "        result = fast_rope.apply(tensor_copy)\n",
+    "        \n",
+    "        if device.type == 'cuda':\n",
+    "            torch.cuda.synchronize()\n",
+    "        \n",
+    "        end_time = time.perf_counter()\n",
+    "        fast_times.append((end_time - start_time) * 1000)  # Convert to milliseconds\n",
+    "    \n",
+    "    # Calculate statistics\n",
+    "    regular_avg = statistics.mean(regular_times)\n",
+    "    fast_avg = statistics.mean(fast_times)\n",
+    "    \n",
+    "    return regular_avg, fast_avg, regular_times, fast_times\n",
+    "\n",
+    "def print_benchmark_results(regular_avg: float, fast_avg: float, \n",
+    "                          regular_times: List[float], fast_times: List[float]):\n",
+    "    \"\"\"Print detailed benchmark results.\"\"\"\n",
+    "    \n",
+    "    regular_median = statistics.median(regular_times)\n",
+    "    regular_std = statistics.stdev(regular_times) if len(regular_times) > 1 else 0\n",
+    "    regular_min = min(regular_times)\n",
+    "    regular_max = max(regular_times)\n",
+    "    \n",
+    "    fast_median = statistics.median(fast_times)\n",
+    "    fast_std = statistics.stdev(fast_times) if len(fast_times) > 1 else 0\n",
+    "    fast_min = min(fast_times)\n",
+    "    fast_max = max(fast_times)\n",
+    "    \n",
+    "    speedup = regular_avg / fast_avg if fast_avg > 0 else float('inf')\n",
+    "    \n",
+    "    print(\"\\n\" + \"=\" * 60)\n",
+    "    print(\"BENCHMARK RESULTS\")\n",
+    "    print(\"=\" * 60)\n",
+    "    \n",
+    "    print(f\"\\nRegular Rope:\")\n",
+    "    print(f\"  Mean:   {regular_avg:.3f} ms\")\n",
+    "    print(f\"  Median: {regular_median:.3f} ms\")\n",
+    "    print(f\"  Std:    {regular_std:.3f} ms\")\n",
+    "    print(f\"  Min:    {regular_min:.3f} ms\")\n",
+    "    print(f\"  Max:    {regular_max:.3f} ms\")\n",
+    "    \n",
+    "    print(f\"\\nFast Rope (In-place):\")\n",
+    "    print(f\"  Mean:   {fast_avg:.3f} ms\")\n",
+    "    print(f\"  Median: {fast_median:.3f} ms\")\n",
+    "    print(f\"  Std:    {fast_std:.3f} ms\")\n",
+    "    print(f\"  Min:    {fast_min:.3f} ms\")\n",
+    "    print(f\"  Max:    {fast_max:.3f} ms\")\n",
+    "    \n",
+    "    print(f\"\\nSpeedup: {speedup:.2f}x\")\n",
+    "    if speedup > 1:\n",
+    "        print(f\"Fast rope is {speedup:.2f}x faster\")\n",
+    "    else:\n",
+    "        print(f\"Regular rope is {1/speedup:.2f}x faster\")\n",
+    "\n",
+    "# Example usage\n",
+    "def run_benchmark():\n",
+    "    \"\"\"\n",
+    "    Example of how to use the benchmark functions.\n",
+    "    Replace with your actual RotaryEmbedding classes.\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    # Example parameters - adjust these to match your setup\n",
+    "    dim_per_head = 64\n",
+    "    n_heads = 32\n",
+    "    max_context = 2048\n",
+    "\n",
+    "    freq_dim = dim_per_head // 2\n",
+    "\n",
+    "    torch.manual_seed(42)\n",
+    "\n",
+    "    \n",
+    "    # Create your rope instances\n",
+    "    rope = RotaryEmbedding(dim_per_head, max_context)\n",
+    "    fast_rope = RotaryEmbeddingInPlace(dim_per_head, max_context)\n",
+    "    \n",
+    "    # Create test tensor - adjust shape to match your use case\n",
+    "    tensor = torch.rand(1, 730, n_heads, dim_per_head, device=device)\n",
+    "\n",
+    "    regular_avg, fast_avg, regular_times, fast_times = benchmark_rope_functions(rope, fast_rope, tensor)\n",
+    "    print_benchmark_results(regular_avg, fast_avg, regular_times, fast_times)\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    run_benchmark()"
    ]
   },
   {
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.13.3"
   }
  },
  "nbformat": 4,