torch rope broken

Files changed (8) hide show

.gitignore +3 -1
.python-version +1 -0
moondream2/config.py +1 -1
moondream2/moondream.py +8 -3
moondream2/rope.py +0 -1
moondream2/text.py +7 -9
notes.ipynb +14 -3
requirements.txt +1 -0

.gitignore CHANGED Viewed

@@ -13,4 +13,6 @@ __pycache__/
 # venv
 venv/
-*.safetensors

 # venv
 venv/
+*.safetensors
+log/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

moondream2/config.py CHANGED Viewed

@@ -8,7 +8,7 @@ class TextConfig:
     ff_dim: int = 8192
     n_layers: int = 24
     vocab_size: int = 51200
-    max_context: int = 2048
     n_heads: int = 32
     n_kv_heads: int = 32
     prefix_attn: int = 730

     ff_dim: int = 8192
     n_layers: int = 24
     vocab_size: int = 51200
+    max_context: int = 1000
     n_heads: int = 32
     n_kv_heads: int = 32
     prefix_attn: int = 730

moondream2/moondream.py CHANGED Viewed

@@ -14,7 +14,7 @@ from .text import build_text_model, text_encoder, lm_head, text_decoder
 from .region import decode_coordinate, encode_coordinate, decode_size, encode_size
 from .utils import remove_outlier_points
 import os
 TextSamplingSettings = TypedDict(
     "TextSamplingSettings",
     {
@@ -71,6 +71,11 @@ class MoondreamModel(nn.Module):
         self.vision = build_vision_model(config.vision, dtype)
         self.text = build_text_model(config.text, dtype)
         # Region Model
         self.region = nn.ModuleDict(
             {
@@ -149,12 +154,12 @@ class MoondreamModel(nn.Module):
         return vision_projection(g, r, self.vision, self.config.vision)
     def _prefill(self, x: torch.Tensor, attn_mask: torch.Tensor, pos_ids: torch.Tensor):
-        return text_decoder(x, self.text, attn_mask, pos_ids, self.config.text)
     def _decode_one_tok(
         self, x: torch.Tensor, attn_mask: torch.Tensor, pos_ids: torch.Tensor
     ):
-        hidden = text_decoder(x, self.text, attn_mask, pos_ids, self.config.text)
         logits = lm_head(hidden, self.text)
         return logits, hidden

 from .region import decode_coordinate, encode_coordinate, decode_size, encode_size
 from .utils import remove_outlier_points
 import os
+from torchtune.modules import RotaryPositionalEmbeddings
 TextSamplingSettings = TypedDict(
     "TextSamplingSettings",
     {
         self.vision = build_vision_model(config.vision, dtype)
         self.text = build_text_model(config.text, dtype)
+        self.rotary_emb = RotaryPositionalEmbeddings(
+            config.text.dim // config.text.n_heads,
+            config.text.max_context
+        )
         # Region Model
         self.region = nn.ModuleDict(
             {
         return vision_projection(g, r, self.vision, self.config.vision)
     def _prefill(self, x: torch.Tensor, attn_mask: torch.Tensor, pos_ids: torch.Tensor):
+        return text_decoder(x, self.text, attn_mask, pos_ids, self.config.text, self.rotary_emb)
     def _decode_one_tok(
         self, x: torch.Tensor, attn_mask: torch.Tensor, pos_ids: torch.Tensor
     ):
+        hidden = text_decoder(x, self.text, attn_mask, pos_ids, self.config.text, self.rotary_emb)
         logits = lm_head(hidden, self.text)
         return logits, hidden

moondream2/rope.py CHANGED Viewed

@@ -62,7 +62,6 @@ def func11(x):
 def apply_rotary_emb(
     x: torch.Tensor,
-    freqs_cis: torch.Tensor,
     position_ids: torch.Tensor,
     num_heads: int,
     rot_dim: int = 32,

 def apply_rotary_emb(
     x: torch.Tensor,
     position_ids: torch.Tensor,
     num_heads: int,
     rot_dim: int = 32,

moondream2/text.py CHANGED Viewed

@@ -15,11 +15,11 @@ def text_encoder(input_ids: torch.Tensor, w: nn.Module):
 def attn(
     x: torch.Tensor,
     w: nn.Module,
-    freqs_cis: torch.Tensor,
     kv_cache: nn.Module,
     attn_mask: torch.Tensor,
     n_heads: int,
     position_ids: torch.Tensor,
     do_apply_rotary_emb: bool = True,
 ):
     bsz, q_len, d_model = x.shape
@@ -37,8 +37,8 @@ def attn(
     # 3. Unpack/Split along the first dimension (which now separates Q, K, V)
     q, k, v = qkv_permuted[0], qkv_permuted[1], qkv_permuted[2]
-    q = apply_rotary_emb(q, freqs_cis, position_ids, n_heads)
-    k = apply_rotary_emb(k, freqs_cis, position_ids, n_heads)
     if kv_cache is not None:
         k, v = kv_cache.update(position_ids, k, v)
@@ -57,16 +57,18 @@ def text_decoder(
     attn_mask: torch.Tensor,
     position_ids: torch.Tensor,
     config: TextConfig,
 ):
     for i, block in enumerate(w.blocks):
         l_in = layer_norm(x, block.ln)
         l_attn = attn(
             l_in,
             block.attn,
-            freqs_cis=w.freqs_cis,
             kv_cache=block.kv_cache,
             attn_mask=attn_mask,
             n_heads=config.n_heads,
             position_ids=position_ids,
         )
         l_mlp = mlp(l_in, block.mlp)
@@ -120,10 +122,6 @@ def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module:
         }
     )
     text.wte = nn.Parameter(torch.empty(config.vocab_size, config.dim, dtype=dtype))
-    text.register_buffer(
-        "freqs_cis",
-        precompute_freqs_cis(config.dim // (2 * config.n_heads), config.max_context),
-        persistent=False,
-    )
     return text

 def attn(
     x: torch.Tensor,
     w: nn.Module,
     kv_cache: nn.Module,
     attn_mask: torch.Tensor,
     n_heads: int,
     position_ids: torch.Tensor,
+    rotary_emb: nn.Module,
     do_apply_rotary_emb: bool = True,
 ):
     bsz, q_len, d_model = x.shape
     # 3. Unpack/Split along the first dimension (which now separates Q, K, V)
     q, k, v = qkv_permuted[0], qkv_permuted[1], qkv_permuted[2]
+    q = rotary_emb(q.permute(0, 2, 1, 3))
+    k = rotary_emb(k.permute(0, 2, 1, 3))
     if kv_cache is not None:
         k, v = kv_cache.update(position_ids, k, v)
     attn_mask: torch.Tensor,
     position_ids: torch.Tensor,
     config: TextConfig,
+    rotary_emb: nn.Module
 ):
     for i, block in enumerate(w.blocks):
         l_in = layer_norm(x, block.ln)
         l_attn = attn(
             l_in,
             block.attn,
             kv_cache=block.kv_cache,
             attn_mask=attn_mask,
             n_heads=config.n_heads,
+            rotary_emb=rotary_emb,
             position_ids=position_ids,
         )
         l_mlp = mlp(l_in, block.mlp)
         }
     )
     text.wte = nn.Parameter(torch.empty(config.vocab_size, config.dim, dtype=dtype))
     return text

notes.ipynb CHANGED Viewed

@@ -54,7 +54,18 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [],
    "source": [
     "import torch"
    ]
@@ -398,7 +409,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "venv",
    "language": "python",
    "name": "python3"
   },
@@ -412,7 +423,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.3"
   }
  },
  "nbformat": 4,

    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mRunning cells with 'venv12 (Python 3.12.10)' requires the ipykernel package.\n",
+      "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n",
+      "\u001b[1;31mCommand: '/home/pixel/Desktop/moondream/venv12/bin/python3.12 -m pip install ipykernel -U --force-reinstall'"
+     ]
+    }
+   ],
    "source": [
     "import torch"
    ]
  ],
  "metadata": {
   "kernelspec": {
+   "display_name": "venv12",
    "language": "python",
    "name": "python3"
   },
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.12.10"
   }
  },
  "nbformat": 4,

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ torch==2.7.0+cu128 torchvision==0.22.0+cu128 torchaudio==2.7.0+cu128 --index-url https://download.pytorch.org/whl/cu128