johnmalek312 commited on
Commit
10101c1
·
1 Parent(s): 4b70204
moondream2/config.py CHANGED
@@ -8,7 +8,7 @@ class TextConfig:
8
  ff_dim: int = 8192
9
  n_layers: int = 24
10
  vocab_size: int = 51200
11
- max_context: int = 1000
12
  n_heads: int = 32
13
  n_kv_heads: int = 32
14
  prefix_attn: int = 730
 
8
  ff_dim: int = 8192
9
  n_layers: int = 24
10
  vocab_size: int = 51200
11
+ max_context: int = 2048
12
  n_heads: int = 32
13
  n_kv_heads: int = 32
14
  prefix_attn: int = 730
moondream2/hf_moondream.py DELETED
@@ -1,132 +0,0 @@
1
- from transformers import PreTrainedModel, PretrainedConfig
2
-
3
- from .config import MoondreamConfig
4
- from .moondream import MoondreamModel
5
-
6
- # Files sometimes don't get loaded without these...
7
- from .image_crops import *
8
- from .vision import *
9
- from .text import *
10
- from .region import *
11
- from .utils import *
12
-
13
-
14
- def extract_question(text):
15
- prefix = "<image>\n\nQuestion: "
16
- suffix = "\n\nAnswer:"
17
-
18
- if text.startswith(prefix) and text.endswith(suffix):
19
- return text[len(prefix) : -len(suffix)]
20
- else:
21
- return None
22
-
23
-
24
- class HfConfig(PretrainedConfig):
25
- _auto_class = "AutoConfig"
26
- model_type = "moondream1"
27
-
28
- def __init__(self, **kwargs):
29
- super().__init__(**kwargs)
30
- self.config = {}
31
-
32
-
33
- class HfMoondream(PreTrainedModel):
34
- _supports_gradient_checkpointing = True
35
- _auto_class = "AutoModelForCausalLM"
36
- config_class = HfConfig
37
-
38
- def __init__(self, config):
39
- super().__init__(config)
40
- self.model = MoondreamModel(
41
- MoondreamConfig.from_dict(config.config), setup_caches=False
42
- )
43
- self.model._setup_caches()
44
-
45
- @property
46
- def encode_image(self):
47
- return self.model.encode_image
48
-
49
- @property
50
- def query(self):
51
- return self.model.query
52
-
53
- @property
54
- def caption(self):
55
- return self.model.caption
56
-
57
- @property
58
- def detect(self):
59
- return self.model.detect
60
-
61
- @property
62
- def point(self):
63
- return self.model.point
64
-
65
- @property
66
- def detect_gaze(self):
67
- return self.model.detect_gaze
68
-
69
- def answer_question(
70
- self,
71
- image_embeds,
72
- question,
73
- tokenizer=None,
74
- chat_history="",
75
- result_queue=None,
76
- max_new_tokens=256,
77
- **kwargs
78
- ):
79
- answer = self.query(image_embeds, question)["answer"].strip()
80
-
81
- if result_queue is not None:
82
- result_queue.put(answer)
83
- return answer
84
-
85
- def batch_answer(self, images, prompts, tokenizer=None, **kwargs):
86
- answers = []
87
- for image, prompt in zip(images, prompts):
88
- answers.append(self.query(image, prompt)["answer"].strip())
89
- return answers
90
-
91
- def _unsupported_exception(self):
92
- raise NotImplementedError(
93
- "This method is not supported in the latest version of moondream. "
94
- "Consider upgrading to the updated API spec, or alternately pin "
95
- "to 'revision=2024-08-26'."
96
- )
97
-
98
- def generate(self, image_embeds, prompt, tokenizer, max_new_tokens=128, **kwargs):
99
- """
100
- Function definition remains unchanged for backwards compatibility.
101
- Be aware that tokenizer, max_new_takens, and kwargs are ignored.
102
- """
103
- prompt_extracted = extract_question(prompt)
104
- if prompt_extracted is not None:
105
- answer = self.model.query(
106
- image=image_embeds, question=prompt_extracted, stream=False
107
- )["answer"]
108
- else:
109
- image_embeds = self.encode_image(image_embeds)
110
- prompt_tokens = torch.tensor(
111
- [self.model.tokenizer.encode(prompt).ids],
112
- device=self.device,
113
- )
114
-
115
- def generator():
116
- for token in self.model._generate_text(
117
- prompt_tokens,
118
- image_embeds.kv_cache,
119
- image_embeds.pos,
120
- max_new_tokens,
121
- ):
122
- yield token
123
-
124
- answer = "".join(list(generator()))
125
-
126
- return [answer]
127
-
128
- def get_input_embeddings(self):
129
- return super().get_input_embeddings()
130
-
131
- def input_embeds(self, *args, **kwargs):
132
- self._unsupported_exception()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
moondream2/layers.py CHANGED
@@ -16,7 +16,7 @@ class LinearWeights:
16
 
17
 
18
  def linear(x: torch.Tensor, w: LinearWeights) -> torch.Tensor:
19
- return F.linear(x, w.weight, w.bias)
20
 
21
 
22
  @dataclass
 
16
 
17
 
18
  def linear(x: torch.Tensor, w: LinearWeights) -> torch.Tensor:
19
+ return F.linear(x, w.weight(), w.bias())
20
 
21
 
22
  @dataclass
moondream2/moondream.py CHANGED
@@ -14,7 +14,7 @@ from .text import build_text_model, text_encoder, lm_head, text_decoder
14
  from .region import decode_coordinate, encode_coordinate, decode_size, encode_size
15
  from .utils import remove_outlier_points
16
  import os
17
- from torchtune.modules import RotaryPositionalEmbeddings
18
  TextSamplingSettings = TypedDict(
19
  "TextSamplingSettings",
20
  {
@@ -40,26 +40,6 @@ DEFAULT_MAX_OBJECTS = 50
40
  @dataclass(frozen=True)
41
  class EncodedImage:
42
  pos: int
43
- caches: List[Tuple[torch.Tensor, torch.Tensor]]
44
-
45
-
46
- class KVCache(nn.Module):
47
-
48
- def __init__(self, n_heads, n_kv_heads, max_context, dim, device, dtype):
49
- super().__init__()
50
- cache_shape = (1, n_kv_heads, max_context, dim // n_heads)
51
- self.register_buffer(
52
- "k_cache", torch.zeros(*cache_shape, device=device, dtype=dtype)
53
- )
54
- self.register_buffer(
55
- "v_cache", torch.zeros(*cache_shape, device=device, dtype=dtype)
56
- )
57
-
58
- def update(self, pos_ids, k, v):
59
- kout, vout = self.k_cache, self.v_cache
60
- kout[:, :, pos_ids, :] = k
61
- vout[:, :, pos_ids, :] = v
62
- return kout, vout
63
 
64
 
65
  class MoondreamModel(nn.Module):
@@ -70,11 +50,7 @@ class MoondreamModel(nn.Module):
70
  self.tokenizer = Tokenizer.from_file(os.path.join(current_dir, "tokenizer.json"))
71
  self.vision = build_vision_model(config.vision, dtype)
72
  self.text = build_text_model(config.text, dtype)
73
-
74
- self.rotary_emb = RotaryPositionalEmbeddings(
75
- config.text.dim // config.text.n_heads,
76
- config.text.max_context
77
- )
78
 
79
  # Region Model
80
  self.region = nn.ModuleDict(
@@ -128,21 +104,6 @@ class MoondreamModel(nn.Module):
128
  attn_mask[..., :prefix_attn_len, :prefix_attn_len] = 1
129
  self.register_buffer("attn_mask", attn_mask, persistent=False)
130
 
131
- # Initialize KV caches.
132
- if setup_caches:
133
- self._setup_caches()
134
-
135
- def _setup_caches(self):
136
- c = self.config.text
137
- for b in self.text.blocks:
138
- b.kv_cache = KVCache(
139
- c.n_heads,
140
- c.n_kv_heads,
141
- c.max_context,
142
- c.dim,
143
- device=self.device,
144
- dtype=self.vision.pos_emb.dtype,
145
- )
146
  @property
147
  def device(self):
148
  return self.vision.pos_emb.device
@@ -154,12 +115,12 @@ class MoondreamModel(nn.Module):
154
  return vision_projection(g, r, self.vision, self.config.vision)
155
 
156
  def _prefill(self, x: torch.Tensor, attn_mask: torch.Tensor, pos_ids: torch.Tensor):
157
- return text_decoder(x, self.text, attn_mask, pos_ids, self.config.text, self.rotary_emb)
158
 
159
  def _decode_one_tok(
160
  self, x: torch.Tensor, attn_mask: torch.Tensor, pos_ids: torch.Tensor
161
  ):
162
- hidden = text_decoder(x, self.text, attn_mask, pos_ids, self.config.text, self.rotary_emb)
163
  logits = lm_head(hidden, self.text)
164
  return logits, hidden
165
 
@@ -217,14 +178,7 @@ class MoondreamModel(nn.Module):
217
  self._prefill(inputs_embeds, mask, pos_ids)
218
 
219
  return EncodedImage(
220
- pos=inputs_embeds.size(1),
221
- caches=[
222
- (
223
- b.kv_cache.k_cache[:, :, : inputs_embeds.size(1), :].clone(),
224
- b.kv_cache.v_cache[:, :, : inputs_embeds.size(1), :].clone(),
225
- )
226
- for b in self.text.blocks
227
- ],
228
  )
229
 
230
  def _apply_top_p(self, probs: torch.Tensor, top_p: float):
@@ -345,11 +299,6 @@ class MoondreamModel(nn.Module):
345
 
346
  return generator(next_token, pos)
347
 
348
- def load_encoded_image(self, encoded_image: EncodedImage):
349
- for b, (k, v) in zip(self.text.blocks, encoded_image.caches):
350
- b.kv_cache.k_cache[:, :, : k.size(2), :] = k
351
- b.kv_cache.v_cache[:, :, : v.size(2), :] = v
352
-
353
  def _generate_points(
354
  self,
355
  hidden: torch.Tensor,
@@ -441,7 +390,6 @@ class MoondreamModel(nn.Module):
441
  raise NotImplementedError("Model does not support pointing.")
442
 
443
  image = self.encode_image(image)
444
- self.load_encoded_image(image)
445
 
446
  prompt_tokens = torch.tensor(
447
  [
 
14
  from .region import decode_coordinate, encode_coordinate, decode_size, encode_size
15
  from .utils import remove_outlier_points
16
  import os
17
+ from .rope import RotaryEmbedding
18
  TextSamplingSettings = TypedDict(
19
  "TextSamplingSettings",
20
  {
 
40
  @dataclass(frozen=True)
41
  class EncodedImage:
42
  pos: int
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
 
45
  class MoondreamModel(nn.Module):
 
50
  self.tokenizer = Tokenizer.from_file(os.path.join(current_dir, "tokenizer.json"))
51
  self.vision = build_vision_model(config.vision, dtype)
52
  self.text = build_text_model(config.text, dtype)
53
+ self.rope = RotaryEmbedding(config.text.dim // config.text.n_heads, config.text.max_context)
 
 
 
 
54
 
55
  # Region Model
56
  self.region = nn.ModuleDict(
 
104
  attn_mask[..., :prefix_attn_len, :prefix_attn_len] = 1
105
  self.register_buffer("attn_mask", attn_mask, persistent=False)
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  @property
108
  def device(self):
109
  return self.vision.pos_emb.device
 
115
  return vision_projection(g, r, self.vision, self.config.vision)
116
 
117
  def _prefill(self, x: torch.Tensor, attn_mask: torch.Tensor, pos_ids: torch.Tensor):
118
+ return text_decoder(x, self.text, attn_mask, self.config.text, self.rope)
119
 
120
  def _decode_one_tok(
121
  self, x: torch.Tensor, attn_mask: torch.Tensor, pos_ids: torch.Tensor
122
  ):
123
+ hidden = text_decoder(x, self.text, attn_mask, self.config.text, self.rope)
124
  logits = lm_head(hidden, self.text)
125
  return logits, hidden
126
 
 
178
  self._prefill(inputs_embeds, mask, pos_ids)
179
 
180
  return EncodedImage(
181
+ pos=inputs_embeds.size(1)
 
 
 
 
 
 
 
182
  )
183
 
184
  def _apply_top_p(self, probs: torch.Tensor, top_p: float):
 
299
 
300
  return generator(next_token, pos)
301
 
 
 
 
 
 
302
  def _generate_points(
303
  self,
304
  hidden: torch.Tensor,
 
390
  raise NotImplementedError("Model does not support pointing.")
391
 
392
  image = self.encode_image(image)
 
393
 
394
  prompt_tokens = torch.tensor(
395
  [
moondream2/rope.py CHANGED
@@ -1,89 +1,53 @@
1
- # Ethically sourced from https://github.com/xjdr-alt/entropix
2
-
3
  import torch
4
- import time
5
-
6
- def precompute_freqs_cis(
7
- dim: int,
8
- end: int,
9
- theta: float = 10000.0,
10
- use_scaled: bool = False,
11
- dtype: torch.dtype = torch.float32,
12
- ) -> torch.Tensor:
13
- freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=dtype)[: (dim // 2)] / dim))
14
- t = torch.arange(end, dtype=dtype).unsqueeze(1)
15
- freqs = t * freqs.unsqueeze(0)
16
- freqs = torch.exp(1j * freqs)
17
- return torch.stack([freqs.real, freqs.imag], dim=-1)
18
-
19
- def func1(x):
20
- #print(x)
21
- pass
22
-
23
- def func2(x):
24
- #print(x)
25
- pass
26
-
27
- def func3(x):
28
- #print(x)
29
- pass
30
-
31
- def func4(x):
32
- #print(x)
33
- pass
34
-
35
- def func5(x):
36
- #print(x)
37
- pass
38
-
39
- def func6(x):
40
- #print(x)
41
- pass
42
-
43
- def func7(x):
44
- #print(x)
45
- pass
46
-
47
- def func8(x):
48
- #print(x)
49
- pass
50
-
51
- def func9(x):
52
- #print(x)
53
- pass
54
-
55
- def func10(x):
56
- #print(x)
57
- pass
58
-
59
- def func11(x):
60
- #print(x)
61
- pass
62
-
63
- def apply_rotary_emb(
64
- x: torch.Tensor,
65
- position_ids: torch.Tensor,
66
- num_heads: int,
67
- rot_dim: int = 32,
68
- interleave: bool = False,
69
- ) -> torch.Tensor:
70
- assert rot_dim == freqs_cis.shape[-2] * 2
71
- assert num_heads == x.shape[1]
72
- x_rot, x_pass = x[..., :rot_dim], x[..., rot_dim:]
73
-
74
- d_q = x_rot.shape[-1] // 2
75
- xq_r, xq_i = x_rot[..., :d_q], x_rot[..., d_q:]
76
-
77
- # Get the cosine component from freqs_cis
78
- cos_component = freqs_cis[..., 0]
79
- # Index with position_ids
80
- cos_indexed = cos_component[position_ids, :]
81
- # Add two dimensions at the beginning
82
- freqs_cos = cos_indexed.unsqueeze(0).unsqueeze(0)
83
- freqs_sin = freqs_cis[..., 1][position_ids, :].unsqueeze(0).unsqueeze(0)
84
-
85
- # Complex multiplication: (a + bi) * (c + di) = (ac - bd) + (ad + bc)i
86
- xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin
87
- xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos
88
- xq_out = torch.stack((xq_out_r, xq_out_i), dim=-1).flatten(-2)
89
- return torch.cat([xq_out.to(x.dtype), x_pass], dim=-1)
 
 
 
1
  import torch
2
+ import torch.nn as nn
3
+
4
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
5
+
6
+ class RotaryEmbedding(nn.Module):
7
+ def __init__(self, head_dim: int, max_seq_len: int, theta: float = 10000.0):
8
+ super().__init__()
9
+ # Match RotaryEmbedding exactly
10
+ self.rot_dim = head_dim // 2 # Only half of head_dim is rotated
11
+
12
+ # Frequency calculation - match RotaryEmbedding exactly
13
+ freqs = 1.0 / (theta ** (torch.arange(0, self.rot_dim, 2).float() / self.rot_dim))
14
+ t = torch.arange(max_seq_len, dtype=torch.float32).unsqueeze(1)
15
+ freqs = t * freqs.unsqueeze(0)
16
+
17
+
18
+ freqs_cis = torch.exp(1j * freqs)
19
+ cos_vals = freqs_cis.real
20
+ sin_vals = freqs_cis.imag
21
+
22
+ self.register_buffer('cos_cache', cos_vals, persistent=False)
23
+ self.register_buffer('sin_cache', sin_vals, persistent=False)
24
+
25
+ def apply(self, x: torch.Tensor) -> torch.Tensor:
26
+ """
27
+ WARNING: This modifies the input tensor in-place for maximum speed!
28
+ If you need the original tensor, make a copy before calling this.
29
+
30
+ Must match RotaryEmbedding output exactly.
31
+ """
32
+ seq_len = x.shape[1]
33
+ d = self.rot_dim // 2
34
+
35
+ # Get cos/sin with same broadcasting as RotaryEmbedding
36
+ cos = self.cos_cache[:seq_len].unsqueeze(0).unsqueeze(2)
37
+ sin = self.sin_cache[:seq_len].unsqueeze(0).unsqueeze(2)
38
+
39
+ # Split rotated part into real/imaginary components
40
+ xq_r = x[..., :d] # First half of rot_dim
41
+ xq_i = x[..., d:d*2] # Second half of rot_dim
42
+
43
+ # Apply rotation
44
+ xq_out_r = xq_r * cos - xq_i * sin
45
+ xq_out_i = xq_r * sin + xq_i * cos
46
+
47
+ # Vectorized interleaving using torch.stack and view
48
+ # Stack creates [d, ..., 2] then view as [..., d*2]
49
+ x[..., :self.rot_dim] = torch.stack([xq_out_r, xq_out_i], dim=-1).view(*x.shape[:-1], self.rot_dim)
50
+
51
+ # x_pass part (x[..., self.rot_dim:]) remains unchanged automatically
52
+
53
+ return x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
moondream2/text.py CHANGED
@@ -1,12 +1,15 @@
1
  import torch
2
  import torch.nn as nn
3
-
4
  from torch.nn import functional as F
5
 
6
  from .layers import layer_norm, mlp
7
- from .rope import apply_rotary_emb, precompute_freqs_cis
8
  from .config import TextConfig
9
 
 
 
 
 
10
 
11
  def text_encoder(input_ids: torch.Tensor, w: nn.Module):
12
  return F.embedding(input_ids, w.wte)
@@ -15,12 +18,9 @@ def text_encoder(input_ids: torch.Tensor, w: nn.Module):
15
  def attn(
16
  x: torch.Tensor,
17
  w: nn.Module,
18
- kv_cache: nn.Module,
19
  attn_mask: torch.Tensor,
20
  n_heads: int,
21
- position_ids: torch.Tensor,
22
- rotary_emb: nn.Module,
23
- do_apply_rotary_emb: bool = True,
24
  ):
25
  bsz, q_len, d_model = x.shape
26
  head_dim = d_model // n_heads
@@ -37,11 +37,8 @@ def attn(
37
  # 3. Unpack/Split along the first dimension (which now separates Q, K, V)
38
  q, k, v = qkv_permuted[0], qkv_permuted[1], qkv_permuted[2]
39
 
40
- q = rotary_emb(q.permute(0, 2, 1, 3))
41
- k = rotary_emb(k.permute(0, 2, 1, 3))
42
-
43
- if kv_cache is not None:
44
- k, v = kv_cache.update(position_ids, k, v)
45
 
46
  out = F.scaled_dot_product_attention(
47
  q, k, v, attn_mask=attn_mask
@@ -55,9 +52,8 @@ def text_decoder(
55
  x: torch.Tensor,
56
  w: nn.Module,
57
  attn_mask: torch.Tensor,
58
- position_ids: torch.Tensor,
59
  config: TextConfig,
60
- rotary_emb: nn.Module
61
 
62
  ):
63
  for i, block in enumerate(w.blocks):
@@ -65,11 +61,9 @@ def text_decoder(
65
  l_attn = attn(
66
  l_in,
67
  block.attn,
68
- kv_cache=block.kv_cache,
69
  attn_mask=attn_mask,
70
  n_heads=config.n_heads,
71
- rotary_emb=rotary_emb,
72
- position_ids=position_ids,
73
  )
74
  l_mlp = mlp(l_in, block.mlp)
75
  x = x + l_attn + l_mlp
 
1
  import torch
2
  import torch.nn as nn
3
+ from typing import TYPE_CHECKING
4
  from torch.nn import functional as F
5
 
6
  from .layers import layer_norm, mlp
 
7
  from .config import TextConfig
8
 
9
+ # type checking imports if typechecking
10
+ if TYPE_CHECKING:
11
+ from .rope import RotaryEmbedding
12
+
13
 
14
  def text_encoder(input_ids: torch.Tensor, w: nn.Module):
15
  return F.embedding(input_ids, w.wte)
 
18
  def attn(
19
  x: torch.Tensor,
20
  w: nn.Module,
 
21
  attn_mask: torch.Tensor,
22
  n_heads: int,
23
+ rope: "RotaryEmbedding"
 
 
24
  ):
25
  bsz, q_len, d_model = x.shape
26
  head_dim = d_model // n_heads
 
37
  # 3. Unpack/Split along the first dimension (which now separates Q, K, V)
38
  q, k, v = qkv_permuted[0], qkv_permuted[1], qkv_permuted[2]
39
 
40
+ q = rope.apply(q.permute(0, 2, 1, 3))
41
+ k = rope.apply(k.permute(0, 2, 1, 3))
 
 
 
42
 
43
  out = F.scaled_dot_product_attention(
44
  q, k, v, attn_mask=attn_mask
 
52
  x: torch.Tensor,
53
  w: nn.Module,
54
  attn_mask: torch.Tensor,
 
55
  config: TextConfig,
56
+ rope: "RotaryEmbedding"
57
 
58
  ):
59
  for i, block in enumerate(w.blocks):
 
61
  l_attn = attn(
62
  l_in,
63
  block.attn,
 
64
  attn_mask=attn_mask,
65
  n_heads=config.n_heads,
66
+ rope=rope,
 
67
  )
68
  l_mlp = mlp(l_in, block.mlp)
69
  x = x + l_attn + l_mlp
moondream2/vision.py CHANGED
@@ -34,9 +34,10 @@ def prepare_crops(
34
  )
35
  all_crops = overlap_crops["crops"]
36
  all_crops = np.transpose(all_crops, (0, 3, 1, 2))
 
37
  all_crops = (
38
- torch.from_numpy(all_crops)
39
- .to(device=device, dtype=torch.float16)
40
  .div_(255.0)
41
  .sub_(0.5)
42
  .div_(0.5)
 
34
  )
35
  all_crops = overlap_crops["crops"]
36
  all_crops = np.transpose(all_crops, (0, 3, 1, 2))
37
+ all_crops = torch.from_numpy(all_crops).to(dtype=torch.float16)
38
  all_crops = (
39
+ all_crops
40
+ .to(device=device)
41
  .div_(255.0)
42
  .sub_(0.5)
43
  .div_(0.5)
notes.ipynb CHANGED
@@ -8,7 +8,7 @@
8
  {
9
  "data": {
10
  "text/plain": [
11
- "True"
12
  ]
13
  },
14
  "execution_count": 2,
@@ -18,7 +18,17 @@
18
  ],
19
  "source": [
20
  "import torch\n",
21
- "torch.cuda.is_available()"
 
 
 
 
 
 
 
 
 
 
22
  ]
23
  },
24
  {
@@ -54,28 +64,30 @@
54
  "cell_type": "code",
55
  "execution_count": 1,
56
  "metadata": {},
57
- "outputs": [
58
- {
59
- "ename": "",
60
- "evalue": "",
61
- "output_type": "error",
62
- "traceback": [
63
- "\u001b[1;31mRunning cells with 'venv12 (Python 3.12.10)' requires the ipykernel package.\n",
64
- "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n",
65
- "\u001b[1;31mCommand: '/home/pixel/Desktop/moondream/venv12/bin/python3.12 -m pip install ipykernel -U --force-reinstall'"
66
- ]
67
- }
68
- ],
69
  "source": [
70
  "import torch"
71
  ]
72
  },
73
  {
74
  "cell_type": "code",
75
- "execution_count": 2,
76
  "metadata": {},
77
- "outputs": [],
 
 
 
 
 
 
 
 
 
78
  "source": [
 
 
 
 
79
  "import os\n",
80
  "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n",
81
  "\n",
@@ -83,51 +95,263 @@
83
  "from moondream2.config import MoondreamConfig\n",
84
  "from moondream2.moondream import MoondreamModel\n",
85
  "import torch.profiler\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  "\n",
87
- "config = MoondreamConfig()\n",
88
- "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
89
- "model = MoondreamModel(config, setup_caches=False)\n",
90
- "from safetensors.torch import load_model\n",
91
- "weights_path = \"moondream2/model2.safetensors\" # Path to your local weights file\n",
92
- "state_dict = load_model(model, weights_path)\n",
93
- "model._setup_caches()"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  ]
95
  },
96
  {
97
  "cell_type": "code",
98
  "execution_count": 3,
99
  "metadata": {},
 
 
 
 
 
 
 
 
 
 
100
  "outputs": [
101
  {
102
- "ename": "OutOfMemoryError",
103
- "evalue": "CUDA out of memory. Tried to allocate 200.00 MiB. GPU 0 has a total capacity of 3.69 GiB of which 129.50 MiB is free. Including non-PyTorch memory, this process has 3.56 GiB memory in use. Of the allocated memory 3.47 GiB is allocated by PyTorch, and 13.58 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)",
104
- "output_type": "error",
105
- "traceback": [
106
- "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
107
- "\u001b[31mOutOfMemoryError\u001b[39m Traceback (most recent call last)",
108
- "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m)\u001b[49m\n",
109
- "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/venv/lib/python3.13/site-packages/torch/nn/modules/module.py:1355\u001b[39m, in \u001b[36mModule.to\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1352\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1353\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1355\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n",
110
- "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/venv/lib/python3.13/site-packages/torch/nn/modules/module.py:915\u001b[39m, in \u001b[36mModule._apply\u001b[39m\u001b[34m(self, fn, recurse)\u001b[39m\n\u001b[32m 913\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[32m 914\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.children():\n\u001b[32m--> \u001b[39m\u001b[32m915\u001b[39m \u001b[43mmodule\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 917\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied):\n\u001b[32m 918\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m torch._has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[32m 919\u001b[39m \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[32m 920\u001b[39m \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 925\u001b[39m \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[32m 926\u001b[39m \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n",
111
- "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/venv/lib/python3.13/site-packages/torch/nn/modules/module.py:942\u001b[39m, in \u001b[36mModule._apply\u001b[39m\u001b[34m(self, fn, recurse)\u001b[39m\n\u001b[32m 938\u001b[39m \u001b[38;5;66;03m# Tensors stored in modules are graph leaves, and we don't want to\u001b[39;00m\n\u001b[32m 939\u001b[39m \u001b[38;5;66;03m# track autograd history of `param_applied`, so we have to use\u001b[39;00m\n\u001b[32m 940\u001b[39m \u001b[38;5;66;03m# `with torch.no_grad():`\u001b[39;00m\n\u001b[32m 941\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m torch.no_grad():\n\u001b[32m--> \u001b[39m\u001b[32m942\u001b[39m param_applied = \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mparam\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 943\u001b[39m p_should_use_set_data = compute_should_use_set_data(param, param_applied)\n\u001b[32m 945\u001b[39m \u001b[38;5;66;03m# subclasses may have multiple child tensors so we need to use swap_tensors\u001b[39;00m\n",
112
- "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/venv/lib/python3.13/site-packages/torch/nn/modules/module.py:1341\u001b[39m, in \u001b[36mModule.to.<locals>.convert\u001b[39m\u001b[34m(t)\u001b[39m\n\u001b[32m 1334\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m convert_to_format \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m t.dim() \u001b[38;5;129;01min\u001b[39;00m (\u001b[32m4\u001b[39m, \u001b[32m5\u001b[39m):\n\u001b[32m 1335\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m t.to(\n\u001b[32m 1336\u001b[39m device,\n\u001b[32m 1337\u001b[39m dtype \u001b[38;5;28;01mif\u001b[39;00m t.is_floating_point() \u001b[38;5;129;01mor\u001b[39;00m t.is_complex() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m 1338\u001b[39m non_blocking,\n\u001b[32m 1339\u001b[39m memory_format=convert_to_format,\n\u001b[32m 1340\u001b[39m )\n\u001b[32m-> \u001b[39m\u001b[32m1341\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mt\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1342\u001b[39m \u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1343\u001b[39m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m.\u001b[49m\u001b[43mis_floating_point\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m.\u001b[49m\u001b[43mis_complex\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 1344\u001b[39m \u001b[43m \u001b[49m\u001b[43mnon_blocking\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1345\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1346\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 1347\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mstr\u001b[39m(e) == \u001b[33m\"\u001b[39m\u001b[33mCannot copy out of meta tensor; no data!\u001b[39m\u001b[33m\"\u001b[39m:\n",
113
- "\u001b[31mOutOfMemoryError\u001b[39m: CUDA out of memory. Tried to allocate 200.00 MiB. GPU 0 has a total capacity of 3.69 GiB of which 129.50 MiB is free. Including non-PyTorch memory, this process has 3.56 GiB memory in use. Of the allocated memory 3.47 GiB is allocated by PyTorch, and 13.58 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  ]
115
  }
116
  ],
117
  "source": [
118
- "model.to(device)"
 
 
 
 
 
 
 
 
119
  ]
120
  },
121
  {
122
  "cell_type": "code",
123
- "execution_count": 8,
124
  "metadata": {},
125
  "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  "source": [
127
  "from PIL import Image\n",
128
- "image = Image.open(\"example.png\")\n",
129
- "query = \"home icon at the bottom of the screen is visible\"\n",
130
- "points = model.point(image, query)[\"points\"]\n"
 
131
  ]
132
  },
133
  {
@@ -303,96 +527,36 @@
303
  },
304
  {
305
  "cell_type": "code",
306
- "execution_count": 1,
307
- "metadata": {},
308
- "outputs": [],
309
- "source": [
310
- "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
311
- "\n",
312
- "model = AutoModelForCausalLM.from_pretrained(\n",
313
- " \"vikhyatk/moondream2\",\n",
314
- " revision=\"2025-04-14\",\n",
315
- " trust_remote_code=True,\n",
316
- " cache_dir=\"moondream\",\n",
317
- " # Uncomment to run on GPU.\n",
318
- " device_map={\"\": \"cuda\"}\n",
319
- ")"
320
- ]
321
- },
322
- {
323
- "cell_type": "code",
324
- "execution_count": 7,
325
  "metadata": {},
326
  "outputs": [
327
  {
328
  "data": {
329
  "text/plain": [
330
- "HfMoondream(\n",
331
- " (model): MoondreamModel(\n",
332
- " (vision): ModuleDict(\n",
333
- " (patch_emb): Linear(in_features=588, out_features=1152, bias=True)\n",
334
- " (blocks): ModuleList(\n",
335
- " (0-26): 27 x ModuleDict(\n",
336
- " (ln1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
337
- " (attn): ModuleDict(\n",
338
- " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n",
339
- " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n",
340
- " )\n",
341
- " (ln2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
342
- " (mlp): ModuleDict(\n",
343
- " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n",
344
- " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n",
345
- " )\n",
346
- " )\n",
347
- " )\n",
348
- " (post_ln): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
349
- " (proj_mlp): ModuleDict(\n",
350
- " (fc1): Linear(in_features=2304, out_features=8192, bias=True)\n",
351
- " (fc2): Linear(in_features=8192, out_features=2048, bias=True)\n",
352
- " )\n",
353
- " )\n",
354
- " (text): ModuleDict(\n",
355
- " (blocks): ModuleList(\n",
356
- " (0-23): 24 x ModuleDict(\n",
357
- " (ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
358
- " (attn): ModuleDict(\n",
359
- " (qkv): Linear(in_features=2048, out_features=6144, bias=True)\n",
360
- " (proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
361
- " )\n",
362
- " (mlp): ModuleDict(\n",
363
- " (fc1): Linear(in_features=2048, out_features=8192, bias=True)\n",
364
- " (fc2): Linear(in_features=8192, out_features=2048, bias=True)\n",
365
- " )\n",
366
- " )\n",
367
- " )\n",
368
- " (post_ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
369
- " (lm_head): Linear(in_features=2048, out_features=51200, bias=True)\n",
370
- " )\n",
371
- " (region): ModuleDict(\n",
372
- " (coord_encoder): Linear(in_features=256, out_features=2048, bias=True)\n",
373
- " (coord_decoder): ModuleDict(\n",
374
- " (fc1): Linear(in_features=2048, out_features=8192, bias=True)\n",
375
- " (fc2): Linear(in_features=8192, out_features=1024, bias=True)\n",
376
- " )\n",
377
- " (size_encoder): Linear(in_features=512, out_features=2048, bias=True)\n",
378
- " (size_decoder): ModuleDict(\n",
379
- " (fc1): Linear(in_features=2048, out_features=8192, bias=True)\n",
380
- " (fc2): Linear(in_features=8192, out_features=2048, bias=True)\n",
381
- " )\n",
382
- " )\n",
383
- " )\n",
384
- ")"
385
  ]
386
  },
387
- "execution_count": 7,
388
  "metadata": {},
389
  "output_type": "execute_result"
390
  }
391
  ],
392
  "source": [
393
- "model.point()"
 
 
 
 
 
394
  ]
395
  },
 
 
 
 
 
 
 
396
  {
397
  "cell_type": "code",
398
  "execution_count": null,
 
8
  {
9
  "data": {
10
  "text/plain": [
11
+ "20"
12
  ]
13
  },
14
  "execution_count": 2,
 
18
  ],
19
  "source": [
20
  "import torch\n",
21
+ "import gc\n",
22
+ "# ... your code that uses GPU tensors ...\n",
23
+ "# For example:\n",
24
+ "# x = torch.randn(1000, 1000, device='cuda')\n",
25
+ "# del x # or x = None\n",
26
+ "\n",
27
+ "# This is the key command:\n",
28
+ "torch.cuda.empty_cache()\n",
29
+ "\n",
30
+ "# Optionally, run Python's garbage collector too\n",
31
+ "gc.collect()"
32
  ]
33
  },
34
  {
 
64
  "cell_type": "code",
65
  "execution_count": 1,
66
  "metadata": {},
67
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
68
  "source": [
69
  "import torch"
70
  ]
71
  },
72
  {
73
  "cell_type": "code",
74
+ "execution_count": 1,
75
  "metadata": {},
76
+ "outputs": [
77
+ {
78
+ "name": "stderr",
79
+ "output_type": "stream",
80
+ "text": [
81
+ "/home/pixel/Desktop/moondream/venv/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
82
+ " from .autonotebook import tqdm as notebook_tqdm\n"
83
+ ]
84
+ }
85
+ ],
86
  "source": [
87
+ "# auto reload jupyter notebook\n",
88
+ "%load_ext autoreload\n",
89
+ "%autoreload 2\n",
90
+ "\n",
91
  "import os\n",
92
  "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n",
93
  "\n",
 
95
  "from moondream2.config import MoondreamConfig\n",
96
  "from moondream2.moondream import MoondreamModel\n",
97
  "import torch.profiler\n",
98
+ "with torch.inference_mode():\n",
99
+ " config = MoondreamConfig()\n",
100
+ " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
101
+ " model = MoondreamModel(config, setup_caches=False)\n",
102
+ " from safetensors.torch import load_model\n",
103
+ " weights_path = \"moondream2/model.safetensors\" # Path to your local weights file\n",
104
+ " state_dict = load_model(model, weights_path)\n"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": 2,
110
+ "metadata": {},
111
+ "outputs": [],
112
+ "source": [
113
+ "import torch.nn as nn\n",
114
+ "from torch.quantization import quantize_dynamic\n",
115
+ "\n",
116
  "\n",
117
+ "model_quantized = quantize_dynamic(\n",
118
+ " model, \n",
119
+ " {nn.Linear}, # Only quantize these layer types\n",
120
+ " dtype=torch.qint8\n",
121
+ ")\n"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "execution_count": 6,
127
+ "metadata": {},
128
+ "outputs": [],
129
+ "source": [
130
+ "real = model.get_submodule(\"vision\").proj_mlp.get_submodule(\"fc1\")"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": 8,
136
+ "metadata": {},
137
+ "outputs": [
138
+ {
139
+ "data": {
140
+ "text/plain": [
141
+ "tensor([[ 0.0198, 0.0356, -0.0158, ..., 0.0119, 0.0487, 0.0448],\n",
142
+ " [ 0.0198, 0.0277, 0.0725, ..., 0.0079, 0.0343, 0.0435],\n",
143
+ " [-0.0422, 0.0356, -0.0263, ..., -0.0145, -0.0250, 0.0184],\n",
144
+ " ...,\n",
145
+ " [-0.0356, -0.0474, -0.0237, ..., -0.0198, 0.0277, 0.0263],\n",
146
+ " [ 0.0013, 0.0263, 0.0395, ..., -0.0422, 0.0329, -0.0316],\n",
147
+ " [ 0.0303, -0.0527, -0.0356, ..., 0.0382, -0.0171, -0.0171]],\n",
148
+ " size=(8192, 2304), dtype=torch.qint8,\n",
149
+ " quantization_scheme=torch.per_tensor_affine, scale=0.0013174019986763597,\n",
150
+ " zero_point=0)"
151
+ ]
152
+ },
153
+ "execution_count": 8,
154
+ "metadata": {},
155
+ "output_type": "execute_result"
156
+ }
157
+ ],
158
+ "source": [
159
+ "real."
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": 9,
165
+ "metadata": {},
166
+ "outputs": [],
167
+ "source": [
168
+ "vision = model_quantized.get_submodule(\"vision\")"
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "code",
173
+ "execution_count": 10,
174
+ "metadata": {},
175
+ "outputs": [],
176
+ "source": [
177
+ "fcc = vision.proj_mlp.get_submodule(\"fc1\")"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": null,
183
+ "metadata": {},
184
+ "outputs": [
185
+ {
186
+ "data": {
187
+ "text/plain": [
188
+ "tensor([[ 0.0198, 0.0356, -0.0158, ..., 0.0119, 0.0487, 0.0448],\n",
189
+ " [ 0.0198, 0.0277, 0.0725, ..., 0.0079, 0.0343, 0.0435],\n",
190
+ " [-0.0422, 0.0356, -0.0263, ..., -0.0145, -0.0250, 0.0184],\n",
191
+ " ...,\n",
192
+ " [-0.0356, -0.0474, -0.0237, ..., -0.0198, 0.0277, 0.0263],\n",
193
+ " [ 0.0013, 0.0263, 0.0395, ..., -0.0422, 0.0329, -0.0316],\n",
194
+ " [ 0.0303, -0.0527, -0.0356, ..., 0.0382, -0.0171, -0.0171]],\n",
195
+ " size=(8192, 2304), dtype=torch.qint8,\n",
196
+ " quantization_scheme=torch.per_tensor_affine, scale=0.0013174019986763597,\n",
197
+ " zero_point=0)"
198
+ ]
199
+ },
200
+ "execution_count": 13,
201
+ "metadata": {},
202
+ "output_type": "execute_result"
203
+ }
204
+ ],
205
+ "source": [
206
+ "fcc.weight()"
207
  ]
208
  },
209
  {
210
  "cell_type": "code",
211
  "execution_count": 3,
212
  "metadata": {},
213
+ "outputs": [],
214
+ "source": [
215
+ "\n",
216
+ "model = model.to(device)"
217
+ ]
218
+ },
219
+ {
220
+ "cell_type": "code",
221
+ "execution_count": 5,
222
+ "metadata": {},
223
  "outputs": [
224
  {
225
+ "data": {
226
+ "text/plain": [
227
+ "MoondreamModel(\n",
228
+ " (vision): ModuleDict(\n",
229
+ " (patch_emb): DynamicQuantizedLinear(in_features=588, out_features=1152, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
230
+ " (blocks): ModuleList(\n",
231
+ " (0-26): 27 x ModuleDict(\n",
232
+ " (ln1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
233
+ " (attn): ModuleDict(\n",
234
+ " (qkv): DynamicQuantizedLinear(in_features=1152, out_features=3456, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
235
+ " (proj): DynamicQuantizedLinear(in_features=1152, out_features=1152, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
236
+ " )\n",
237
+ " (ln2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
238
+ " (mlp): ModuleDict(\n",
239
+ " (fc1): DynamicQuantizedLinear(in_features=1152, out_features=4304, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
240
+ " (fc2): DynamicQuantizedLinear(in_features=4304, out_features=1152, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
241
+ " )\n",
242
+ " )\n",
243
+ " )\n",
244
+ " (post_ln): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n",
245
+ " (proj_mlp): ModuleDict(\n",
246
+ " (fc1): DynamicQuantizedLinear(in_features=2304, out_features=8192, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
247
+ " (fc2): DynamicQuantizedLinear(in_features=8192, out_features=2048, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
248
+ " )\n",
249
+ " )\n",
250
+ " (text): ModuleDict(\n",
251
+ " (blocks): ModuleList(\n",
252
+ " (0-23): 24 x ModuleDict(\n",
253
+ " (ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
254
+ " (attn): ModuleDict(\n",
255
+ " (qkv): DynamicQuantizedLinear(in_features=2048, out_features=6144, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
256
+ " (proj): DynamicQuantizedLinear(in_features=2048, out_features=2048, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
257
+ " )\n",
258
+ " (mlp): ModuleDict(\n",
259
+ " (fc1): DynamicQuantizedLinear(in_features=2048, out_features=8192, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
260
+ " (fc2): DynamicQuantizedLinear(in_features=8192, out_features=2048, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
261
+ " )\n",
262
+ " )\n",
263
+ " )\n",
264
+ " (post_ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n",
265
+ " (lm_head): DynamicQuantizedLinear(in_features=2048, out_features=51200, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
266
+ " )\n",
267
+ " (rotary_emb): RotaryPositionalEmbeddings()\n",
268
+ " (region): ModuleDict(\n",
269
+ " (coord_encoder): DynamicQuantizedLinear(in_features=256, out_features=2048, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
270
+ " (coord_decoder): ModuleDict(\n",
271
+ " (fc1): DynamicQuantizedLinear(in_features=2048, out_features=8192, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
272
+ " (fc2): DynamicQuantizedLinear(in_features=8192, out_features=1024, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
273
+ " )\n",
274
+ " (size_encoder): DynamicQuantizedLinear(in_features=512, out_features=2048, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
275
+ " (size_decoder): ModuleDict(\n",
276
+ " (fc1): DynamicQuantizedLinear(in_features=2048, out_features=8192, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
277
+ " (fc2): DynamicQuantizedLinear(in_features=8192, out_features=2048, dtype=torch.qint8, qscheme=torch.per_tensor_affine)\n",
278
+ " )\n",
279
+ " )\n",
280
+ ")"
281
+ ]
282
+ },
283
+ "execution_count": 5,
284
+ "metadata": {},
285
+ "output_type": "execute_result"
286
+ }
287
+ ],
288
+ "source": [
289
+ "model"
290
+ ]
291
+ },
292
+ {
293
+ "cell_type": "code",
294
+ "execution_count": 4,
295
+ "metadata": {},
296
+ "outputs": [
297
+ {
298
+ "name": "stdout",
299
+ "output_type": "stream",
300
+ "text": [
301
+ "model size: 203.238MB\n"
302
  ]
303
  }
304
  ],
305
  "source": [
306
+ "param_size = 0\n",
307
+ "for param in model.parameters():\n",
308
+ " param_size += param.nelement() * param.element_size()\n",
309
+ "buffer_size = 0\n",
310
+ "for buffer in model.buffers():\n",
311
+ " buffer_size += buffer.nelement() * buffer.element_size()\n",
312
+ "\n",
313
+ "size_all_mb = (param_size + buffer_size) / 1024**2\n",
314
+ "print('model size: {:.3f}MB'.format(size_all_mb))"
315
  ]
316
  },
317
  {
318
  "cell_type": "code",
319
+ "execution_count": 3,
320
  "metadata": {},
321
  "outputs": [],
322
+ "source": [
323
+ "model = model_quantized.to(device)"
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "code",
328
+ "execution_count": 4,
329
+ "metadata": {},
330
+ "outputs": [
331
+ {
332
+ "ename": "RuntimeError",
333
+ "evalue": "self and mat2 must have the same dtype, but got Half and QInt8",
334
+ "output_type": "error",
335
+ "traceback": [
336
+ "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
337
+ "\u001b[31mRuntimeError\u001b[39m Traceback (most recent call last)",
338
+ "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 5\u001b[39m\n\u001b[32m 3\u001b[39m image = Image.open(\u001b[33m\"\u001b[39m\u001b[33mexample.png\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 4\u001b[39m query = \u001b[33m\"\u001b[39m\u001b[33mhome icon at the bottom of the screen is visible\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m points = \u001b[43mmodel\u001b[49m\u001b[43m.\u001b[49m\u001b[43mpoint\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m)\u001b[49m[\u001b[33m\"\u001b[39m\u001b[33mpoints\u001b[39m\u001b[33m\"\u001b[39m]\n",
339
+ "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/moondream2/moondream.py:396\u001b[39m, in \u001b[36mMoondreamModel.point\u001b[39m\u001b[34m(self, image, object, settings)\u001b[39m\n\u001b[32m 393\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.config.tokenizer.templates[\u001b[33m\"\u001b[39m\u001b[33mpoint\u001b[39m\u001b[33m\"\u001b[39m] \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 394\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33mModel does not support pointing.\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m396\u001b[39m image = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mencode_image\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimage\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 398\u001b[39m prompt_tokens = torch.tensor(\n\u001b[32m 399\u001b[39m [\n\u001b[32m 400\u001b[39m \u001b[38;5;28mself\u001b[39m.config.tokenizer.templates[\u001b[33m\"\u001b[39m\u001b[33mpoint\u001b[39m\u001b[33m\"\u001b[39m][\u001b[33m\"\u001b[39m\u001b[33mprefix\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m (...)\u001b[39m\u001b[32m 404\u001b[39m device=\u001b[38;5;28mself\u001b[39m.device,\n\u001b[32m 405\u001b[39m )\n\u001b[32m 407\u001b[39m _, hidden, next_token, pos = \u001b[38;5;28mself\u001b[39m._prefill_prompt(\n\u001b[32m 408\u001b[39m prompt_tokens, image.pos, temperature=\u001b[32m0\u001b[39m, top_p=\u001b[32m0\u001b[39m\n\u001b[32m 409\u001b[39m )\n",
340
+ "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/moondream2/moondream.py:174\u001b[39m, in \u001b[36mMoondreamModel.encode_image\u001b[39m\u001b[34m(self, image)\u001b[39m\n\u001b[32m 170\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m torch.inference_mode():\n\u001b[32m 172\u001b[39m bos = torch.tensor([[\u001b[38;5;28mself\u001b[39m.config.tokenizer.bos_id]], device=\u001b[38;5;28mself\u001b[39m.device)\n\u001b[32m--> \u001b[39m\u001b[32m174\u001b[39m img_emb = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_run_vision_encoder\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimage\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 175\u001b[39m bos_emb = text_encoder(\n\u001b[32m 176\u001b[39m bos,\n\u001b[32m 177\u001b[39m \u001b[38;5;28mself\u001b[39m.text,\n\u001b[32m 178\u001b[39m )\n\u001b[32m 179\u001b[39m inputs_embeds = torch.cat([bos_emb, img_emb[\u001b[38;5;28;01mNone\u001b[39;00m]], dim=\u001b[32m1\u001b[39m)\n",
341
+ "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/moondream2/moondream.py:143\u001b[39m, in \u001b[36mMoondreamModel._run_vision_encoder\u001b[39m\u001b[34m(self, image)\u001b[39m\n\u001b[32m 140\u001b[39m all_crops, tiling = prepare_crops(image, \u001b[38;5;28mself\u001b[39m.config.vision, device=\u001b[38;5;28mself\u001b[39m.device)\n\u001b[32m 141\u001b[39m torch._dynamo.mark_dynamic(all_crops, \u001b[32m0\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m143\u001b[39m outputs = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_vis_enc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mall_crops\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 145\u001b[39m global_features = outputs[\u001b[32m0\u001b[39m]\n\u001b[32m 146\u001b[39m local_features = outputs[\u001b[32m1\u001b[39m:].view(\n\u001b[32m 147\u001b[39m -\u001b[32m1\u001b[39m,\n\u001b[32m 148\u001b[39m \u001b[38;5;28mself\u001b[39m.config.vision.enc_n_layers,\n\u001b[32m 149\u001b[39m \u001b[38;5;28mself\u001b[39m.config.vision.enc_n_layers,\n\u001b[32m 150\u001b[39m \u001b[38;5;28mself\u001b[39m.config.vision.enc_dim,\n\u001b[32m 151\u001b[39m )\n",
342
+ "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/moondream2/moondream.py:116\u001b[39m, in \u001b[36mMoondreamModel._vis_enc\u001b[39m\u001b[34m(self, x)\u001b[39m\n\u001b[32m 115\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_vis_enc\u001b[39m(\u001b[38;5;28mself\u001b[39m, x: torch.Tensor):\n\u001b[32m--> \u001b[39m\u001b[32m116\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mvision_encoder\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mvision\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m.\u001b[49m\u001b[43mvision\u001b[49m\u001b[43m)\u001b[49m\n",
343
+ "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/moondream2/vision.py:71\u001b[39m, in \u001b[36mvision_encoder\u001b[39m\u001b[34m(input_BCHW, w, config)\u001b[39m\n\u001b[32m 68\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mvision_encoder\u001b[39m(input_BCHW: torch.Tensor, w: nn.Module, config: VisionConfig):\n\u001b[32m 69\u001b[39m x = create_patches(input_BCHW, config.enc_patch_size)\n\u001b[32m---> \u001b[39m\u001b[32m71\u001b[39m x = \u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mw\u001b[49m\u001b[43m.\u001b[49m\u001b[43mpatch_emb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 72\u001b[39m x = x + w.pos_emb\n\u001b[32m 73\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m block \u001b[38;5;129;01min\u001b[39;00m w.blocks:\n",
344
+ "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/moondream/moondream2/layers.py:19\u001b[39m, in \u001b[36mlinear\u001b[39m\u001b[34m(x, w)\u001b[39m\n\u001b[32m 18\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mlinear\u001b[39m(x: torch.Tensor, w: LinearWeights) -> torch.Tensor:\n\u001b[32m---> \u001b[39m\u001b[32m19\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[43m.\u001b[49m\u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mw\u001b[49m\u001b[43m.\u001b[49m\u001b[43mweight\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mw\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbias\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
345
+ "\u001b[31mRuntimeError\u001b[39m: self and mat2 must have the same dtype, but got Half and QInt8"
346
+ ]
347
+ }
348
+ ],
349
  "source": [
350
  "from PIL import Image\n",
351
+ "with torch.inference_mode():\n",
352
+ " image = Image.open(\"example.png\")\n",
353
+ " query = \"home icon at the bottom of the screen is visible\"\n",
354
+ " points = model.point(image, query)[\"points\"]\n"
355
  ]
356
  },
357
  {
 
527
  },
528
  {
529
  "cell_type": "code",
530
+ "execution_count": 4,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
531
  "metadata": {},
532
  "outputs": [
533
  {
534
  "data": {
535
  "text/plain": [
536
+ "Linear(in_features=10, out_features=10, bias=True)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
537
  ]
538
  },
539
+ "execution_count": 4,
540
  "metadata": {},
541
  "output_type": "execute_result"
542
  }
543
  ],
544
  "source": [
545
+ "import torch \n",
546
+ "import torch.nn as nn\n",
547
+ "\n",
548
+ "linear = nn.Linear(10, 10, dtype=torch.float16)\n",
549
+ "\n",
550
+ "linear"
551
  ]
552
  },
553
+ {
554
+ "cell_type": "code",
555
+ "execution_count": null,
556
+ "metadata": {},
557
+ "outputs": [],
558
+ "source": []
559
+ },
560
  {
561
  "cell_type": "code",
562
  "execution_count": null,
ollama.ipynb CHANGED
@@ -1,166 +1,365 @@
1
  {
2
  "cells": [
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": 1,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
9
- "from transformers import AutoConfig\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  "\n",
11
- "config = AutoConfig.from_pretrained(\"vikhyatk/moondream2\", trust_remote_code=True)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  ]
13
  },
14
  {
15
  "cell_type": "code",
16
  "execution_count": 2,
17
  "metadata": {},
18
- "outputs": [
19
- {
20
- "data": {
21
- "text/plain": [
22
- "{'return_dict': True,\n",
23
- " 'output_hidden_states': False,\n",
24
- " 'output_attentions': False,\n",
25
- " 'torchscript': False,\n",
26
- " 'torch_dtype': 'float16',\n",
27
- " 'use_bfloat16': False,\n",
28
- " 'tf_legacy_loss': False,\n",
29
- " 'pruned_heads': {},\n",
30
- " 'tie_word_embeddings': True,\n",
31
- " 'chunk_size_feed_forward': 0,\n",
32
- " 'is_encoder_decoder': False,\n",
33
- " 'is_decoder': False,\n",
34
- " 'cross_attention_hidden_size': None,\n",
35
- " 'add_cross_attention': False,\n",
36
- " 'tie_encoder_decoder': False,\n",
37
- " 'max_length': 20,\n",
38
- " 'min_length': 0,\n",
39
- " 'do_sample': False,\n",
40
- " 'early_stopping': False,\n",
41
- " 'num_beams': 1,\n",
42
- " 'num_beam_groups': 1,\n",
43
- " 'diversity_penalty': 0.0,\n",
44
- " 'temperature': 1.0,\n",
45
- " 'top_k': 50,\n",
46
- " 'top_p': 1.0,\n",
47
- " 'typical_p': 1.0,\n",
48
- " 'repetition_penalty': 1.0,\n",
49
- " 'length_penalty': 1.0,\n",
50
- " 'no_repeat_ngram_size': 0,\n",
51
- " 'encoder_no_repeat_ngram_size': 0,\n",
52
- " 'bad_words_ids': None,\n",
53
- " 'num_return_sequences': 1,\n",
54
- " 'output_scores': False,\n",
55
- " 'return_dict_in_generate': False,\n",
56
- " 'forced_bos_token_id': None,\n",
57
- " 'forced_eos_token_id': None,\n",
58
- " 'remove_invalid_values': False,\n",
59
- " 'exponential_decay_length_penalty': None,\n",
60
- " 'suppress_tokens': None,\n",
61
- " 'begin_suppress_tokens': None,\n",
62
- " 'architectures': ['HfMoondream'],\n",
63
- " 'finetuning_task': None,\n",
64
- " 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'},\n",
65
- " 'label2id': {'LABEL_0': 0, 'LABEL_1': 1},\n",
66
- " 'tokenizer_class': None,\n",
67
- " 'prefix': None,\n",
68
- " 'bos_token_id': None,\n",
69
- " 'pad_token_id': None,\n",
70
- " 'eos_token_id': None,\n",
71
- " 'sep_token_id': None,\n",
72
- " 'decoder_start_token_id': None,\n",
73
- " 'task_specific_params': None,\n",
74
- " 'problem_type': None,\n",
75
- " '_name_or_path': 'vikhyatk/moondream2',\n",
76
- " '_attn_implementation_autoset': False,\n",
77
- " 'transformers_version': '4.49.0',\n",
78
- " 'auto_map': {'AutoConfig': 'vikhyatk/moondream2--hf_moondream.HfConfig',\n",
79
- " 'AutoModelForCausalLM': 'vikhyatk/moondream2--hf_moondream.HfMoondream'},\n",
80
- " 'config': {},\n",
81
- " 'model_type': 'moondream1'}"
82
- ]
83
- },
84
- "execution_count": 2,
85
- "metadata": {},
86
- "output_type": "execute_result"
87
- }
88
- ],
89
  "source": [
90
- "config.to_dict()\n"
 
 
 
 
 
 
 
 
 
91
  ]
92
  },
93
  {
94
  "cell_type": "code",
95
- "execution_count": 4,
96
  "metadata": {},
97
  "outputs": [],
98
  "source": [
99
- "from moondream2.hf_moondream import HfConfig\n",
100
- "# serialize config.json to data variable\n",
101
- "import json\n",
102
- "with open(\"moondream2/config.json\", \"r\") as f:\n",
103
- " data = json.load(f)\n",
104
- "configo = HfConfig(**data)\n"
105
  ]
106
  },
107
  {
108
  "cell_type": "code",
109
- "execution_count": 5,
 
 
 
 
 
 
 
110
  "metadata": {},
111
  "outputs": [
112
  {
113
- "name": "stdout",
114
- "output_type": "stream",
115
- "text": [
116
- "_name_or_path : vikhyatk/moondream2 != \n",
117
- "auto_map : {'AutoConfig': 'vikhyatk/moondream2--hf_moondream.HfConfig', 'AutoModelForCausalLM': 'vikhyatk/moondream2--hf_moondream.HfMoondream'} != {'AutoConfig': 'hf_moondream.HfConfig', 'AutoModelForCausalLM': 'hf_moondream.HfMoondream'}\n"
118
- ]
 
 
 
119
  }
120
  ],
121
  "source": [
122
- "# compare dicts of config.to_dict() and configo.to_dict() \n",
123
- "\n",
124
- "# make it check values of the dicts\n",
125
- "\n",
126
- "for key, value in config.to_dict().items():\n",
127
- " if key not in configo.to_dict():\n",
128
- " print(key + \" : \" + str(value) + \" not in hf_config\")\n",
129
- " elif value != configo.to_dict()[key]:\n",
130
- " print(key+ \" : \"+str(value)+\" != \"+str(configo.to_dict()[key]))\n",
131
- "\n",
132
- "for key, value in configo.to_dict().items():\n",
133
- " if key not in config.to_dict():\n",
134
- " print(key + \" : \" + str(value) + \" not in from_pretrained\")\n",
135
- "\n",
136
- "hparams = config.to_dict()\n"
137
  ]
138
  },
139
  {
140
  "cell_type": "code",
141
- "execution_count": 24,
142
  "metadata": {},
143
  "outputs": [],
 
 
 
 
 
144
  "source": [
145
- "text_config = hparams.get(\"text_config\", {})"
 
146
  ]
147
  },
148
  {
149
  "cell_type": "code",
150
- "execution_count": 26,
151
  "metadata": {},
152
  "outputs": [],
153
- "source": [
154
- "text_config.get(\"architectures\")\n"
155
- ]
156
  },
157
  {
158
  "cell_type": "code",
159
- "execution_count": 27,
160
  "metadata": {},
161
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  "source": [
163
- "hparams.get(\"vision_config\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  ]
165
  },
166
  {
@@ -187,7 +386,7 @@
187
  "name": "python",
188
  "nbconvert_exporter": "python",
189
  "pygments_lexer": "ipython3",
190
- "version": "3.12.4"
191
  }
192
  },
193
  "nbformat": 4,
 
1
  {
2
  "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": []
9
+ },
10
  {
11
  "cell_type": "code",
12
  "execution_count": 1,
13
  "metadata": {},
14
  "outputs": [],
15
  "source": [
16
+ "import torch\n",
17
+ "import torch.nn as nn\n",
18
+ "\n",
19
+ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
20
+ "\n",
21
+ "class RotaryEmbeddingInPlace(nn.Module):\n",
22
+ " def __init__(self, head_dim: int, max_seq_len: int, theta: float = 10000.0):\n",
23
+ " super().__init__()\n",
24
+ " # Match RotaryEmbedding exactly\n",
25
+ " self.rot_dim = head_dim // 2 # Only half of head_dim is rotated\n",
26
+ " \n",
27
+ " # Frequency calculation - match RotaryEmbedding exactly\n",
28
+ " freqs = 1.0 / (theta ** (torch.arange(0, self.rot_dim, 2).float() / self.rot_dim))\n",
29
+ " t = torch.arange(max_seq_len, dtype=torch.float32).unsqueeze(1)\n",
30
+ " freqs = t * freqs.unsqueeze(0)\n",
31
+ " \n",
32
+ "\n",
33
+ " freqs_cis = torch.exp(1j * freqs)\n",
34
+ " cos_vals = freqs_cis.real\n",
35
+ " sin_vals = freqs_cis.imag\n",
36
  "\n",
37
+ " self.register_buffer('cos_cache', cos_vals, persistent=False)\n",
38
+ " self.register_buffer('sin_cache', sin_vals, persistent=False)\n",
39
+ " \n",
40
+ " def apply(self, x: torch.Tensor) -> torch.Tensor:\n",
41
+ " \"\"\"\n",
42
+ " WARNING: This modifies the input tensor in-place for maximum speed!\n",
43
+ " If you need the original tensor, make a copy before calling this.\n",
44
+ " \n",
45
+ " Must match RotaryEmbedding output exactly.\n",
46
+ " \"\"\"\n",
47
+ " seq_len = x.shape[1]\n",
48
+ " d = self.rot_dim // 2\n",
49
+ " \n",
50
+ " # Get cos/sin with same broadcasting as RotaryEmbedding\n",
51
+ " cos = self.cos_cache[:seq_len].unsqueeze(0).unsqueeze(2)\n",
52
+ " sin = self.sin_cache[:seq_len].unsqueeze(0).unsqueeze(2)\n",
53
+ " \n",
54
+ " # Split rotated part into real/imaginary components\n",
55
+ " xq_r = x[..., :d] # First half of rot_dim\n",
56
+ " xq_i = x[..., d:d*2] # Second half of rot_dim\n",
57
+ " \n",
58
+ " # Apply rotation\n",
59
+ " xq_out_r = xq_r * cos - xq_i * sin\n",
60
+ " xq_out_i = xq_r * sin + xq_i * cos\n",
61
+ " \n",
62
+ " # Vectorized interleaving using torch.stack and view\n",
63
+ " # Stack creates [d, ..., 2] then view as [..., d*2]\n",
64
+ " x[..., :self.rot_dim] = torch.stack([xq_out_r, xq_out_i], dim=-1).view(*x.shape[:-1], self.rot_dim)\n",
65
+ " \n",
66
+ " # x_pass part (x[..., self.rot_dim:]) remains unchanged automatically\n",
67
+ " \n",
68
+ " return x\n"
69
  ]
70
  },
71
  {
72
  "cell_type": "code",
73
  "execution_count": 2,
74
  "metadata": {},
75
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  "source": [
77
+ "dim_per_head = 64\n",
78
+ "n_heads = 32\n",
79
+ "max_context = 2048\n",
80
+ "\n",
81
+ "freq_dim = dim_per_head // 2\n",
82
+ "\n",
83
+ "torch.manual_seed(42)\n",
84
+ "\n",
85
+ "tensor = torch.rand(1, 730, n_heads, dim_per_head)\n",
86
+ "tensor = tensor.to(device)\n"
87
  ]
88
  },
89
  {
90
  "cell_type": "code",
91
+ "execution_count": 3,
92
  "metadata": {},
93
  "outputs": [],
94
  "source": [
95
+ "fast_rope = RotaryEmbeddingInPlace(dim_per_head, max_context)\n",
96
+ "fast_rope.to(device)\n",
97
+ "fast_rtensor = fast_rope.apply(tensor)\n",
98
+ "\n",
99
+ "\n"
 
100
  ]
101
  },
102
  {
103
  "cell_type": "code",
104
+ "execution_count": null,
105
+ "metadata": {},
106
+ "outputs": [],
107
+ "source": []
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": 4,
112
  "metadata": {},
113
  "outputs": [
114
  {
115
+ "data": {
116
+ "text/plain": [
117
+ "tensor([0.8823, 0.8854, 0.9150, 0.5739, 0.3829, 0.2666, 0.9593, 0.6274, 0.3904,\n",
118
+ " 0.2696, 0.6009, 0.4414, 0.2566, 0.2969, 0.7936], device='cuda:0')"
119
+ ]
120
+ },
121
+ "execution_count": 4,
122
+ "metadata": {},
123
+ "output_type": "execute_result"
124
  }
125
  ],
126
  "source": [
127
+ "fast_rtensor.flatten()[:15]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  ]
129
  },
130
  {
131
  "cell_type": "code",
132
+ "execution_count": null,
133
  "metadata": {},
134
  "outputs": [],
135
+ "source": []
136
+ },
137
+ {
138
+ "cell_type": "markdown",
139
+ "metadata": {},
140
  "source": [
141
+ "tensor([0.8823, 0.8854, 0.9150, 0.5739, 0.3829, 0.2666, 0.9593, 0.6274, 0.3904,\n",
142
+ " 0.2696, 0.6009, 0.4414, 0.2566, 0.2969, 0.7936])"
143
  ]
144
  },
145
  {
146
  "cell_type": "code",
147
+ "execution_count": 6,
148
  "metadata": {},
149
  "outputs": [],
150
+ "source": []
 
 
151
  },
152
  {
153
  "cell_type": "code",
154
+ "execution_count": 2,
155
  "metadata": {},
156
+ "outputs": [
157
+ {
158
+ "name": "stdout",
159
+ "output_type": "stream",
160
+ "text": [
161
+ "Benchmarking with tensor shape: torch.Size([1, 730, 32, 64])\n",
162
+ "Device: cuda:0\n",
163
+ "Warmup iterations: 10\n",
164
+ "Benchmark iterations: 100\n",
165
+ "--------------------------------------------------\n",
166
+ "Warming up regular rope...\n",
167
+ "Benchmarking regular rope...\n",
168
+ "Warming up fast rope...\n",
169
+ "Benchmarking fast rope...\n",
170
+ "\n",
171
+ "============================================================\n",
172
+ "BENCHMARK RESULTS\n",
173
+ "============================================================\n",
174
+ "\n",
175
+ "Regular Rope:\n",
176
+ " Mean: 0.338 ms\n",
177
+ " Median: 0.335 ms\n",
178
+ " Std: 0.009 ms\n",
179
+ " Min: 0.330 ms\n",
180
+ " Max: 0.385 ms\n",
181
+ "\n",
182
+ "Fast Rope (In-place):\n",
183
+ " Mean: 0.267 ms\n",
184
+ " Median: 0.265 ms\n",
185
+ " Std: 0.005 ms\n",
186
+ " Min: 0.261 ms\n",
187
+ " Max: 0.285 ms\n",
188
+ "\n",
189
+ "Speedup: 1.27x\n",
190
+ "Fast rope is 1.27x faster\n"
191
+ ]
192
+ }
193
+ ],
194
  "source": [
195
+ "import torch\n",
196
+ "import time\n",
197
+ "import statistics\n",
198
+ "from typing import List, Tuple\n",
199
+ "\n",
200
+ "def benchmark_rope_functions(\n",
201
+ " rope, \n",
202
+ " fast_rope, \n",
203
+ " tensor: torch.Tensor, \n",
204
+ " num_warmup: int = 10,\n",
205
+ " num_iterations: int = 100\n",
206
+ ") -> Tuple[float, float, List[float], List[float]]:\n",
207
+ " \"\"\"\n",
208
+ " Benchmark two rope functions, accounting for in-place modification.\n",
209
+ " \n",
210
+ " Args:\n",
211
+ " rope: Regular RotaryEmbedding instance\n",
212
+ " fast_rope: RotaryEmbeddingInPlace instance\n",
213
+ " tensor: Input tensor to benchmark with\n",
214
+ " num_warmup: Number of warmup iterations\n",
215
+ " num_iterations: Number of benchmark iterations\n",
216
+ " \n",
217
+ " Returns:\n",
218
+ " Tuple of (regular_avg_time, fast_avg_time, regular_times, fast_times)\n",
219
+ " \"\"\"\n",
220
+ " \n",
221
+ " # Ensure we're on the right device and in eval mode if applicable\n",
222
+ " device = tensor.device\n",
223
+ " \n",
224
+ " # Pre-allocate tensor copies to avoid allocation overhead during timing\n",
225
+ " tensor_copies = []\n",
226
+ " for _ in range(num_warmup + num_iterations):\n",
227
+ " tensor_copies.append(tensor.clone().detach())\n",
228
+ " \n",
229
+ " print(f\"Benchmarking with tensor shape: {tensor.shape}\")\n",
230
+ " print(f\"Device: {device}\")\n",
231
+ " print(f\"Warmup iterations: {num_warmup}\")\n",
232
+ " print(f\"Benchmark iterations: {num_iterations}\")\n",
233
+ " print(\"-\" * 50)\n",
234
+ " \n",
235
+ " # Warmup phase for regular rope\n",
236
+ " print(\"Warming up regular rope...\")\n",
237
+ " for i in range(num_warmup):\n",
238
+ " _ = rope.apply(tensor)\n",
239
+ " if device.type == 'cuda':\n",
240
+ " torch.cuda.synchronize()\n",
241
+ " \n",
242
+ " # Benchmark regular rope\n",
243
+ " print(\"Benchmarking regular rope...\")\n",
244
+ " regular_times = []\n",
245
+ " for i in range(num_iterations):\n",
246
+ " if device.type == 'cuda':\n",
247
+ " torch.cuda.synchronize()\n",
248
+ " \n",
249
+ " start_time = time.perf_counter()\n",
250
+ " result = rope.apply(tensor)\n",
251
+ " \n",
252
+ " if device.type == 'cuda':\n",
253
+ " torch.cuda.synchronize()\n",
254
+ " \n",
255
+ " end_time = time.perf_counter()\n",
256
+ " regular_times.append((end_time - start_time) * 1000) # Convert to milliseconds\n",
257
+ " \n",
258
+ " # Warmup phase for fast rope (in-place)\n",
259
+ " print(\"Warming up fast rope...\")\n",
260
+ " for i in range(num_warmup):\n",
261
+ " test_tensor = tensor_copies[i].clone() # Use a copy for warmup\n",
262
+ " _ = fast_rope.apply(test_tensor)\n",
263
+ " if device.type == 'cuda':\n",
264
+ " torch.cuda.synchronize()\n",
265
+ " \n",
266
+ " # Benchmark fast rope (in-place)\n",
267
+ " print(\"Benchmarking fast rope...\")\n",
268
+ " fast_times = []\n",
269
+ " copy_idx = num_warmup # Start from after warmup copies\n",
270
+ " \n",
271
+ " for i in range(num_iterations):\n",
272
+ " # Use pre-allocated copy\n",
273
+ " tensor_copy = tensor_copies[copy_idx + i]\n",
274
+ " \n",
275
+ " if device.type == 'cuda':\n",
276
+ " torch.cuda.synchronize()\n",
277
+ " \n",
278
+ " # Time only the apply operation, not the copy\n",
279
+ " start_time = time.perf_counter()\n",
280
+ " result = fast_rope.apply(tensor_copy)\n",
281
+ " \n",
282
+ " if device.type == 'cuda':\n",
283
+ " torch.cuda.synchronize()\n",
284
+ " \n",
285
+ " end_time = time.perf_counter()\n",
286
+ " fast_times.append((end_time - start_time) * 1000) # Convert to milliseconds\n",
287
+ " \n",
288
+ " # Calculate statistics\n",
289
+ " regular_avg = statistics.mean(regular_times)\n",
290
+ " fast_avg = statistics.mean(fast_times)\n",
291
+ " \n",
292
+ " return regular_avg, fast_avg, regular_times, fast_times\n",
293
+ "\n",
294
+ "def print_benchmark_results(regular_avg: float, fast_avg: float, \n",
295
+ " regular_times: List[float], fast_times: List[float]):\n",
296
+ " \"\"\"Print detailed benchmark results.\"\"\"\n",
297
+ " \n",
298
+ " regular_median = statistics.median(regular_times)\n",
299
+ " regular_std = statistics.stdev(regular_times) if len(regular_times) > 1 else 0\n",
300
+ " regular_min = min(regular_times)\n",
301
+ " regular_max = max(regular_times)\n",
302
+ " \n",
303
+ " fast_median = statistics.median(fast_times)\n",
304
+ " fast_std = statistics.stdev(fast_times) if len(fast_times) > 1 else 0\n",
305
+ " fast_min = min(fast_times)\n",
306
+ " fast_max = max(fast_times)\n",
307
+ " \n",
308
+ " speedup = regular_avg / fast_avg if fast_avg > 0 else float('inf')\n",
309
+ " \n",
310
+ " print(\"\\n\" + \"=\" * 60)\n",
311
+ " print(\"BENCHMARK RESULTS\")\n",
312
+ " print(\"=\" * 60)\n",
313
+ " \n",
314
+ " print(f\"\\nRegular Rope:\")\n",
315
+ " print(f\" Mean: {regular_avg:.3f} ms\")\n",
316
+ " print(f\" Median: {regular_median:.3f} ms\")\n",
317
+ " print(f\" Std: {regular_std:.3f} ms\")\n",
318
+ " print(f\" Min: {regular_min:.3f} ms\")\n",
319
+ " print(f\" Max: {regular_max:.3f} ms\")\n",
320
+ " \n",
321
+ " print(f\"\\nFast Rope (In-place):\")\n",
322
+ " print(f\" Mean: {fast_avg:.3f} ms\")\n",
323
+ " print(f\" Median: {fast_median:.3f} ms\")\n",
324
+ " print(f\" Std: {fast_std:.3f} ms\")\n",
325
+ " print(f\" Min: {fast_min:.3f} ms\")\n",
326
+ " print(f\" Max: {fast_max:.3f} ms\")\n",
327
+ " \n",
328
+ " print(f\"\\nSpeedup: {speedup:.2f}x\")\n",
329
+ " if speedup > 1:\n",
330
+ " print(f\"Fast rope is {speedup:.2f}x faster\")\n",
331
+ " else:\n",
332
+ " print(f\"Regular rope is {1/speedup:.2f}x faster\")\n",
333
+ "\n",
334
+ "# Example usage\n",
335
+ "def run_benchmark():\n",
336
+ " \"\"\"\n",
337
+ " Example of how to use the benchmark functions.\n",
338
+ " Replace with your actual RotaryEmbedding classes.\n",
339
+ " \"\"\"\n",
340
+ " \n",
341
+ " # Example parameters - adjust these to match your setup\n",
342
+ " dim_per_head = 64\n",
343
+ " n_heads = 32\n",
344
+ " max_context = 2048\n",
345
+ "\n",
346
+ " freq_dim = dim_per_head // 2\n",
347
+ "\n",
348
+ " torch.manual_seed(42)\n",
349
+ "\n",
350
+ " \n",
351
+ " # Create your rope instances\n",
352
+ " rope = RotaryEmbedding(dim_per_head, max_context)\n",
353
+ " fast_rope = RotaryEmbeddingInPlace(dim_per_head, max_context)\n",
354
+ " \n",
355
+ " # Create test tensor - adjust shape to match your use case\n",
356
+ " tensor = torch.rand(1, 730, n_heads, dim_per_head, device=device)\n",
357
+ "\n",
358
+ " regular_avg, fast_avg, regular_times, fast_times = benchmark_rope_functions(rope, fast_rope, tensor)\n",
359
+ " print_benchmark_results(regular_avg, fast_avg, regular_times, fast_times)\n",
360
+ "\n",
361
+ "if __name__ == \"__main__\":\n",
362
+ " run_benchmark()"
363
  ]
364
  },
365
  {
 
386
  "name": "python",
387
  "nbconvert_exporter": "python",
388
  "pygments_lexer": "ipython3",
389
+ "version": "3.13.3"
390
  }
391
  },
392
  "nbformat": 4,