model-00002-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b73a606d306a09519e3fbe7bfd29077d39db48fee47ce19521b6b5c398cdcc32
3
- size 4054187824
 
 
 
 
model-00003-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6941d35ff1feae1603946f8746a71205bb86343b57968402df2e737faf9258a2
3
- size 1244659840
 
 
 
 
model-00001-of-00003.safetensors → model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f60b6bc3c8ed16d95c88b5b6d33101d0aa9464f5f3f33e204342859b12e371bb
3
- size 4969539560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b84184ec79aa409631e68dc76c3133bc1bbae76d61842fdcd4fe553dd6a3b579
3
+ size 10268388224
modular_isaac.py CHANGED
@@ -96,7 +96,6 @@ from transformers.models.siglip2.modeling_siglip2 import (
96
  Siglip2Attention,
97
  Siglip2Encoder,
98
  Siglip2EncoderLayer,
99
- Siglip2VisionTransformer
100
  )
101
 
102
 
@@ -973,6 +972,23 @@ class IsaacVisionTransformer(nn.Module):
973
  # Return the full sequence of embeddings
974
  return hidden_states
975
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
976
 
977
  class IsaacVisionEmbedding(nn.Module):
978
  """Vision embedding wrapper exposing tower and projector."""
@@ -982,14 +998,8 @@ class IsaacVisionEmbedding(nn.Module):
982
  def __init__(self, config: IsaacConfig):
983
  super().__init__()
984
  vision_cfg = config.vision_config
985
- hidden_dim = vision_cfg.hidden_size * (vision_cfg.pixel_shuffle_scale_factor**2)
986
-
987
  self.vision_tower = IsaacVisionTransformer(vision_cfg)
988
- self.multimodal_projector = nn.Sequential(
989
- nn.Linear(hidden_dim, 4 * hidden_dim, bias=False),
990
- nn.SiLU(),
991
- nn.Linear(4 * hidden_dim, config.hidden_size, bias=False),
992
- )
993
 
994
  def forward(self, vision_tokens: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
995
  hidden_states = self.vision_tower(vision_tokens)
@@ -1569,8 +1579,9 @@ class IsaacModel(Qwen3PreTrainedModel):
1569
  raise ValueError("IsaacConfig should always have vision_config")
1570
 
1571
 
1572
- self.vision_embedding = IsaacVisionEmbedding(config)
1573
- self.vision_embedding._supports_sdpa = True
 
1574
 
1575
  # Dispatch table for TensorStream balanced embedding (text + vision)
1576
  self.embed_fns = {
@@ -1622,10 +1633,6 @@ class IsaacModel(Qwen3PreTrainedModel):
1622
  def vision_model(self) -> nn.Module:
1623
  return self.vision_embedding.vision_tower
1624
 
1625
- @property
1626
- def vision_tower(self) -> nn.Module:
1627
- return self.vision_embedding.vision_tower
1628
-
1629
  def embed_text_tokens(self, token_ids: torch.Tensor) -> torch.Tensor:
1630
  """Embed text tokens, squeezing singleton dimensions."""
1631
  # Text events are shaped as (..., 1); squeeze the singleton index dim
@@ -1637,7 +1644,7 @@ class IsaacModel(Qwen3PreTrainedModel):
1637
  def embed_vision(self, vision_tokens: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
1638
  """Embed vision tokens using the vision encoder."""
1639
  # vision tokens is (seq_patches, token_grids)
1640
- return self.vision_embedding(vision_tokens)
1641
 
1642
  def embed_stream(self, tensor_stream: TensorStream) -> torch.Tensor:
1643
  """
@@ -2096,7 +2103,8 @@ def _compute_residual_p_frames(frames: torch.Tensor, is_p_frame: list[bool]) ->
2096
  __all__ = [
2097
  "IsaacConfig",
2098
  "IsaacModel",
 
2099
  "IsaacForConditionalGeneration",
2100
  "IsaacImageProcessorFast",
2101
  "IsaacProcessor",
2102
- ]
 
96
  Siglip2Attention,
97
  Siglip2Encoder,
98
  Siglip2EncoderLayer,
 
99
  )
100
 
101
 
 
972
  # Return the full sequence of embeddings
973
  return hidden_states
974
 
975
+ class IsaacMultiModalProjector(nn.Module):
976
+ def __init__(self, config: IsaacConfig):
977
+ super().__init__()
978
+ self.vision_hidden_size = config.vision_config.hidden_size * (
979
+ config.vision_config.pixel_shuffle_scale_factor**2
980
+ )
981
+ self.backbone_hidden_size = config.hidden_size
982
+ self.linear_1 = nn.Linear(self.vision_hidden_size, 4 * self.vision_hidden_size, bias=False)
983
+ self.silu = nn.SiLU()
984
+ self.linear_2 = nn.Linear(4 * self.vision_hidden_size, self.backbone_hidden_size, bias=False)
985
+
986
+ def forward(self, image_features):
987
+ hidden_states = self.linear_1(image_features)
988
+ hidden_states = self.silu(hidden_states)
989
+ hidden_states = self.linear_2(hidden_states)
990
+ return hidden_states
991
+
992
 
993
  class IsaacVisionEmbedding(nn.Module):
994
  """Vision embedding wrapper exposing tower and projector."""
 
998
  def __init__(self, config: IsaacConfig):
999
  super().__init__()
1000
  vision_cfg = config.vision_config
 
 
1001
  self.vision_tower = IsaacVisionTransformer(vision_cfg)
1002
+ self.multimodal_projector = IsaacMultiModalProjector(config)
 
 
 
 
1003
 
1004
  def forward(self, vision_tokens: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
1005
  hidden_states = self.vision_tower(vision_tokens)
 
1579
  raise ValueError("IsaacConfig should always have vision_config")
1580
 
1581
 
1582
+
1583
+ self.vision_tower = IsaacVisionTransformer(config.vision_config)
1584
+ self.multimodal_projector = IsaacMultiModalProjector(config)
1585
 
1586
  # Dispatch table for TensorStream balanced embedding (text + vision)
1587
  self.embed_fns = {
 
1633
  def vision_model(self) -> nn.Module:
1634
  return self.vision_embedding.vision_tower
1635
 
 
 
 
 
1636
  def embed_text_tokens(self, token_ids: torch.Tensor) -> torch.Tensor:
1637
  """Embed text tokens, squeezing singleton dimensions."""
1638
  # Text events are shaped as (..., 1); squeeze the singleton index dim
 
1644
  def embed_vision(self, vision_tokens: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
1645
  """Embed vision tokens using the vision encoder."""
1646
  # vision tokens is (seq_patches, token_grids)
1647
+ return self.multimodal_projector(self.vision_tower(vision_tokens))
1648
 
1649
  def embed_stream(self, tensor_stream: TensorStream) -> torch.Tensor:
1650
  """
 
2103
  __all__ = [
2104
  "IsaacConfig",
2105
  "IsaacModel",
2106
+ "IsaacPreTrainedModel", # noqa: F822
2107
  "IsaacForConditionalGeneration",
2108
  "IsaacImageProcessorFast",
2109
  "IsaacProcessor",
2110
+ ]