philperceptron commited on
Commit
3566174
·
1 Parent(s): c00ac74

checkpoint restructure

Browse files
model-00002-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b73a606d306a09519e3fbe7bfd29077d39db48fee47ce19521b6b5c398cdcc32
3
- size 4054187824
 
 
 
 
model-00003-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6941d35ff1feae1603946f8746a71205bb86343b57968402df2e737faf9258a2
3
- size 1244659840
 
 
 
 
model-00001-of-00003.safetensors → model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f60b6bc3c8ed16d95c88b5b6d33101d0aa9464f5f3f33e204342859b12e371bb
3
- size 4969539560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b84184ec79aa409631e68dc76c3133bc1bbae76d61842fdcd4fe553dd6a3b579
3
+ size 10268388224
modular_isaac.py CHANGED
@@ -1579,8 +1579,9 @@ class IsaacModel(Qwen3PreTrainedModel):
1579
  raise ValueError("IsaacConfig should always have vision_config")
1580
 
1581
 
1582
- self.vision_embedding = IsaacVisionEmbedding(config)
1583
- self.vision_embedding._supports_sdpa = True
 
1584
 
1585
  # Dispatch table for TensorStream balanced embedding (text + vision)
1586
  self.embed_fns = {
@@ -1632,10 +1633,6 @@ class IsaacModel(Qwen3PreTrainedModel):
1632
  def vision_model(self) -> nn.Module:
1633
  return self.vision_embedding.vision_tower
1634
 
1635
- @property
1636
- def vision_tower(self) -> nn.Module:
1637
- return self.vision_embedding.vision_tower
1638
-
1639
  def embed_text_tokens(self, token_ids: torch.Tensor) -> torch.Tensor:
1640
  """Embed text tokens, squeezing singleton dimensions."""
1641
  # Text events are shaped as (..., 1); squeeze the singleton index dim
@@ -1647,7 +1644,7 @@ class IsaacModel(Qwen3PreTrainedModel):
1647
  def embed_vision(self, vision_tokens: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
1648
  """Embed vision tokens using the vision encoder."""
1649
  # vision tokens is (seq_patches, token_grids)
1650
- return self.vision_embedding(vision_tokens)
1651
 
1652
  def embed_stream(self, tensor_stream: TensorStream) -> torch.Tensor:
1653
  """
@@ -2110,4 +2107,4 @@ __all__ = [
2110
  "IsaacForConditionalGeneration",
2111
  "IsaacImageProcessorFast",
2112
  "IsaacProcessor",
2113
- ]
 
1579
  raise ValueError("IsaacConfig should always have vision_config")
1580
 
1581
 
1582
+
1583
+ self.vision_tower = IsaacVisionTransformer(config.vision_config)
1584
+ self.multimodal_projector = IsaacMultiModalProjector(config)
1585
 
1586
  # Dispatch table for TensorStream balanced embedding (text + vision)
1587
  self.embed_fns = {
 
1633
  def vision_model(self) -> nn.Module:
1634
  return self.vision_embedding.vision_tower
1635
 
 
 
 
 
1636
  def embed_text_tokens(self, token_ids: torch.Tensor) -> torch.Tensor:
1637
  """Embed text tokens, squeezing singleton dimensions."""
1638
  # Text events are shaped as (..., 1); squeeze the singleton index dim
 
1644
  def embed_vision(self, vision_tokens: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
1645
  """Embed vision tokens using the vision encoder."""
1646
  # vision tokens is (seq_patches, token_grids)
1647
+ return self.multimodal_projector(self.vision_tower(vision_tokens))
1648
 
1649
  def embed_stream(self, tensor_stream: TensorStream) -> torch.Tensor:
1650
  """
 
2107
  "IsaacForConditionalGeneration",
2108
  "IsaacImageProcessorFast",
2109
  "IsaacProcessor",
2110
+ ]