5.0 structure fixes
#7
by philippguevorguian - opened
model-00002-of-00003.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:b73a606d306a09519e3fbe7bfd29077d39db48fee47ce19521b6b5c398cdcc32
|
| 3 |
-
size 4054187824
|
|
|
|
|
|
|
|
|
|
|
|
model-00003-of-00003.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:6941d35ff1feae1603946f8746a71205bb86343b57968402df2e737faf9258a2
|
| 3 |
-
size 1244659840
|
|
|
|
|
|
|
|
|
|
|
|
model-00001-of-00003.safetensors → model.safetensors
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b84184ec79aa409631e68dc76c3133bc1bbae76d61842fdcd4fe553dd6a3b579
|
| 3 |
+
size 10268388224
|
modular_isaac.py
CHANGED
|
@@ -96,7 +96,6 @@ from transformers.models.siglip2.modeling_siglip2 import (
|
|
| 96 |
Siglip2Attention,
|
| 97 |
Siglip2Encoder,
|
| 98 |
Siglip2EncoderLayer,
|
| 99 |
-
Siglip2VisionTransformer
|
| 100 |
)
|
| 101 |
|
| 102 |
|
|
@@ -973,6 +972,23 @@ class IsaacVisionTransformer(nn.Module):
|
|
| 973 |
# Return the full sequence of embeddings
|
| 974 |
return hidden_states
|
| 975 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 976 |
|
| 977 |
class IsaacVisionEmbedding(nn.Module):
|
| 978 |
"""Vision embedding wrapper exposing tower and projector."""
|
|
@@ -982,14 +998,8 @@ class IsaacVisionEmbedding(nn.Module):
|
|
| 982 |
def __init__(self, config: IsaacConfig):
|
| 983 |
super().__init__()
|
| 984 |
vision_cfg = config.vision_config
|
| 985 |
-
hidden_dim = vision_cfg.hidden_size * (vision_cfg.pixel_shuffle_scale_factor**2)
|
| 986 |
-
|
| 987 |
self.vision_tower = IsaacVisionTransformer(vision_cfg)
|
| 988 |
-
self.multimodal_projector =
|
| 989 |
-
nn.Linear(hidden_dim, 4 * hidden_dim, bias=False),
|
| 990 |
-
nn.SiLU(),
|
| 991 |
-
nn.Linear(4 * hidden_dim, config.hidden_size, bias=False),
|
| 992 |
-
)
|
| 993 |
|
| 994 |
def forward(self, vision_tokens: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
| 995 |
hidden_states = self.vision_tower(vision_tokens)
|
|
@@ -1569,8 +1579,9 @@ class IsaacModel(Qwen3PreTrainedModel):
|
|
| 1569 |
raise ValueError("IsaacConfig should always have vision_config")
|
| 1570 |
|
| 1571 |
|
| 1572 |
-
|
| 1573 |
-
self.
|
|
|
|
| 1574 |
|
| 1575 |
# Dispatch table for TensorStream balanced embedding (text + vision)
|
| 1576 |
self.embed_fns = {
|
|
@@ -1622,10 +1633,6 @@ class IsaacModel(Qwen3PreTrainedModel):
|
|
| 1622 |
def vision_model(self) -> nn.Module:
|
| 1623 |
return self.vision_embedding.vision_tower
|
| 1624 |
|
| 1625 |
-
@property
|
| 1626 |
-
def vision_tower(self) -> nn.Module:
|
| 1627 |
-
return self.vision_embedding.vision_tower
|
| 1628 |
-
|
| 1629 |
def embed_text_tokens(self, token_ids: torch.Tensor) -> torch.Tensor:
|
| 1630 |
"""Embed text tokens, squeezing singleton dimensions."""
|
| 1631 |
# Text events are shaped as (..., 1); squeeze the singleton index dim
|
|
@@ -1637,7 +1644,7 @@ class IsaacModel(Qwen3PreTrainedModel):
|
|
| 1637 |
def embed_vision(self, vision_tokens: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
| 1638 |
"""Embed vision tokens using the vision encoder."""
|
| 1639 |
# vision tokens is (seq_patches, token_grids)
|
| 1640 |
-
return self.
|
| 1641 |
|
| 1642 |
def embed_stream(self, tensor_stream: TensorStream) -> torch.Tensor:
|
| 1643 |
"""
|
|
@@ -2096,7 +2103,8 @@ def _compute_residual_p_frames(frames: torch.Tensor, is_p_frame: list[bool]) ->
|
|
| 2096 |
__all__ = [
|
| 2097 |
"IsaacConfig",
|
| 2098 |
"IsaacModel",
|
|
|
|
| 2099 |
"IsaacForConditionalGeneration",
|
| 2100 |
"IsaacImageProcessorFast",
|
| 2101 |
"IsaacProcessor",
|
| 2102 |
-
]
|
|
|
|
| 96 |
Siglip2Attention,
|
| 97 |
Siglip2Encoder,
|
| 98 |
Siglip2EncoderLayer,
|
|
|
|
| 99 |
)
|
| 100 |
|
| 101 |
|
|
|
|
| 972 |
# Return the full sequence of embeddings
|
| 973 |
return hidden_states
|
| 974 |
|
| 975 |
+
class IsaacMultiModalProjector(nn.Module):
|
| 976 |
+
def __init__(self, config: IsaacConfig):
|
| 977 |
+
super().__init__()
|
| 978 |
+
self.vision_hidden_size = config.vision_config.hidden_size * (
|
| 979 |
+
config.vision_config.pixel_shuffle_scale_factor**2
|
| 980 |
+
)
|
| 981 |
+
self.backbone_hidden_size = config.hidden_size
|
| 982 |
+
self.linear_1 = nn.Linear(self.vision_hidden_size, 4 * self.vision_hidden_size, bias=False)
|
| 983 |
+
self.silu = nn.SiLU()
|
| 984 |
+
self.linear_2 = nn.Linear(4 * self.vision_hidden_size, self.backbone_hidden_size, bias=False)
|
| 985 |
+
|
| 986 |
+
def forward(self, image_features):
|
| 987 |
+
hidden_states = self.linear_1(image_features)
|
| 988 |
+
hidden_states = self.silu(hidden_states)
|
| 989 |
+
hidden_states = self.linear_2(hidden_states)
|
| 990 |
+
return hidden_states
|
| 991 |
+
|
| 992 |
|
| 993 |
class IsaacVisionEmbedding(nn.Module):
|
| 994 |
"""Vision embedding wrapper exposing tower and projector."""
|
|
|
|
| 998 |
def __init__(self, config: IsaacConfig):
|
| 999 |
super().__init__()
|
| 1000 |
vision_cfg = config.vision_config
|
|
|
|
|
|
|
| 1001 |
self.vision_tower = IsaacVisionTransformer(vision_cfg)
|
| 1002 |
+
self.multimodal_projector = IsaacMultiModalProjector(config)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1003 |
|
| 1004 |
def forward(self, vision_tokens: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
| 1005 |
hidden_states = self.vision_tower(vision_tokens)
|
|
|
|
| 1579 |
raise ValueError("IsaacConfig should always have vision_config")
|
| 1580 |
|
| 1581 |
|
| 1582 |
+
|
| 1583 |
+
self.vision_tower = IsaacVisionTransformer(config.vision_config)
|
| 1584 |
+
self.multimodal_projector = IsaacMultiModalProjector(config)
|
| 1585 |
|
| 1586 |
# Dispatch table for TensorStream balanced embedding (text + vision)
|
| 1587 |
self.embed_fns = {
|
|
|
|
| 1633 |
def vision_model(self) -> nn.Module:
|
| 1634 |
return self.vision_embedding.vision_tower
|
| 1635 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1636 |
def embed_text_tokens(self, token_ids: torch.Tensor) -> torch.Tensor:
|
| 1637 |
"""Embed text tokens, squeezing singleton dimensions."""
|
| 1638 |
# Text events are shaped as (..., 1); squeeze the singleton index dim
|
|
|
|
| 1644 |
def embed_vision(self, vision_tokens: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
| 1645 |
"""Embed vision tokens using the vision encoder."""
|
| 1646 |
# vision tokens is (seq_patches, token_grids)
|
| 1647 |
+
return self.multimodal_projector(self.vision_tower(vision_tokens))
|
| 1648 |
|
| 1649 |
def embed_stream(self, tensor_stream: TensorStream) -> torch.Tensor:
|
| 1650 |
"""
|
|
|
|
| 2103 |
__all__ = [
|
| 2104 |
"IsaacConfig",
|
| 2105 |
"IsaacModel",
|
| 2106 |
+
"IsaacPreTrainedModel", # noqa: F822
|
| 2107 |
"IsaacForConditionalGeneration",
|
| 2108 |
"IsaacImageProcessorFast",
|
| 2109 |
"IsaacProcessor",
|
| 2110 |
+
]
|