2toINF
/

X-VLA-WidowX

Model card Files Files and versions

2toINF commited on Nov 11, 2025

Commit

bb9981b

·

verified ·

1 Parent(s): d14e104

Update modeling_xvla.py

Files changed (1) hide show

modeling_xvla.py +2 -9

modeling_xvla.py CHANGED Viewed

@@ -55,23 +55,20 @@ class XVLA(PreTrainedModel):
         # Core settings
         self.num_actions: int = config.num_actions
         self.use_proprio: bool = config.use_proprio
         # Action space (dimensions + hooks)
         self.action_space = build_action_space(config.action_mode.lower())
         dim_action = self.action_space.dim_action
         dim_proprio = getattr(self.action_space, "dim_proprio", dim_action)
         # Florence2 backbone (encoder only)
-        self.vlm = Florence2ForConditionalGeneration(config.florence_config)
         if hasattr(self.vlm, "language_model"):
             lm = self.vlm.language_model
             if hasattr(lm, "model") and hasattr(lm.model, "decoder"):
                 del lm.model.decoder
             if hasattr(lm, "lm_head"):
                 del lm.lm_head
-        # ⚠️ VERY IMPORTANT: disable Florence2's tie_weights hooks to avoid decoder access
-        if hasattr(self.vlm, "tie_weights"):
-            self.vlm.tie_weights = lambda *a, **k: None
         projection_dim = getattr(self.vlm.config, "projection_dim", None)
         if projection_dim is None:
@@ -96,10 +93,6 @@ class XVLA(PreTrainedModel):
         # Deferred FastAPI app
         self.app: FastAPI | None = None
-    def tie_weights(self):
-        """Disable automatic weight tying (Florence is encoder-only)."""
-        return
     # ============================= Florence2 encoder =============================
     def forward_vlm(
         self,

         # Core settings
         self.num_actions: int = config.num_actions
         self.use_proprio: bool = config.use_proprio
+        self.action_mode: str = config.action_mode.lower()
         # Action space (dimensions + hooks)
         self.action_space = build_action_space(config.action_mode.lower())
         dim_action = self.action_space.dim_action
         dim_proprio = getattr(self.action_space, "dim_proprio", dim_action)
         # Florence2 backbone (encoder only)
+        self.vlm = Florence2ForConditionalGeneration(config.florence_config).to(torch.float32)
         if hasattr(self.vlm, "language_model"):
             lm = self.vlm.language_model
             if hasattr(lm, "model") and hasattr(lm.model, "decoder"):
                 del lm.model.decoder
             if hasattr(lm, "lm_head"):
                 del lm.lm_head
         projection_dim = getattr(self.vlm.config, "projection_dim", None)
         if projection_dim is None:
         # Deferred FastAPI app
         self.app: FastAPI | None = None
     # ============================= Florence2 encoder =============================
     def forward_vlm(
         self,