roboflamingo-demo

Runtime error

App Files Files Community

aw1app commited on Nov 14, 2025

Commit

7cc7ee4

1 Parent(s): 44a0f72

Force output_hidden_states=True in lang_encoder forward call

Browse files

Files changed (1) hide show

patched_factory.py +28 -49

patched_factory.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Factory with proper hidden state extraction"""
 import sys
 import torch
 import torch.nn as nn
@@ -17,37 +17,35 @@ class RoboFlamingoWithPolicy(nn.Module):
         self.lang_encoder = base_model.lang_encoder
     def forward(self, vision_x, lang_x, attention_mask=None):
-        # Call base model
-        output = self.base_model(
-            vision_x=vision_x,
-            lang_x=lang_x,
-            attention_mask=attention_mask
-        )
-        # CRITICAL: We need hidden states, not logits!
-        # hidden_states should be enabled via config
-        embeddings = None
-        if hasattr(output, 'hidden_states') and output.hidden_states is not None and len(output.hidden_states) > 0:
-            # Use last layer hidden states
-            embeddings = output.hidden_states[-1]
-            print(f"   Using hidden_states: {embeddings.shape}")
-        elif hasattr(output, 'last_hidden_state'):
-            embeddings = output.last_hidden_state
-            print(f"   Using last_hidden_state: {embeddings.shape}")
         else:
-            # Fallback: logits have wrong dimension
-            # We need to access the language model's actual hidden states
-            print(f"   ⚠️ No hidden_states! Output type: {type(output)}")
-            print(f"   Output has: {[k for k in dir(output) if not k.startswith('_')]}")
-            # Try to get from logits by taking last layer
-            if hasattr(output, 'logits'):
-                # Logits are (batch, seq, vocab_size=50281)
-                # We need (batch, seq, hidden=2048)
-                # This won't work - we need proper hidden states
-                print(f"   ❌ Only have logits: {output.logits.shape}")
-                raise RuntimeError("Model not outputting hidden_states! Need to configure model properly.")
         # Apply policy head
         actions, gripper, _ = self.policy_head(embeddings)
@@ -55,7 +53,7 @@ class RoboFlamingoWithPolicy(nn.Module):
         return {'actions': actions, 'gripper': gripper}
 def create_model_and_transforms(checkpoint_path=None):
-    print("📦 Creating base OpenFlamingo...")
     base_model, image_processor, tokenizer = create_base(
         clip_vision_encoder_path="ViT-L-14",
         clip_vision_encoder_pretrained="openai",
@@ -64,25 +62,6 @@ def create_model_and_transforms(checkpoint_path=None):
         cross_attn_every_n_layers=4,
     )
-    # CRITICAL: Enable hidden states output
-    print("🔧 Enabling hidden states output...")
-    # Try multiple ways to enable hidden states
-    if hasattr(base_model, 'lang_encoder'):
-        if hasattr(base_model.lang_encoder, 'config'):
-            base_model.lang_encoder.config.output_hidden_states = True
-            print("   ✅ Set via lang_encoder.config")
-        if hasattr(base_model.lang_encoder, 'transformer'):
-            if hasattr(base_model.lang_encoder.transformer, 'config'):
-                base_model.lang_encoder.transformer.config.output_hidden_states = True
-                print("   ✅ Set via transformer.config")
-    # Also try setting on the model itself
-    if hasattr(base_model, 'config'):
-        base_model.config.output_hidden_states = True
-        print("   ✅ Set on base_model.config")
     print("🔨 Creating policy head...")
     policy_head = LSTMPolicyHead(
         input_dim=2048,

+"""Factory with forced hidden states"""
 import sys
 import torch
 import torch.nn as nn
         self.lang_encoder = base_model.lang_encoder
     def forward(self, vision_x, lang_x, attention_mask=None):
+        # Get the internal model
+        # OpenFlamingo wraps the language model, we need to call it with output_hidden_states
+        # The base_model is Flamingo, which has lang_encoder
+        # We need to get embeddings from the language encoder
+        # First, process vision
+        if vision_x is not None:
+            # Vision encoder
+            vision_features = self.base_model._encode_vision_x(vision_x=vision_x)
+        else:
+            vision_features = None
+        # Now call language model with output_hidden_states=True
+        # The lang_encoder should support this parameter
+        lang_output = self.base_model.lang_encoder(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            output_hidden_states=True,  # FORCE hidden states output!
+            return_dict=True
+        )
+        # Now we should have hidden states
+        if hasattr(lang_output, 'hidden_states') and lang_output.hidden_states is not None:
+            embeddings = lang_output.hidden_states[-1]
+            print(f"   ✅ Got hidden states: {embeddings.shape}")
         else:
+            print(f"   ❌ Still no hidden states!")
+            raise RuntimeError("Cannot get hidden states from language model")
         # Apply policy head
         actions, gripper, _ = self.policy_head(embeddings)
         return {'actions': actions, 'gripper': gripper}
 def create_model_and_transforms(checkpoint_path=None):
+    print("📦 Creating base...")
     base_model, image_processor, tokenizer = create_base(
         clip_vision_encoder_path="ViT-L-14",
         clip_vision_encoder_pretrained="openai",
         cross_attn_every_n_layers=4,
     )
     print("🔨 Creating policy head...")
     policy_head = LSTMPolicyHead(
         input_dim=2048,