roboflamingo-demo

Runtime error

App Files Files Community

aw1app commited on Nov 14, 2025

Commit

adf1b6d

1 Parent(s): ed14d35

Final fix: Exact checkpoint dimensions (action=6, not 7)

Browse files

Files changed (2) hide show

patched_factory.py +16 -37
policy_head.py +14 -32

patched_factory.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Factory with correct dimensions"""
 import sys
 import torch
 import torch.nn as nn
@@ -9,42 +9,31 @@ from huggingface_hub import hf_hub_download
 from policy_head import LSTMPolicyHead
 class RoboFlamingoWithPolicy(nn.Module):
-    """Wraps OpenFlamingo + LSTM Policy Head"""
     def __init__(self, base_model, policy_head):
         super().__init__()
         self.base_model = base_model
         self.policy_head = policy_head
         self.vision_encoder = base_model.vision_encoder
         self.lang_encoder = base_model.lang_encoder
     def forward(self, vision_x, lang_x, attention_mask=None):
-        # Get embeddings with hidden states
         output = self.base_model(
             vision_x=vision_x,
             lang_x=lang_x,
             attention_mask=attention_mask
         )
-        # Get hidden states if available
         if hasattr(output, 'hidden_states') and output.hidden_states is not None:
             embeddings = output.hidden_states[-1]
         else:
-            # Fallback: use logits (not ideal)
             embeddings = output.logits
-        # Apply policy head
         actions, gripper, _ = self.policy_head(embeddings)
-        return {
-            'actions': actions,
-            'gripper': gripper
-        }
 def create_model_and_transforms(checkpoint_path=None):
-    """Load RoboFlamingo"""
-    print("📦 Creating base OpenFlamingo...")
     base_model, image_processor, tokenizer = create_base(
         clip_vision_encoder_path="ViT-L-14",
         clip_vision_encoder_pretrained="openai",
@@ -53,66 +42,56 @@ def create_model_and_transforms(checkpoint_path=None):
         cross_attn_every_n_layers=4,
     )
-    print("✅ Base created")
-    # Enable hidden states
     if hasattr(base_model.lang_encoder, 'config'):
         base_model.lang_encoder.config.output_hidden_states = True
-    # Create policy head with CORRECT dimensions from checkpoint
-    print("🔨 Creating policy head (4-layer LSTM, hidden=1024)...")
     policy_head = LSTMPolicyHead(
         input_dim=2048,
         hidden_dim=1024,
-        num_layers=4,
-        action_dim=7
     )
     model = RoboFlamingoWithPolicy(base_model, policy_head)
-    print("✅ Policy head attached")
     if checkpoint_path:
-        print("📥 Downloading checkpoint...")
         ckpt_file = hf_hub_download(
             repo_id="robovlms/RoboFlamingo",
             filename="checkpoint_gripper_post_hist_1_aug_10_4_traj_cons_ws_12_mpt_3b_4.pth",
             repo_type="model"
         )
-        print("📥 Loading...")
         checkpoint = torch.load(ckpt_file, map_location='cpu')
         state_dict = checkpoint.get('model_state_dict', checkpoint)
-        # Map keys
         new_state_dict = {}
         for key, value in state_dict.items():
-            # Map policy head
             if 'action_head.rnn' in key:
                 new_key = key.replace('module.action_head.rnn', 'policy_head.lstm')
                 new_state_dict[new_key] = value
             elif 'action_head.actions.mlp' in key:
-                # Map actions MLP layers
                 new_key = key.replace('module.action_head.actions.mlp', 'policy_head.action_head')
                 new_state_dict[new_key] = value
             elif 'action_head.gripper.mlp' in key:
-                # Map gripper MLP layers
                 new_key = key.replace('module.action_head.gripper.mlp', 'policy_head.gripper_head')
                 new_state_dict[new_key] = value
             else:
-                # Base model keys
                 new_key = key.replace('module.', 'base_model.')
                 new_state_dict[new_key] = value
-        # Load (strict=False to ignore size mismatches for vocab)
         missing, unexpected = model.load_state_dict(new_state_dict, strict=False)
-        print(f"✅ Loaded (missing: {len(missing)}, unexpected: {len(unexpected)})")
-        # Show any remaining mismatches
-        if len(missing) > 0:
-            print(f"   Missing keys: {list(missing)[:3]}")
-        if len(unexpected) > 0:
-            print(f"   Unexpected keys: {list(unexpected)[:3]}")
     return model, image_processor, tokenizer

+"""Factory - load checkpoint with exact dimensions"""
 import sys
 import torch
 import torch.nn as nn
 from policy_head import LSTMPolicyHead
 class RoboFlamingoWithPolicy(nn.Module):
     def __init__(self, base_model, policy_head):
         super().__init__()
         self.base_model = base_model
         self.policy_head = policy_head
         self.vision_encoder = base_model.vision_encoder
         self.lang_encoder = base_model.lang_encoder
     def forward(self, vision_x, lang_x, attention_mask=None):
         output = self.base_model(
             vision_x=vision_x,
             lang_x=lang_x,
             attention_mask=attention_mask
         )
         if hasattr(output, 'hidden_states') and output.hidden_states is not None:
             embeddings = output.hidden_states[-1]
         else:
             embeddings = output.logits
         actions, gripper, _ = self.policy_head(embeddings)
+        return {'actions': actions, 'gripper': gripper}
 def create_model_and_transforms(checkpoint_path=None):
+    print("📦 Creating base...")
     base_model, image_processor, tokenizer = create_base(
         clip_vision_encoder_path="ViT-L-14",
         clip_vision_encoder_pretrained="openai",
         cross_attn_every_n_layers=4,
     )
     if hasattr(base_model.lang_encoder, 'config'):
         base_model.lang_encoder.config.output_hidden_states = True
+    print("🔨 Creating policy head...")
     policy_head = LSTMPolicyHead(
         input_dim=2048,
         hidden_dim=1024,
+        num_layers=4
     )
     model = RoboFlamingoWithPolicy(base_model, policy_head)
+    print("✅ Model ready")
     if checkpoint_path:
+        print("📥 Loading checkpoint...")
         ckpt_file = hf_hub_download(
             repo_id="robovlms/RoboFlamingo",
             filename="checkpoint_gripper_post_hist_1_aug_10_4_traj_cons_ws_12_mpt_3b_4.pth",
             repo_type="model"
         )
         checkpoint = torch.load(ckpt_file, map_location='cpu')
         state_dict = checkpoint.get('model_state_dict', checkpoint)
         new_state_dict = {}
         for key, value in state_dict.items():
             if 'action_head.rnn' in key:
                 new_key = key.replace('module.action_head.rnn', 'policy_head.lstm')
                 new_state_dict[new_key] = value
             elif 'action_head.actions.mlp' in key:
                 new_key = key.replace('module.action_head.actions.mlp', 'policy_head.action_head')
                 new_state_dict[new_key] = value
             elif 'action_head.gripper.mlp' in key:
                 new_key = key.replace('module.action_head.gripper.mlp', 'policy_head.gripper_head')
                 new_state_dict[new_key] = value
+            elif 'transformer.wte.weight' in key:
+                # Handle vocab size mismatch (50280 -> 50281)
+                # Pad with zeros for the extra token
+                if value.shape[0] == 50280:
+                    value = torch.cat([value, torch.zeros(1, value.shape[1])], dim=0)
+                new_key = key.replace('module.', 'base_model.')
+                new_state_dict[new_key] = value
             else:
                 new_key = key.replace('module.', 'base_model.')
                 new_state_dict[new_key] = value
         missing, unexpected = model.load_state_dict(new_state_dict, strict=False)
+        print(f"✅ Checkpoint loaded!")
+        print(f"   Missing: {len(missing)}, Unexpected: {len(unexpected)}")
     return model, image_processor, tokenizer

policy_head.py CHANGED Viewed

@@ -1,21 +1,13 @@
-"""LSTM Policy Head with correct dimensions from checkpoint"""
 import torch
 import torch.nn as nn
 class LSTMPolicyHead(nn.Module):
-    """
-    LSTM-based policy head from RoboFlamingo checkpoint.
-    Dimensions extracted from checkpoint weights.
-    """
-    def __init__(self, input_dim=2048, hidden_dim=1024, num_layers=4, action_dim=7):
         super().__init__()
-        self.input_dim = input_dim
-        self.hidden_dim = hidden_dim
-        self.num_layers = num_layers
-        self.action_dim = action_dim
-        # LSTM with 4 layers, hidden_dim=1024
         self.lstm = nn.LSTM(
             input_size=input_dim,
             hidden_size=hidden_dim,
@@ -23,40 +15,30 @@ class LSTMPolicyHead(nn.Module):
             batch_first=True
         )
-        # Action MLP (4 layers based on checkpoint)
         self.action_head = nn.Sequential(
-            nn.Linear(hidden_dim, 512),
             nn.ReLU(),
-            nn.Linear(512, 256),
             nn.ReLU(),
-            nn.Linear(256, 128),
             nn.ReLU(),
-            nn.Linear(128, action_dim)
         )
-        # Gripper MLP (4 layers based on checkpoint)
         self.gripper_head = nn.Sequential(
-            nn.Linear(hidden_dim, 512),
             nn.ReLU(),
-            nn.Linear(512, 256),
             nn.ReLU(),
-            nn.Linear(256, 128),
             nn.ReLU(),
-            nn.Linear(128, 1),
             nn.Sigmoid()
         )
     def forward(self, x, hidden=None):
-        """
-        Args:
-            x: (batch, seq_len, input_dim)
-            hidden: tuple of (h_0, c_0)
-        Returns:
-            actions: (batch, seq_len, action_dim)
-            gripper: (batch, seq_len, 1)
-            hidden: tuple of (h_n, c_n)
-        """
         # LSTM
         lstm_out, hidden = self.lstm(x, hidden)

+"""LSTM Policy Head - EXACT checkpoint dimensions"""
 import torch
 import torch.nn as nn
 class LSTMPolicyHead(nn.Module):
+    """Exact architecture from RoboFlamingo checkpoint"""
+    def __init__(self, input_dim=2048, hidden_dim=1024, num_layers=4):
         super().__init__()
+        # LSTM: 4 layers, hidden=1024
         self.lstm = nn.LSTM(
             input_size=input_dim,
             hidden_size=hidden_dim,
             batch_first=True
         )
+        # Action MLP: 1024 -> 1024 -> 512 -> 256 -> 6
         self.action_head = nn.Sequential(
+            nn.Linear(1024, 1024),
             nn.ReLU(),
+            nn.Linear(1024, 512),
             nn.ReLU(),
+            nn.Linear(512, 256),
             nn.ReLU(),
+            nn.Linear(256, 6)  # 6 outputs (position + rotation, no gripper here)
         )
+        # Gripper MLP: 1024 -> 1024 -> 512 -> 256 -> 1
         self.gripper_head = nn.Sequential(
+            nn.Linear(1024, 1024),
             nn.ReLU(),
+            nn.Linear(1024, 512),
             nn.ReLU(),
+            nn.Linear(512, 256),
             nn.ReLU(),
+            nn.Linear(256, 1),
             nn.Sigmoid()
         )
     def forward(self, x, hidden=None):
         # LSTM
         lstm_out, hidden = self.lstm(x, hidden)