File size: 16,303 Bytes

e94400c

# Copyright 2025 starVLA community. All rights reserved.
# Licensed under the MIT License, Version 1.0 (the "License");
# Implemented by Jinhui YE / HKUST University] in [2025].
"""
Qwen-GROOT Framework
A lightweight implementation that Qwen2.5-vl + Flow-matching head to directly predict continuous actions
Flow-matching header is copyright from GR00T N1.5, but a sample MoE inspired by PI_0
"""
from typing import List
from tqdm import tqdm
from typing import List, Optional, Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from PIL import Image



from starVLA.training.trainer_utils import initialize_overwatch
from deployment.model_server.tools.image_tools import to_pil_preserve

logger = initialize_overwatch(__name__)

# HuggingFace Default / LLaMa-2 IGNORE_INDEX (for labels)
IGNORE_INDEX = -100

from starVLA.model.framework.base_framework import baseframework
from starVLA.model.modules.vlm import get_vlm_model
from starVLA.model.modules.action_model.LayerwiseFM_ActionHeader import get_action_model, LayerwiseFlowmatchingActionHead
from starVLA.training.trainer_utils.trainer_tools import resize_images
from starVLA.model.tools import FRAMEWORK_REGISTRY

####################################################
# ⚠️ Warning: This framework has been restructured and is NOT compatible with checkpoints created before 2025-10-20.
####################################################

@FRAMEWORK_REGISTRY.register("QwenPI")
class Qwen_PI(baseframework):
    """
    Multimodal vision-language-action model.

    Components:
      - Qwen2.5 VL interface for fused language/vision token embeddings
      - Layer-wise cross DiT diffusion head

    Focus: Predict future continuous actions conditioned on images + instruction.
    """

    def __init__(
        self,
        config: Optional[dict] = None,
        **kwargs,
    ) -> None:
        """
        Construct all submodules and cache key configuration values.

        Args:
            config: Hierarchical configuration (OmegaConf/dict) containing framework + trainer sections.
            **kwargs: Reserved for future overrides (unused).
        """

        super().__init__()
        self.config = config
        self.qwen_vl_interface = get_vlm_model(config=self.config)

        # dynamic get llm config from actual model (do NOT hardcode num_vl_layers)
        # Qwen3VLConfig nests language config under text_config; fall back to top-level for older models
        _model_cfg = self.qwen_vl_interface.model.config
        _text_cfg = getattr(_model_cfg, "text_config", _model_cfg)
        llm_hidden_size = _text_cfg.hidden_size
        num_vl_layers = _text_cfg.num_hidden_layers
        self.llm_hidden_size = llm_hidden_size
        self.config.framework.qwenvl.vl_hidden_dim = llm_hidden_size
        self.config.framework.qwenvl.num_vl_layers = num_vl_layers

        self.action_model: LayerwiseFlowmatchingActionHead = get_action_model(config=self.config)

        self.future_action_window_size = config.framework.action_model.future_action_window_size
        self.past_action_window_size = config.framework.action_model.past_action_window_size
        self.chunk_len = self.past_action_window_size + 1 + self.future_action_window_size

        # Dataset soft prompt: conditions VLM on dataset identity
        self.dataset_vocab_size = getattr(self.config.framework.action_model, "dataset_vocab_size", 256)
        self.num_data_tokens = getattr(self.config.framework.qwenvl, "num_data_tokens", 0)
        if self.num_data_tokens > 0:
            self.dataset_embed = nn.Embedding(
                self.dataset_vocab_size,
                llm_hidden_size * self.num_data_tokens,
            )
        

    def forward(
        self,
        examples: List[dict] = None,
        **kwargs,
    ) -> Tuple:
        """
        Args:
            examples: List[dict], each dict requires:
                - image: List[PIL.Image] (multi-view)
                - lang: str instruction
                - action: np.ndarray or list shaped [T, action_dim]
        Returns:
            dict:
                action_loss (torch.Tensor): Scalar diffusion noise prediction loss.
        """
        batch_images = [example["image"] for example in examples]  #  [B，[PLT]]
        instructions = [example["lang"] for example in examples]  # [B, str]
        actions = [example["action"] for example in examples]  # label [B， len, 7]

        state = [example["state"] for example in examples] if "state" in examples[0] else None  # [B, 1, state_dim]
        dataset_ids = [example.get("dataset_id", 0) for example in examples]

        # Step 1: QWenVL input format
        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(images=batch_images, instructions=instructions)

        # Prepend dataset soft prompt tokens to VLM inputs
        if self.num_data_tokens > 0 and "input_ids" in qwen_inputs:
            dataset_ids_tensor = torch.tensor(
                dataset_ids, device=qwen_inputs["input_ids"].device, dtype=torch.long
            )
            ds_embeds = self.dataset_embed(dataset_ids_tensor).view(
                len(dataset_ids), self.num_data_tokens, self.llm_hidden_size
            )
            token_embeds = self.qwen_vl_interface.model.get_input_embeddings()(qwen_inputs["input_ids"])
            qwen_inputs["inputs_embeds"] = torch.cat((ds_embeds, token_embeds), dim=1)
            qwen_inputs.pop("input_ids")
            if "attention_mask" in qwen_inputs:
                prefix_mask = torch.ones(
                    (qwen_inputs["attention_mask"].shape[0], self.num_data_tokens),
                    device=qwen_inputs["attention_mask"].device,
                    dtype=qwen_inputs["attention_mask"].dtype,
                )
                qwen_inputs["attention_mask"] = torch.cat(
                    (prefix_mask, qwen_inputs["attention_mask"]), dim=1
                )
            if "position_ids" in qwen_inputs:
                prefix_pos = torch.arange(
                    self.num_data_tokens,
                    device=qwen_inputs["position_ids"].device,
                    dtype=qwen_inputs["position_ids"].dtype,
                ).unsqueeze(0).expand(qwen_inputs["position_ids"].shape[0], -1)
                qwen_inputs["position_ids"] = torch.cat(
                    (prefix_pos, qwen_inputs["position_ids"] + self.num_data_tokens), dim=1
                )

        with torch.autocast("cuda", dtype=torch.bfloat16):
            qwenvl_outputs = self.qwen_vl_interface(
                **qwen_inputs,
                output_attentions=False,
                output_hidden_states=True,
                return_dict=True,
            )
            # 取与 DiT 层数匹配的最后 N 层隐藏态，按层喂给 DiT
            all_hidden = qwenvl_outputs.hidden_states
            expected_layers = len(self.action_model.model.transformer_blocks)
            vl_embs_list = list(all_hidden[-expected_layers:])
            base_hidden = vl_embs_list[-1]

        # Step 4: Action Expert Forward and Loss
        # Extract encoder_attention_mask before VLM forward (qwen_inputs still in scope).
        # In cross-embodied training, batch sequences have very different lengths due to
        # varying camera counts (different image token counts per environment). Without
        # masking, the DiT cross-attention attends to padding tokens, injecting
        # task-dependent noise that causes unstable performance across environments.
        encoder_attention_mask = qwen_inputs.get("attention_mask", None)

        with torch.autocast("cuda", dtype=torch.float32):
            # 标签对齐：取最后 chunk_len 段
            actions = torch.tensor(
                np.array(actions), device=base_hidden.device, dtype=base_hidden.dtype
            )  # [B, T_full, action_dim]
            actions_target = actions[:, -(self.future_action_window_size+1):, :]  # (B, chunk_len, action_dim)

            repeated_diffusion_steps = (
                self.config.trainer.get("repeated_diffusion_steps", 1) if self.config and self.config.trainer else 1
            )
            actions_target_repeated = actions_target.repeat(repeated_diffusion_steps, 1, 1)
            # 对每层特征做 repeat
            vl_embs_list_repeated = [h.repeat(repeated_diffusion_steps, 1, 1) for h in vl_embs_list]
            encoder_attention_mask_repeated = (
                encoder_attention_mask.repeat(repeated_diffusion_steps, 1)
                if encoder_attention_mask is not None else None
            )

            state_repeated = None
            if state is not None:
                state = torch.tensor(
                    np.array(state), device=base_hidden.device, dtype=base_hidden.dtype
                )
                state_repeated = state.repeat(repeated_diffusion_steps, 1, 1)

            action_loss = self.action_model(
                vl_embs_list_repeated, actions_target_repeated, state_repeated,
                encoder_attention_mask=encoder_attention_mask_repeated,
            )  # (B, chunk_len, action_dim)



        return {"action_loss": action_loss}

    @torch.inference_mode()
    def predict_action( # TODO align  predict_action with forward, make api more flexible
        self,
        examples: List[dict] = None,
        **kwargs: str,
    ) -> np.ndarray:
        """
        推理：单次前向直接回归未来动作（无扩散采样）。

        Steps:
          1. Resize images to training resolution (if specified)
          2. Encode with QwenVL (hidden states retained)
          6. Return normalized action trajectory

        Returns:
            dict:
                normalized_actions (np.ndarray): Shape [B, T, action_dim], diffusion-sampled normalized actions.
        """
        from deployment.model_server.tools.image_tools import to_pil_preserve
        batch_images = [to_pil_preserve(example["image"]) for example in examples]  #  [B，[PLT]]
        instructions = [example["lang"] for example in examples]  # [B, str]

        state = [example["state"] for example in examples] if "state" in examples[0] else None  # [B, 1, state_dim]
        dataset_ids = [example.get("dataset_id", 0) for example in examples]

        train_obs_image_size = getattr(self.config.datasets.vla_data, "image_size", None)
        if train_obs_image_size:
            batch_images = resize_images(batch_images, target_size=train_obs_image_size)

        # Step 1: QWenVL input format
        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(images=batch_images, instructions=instructions)

        # Prepend dataset soft prompt tokens to VLM inputs
        if self.num_data_tokens > 0 and "input_ids" in qwen_inputs:
            dataset_ids_tensor = torch.tensor(
                dataset_ids, device=qwen_inputs["input_ids"].device, dtype=torch.long
            )
            ds_embeds = self.dataset_embed(dataset_ids_tensor).view(
                len(dataset_ids), self.num_data_tokens, self.llm_hidden_size
            )
            token_embeds = self.qwen_vl_interface.model.get_input_embeddings()(qwen_inputs["input_ids"])
            qwen_inputs["inputs_embeds"] = torch.cat((ds_embeds, token_embeds), dim=1)
            qwen_inputs.pop("input_ids")
            if "attention_mask" in qwen_inputs:
                prefix_mask = torch.ones(
                    (qwen_inputs["attention_mask"].shape[0], self.num_data_tokens),
                    device=qwen_inputs["attention_mask"].device,
                    dtype=qwen_inputs["attention_mask"].dtype,
                )
                qwen_inputs["attention_mask"] = torch.cat(
                    (prefix_mask, qwen_inputs["attention_mask"]), dim=1
                )
            if "position_ids" in qwen_inputs:
                prefix_pos = torch.arange(
                    self.num_data_tokens,
                    device=qwen_inputs["position_ids"].device,
                    dtype=qwen_inputs["position_ids"].dtype,
                ).unsqueeze(0).expand(qwen_inputs["position_ids"].shape[0], -1)
                qwen_inputs["position_ids"] = torch.cat(
                    (prefix_pos, qwen_inputs["position_ids"] + self.num_data_tokens), dim=1
                )

        encoder_attention_mask = qwen_inputs.get("attention_mask", None)

        with torch.autocast("cuda", dtype=torch.bfloat16):
            qwenvl_outputs = self.qwen_vl_interface(
                **qwen_inputs,
                output_attentions=False,
                output_hidden_states=True,
                return_dict=True,
            )
            all_hidden = qwenvl_outputs.hidden_states
            expected_layers = len(self.action_model.model.transformer_blocks)
            vl_embs_list = list(all_hidden[-expected_layers:])
            base_hidden = vl_embs_list[-1]

        state = torch.from_numpy(np.array(state)).to(base_hidden.device, dtype=base_hidden.dtype) if state is not None else None
        # Step 4: Action Expert Forward and Loss
        with torch.autocast("cuda", dtype=torch.float32):
            pred_actions = self.action_model.predict_action(
                vl_embs_list, state, encoder_attention_mask=encoder_attention_mask
            )  # (B, chunk_len, action_dim)

        normalized_actions = pred_actions.detach().cpu().numpy()
        return {"normalized_actions": normalized_actions}



if __name__ == "__main__":
    from omegaconf import OmegaConf
    import debugpy
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--config_yaml", type=str, default="./starVLA/config/training/starvla_cotrain_oxe.yaml", help="Path to YAML config")
    args, clipargs = parser.parse_known_args()

    debugpy.listen(("0.0.0.0", 10092))
    print("🔍 Rank 0 waiting for debugger attach on port 10092...")
    debugpy.wait_for_client()

    cfg = OmegaConf.load(args.config_yaml)
    # try get model
    cfg.framework.qwenvl.base_vlm = "./playground/Pretrained_models/Qwen3-VL-4B-Instruct"
    

    model = Qwen_PI(cfg)
    # ckpt="/mnt/petrelfs/yejinhui/Projects/llavavla/results/Checkpoints/1011_qwenpi/checkpoints/need_steps_10000_pytorch_model.pt"
    # model = Qwen_PI.from_pretrained(ckpt)
    print(model)


    # fake sample 
    image = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
    # Create a sample
    sample = {
        "action": np.random.uniform(-1, 1, size=(16, 7)).astype(np.float16), # action_chunk, action_dim
        "image": [image, image], # two views
        "lang": "This is a fake instruction for testing.",
        "state" : np.random.uniform(-1, 1, size=(1, 7)).astype(np.float16), # chunk, state_dim
    }

    batch  = [sample, sample]  # batch size 2
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    forward_output = model(batch)
    action_loss = forward_output['action_loss']
    print(f"Action Loss: {action_loss.item()}")

    # test predict action
    predict_output = model.predict_action([sample])
    normalized_actions = predict_output['normalized_actions']
    print(f"Unnormalized Action: {normalized_actions}")

    # # Advance: try forward model with dataloader
    # # can be fake sample， but here get from dataloader for simpler
    # from starVLA.dataloader.lerobot_datasets import get_vla_dataset, collate_fn

    # vla_dataset_cfg = cfg.datasets.vla_data
    # dataset = get_vla_dataset(data_cfg=vla_dataset_cfg)

    # from torch.utils.data import DataLoader

    # train_dataloader = DataLoader(
    #     dataset,
    #     batch_size=2,
    #     num_workers=1,  # For Debug
    #     collate_fn=collate_fn,
    # )
    # # 
    # for batch in tqdm(train_dataloader, desc="Processing Batches"):
    #     batch
    #     break

    # # try get model
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # model = model.to(device)
    # model(batch)

    # action = model.predict_action(batch_images=[batch[0]["image"]], instructions=[batch[0]["lang"]])

    # # fake state
    # for ba in batch:
    #     ba["state"] = ba["action"][0][None]

    # model(batch)
    # action = model.predict_action(batch_images=[batch[0]["image"]], instructions=[batch[0]["lang"]], state=[batch[0]["state"]])