| |
| |
| |
|
|
| import torch |
| from typing import Optional, List |
| from transformers.modeling_outputs import CausalLMOutputWithPast |
| from typing import Dict, Optional, List |
| from typing import List, Union, Dict, Optional |
|
|
|
|
| import torch |
| from PIL import Image |
| from transformers import AutoProcessor, AutoModelForCausalLM |
|
|
|
|
| from accelerate.logging import get_logger |
|
|
| logger = get_logger(__name__) |
|
|
| |
| |
| |
| |
| |
|
|
| |
|
|
| import torch.nn as nn |
| def _construct_prompts(text): |
|
|
| return text |
|
|
| class _Florence_Interface(nn.Module): |
| """ |
| This exists because of the diversity of VLMs, so we encapsulate the changes here. |
| Lightweight wrapper around Qwen3-VL (Qwen3VLForConditionalGeneration). |
| |
| Purpose: |
| - Unify interface with other VLM backends (CausalLM-like usage). |
| - Centralize preprocessing (tokenization + multimodal packing). |
| - Provide consistent forward / generate signatures. |
| |
| """ |
|
|
| def __init__(self, config: Optional[dict] = None, **kwargs): |
| """ |
| Initialize the VLM wrapper. |
| Following https://huggingface.co/microsoft/Florence-2-large |
| |
| """ |
| super().__init__() |
|
|
| qwenvl_config = config.framework.get("qwenvl", {}) |
| model_id = qwenvl_config.get("base_vlm", "microsoft/Florence-2-large") |
|
|
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 |
|
|
| self.model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch_dtype, trust_remote_code=True, attn_implementation="eager" ) |
| self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) |
|
|
| self.processor._construct_prompts = _construct_prompts |
| self.config = config |
|
|
| |
| self.model.config.hidden_size = self.model.config.projection_dim |
|
|
|
|
| |
| if hasattr(self.model, "decoder"): |
| del self.model.decoder |
| if hasattr(self.model, "lm_head"): |
| del self.model.lm_head |
|
|
| def forward( |
| self, |
| **kwargs, |
| ) -> CausalLMOutputWithPast: |
| """ |
| Forward pass delegating to underlying Qwen2.5-VL backbone. |
| """ |
|
|
| with torch.autocast("cuda", dtype=torch.bfloat16): |
| outputs = self.forward_vlm( |
| **kwargs, |
| ) |
|
|
| return outputs |
|
|
| |
| def forward_vlm( |
| self, |
| input_ids: torch.LongTensor, |
| pixel_values: torch.FloatTensor, |
| **kwargs |
| ): |
| """ |
| # copyright from X-VLA https://github.com/2toinf/X-VLA/blob/main/models/modeling_florence2.py |
| |
| Encode text + multi-view images via Florence2 encoder. |
| Returns: |
| enc_out.hidden_states: [B, T_enc, D] |
| """ |
| |
| |
| param_dtype = next(self.model.parameters()).dtype |
| pixel_values = pixel_values.to(self.model.device, dtype=param_dtype) |
| valid_feats = self.model._encode_image(pixel_values) |
| B_multiview, N, D = valid_feats.shape |
| |
| inputs_embeds = self.model.get_input_embeddings()(input_ids) |
|
|
| |
| |
| B, L, _ = inputs_embeds.shape |
| image_features = valid_feats.view(B, -1, D) |
|
|
| |
| merged_embeds, attention_mask = self.model._merge_input_ids_with_image_features( |
| image_features, |
| inputs_embeds, |
| ) |
| |
| |
| |
| enc_out = self.model.language_model.model.encoder( |
| attention_mask=attention_mask, |
| inputs_embeds=merged_embeds, |
| ) |
| enc_out.hidden_states = [enc_out.last_hidden_state] |
| |
| return enc_out |
| |
| def build_qwenvl_inputs(self, images, instructions, **kwargs): |
| """ |
| Build model inputs from raw data (images + instructions). |
| Follow Oficial Florence 2 format: https://huggingface.co/microsoft/Florence-2-large |
| """ |
|
|
| |
| assert len(images) == len(instructions), "Images and instructions must have the same length" |
| assert len(images[0]) == 1, "Florence2 only support batch size 1 for now" |
| |
| flatten_batch_images = [] |
| for exameple_images in images: |
| flatten_batch_images.extend(exameple_images) |
| |
| task_prompt = "Locate the objects with category name in the image." |
| for index in range(len(instructions)): |
| instruction = instructions[index] |
| instructions[index] = task_prompt + " " + instruction |
| |
| |
| inputs = self.processor(text=instructions, images=flatten_batch_images, return_tensors="pt", padding=True, truncation=True,) |
| inputs["labels"] = inputs["input_ids"].clone() |
|
|
| return inputs.to(self.model.device) |
|
|
|
|
|
|
|
|
|
|
| if __name__ == "__main__": |
| from omegaconf import OmegaConf |
| import debugpy |
| import argparse |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--config_yaml", type=str, default="./starVLA/config/training/starvla_cotrain_oxe.yaml", help="Path to YAML config") |
| args, clipargs = parser.parse_known_args() |
|
|
| debugpy.listen(("0.0.0.0", 10092)) |
| print("🔍 Rank 0 waiting for debugger attach on port 10092...") |
| debugpy.wait_for_client() |
|
|
| cfg = OmegaConf.load(args.config_yaml) |
| |
| model_id = "playground/Pretrained_models/Florence-2-large" |
| cfg.framework.qwenvl.base_vlm = model_id |
| qwen_vl = _Florence_Interface(cfg) |
| qwen_vl.model.eval() |
|
|
| import requests |
|
|
| import torch |
| from PIL import Image |
|
|
| device = "cuda:0" if torch.cuda.is_available() else "cpu" |
| torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32 |
|
|
| prompt = "<OD>" |
|
|
| url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true" |
| image = Image.open(requests.get(url, stream=True).raw) |
| inputs = qwen_vl.build_qwenvl_inputs(images=[[image]], instructions=[prompt]) |
| with torch.no_grad(): |
| with torch.autocast("cuda", dtype=torch.bfloat16): |
| outputs = qwen_vl.forward_vlm( |
| input_ids=inputs["input_ids"], |
| pixel_values=inputs["pixel_values"], |
| ) |
| print(f"forward_vlm last_hidden_state shape: {outputs.last_hidden_state.shape}") |
| print(f"forward_vlm hidden_states length: {len(outputs.hidden_states)}") |
|
|
|
|