# Copyright 2025 starVLA community. All rights reserved. # Licensed under the MIT License, Version 1.0 (the "License"); # Implemented by [Jinhui YE / HKUST University] in [2025]. import torch from typing import Optional, List from transformers.modeling_outputs import CausalLMOutputWithPast from transformers import Qwen3VLForConditionalGeneration, AutoProcessor from transformers.modeling_outputs import CausalLMOutputWithPast from typing import Dict, Optional, List from torch.nn.utils.rnn import pad_sequence from transformers import BatchFeature from qwen_vl_utils import process_vision_info from accelerate.logging import get_logger logger = get_logger(__name__) IGNORE_INDEX = -100 IMAGE_TOKEN_INDEX = 151655 VIDEO_TOKEN_INDEX = 151656 DEFAULT_IMAGE_TOKEN = "" DEFAULT_VIDEO_TOKEN = "