import base64 import io import logging from transformers import AutoProcessor, AutoTokenizer, PreTrainedTokenizerBase, ProcessorMixin logger = logging.getLogger(__name__) # Default image patch size for vision-language models # Note: Qwen3-VL uses 16, Qwen2.5-VL uses 14 # Reference: https://github.com/QwenLM/Qwen3-VL/blob/main/qwen-vl-utils/README.md DEFAULT_PATCH_SIZE = 14 def load_tokenizer(name_or_path: str, **kwargs): return AutoTokenizer.from_pretrained(name_or_path, **kwargs) def build_processor_kwargs(multimodal_inputs: dict | None = None) -> dict: forced = { # force return_tensors to None for input_ids "return_tensors": None, } modality_forced = {"return_tensors": "pt"} result = dict(multimodal_inputs) if multimodal_inputs else {} result.update(forced) # set return_tensors="pt" for modality-specific outputs for key in ("audio_kwargs", "images_kwargs", "videos_kwargs"): if key in result: result[key] = {**result[key], **modality_forced} else: result[key] = modality_forced.copy() return result def load_processor(name_or_path: str, **kwargs): try: proc = AutoProcessor.from_pretrained(name_or_path, **kwargs) except (OSError, ValueError) as e: logger.warning(f"Failed to load processor from {name_or_path}: {e}") proc = None # If HF returned a tokenizer, discard it. if isinstance(proc, PreTrainedTokenizerBase) or not isinstance(proc, ProcessorMixin): proc = None return proc def process_vision_info(prompt, processor): # TODO: temporary solution, will write image utils for slime later from qwen_vl_utils import process_vision_info as qwen_process_vision_info if hasattr(processor.image_processor, "patch_size"): image_patch_size = processor.image_processor.patch_size else: logger.info(f"Using default patch size: {DEFAULT_PATCH_SIZE}") image_patch_size = DEFAULT_PATCH_SIZE images, videos = qwen_process_vision_info(prompt, image_patch_size=image_patch_size) multimodal_inputs = {"images": images, "videos": videos} return multimodal_inputs def encode_image_for_rollout_engine(image) -> str: """Load an image from path, ensure RGB, encode as PNG base64 string.""" buffer = io.BytesIO() if image.mode != "RGB": image = image.convert("RGB") image.save(buffer, format="PNG") image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8") return f"data:image/png;base64,{image_base64}"