JustinTX's picture
Add files using upload-large-folder tool
d7b3a74 verified
import base64
import io
import logging
from transformers import AutoProcessor, AutoTokenizer, PreTrainedTokenizerBase, ProcessorMixin
logger = logging.getLogger(__name__)
# Default image patch size for vision-language models
# Note: Qwen3-VL uses 16, Qwen2.5-VL uses 14
# Reference: https://github.com/QwenLM/Qwen3-VL/blob/main/qwen-vl-utils/README.md
DEFAULT_PATCH_SIZE = 14
def load_tokenizer(name_or_path: str, **kwargs):
return AutoTokenizer.from_pretrained(name_or_path, **kwargs)
def build_processor_kwargs(multimodal_inputs: dict | None = None) -> dict:
forced = {
# force return_tensors to None for input_ids
"return_tensors": None,
}
modality_forced = {"return_tensors": "pt"}
result = dict(multimodal_inputs) if multimodal_inputs else {}
result.update(forced)
# set return_tensors="pt" for modality-specific outputs
for key in ("audio_kwargs", "images_kwargs", "videos_kwargs"):
if key in result:
result[key] = {**result[key], **modality_forced}
else:
result[key] = modality_forced.copy()
return result
def load_processor(name_or_path: str, **kwargs):
try:
proc = AutoProcessor.from_pretrained(name_or_path, **kwargs)
except (OSError, ValueError) as e:
logger.warning(f"Failed to load processor from {name_or_path}: {e}")
proc = None
# If HF returned a tokenizer, discard it.
if isinstance(proc, PreTrainedTokenizerBase) or not isinstance(proc, ProcessorMixin):
proc = None
return proc
def process_vision_info(prompt, processor):
# TODO: temporary solution, will write image utils for slime later
from qwen_vl_utils import process_vision_info as qwen_process_vision_info
if hasattr(processor.image_processor, "patch_size"):
image_patch_size = processor.image_processor.patch_size
else:
logger.info(f"Using default patch size: {DEFAULT_PATCH_SIZE}")
image_patch_size = DEFAULT_PATCH_SIZE
images, videos = qwen_process_vision_info(prompt, image_patch_size=image_patch_size)
multimodal_inputs = {"images": images, "videos": videos}
return multimodal_inputs
def encode_image_for_rollout_engine(image) -> str:
"""Load an image from path, ensure RGB, encode as PNG base64 string."""
buffer = io.BytesIO()
if image.mode != "RGB":
image = image.convert("RGB")
image.save(buffer, format="PNG")
image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
return f"data:image/png;base64,{image_base64}"