File size: 2,558 Bytes
d7b3a74 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | import base64
import io
import logging
from transformers import AutoProcessor, AutoTokenizer, PreTrainedTokenizerBase, ProcessorMixin
logger = logging.getLogger(__name__)
# Default image patch size for vision-language models
# Note: Qwen3-VL uses 16, Qwen2.5-VL uses 14
# Reference: https://github.com/QwenLM/Qwen3-VL/blob/main/qwen-vl-utils/README.md
DEFAULT_PATCH_SIZE = 14
def load_tokenizer(name_or_path: str, **kwargs):
return AutoTokenizer.from_pretrained(name_or_path, **kwargs)
def build_processor_kwargs(multimodal_inputs: dict | None = None) -> dict:
forced = {
# force return_tensors to None for input_ids
"return_tensors": None,
}
modality_forced = {"return_tensors": "pt"}
result = dict(multimodal_inputs) if multimodal_inputs else {}
result.update(forced)
# set return_tensors="pt" for modality-specific outputs
for key in ("audio_kwargs", "images_kwargs", "videos_kwargs"):
if key in result:
result[key] = {**result[key], **modality_forced}
else:
result[key] = modality_forced.copy()
return result
def load_processor(name_or_path: str, **kwargs):
try:
proc = AutoProcessor.from_pretrained(name_or_path, **kwargs)
except (OSError, ValueError) as e:
logger.warning(f"Failed to load processor from {name_or_path}: {e}")
proc = None
# If HF returned a tokenizer, discard it.
if isinstance(proc, PreTrainedTokenizerBase) or not isinstance(proc, ProcessorMixin):
proc = None
return proc
def process_vision_info(prompt, processor):
# TODO: temporary solution, will write image utils for slime later
from qwen_vl_utils import process_vision_info as qwen_process_vision_info
if hasattr(processor.image_processor, "patch_size"):
image_patch_size = processor.image_processor.patch_size
else:
logger.info(f"Using default patch size: {DEFAULT_PATCH_SIZE}")
image_patch_size = DEFAULT_PATCH_SIZE
images, videos = qwen_process_vision_info(prompt, image_patch_size=image_patch_size)
multimodal_inputs = {"images": images, "videos": videos}
return multimodal_inputs
def encode_image_for_rollout_engine(image) -> str:
"""Load an image from path, ensure RGB, encode as PNG base64 string."""
buffer = io.BytesIO()
if image.mode != "RGB":
image = image.convert("RGB")
image.save(buffer, format="PNG")
image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
return f"data:image/png;base64,{image_base64}"
|