| import base64 |
| import io |
| import logging |
|
|
| from transformers import AutoProcessor, AutoTokenizer, PreTrainedTokenizerBase, ProcessorMixin |
|
|
| logger = logging.getLogger(__name__) |
|
|
| |
| |
| |
| DEFAULT_PATCH_SIZE = 14 |
|
|
|
|
| def load_tokenizer(name_or_path: str, **kwargs): |
| return AutoTokenizer.from_pretrained(name_or_path, **kwargs) |
|
|
|
|
| def build_processor_kwargs(multimodal_inputs: dict | None = None) -> dict: |
|
|
| forced = { |
| |
| "return_tensors": None, |
| } |
| modality_forced = {"return_tensors": "pt"} |
|
|
| result = dict(multimodal_inputs) if multimodal_inputs else {} |
|
|
| result.update(forced) |
|
|
| |
| for key in ("audio_kwargs", "images_kwargs", "videos_kwargs"): |
| if key in result: |
| result[key] = {**result[key], **modality_forced} |
| else: |
| result[key] = modality_forced.copy() |
|
|
| return result |
|
|
|
|
| def load_processor(name_or_path: str, **kwargs): |
| try: |
| proc = AutoProcessor.from_pretrained(name_or_path, **kwargs) |
| except (OSError, ValueError) as e: |
| logger.warning(f"Failed to load processor from {name_or_path}: {e}") |
| proc = None |
|
|
| |
| if isinstance(proc, PreTrainedTokenizerBase) or not isinstance(proc, ProcessorMixin): |
| proc = None |
|
|
| return proc |
|
|
|
|
| def process_vision_info(prompt, processor): |
| |
| from qwen_vl_utils import process_vision_info as qwen_process_vision_info |
|
|
| if hasattr(processor.image_processor, "patch_size"): |
| image_patch_size = processor.image_processor.patch_size |
| else: |
| logger.info(f"Using default patch size: {DEFAULT_PATCH_SIZE}") |
| image_patch_size = DEFAULT_PATCH_SIZE |
| images, videos = qwen_process_vision_info(prompt, image_patch_size=image_patch_size) |
| multimodal_inputs = {"images": images, "videos": videos} |
| return multimodal_inputs |
|
|
|
|
| def encode_image_for_rollout_engine(image) -> str: |
| """Load an image from path, ensure RGB, encode as PNG base64 string.""" |
| buffer = io.BytesIO() |
| if image.mode != "RGB": |
| image = image.convert("RGB") |
| image.save(buffer, format="PNG") |
| image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8") |
| return f"data:image/png;base64,{image_base64}" |
|
|