import base64 import io import torch from PIL import Image from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor from qwen_vl_utils import process_vision_info class EndpointHandler: def __init__(self, model_dir): self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained( model_dir, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True ) self.processor = AutoProcessor.from_pretrained( model_dir, trust_remote_code=True ) def __call__(self, data): image_b64 = data["inputs"]["image"] prompt = data["inputs"]["text"] image = Image.open(io.BytesIO(base64.b64decode(image_b64))) messages = [ { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": prompt}, ], } ] text = self.processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = self.processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt" ).to(self.model.device) outputs = self.model.generate(**inputs, max_new_tokens=512) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, outputs) ] decoded = self.processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True ) return {"result": decoded[0]}