from transformers import GenerationConfig, AutoProcessor, AutoTokenizer, AutoModelForImageTextToText, Qwen2_5_VLForConditionalGeneration from qwen_vl_utils import process_vision_info model_name = "Qwen/Qwen2.5-VL-7B-Instruct" #If it is an any form of ID - return only list of keys and values. class EndpointHandler: def __init__(self): self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained( model_name, torch_dtype="auto", device_map="cuda" ) self.processor = AutoProcessor.from_pretrained(model_name) async def __call__(self, data): messages = data.get("messages") gen_cfg = GenerationConfig( max_new_tokens=2048, no_repeat_ngram_size=3, repeat_penalty=1.2, early_stopping=True, ) text = self.processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = self.processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) generated_ids = self.model.generate(**inputs, generation_config=gen_cfg) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = self.processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) return output_text[0]