from prompts import make_user_query, system_prompt from transformers import ( Qwen3_5ForConditionalGeneration, AutoProcessor, ) from PIL import Image import torch MODEL_PATH = "M:/ai/qwen3.5_mm_trainer/Qwen3.5-4B-Base_k2" DEVICE = 'cuda' model = Qwen3_5ForConditionalGeneration.from_pretrained( MODEL_PATH, torch_dtype=torch.bfloat16, attn_implementation="sdpa", device_map=DEVICE ) processor = AutoProcessor.from_pretrained( MODEL_PATH, min_pixels=256*32*32, padding_side="right" ) C_TYPE = 'long_thoughts_v2' USE_NAMES = True ADD_TAGS = False ADD_CHAR_LIST = False ADD_CHARS_TAGS = False ADD_CHARS_DESCR = False def prepare_messages(item): user_query = make_user_query(item, C_TYPE, USE_NAMES, ADD_TAGS, ADD_CHAR_LIST, ADD_CHARS_TAGS, ADD_CHARS_DESCR ) return [ { "role": "system", "content": [{"type": "text", "text": system_prompt}] }, { "role": "user", "content": [ {"type": "image"}, {"type": "text", "text": user_query}, ], } ] img = Image.open('test_image.png') images = [img] msgs = prepare_messages({}) texts = [processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)] inputs = processor(text=texts, images=images, return_tensors="pt") inputs = {k:v.to(DEVICE) for k,v in inputs.items()} with torch.no_grad(): generate_ids = model.generate(**inputs, max_new_tokens=1024) generated_texts = processor.batch_decode( generate_ids[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True ) print(generated_texts[0])