--- license: cc-by-sa-4.0 --- ``` from PIL import Image from transformers import AutoModelForCausalLM, AutoProcessor import torch model_path = 'shilinxu/NaflexVLM2_5' model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16 ,device_map='cuda:0', trust_remote_code=True) processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) url = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg' import requests image = Image.open(requests.get(url, stream=True).raw) messages = [ { 'role':'user', 'content': [ {'type':'text', 'text': 'Describe this image in detail.'}, {'type':'image'} ] } ] text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = processor( text=text, images=[image], padding=False, return_tensors="pt", ) inputs = inputs.to(model.device, dtype=torch.bfloat16) generated_ids = model.generate(**inputs, max_new_tokens=128, temperature=1.0, repetition_penalty=1.2) generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) print(output_text) ```