```python import os import torch from transformers import AutoModelForVision2Seq, AutoProcessor from peft import PeftModel from qwen_vl_utils import process_vision_info MODEL_PATH = "Qwen/Qwen3-VL-8B-Instruct" ADAPTER_PATH = "yang1232009/HanMoVLM" IMAGE_PATH = "" PROMPT_TEXT = ( "你是一位中国传统绘画鉴赏专家,熟悉笔墨技法、中国画美学、艺术史与文人画理论。" "请对输入的国风绘画图像进行深入、专业、客观以及细致的艺术评估。\n" "按以下格式输出:\n" "原因分析: [详细分析]\n" "最终分数: [0-5整数分数]" ) base_model = AutoModelForVision2Seq.from_pretrained( MODEL_PATH, torch_dtype=torch.float16, device_map="cuda", ) model = PeftModel.from_pretrained(base_model, ADAPTER_PATH) model = model.merge_and_unload() model.eval() processor = AutoProcessor.from_pretrained(MODEL_PATH) image_path = IMAGE_PATH if not os.path.exists(image_path): raise FileNotFoundError(f"Image file not found: {image_path}") messages = [ { "role": "user", "content": [ {"type": "image", "image": image_path}, {"type": "text", "text": PROMPT_TEXT}, ], } ] inputs = processor.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" ) inputs = inputs.to(model.device) # Inference: Generation of the output generated_ids = model.generate(**inputs, max_new_tokens=4096) generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) print(output_text) ```