IBIS / infer.py
manglu3935's picture
Add infer.py
4b21f47
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, GenerationConfig
from qwen_vl_utils import process_vision_info
# ==========================
# 初始化本地 Qwen2.5-VL 模型
# ==========================
MODEL_PATH = "XXXXXX"
print(f"[Init] Loading model from {MODEL_PATH}")
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
MODEL_PATH,
torch_dtype=torch.bfloat16,
device_map={"": "cuda:0"}, # ✅ 强制单卡推理
attn_implementation="flash_attention_2",
)
processor = AutoProcessor.from_pretrained(MODEL_PATH)
gen_config = GenerationConfig(
max_new_tokens=512,
do_sample=False,
temperature=1,
num_return_sequences=1,
pad_token_id=151643,
)
# ==========================
# 推理函数
# ==========================
def call_qwen_local(prompt, images=None, system_prompt=None):
"""
使用本地 Qwen2.5-VL 模型进行推理,可选 system prompt。
"""
# ===== 构造消息结构 =====
messages = []
# 加入 system prompt(如果有)
if system_prompt:
messages.append({
"role": "system",
"content": [{"type": "text", "text": system_prompt}]
})
# 加入 user prompt
if images is None:
messages.append({
"role": "user",
"content": [{"type": "text", "text": prompt}]
})
else:
content = []
for img_path in images:
content.append({"type": "image", "image": img_path})
content.append({"type": "text", "text": prompt})
messages.append({"role": "user", "content": content})
# ===== 模型输入处理 =====
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to("cuda")
# ===== 模型推理 =====
generated_ids = model.generate(
**inputs,
generation_config=gen_config
)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)
return output_text[0]
# ==========================
# 主执行区:直接定义输入
# ==========================
if __name__ == "__main__":
# SYSTEM_PROMPT = (
# "You are a helpful assistant. "
# )
# USER_PROMPT = "9.11和9.9谁更大?"
SYSTEM_PROMPT = (
"You are a precise and expert medical segmentation agent. Your mission is to accurately segment a target object in a medical image through a series of interactive point placements. You will be given an image and an instruction. You must carefully analyze the image state. If there is no mask, it is an initialization step. Your goal is to place a Positive Point on a clear, representative part of the target object. If a semi-transparent green mask is present, it is a refinement step. Your goal is to improve its accuracy. Place a Positive Point on a region of the target that the mask has missed, or a Negative Point on an area the mask has incorrectly included.\nYour response must strictly follow this structure: first, your detailed reasoning within <think> tags, and then your single, decisive move within <action> tags. The only valid actions are Positive Point (x, y), Negative Point (x, y), or Terminate. All coordinates (x, y) must be normalized to a 0.0-1.0 scale and formatted to four decimal places, for example: Positive Point (0.5000, 0.2500). Only use Terminate when the mask perfectly aligns with the target boundary. If you Terminate, you must also append a final, concise summary in an <answer> tag."
)
USER_PROMPT = "<image>This part of the right lung is not included."
IMAGE_PATHS = ["XXXX/biomedparse/img/COVID-QU-Ex_covid_28_X-Ray_chest_2_mask_4.png"]
print("\n[Running Qwen2.5-VL inference with system prompt ...]")
model_output = call_qwen_local(USER_PROMPT, IMAGE_PATHS, system_prompt=SYSTEM_PROMPT)
print("\n===============================")
print("System Prompt:")
print(SYSTEM_PROMPT)
print("-------------------------------")
print("User Prompt:")
print(USER_PROMPT)
print("-------------------------------")
print("Model Output:")
print(model_output)
print("===============================")