import json, os, sys, time, torch MODEL_PATH = sys.argv[1] OUTPUT_NAME = sys.argv[2] EVAL_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint" TEST_FILE = os.path.join(EVAL_DIR, "test_1533_openended.jsonl") print(f"Model: {MODEL_PATH}") print(f"Output: {OUTPUT_NAME}") from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor from qwen_vl_utils import process_vision_info model = Qwen2_5_VLForConditionalGeneration.from_pretrained( MODEL_PATH, torch_dtype=torch.bfloat16, device_map="cuda", attn_implementation="sdpa" ) processor = AutoProcessor.from_pretrained(MODEL_PATH) model.eval() samples = [] with open(TEST_FILE) as f: for line in f: if line.strip(): samples.append(json.loads(line)) print(f"Loaded {len(samples)} samples") results = [] t0 = time.time() for idx, sample in enumerate(samples): desc = sample.get("description", "") q = sample.get("question", "") parts = [p for p in [desc, q] if p] parts.append("Please reason step by step, and put your final answer within \\boxed{}.") prompt_text = "\n\n".join(parts) img = sample.get("image_path", "") content = [] if img and os.path.exists(img): content.append({"type": "image", "image": f"file://{img}"}) content.append({"type": "text", "text": prompt_text}) messages = [{"role": "user", "content": content}] text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs = process_vision_info(messages) inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to("cuda") with torch.no_grad(): ids = model.generate(**inputs, max_new_tokens=2048, do_sample=False) out_ids = ids[0][len(inputs.input_ids[0]):] response = processor.decode(out_ids, skip_special_tokens=True) sample["model_output"] = response results.append(sample) if (idx + 1) % 50 == 0: elapsed = time.time() - t0 rate = (idx + 1) / elapsed eta = (len(samples) - idx - 1) / rate / 60 print(f" {idx+1}/{len(samples)} ({rate:.1f}/s, ETA {eta:.0f}min)", flush=True) output_file = os.path.join(EVAL_DIR, f"inference_results_{OUTPUT_NAME}.jsonl") with open(output_file, "w", encoding="utf-8") as f: for r in results: f.write(json.dumps(r, ensure_ascii=False) + "\n") print(f"\nDone: {len(results)} -> {output_file}")