rl4phyx-backup / eval_footprint /eval_single_model_template.py

Upload folder using huggingface_hub

c6cbaf4 verified 2 months ago

7.61 kB

	#!/usr/bin/env python3
	"""
	Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).

	Runs both Base and SFT models on the 1533 open-ended physics test set.
	Saves raw model outputs for later judging.

	Usage (inside Docker container):
	cd /tmp && python3 /path/to/eval_openended_inference.py

	Output:
	sft_eval_footprint/inference_results_base.jsonl
	sft_eval_footprint/inference_results_sft.jsonl
	"""
	import os
	import sys
	import json
	import re
	import time
	import torch
	import multiprocessing as mp
	from collections import Counter

	# ============ CONFIG ============
	os.environ["HF_HUB_OFFLINE"] = "1"
	os.environ["TRANSFORMERS_OFFLINE"] = "1"

	BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
	SFT_MODEL = "MODEL_PATH_PLACEHOLDER"
	TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
	OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
	IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"

	# Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
	BASE_GPUS = [0, 1, 2, 3]
	SFT_GPUS = [4, 5, 6, 7]
	MAX_NEW_TOKENS = 2048
	# ================================


	def load_test_data():
	"""Load test samples from JSONL."""
	samples = []
	with open(TEST_FILE, 'r', encoding='utf-8') as f:
	for line in f:
	if line.strip():
	samples.append(json.loads(line))
	return samples


	def build_open_ended_prompt(sample):
	"""Build an open-ended prompt (no MCQ options)."""
	desc = sample.get('description', '')
	question = sample.get('question', '')

	prompt = f"""Look at the image and answer the physics question.

	{desc}

	{question}

	Please reason step by step, and put your final answer within \\boxed{{}}.
	"""
	return prompt.strip()


	def worker_inference(gpu_id, model_path, samples, output_file, model_name):
	"""Worker: load model on specific GPU and run inference on assigned samples."""
	import torch
	from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
	from qwen_vl_utils import process_vision_info
	from PIL import Image

	device = f"cuda:{gpu_id}"
	print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)

	processor = AutoProcessor.from_pretrained(
	model_path,
	min_pixels=3136,
	max_pixels=200704,
	local_files_only=True,
	trust_remote_code=True,
	)
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	model_path,
	torch_dtype=torch.bfloat16,
	device_map=device,
	local_files_only=True,
	trust_remote_code=True,
	)
	model.eval()
	print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)

	results = []
	for i, sample in enumerate(samples):
	idx = sample['index']
	prompt_text = build_open_ended_prompt(sample)
	image_path = os.path.join(IMAGE_DIR, sample['image'])

	# Build messages
	messages = [{
	"role": "user",
	"content": [
	{"type": "image", "image": f"file://{image_path}"},
	{"type": "text", "text": prompt_text},
	],
	}]

	try:
	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	).to(device)

	with torch.no_grad():
	output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)

	generated = output_ids[0][inputs.input_ids.shape[1]:]
	response = processor.decode(generated, skip_special_tokens=True)
	except Exception as e:
	response = f"ERROR: {str(e)}"

	result = {
	"index": idx,
	"category": sample['category'],
	"subfield": sample.get('subfield', ''),
	"question": sample['question'],
	"ground_truth_value": sample['ground_truth_value'],
	"ground_truth_letter": sample.get('ground_truth_letter', ''),
	"model_output": response,
	"model_name": model_name,
	"gpu_id": gpu_id,
	}
	results.append(result)

	if (i + 1) % 20 == 0 or (i + 1) == len(samples):
	print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)

	# Write results
	with open(output_file, 'w', encoding='utf-8') as f:
	for r in results:
	f.write(json.dumps(r, ensure_ascii=False) + '\n')

	print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
	return len(results)


	def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
	"""Split samples across GPUs and run in parallel."""
	n = len(samples)
	k = len(gpu_ids)
	chunk_size = (n + k - 1) // k

	processes = []
	output_files = []
	for i, gpu_id in enumerate(gpu_ids):
	chunk = samples[i * chunk_size: (i + 1) * chunk_size]
	if not chunk:
	continue
	out_file = f"{output_base}_gpu{gpu_id}.jsonl"
	output_files.append(out_file)
	p = mp.Process(
	target=worker_inference,
	args=(gpu_id, model_path, chunk, out_file, model_name)
	)
	processes.append(p)

	for p in processes:
	p.start()
	for p in processes:
	p.join()

	return output_files


	def merge_results(output_files, final_output):
	"""Merge per-GPU result files into one."""
	all_results = []
	for f in output_files:
	if os.path.exists(f):
	with open(f, 'r', encoding='utf-8') as fh:
	for line in fh:
	if line.strip():
	all_results.append(json.loads(line))

	# Sort by index for consistency
	all_results.sort(key=lambda x: x['index'])

	with open(final_output, 'w', encoding='utf-8') as f:
	for r in all_results:
	f.write(json.dumps(r, ensure_ascii=False) + '\n')

	# Cleanup per-GPU files
	for f in output_files:
	if os.path.exists(f):
	os.remove(f)

	return all_results


	def main():
	import json, os
	TEST_FILE = os.path.join(OUTPUT_DIR, "test_1533_openended.jsonl")
	samples = []
	with open(TEST_FILE) as f:
	for line in f:
	if line.strip():
	samples.append(json.loads(line))
	print(f"Loaded {len(samples)} test samples")
	print(f"Model: {SFT_MODEL}")

	sft_output = os.path.join(OUTPUT_DIR, "OUTPUT_NAME_PLACEHOLDER")
	SFT_GPUS = list(range(8))
	run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)

	sft_final = sft_output + ".jsonl" if not sft_output.endswith(".jsonl") else sft_output
	if not os.path.exists(sft_final):
	# merge from per-gpu files
	all_r = []
	for gpu in SFT_GPUS:
	gf = sft_output + f"_gpu{gpu}.jsonl"
	if os.path.exists(gf):
	with open(gf) as f:
	for line in f:
	if line.strip():
	all_r.append(line)
	os.remove(gf)
	with open(sft_final, 'w') as f:
	for line in all_r:
	f.write(line)

	with open(sft_final) as f:
	count = sum(1 for _ in f)
	print(f"Total: {count} results -> {sft_final}")

	if __name__ == "__main__":
	main()