import json, pandas as pd, os src = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/inference_results_base.jsonl" with open(src) as f: lines = [json.loads(l) for l in f if l.strip()] # Use absolute path for test images IMAGE_BASE = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images" rows = [] for r in lines: idx = r.get("index", 0) cat = r.get("category", "unknown") subfield = r.get("subfield", "") gt_value = str(r.get("ground_truth_value", "")).strip() question = r.get("question", "") prompt_text = f"Look at the image and answer the physics question.\n\n{question}\n\nPlease reason step by step and put your final answer (with units if applicable) in \\boxed{{}}." # Use absolute path for image abs_image_path = os.path.join(IMAGE_BASE, f"{idx}.png") row = { "data_source": "metaphyx_physics", "prompt": [{"content": prompt_text, "role": "user"}], "ability": "physics", "reward_model": {"ground_truth": gt_value, "style": "rule"}, "extra_info": { "category": cat, "subfield": subfield, "index": idx, "image_path": abs_image_path, "split": "test", }, } rows.append(row) df = pd.DataFrame(rows) out_path = "/workspace/rl4phyx/RL4Phyx/oneshot/validation_data/metaphyx_oe_1533.parquet" df.to_parquet(out_path, index=False) # Verify df2 = pd.read_parquet(out_path) print(f"Saved: {out_path}") print(f"Shape: {df2.shape}") # Check first image exists img0 = df2.iloc[0]["extra_info"]["image_path"] print(f"First image: {img0}") print(f"Exists: {os.path.exists(img0)}") # Check a few more for i in [0, 100, 500, 1000, 1532]: ip = df2.iloc[i]["extra_info"]["image_path"] print(f" [{i}] {os.path.basename(ip)}: exists={os.path.exists(ip)}")