Upload root_scripts/make_val_parquet.py with huggingface_hub
Browse files
root_scripts/make_val_parquet.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import json, pandas as pd, os
|
| 3 |
+
|
| 4 |
+
src = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/inference_results_base.jsonl"
|
| 5 |
+
with open(src) as f:
|
| 6 |
+
lines = [json.loads(l) for l in f if l.strip()]
|
| 7 |
+
|
| 8 |
+
# Use absolute path for test images
|
| 9 |
+
IMAGE_BASE = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
|
| 10 |
+
|
| 11 |
+
rows = []
|
| 12 |
+
for r in lines:
|
| 13 |
+
idx = r.get("index", 0)
|
| 14 |
+
cat = r.get("category", "unknown")
|
| 15 |
+
subfield = r.get("subfield", "")
|
| 16 |
+
gt_value = str(r.get("ground_truth_value", "")).strip()
|
| 17 |
+
question = r.get("question", "")
|
| 18 |
+
|
| 19 |
+
prompt_text = f"Look at the image and answer the physics question.\n\n{question}\n\nPlease reason step by step and put your final answer (with units if applicable) in \\boxed{{}}."
|
| 20 |
+
|
| 21 |
+
# Use absolute path for image
|
| 22 |
+
abs_image_path = os.path.join(IMAGE_BASE, f"{idx}.png")
|
| 23 |
+
|
| 24 |
+
row = {
|
| 25 |
+
"data_source": "metaphyx_physics",
|
| 26 |
+
"prompt": [{"content": prompt_text, "role": "user"}],
|
| 27 |
+
"ability": "physics",
|
| 28 |
+
"reward_model": {"ground_truth": gt_value, "style": "rule"},
|
| 29 |
+
"extra_info": {
|
| 30 |
+
"category": cat,
|
| 31 |
+
"subfield": subfield,
|
| 32 |
+
"index": idx,
|
| 33 |
+
"image_path": abs_image_path,
|
| 34 |
+
"split": "test",
|
| 35 |
+
},
|
| 36 |
+
}
|
| 37 |
+
rows.append(row)
|
| 38 |
+
|
| 39 |
+
df = pd.DataFrame(rows)
|
| 40 |
+
|
| 41 |
+
out_path = "/workspace/rl4phyx/RL4Phyx/oneshot/validation_data/metaphyx_oe_1533.parquet"
|
| 42 |
+
df.to_parquet(out_path, index=False)
|
| 43 |
+
|
| 44 |
+
# Verify
|
| 45 |
+
df2 = pd.read_parquet(out_path)
|
| 46 |
+
print(f"Saved: {out_path}")
|
| 47 |
+
print(f"Shape: {df2.shape}")
|
| 48 |
+
# Check first image exists
|
| 49 |
+
img0 = df2.iloc[0]["extra_info"]["image_path"]
|
| 50 |
+
print(f"First image: {img0}")
|
| 51 |
+
print(f"Exists: {os.path.exists(img0)}")
|
| 52 |
+
# Check a few more
|
| 53 |
+
for i in [0, 100, 500, 1000, 1532]:
|
| 54 |
+
ip = df2.iloc[i]["extra_info"]["image_path"]
|
| 55 |
+
print(f" [{i}] {os.path.basename(ip)}: exists={os.path.exists(ip)}")
|