YUNTA88 commited on
Commit
7735006
·
verified ·
1 Parent(s): 0f9dccc

Upload root_scripts/make_val_parquet.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. root_scripts/make_val_parquet.py +55 -0
root_scripts/make_val_parquet.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json, pandas as pd, os
3
+
4
+ src = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/inference_results_base.jsonl"
5
+ with open(src) as f:
6
+ lines = [json.loads(l) for l in f if l.strip()]
7
+
8
+ # Use absolute path for test images
9
+ IMAGE_BASE = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
10
+
11
+ rows = []
12
+ for r in lines:
13
+ idx = r.get("index", 0)
14
+ cat = r.get("category", "unknown")
15
+ subfield = r.get("subfield", "")
16
+ gt_value = str(r.get("ground_truth_value", "")).strip()
17
+ question = r.get("question", "")
18
+
19
+ prompt_text = f"Look at the image and answer the physics question.\n\n{question}\n\nPlease reason step by step and put your final answer (with units if applicable) in \\boxed{{}}."
20
+
21
+ # Use absolute path for image
22
+ abs_image_path = os.path.join(IMAGE_BASE, f"{idx}.png")
23
+
24
+ row = {
25
+ "data_source": "metaphyx_physics",
26
+ "prompt": [{"content": prompt_text, "role": "user"}],
27
+ "ability": "physics",
28
+ "reward_model": {"ground_truth": gt_value, "style": "rule"},
29
+ "extra_info": {
30
+ "category": cat,
31
+ "subfield": subfield,
32
+ "index": idx,
33
+ "image_path": abs_image_path,
34
+ "split": "test",
35
+ },
36
+ }
37
+ rows.append(row)
38
+
39
+ df = pd.DataFrame(rows)
40
+
41
+ out_path = "/workspace/rl4phyx/RL4Phyx/oneshot/validation_data/metaphyx_oe_1533.parquet"
42
+ df.to_parquet(out_path, index=False)
43
+
44
+ # Verify
45
+ df2 = pd.read_parquet(out_path)
46
+ print(f"Saved: {out_path}")
47
+ print(f"Shape: {df2.shape}")
48
+ # Check first image exists
49
+ img0 = df2.iloc[0]["extra_info"]["image_path"]
50
+ print(f"First image: {img0}")
51
+ print(f"Exists: {os.path.exists(img0)}")
52
+ # Check a few more
53
+ for i in [0, 100, 500, 1000, 1532]:
54
+ ip = df2.iloc[i]["extra_info"]["image_path"]
55
+ print(f" [{i}] {os.path.basename(ip)}: exists={os.path.exists(ip)}")