rl4phyx-backup / root_scripts /fix_parquet.py
YUNTA88's picture
Upload root_scripts/fix_parquet.py with huggingface_hub
26d786a verified
import pandas as pd
path = "/workspace/rl4phyx/RL4Phyx/ZeroSearch/One-Shot-RLVR/data/train/physics_vlm/mechanics/mechanics_1_rl_numerical.parquet"
df = pd.read_parquet(path)
# Build a clean open-ended prompt from scratch
open_ended_prompt = """Look at the image and answer the physics question.
A patient with a dislocated shoulder is put into a traction apparatus as shown in figure. The pulls A and B have equal magnitudes and must combine to produce an outward traction force of 12.8 N on the patient's arm.
Question: How large should these pulls be?
Please reason step by step and put your final numerical answer (with units) in \\boxed{}."""
# Rebuild all rows with clean prompt
new_rows = []
for i, row in df.iterrows():
r = {
"data_source": row["data_source"],
"prompt": [{"content": open_ended_prompt, "role": "user"}],
"ability": row["ability"],
"reward_model": row["reward_model"], # keeps {'ground_truth': '7.55N', 'style': 'rule'}
"extra_info": row["extra_info"],
}
new_rows.append(r)
new_df = pd.DataFrame(new_rows)
new_df.to_parquet(path, index=False)
# Verify
df2 = pd.read_parquet(path)
print("Shape:", df2.shape)
print("Columns:", list(df2.columns))
print()
print("Prompt:")
print(df2.iloc[0]["prompt"][0]["content"])
print()
print("reward_model:", df2.iloc[0]["reward_model"])
print("extra_info:", df2.iloc[0]["extra_info"])
print()
has_options = "Options" in df2.iloc[0]["prompt"][0]["content"]
print("Has Options:", has_options)
print("DONE!")