Upload root_scripts/fix_parquet.py with huggingface_hub

26d786a verified about 2 months ago

1.53 kB


	import pandas as pd

	path = "/workspace/rl4phyx/RL4Phyx/ZeroSearch/One-Shot-RLVR/data/train/physics_vlm/mechanics/mechanics_1_rl_numerical.parquet"
	df = pd.read_parquet(path)

	# Build a clean open-ended prompt from scratch
	open_ended_prompt = """Look at the image and answer the physics question.

	A patient with a dislocated shoulder is put into a traction apparatus as shown in figure. The pulls A and B have equal magnitudes and must combine to produce an outward traction force of 12.8 N on the patient's arm.

	Question: How large should these pulls be?

	Please reason step by step and put your final numerical answer (with units) in \\boxed{}."""

	# Rebuild all rows with clean prompt
	new_rows = []
	for i, row in df.iterrows():
	r = {
	"data_source": row["data_source"],
	"prompt": [{"content": open_ended_prompt, "role": "user"}],
	"ability": row["ability"],
	"reward_model": row["reward_model"], # keeps {'ground_truth': '7.55N', 'style': 'rule'}
	"extra_info": row["extra_info"],
	}
	new_rows.append(r)

	new_df = pd.DataFrame(new_rows)
	new_df.to_parquet(path, index=False)

	# Verify
	df2 = pd.read_parquet(path)
	print("Shape:", df2.shape)
	print("Columns:", list(df2.columns))
	print()
	print("Prompt:")
	print(df2.iloc[0]["prompt"][0]["content"])
	print()
	print("reward_model:", df2.iloc[0]["reward_model"])
	print("extra_info:", df2.iloc[0]["extra_info"])
	print()
	has_options = "Options" in df2.iloc[0]["prompt"][0]["content"]
	print("Has Options:", has_options)
	print("DONE!")