math500-bon-exercise / code /step5_push_dataset.py

Upload code/step5_push_dataset.py with huggingface_hub

962718c verified 15 days ago

4.26 kB

	"""
	Step 5: Create a HuggingFace dataset from the results and push to Hub.

	This creates a dataset with:
	- The original problem and ground truth answer
	- The greedy (N=1) solution and whether it was correct
	- The Best-of-N (N=16) weighted answer and whether it was correct
	- All 16 sampled solutions with their PRM scores
	- The PRM score breakdown per answer group

	Co-authored with Claude (Anthropic). I can explain all code logic.
	"""

	import json
	from datasets import Dataset, Features, Value, Sequence
	from huggingface_hub import HfApi


	# ──────────────────────────────────────────────────────────────────────────────
	# Load all results
	# ──────────────────────────────────────────────────────────────────────────────
	with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/greedy_results.json") as f:
	greedy_results = json.load(f)

	with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/scored_results.json") as f:
	scored_results = json.load(f)

	with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/bon_results.json") as f:
	bon_results = json.load(f)

	# ──────────────────────────────────────────────────────────────────────────────
	# Build dataset rows
	# ──────────────────────────────────────────────────────────────────────────────
	rows = []
	for greedy, scored, bon in zip(greedy_results, scored_results, bon_results):
	row = {
	# Original problem info
	"problem": greedy["problem"],
	"ground_truth_solution": greedy["solution"],
	"ground_truth_answer": greedy["answer"],
	"subject": greedy["subject"],
	"level": greedy["level"],
	"unique_id": greedy["unique_id"],
	# Greedy solution
	"greedy_solution": greedy["generated_solutions"][0],
	"greedy_extracted_answer": greedy["greedy_extracted_answer"],
	"greedy_correct": greedy["greedy_correct"],
	# Best-of-N results
	"bon_weighted_answer": bon["weighted_bon_answer"],
	"bon_weighted_correct": bon["weighted_bon_correct"],
	"bon_standard_answer": bon["standard_bon_answer"],
	"bon_standard_correct": bon["standard_bon_correct"],
	"bon_majority_answer": bon["majority_vote_answer"],
	"bon_majority_correct": bon["majority_vote_correct"],
	# All N=16 sampled solutions
	"sampled_solutions": scored["sampled_solutions"],
	"sampled_extracted_answers": scored["extracted_answers"],
	"sampled_prm_scores": scored["prm_scores"],
	# Summary stats
	"n_correct_in_16": bon["n_correct_in_16"],
	"answer_score_breakdown": json.dumps(bon["answer_score_breakdown"]),
	}
	rows.append(row)

	# ──────────────────────────────────────────────────────────────────────────────
	# Create and push dataset
	# ──────────────────────────────────────────────────────────────────────────────
	dataset = Dataset.from_list(rows)
	print(f"Created dataset with {len(dataset)} rows")
	print(f"Columns: {dataset.column_names}")
	print(f"\nSample row:")
	for col in ["unique_id", "level", "subject", "ground_truth_answer", "greedy_correct", "bon_weighted_correct"]:
	print(f" {col}: {dataset[0][col]}")

	DATASET_ID = "cmpatino/math500-bon-weighted-results"
	dataset.push_to_hub(DATASET_ID, split="test")
	print(f"\nDataset pushed to: https://huggingface.co/datasets/{DATASET_ID}")