math500-bon-exercise / code /step5_push_dataset.py
cmpatino's picture
cmpatino HF Staff
Upload code/step5_push_dataset.py with huggingface_hub
962718c verified
"""
Step 5: Create a HuggingFace dataset from the results and push to Hub.
This creates a dataset with:
- The original problem and ground truth answer
- The greedy (N=1) solution and whether it was correct
- The Best-of-N (N=16) weighted answer and whether it was correct
- All 16 sampled solutions with their PRM scores
- The PRM score breakdown per answer group
Co-authored with Claude (Anthropic). I can explain all code logic.
"""
import json
from datasets import Dataset, Features, Value, Sequence
from huggingface_hub import HfApi
# ──────────────────────────────────────────────────────────────────────────────
# Load all results
# ──────────────────────────────────────────────────────────────────────────────
with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/greedy_results.json") as f:
greedy_results = json.load(f)
with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/scored_results.json") as f:
scored_results = json.load(f)
with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/bon_results.json") as f:
bon_results = json.load(f)
# ──────────────────────────────────────────────────────────────────────────────
# Build dataset rows
# ──────────────────────────────────────────────────────────────────────────────
rows = []
for greedy, scored, bon in zip(greedy_results, scored_results, bon_results):
row = {
# Original problem info
"problem": greedy["problem"],
"ground_truth_solution": greedy["solution"],
"ground_truth_answer": greedy["answer"],
"subject": greedy["subject"],
"level": greedy["level"],
"unique_id": greedy["unique_id"],
# Greedy solution
"greedy_solution": greedy["generated_solutions"][0],
"greedy_extracted_answer": greedy["greedy_extracted_answer"],
"greedy_correct": greedy["greedy_correct"],
# Best-of-N results
"bon_weighted_answer": bon["weighted_bon_answer"],
"bon_weighted_correct": bon["weighted_bon_correct"],
"bon_standard_answer": bon["standard_bon_answer"],
"bon_standard_correct": bon["standard_bon_correct"],
"bon_majority_answer": bon["majority_vote_answer"],
"bon_majority_correct": bon["majority_vote_correct"],
# All N=16 sampled solutions
"sampled_solutions": scored["sampled_solutions"],
"sampled_extracted_answers": scored["extracted_answers"],
"sampled_prm_scores": scored["prm_scores"],
# Summary stats
"n_correct_in_16": bon["n_correct_in_16"],
"answer_score_breakdown": json.dumps(bon["answer_score_breakdown"]),
}
rows.append(row)
# ──────────────────────────────────────────────────────────────────────────────
# Create and push dataset
# ──────────────────────────────────────────────────────────────────────────────
dataset = Dataset.from_list(rows)
print(f"Created dataset with {len(dataset)} rows")
print(f"Columns: {dataset.column_names}")
print(f"\nSample row:")
for col in ["unique_id", "level", "subject", "ground_truth_answer", "greedy_correct", "bon_weighted_correct"]:
print(f" {col}: {dataset[0][col]}")
DATASET_ID = "cmpatino/math500-bon-weighted-results"
dataset.push_to_hub(DATASET_ID, split="test")
print(f"\nDataset pushed to: https://huggingface.co/datasets/{DATASET_ID}")