File size: 4,258 Bytes

962718c

"""
Step 5: Create a HuggingFace dataset from the results and push to Hub.

This creates a dataset with:
- The original problem and ground truth answer
- The greedy (N=1) solution and whether it was correct
- The Best-of-N (N=16) weighted answer and whether it was correct
- All 16 sampled solutions with their PRM scores
- The PRM score breakdown per answer group

Co-authored with Claude (Anthropic). I can explain all code logic.
"""

import json
from datasets import Dataset, Features, Value, Sequence
from huggingface_hub import HfApi


# ──────────────────────────────────────────────────────────────────────────────
# Load all results
# ──────────────────────────────────────────────────────────────────────────────
with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/greedy_results.json") as f:
    greedy_results = json.load(f)

with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/scored_results.json") as f:
    scored_results = json.load(f)

with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/bon_results.json") as f:
    bon_results = json.load(f)

# ──────────────────────────────────────────────────────────────────────────────
# Build dataset rows
# ──────────────────────────────────────────────────────────────────────────────
rows = []
for greedy, scored, bon in zip(greedy_results, scored_results, bon_results):
    row = {
        # Original problem info
        "problem": greedy["problem"],
        "ground_truth_solution": greedy["solution"],
        "ground_truth_answer": greedy["answer"],
        "subject": greedy["subject"],
        "level": greedy["level"],
        "unique_id": greedy["unique_id"],
        # Greedy solution
        "greedy_solution": greedy["generated_solutions"][0],
        "greedy_extracted_answer": greedy["greedy_extracted_answer"],
        "greedy_correct": greedy["greedy_correct"],
        # Best-of-N results
        "bon_weighted_answer": bon["weighted_bon_answer"],
        "bon_weighted_correct": bon["weighted_bon_correct"],
        "bon_standard_answer": bon["standard_bon_answer"],
        "bon_standard_correct": bon["standard_bon_correct"],
        "bon_majority_answer": bon["majority_vote_answer"],
        "bon_majority_correct": bon["majority_vote_correct"],
        # All N=16 sampled solutions
        "sampled_solutions": scored["sampled_solutions"],
        "sampled_extracted_answers": scored["extracted_answers"],
        "sampled_prm_scores": scored["prm_scores"],
        # Summary stats
        "n_correct_in_16": bon["n_correct_in_16"],
        "answer_score_breakdown": json.dumps(bon["answer_score_breakdown"]),
    }
    rows.append(row)

# ──────────────────────────────────────────────────────────────────────────────
# Create and push dataset
# ──────────────────────────────────────────────────────────────────────────────
dataset = Dataset.from_list(rows)
print(f"Created dataset with {len(dataset)} rows")
print(f"Columns: {dataset.column_names}")
print(f"\nSample row:")
for col in ["unique_id", "level", "subject", "ground_truth_answer", "greedy_correct", "bon_weighted_correct"]:
    print(f"  {col}: {dataset[0][col]}")

DATASET_ID = "cmpatino/math500-bon-weighted-results"
dataset.push_to_hub(DATASET_ID, split="test")
print(f"\nDataset pushed to: https://huggingface.co/datasets/{DATASET_ID}")