""" Step 5: Create a HuggingFace dataset from the results and push to Hub. This creates a dataset with: - The original problem and ground truth answer - The greedy (N=1) solution and whether it was correct - The Best-of-N (N=16) weighted answer and whether it was correct - All 16 sampled solutions with their PRM scores - The PRM score breakdown per answer group Co-authored with Claude (Anthropic). I can explain all code logic. """ import json from datasets import Dataset, Features, Value, Sequence from huggingface_hub import HfApi # ────────────────────────────────────────────────────────────────────────────── # Load all results # ────────────────────────────────────────────────────────────────────────────── with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/greedy_results.json") as f: greedy_results = json.load(f) with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/scored_results.json") as f: scored_results = json.load(f) with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/bon_results.json") as f: bon_results = json.load(f) # ────────────────────────────────────────────────────────────────────────────── # Build dataset rows # ────────────────────────────────────────────────────────────────────────────── rows = [] for greedy, scored, bon in zip(greedy_results, scored_results, bon_results): row = { # Original problem info "problem": greedy["problem"], "ground_truth_solution": greedy["solution"], "ground_truth_answer": greedy["answer"], "subject": greedy["subject"], "level": greedy["level"], "unique_id": greedy["unique_id"], # Greedy solution "greedy_solution": greedy["generated_solutions"][0], "greedy_extracted_answer": greedy["greedy_extracted_answer"], "greedy_correct": greedy["greedy_correct"], # Best-of-N results "bon_weighted_answer": bon["weighted_bon_answer"], "bon_weighted_correct": bon["weighted_bon_correct"], "bon_standard_answer": bon["standard_bon_answer"], "bon_standard_correct": bon["standard_bon_correct"], "bon_majority_answer": bon["majority_vote_answer"], "bon_majority_correct": bon["majority_vote_correct"], # All N=16 sampled solutions "sampled_solutions": scored["sampled_solutions"], "sampled_extracted_answers": scored["extracted_answers"], "sampled_prm_scores": scored["prm_scores"], # Summary stats "n_correct_in_16": bon["n_correct_in_16"], "answer_score_breakdown": json.dumps(bon["answer_score_breakdown"]), } rows.append(row) # ────────────────────────────────────────────────────────────────────────────── # Create and push dataset # ────────────────────────────────────────────────────────────────────────────── dataset = Dataset.from_list(rows) print(f"Created dataset with {len(dataset)} rows") print(f"Columns: {dataset.column_names}") print(f"\nSample row:") for col in ["unique_id", "level", "subject", "ground_truth_answer", "greedy_correct", "bon_weighted_correct"]: print(f" {col}: {dataset[0][col]}") DATASET_ID = "cmpatino/math500-bon-weighted-results" dataset.push_to_hub(DATASET_ID, split="test") print(f"\nDataset pushed to: https://huggingface.co/datasets/{DATASET_ID}")