| """ |
| Step 5: Create a HuggingFace dataset from the results and push to Hub. |
| |
| This creates a dataset with: |
| - The original problem and ground truth answer |
| - The greedy (N=1) solution and whether it was correct |
| - The Best-of-N (N=16) weighted answer and whether it was correct |
| - All 16 sampled solutions with their PRM scores |
| - The PRM score breakdown per answer group |
| |
| Co-authored with Claude (Anthropic). I can explain all code logic. |
| """ |
|
|
| import json |
| from datasets import Dataset, Features, Value, Sequence |
| from huggingface_hub import HfApi |
|
|
|
|
| |
| |
| |
| with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/greedy_results.json") as f: |
| greedy_results = json.load(f) |
|
|
| with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/scored_results.json") as f: |
| scored_results = json.load(f) |
|
|
| with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/bon_results.json") as f: |
| bon_results = json.load(f) |
|
|
| |
| |
| |
| rows = [] |
| for greedy, scored, bon in zip(greedy_results, scored_results, bon_results): |
| row = { |
| |
| "problem": greedy["problem"], |
| "ground_truth_solution": greedy["solution"], |
| "ground_truth_answer": greedy["answer"], |
| "subject": greedy["subject"], |
| "level": greedy["level"], |
| "unique_id": greedy["unique_id"], |
| |
| "greedy_solution": greedy["generated_solutions"][0], |
| "greedy_extracted_answer": greedy["greedy_extracted_answer"], |
| "greedy_correct": greedy["greedy_correct"], |
| |
| "bon_weighted_answer": bon["weighted_bon_answer"], |
| "bon_weighted_correct": bon["weighted_bon_correct"], |
| "bon_standard_answer": bon["standard_bon_answer"], |
| "bon_standard_correct": bon["standard_bon_correct"], |
| "bon_majority_answer": bon["majority_vote_answer"], |
| "bon_majority_correct": bon["majority_vote_correct"], |
| |
| "sampled_solutions": scored["sampled_solutions"], |
| "sampled_extracted_answers": scored["extracted_answers"], |
| "sampled_prm_scores": scored["prm_scores"], |
| |
| "n_correct_in_16": bon["n_correct_in_16"], |
| "answer_score_breakdown": json.dumps(bon["answer_score_breakdown"]), |
| } |
| rows.append(row) |
|
|
| |
| |
| |
| dataset = Dataset.from_list(rows) |
| print(f"Created dataset with {len(dataset)} rows") |
| print(f"Columns: {dataset.column_names}") |
| print(f"\nSample row:") |
| for col in ["unique_id", "level", "subject", "ground_truth_answer", "greedy_correct", "bon_weighted_correct"]: |
| print(f" {col}: {dataset[0][col]}") |
|
|
| DATASET_ID = "cmpatino/math500-bon-weighted-results" |
| dataset.push_to_hub(DATASET_ID, split="test") |
| print(f"\nDataset pushed to: https://huggingface.co/datasets/{DATASET_ID}") |
|
|