| """ |
| Complete pipeline for Best-of-N weighted selection on MATH-500. |
| |
| This single script runs all steps: |
| 1. Filter MATH-500 to 20 level 1-3 problems |
| 2. Generate greedy (N=1) solutions and compute baseline accuracy |
| 3. Sample N=16 solutions per problem with temperature sampling |
| 4. Score all solutions with Skywork PRM (last-step prediction) |
| 5. Compute weighted Best-of-N accuracy |
| 6. Create dataset and push to HuggingFace Hub |
| 7. Generate analysis plots and push them too |
| |
| Reference papers: |
| - DeepMind (2408.03314): Scaling LLM Test-Time Compute, Section 5.1 + Appendix E |
| - Math-Shepherd (2312.08935): Process Reward Models, Section 3.4 |
| |
| Co-authored with Claude (Anthropic) as part of the HuggingFace internship exercise. |
| I can explain all code logic in detail. |
| """ |
|
|
| import json |
| import os |
| import random |
| import subprocess |
| import sys |
| import torch |
| import numpy as np |
| from collections import defaultdict |
| from typing import Optional |
|
|
| from datasets import Dataset, load_dataset |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
|
|
| |
| |
| |
| N_PROBLEMS = 20 |
| N_SAMPLES = 16 |
| TEMPERATURE = 0.7 |
| MAX_NEW_TOKENS = 2048 |
| SEED = 42 |
| LLM_MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct" |
| PRM_MODEL_ID = "Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B" |
| DATASET_ID = "cmpatino/math500-bon-weighted-results" |
|
|
| OUTPUT_DIR = "/tmp/exercise_outputs" |
| os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
| |
| SYSTEM_PROMPT = ( |
| "You are a helpful math assistant. Solve the problem step by step, " |
| "showing your reasoning clearly. Put your final answer inside " |
| "\\boxed{answer} at the end of your solution." |
| ) |
|
|
|
|
| |
| |
| |
|
|
| def extract_boxed_solution(text: str) -> Optional[str]: |
| """ |
| Extract content of the last \\boxed{} in text. |
| Uses bracket-balanced parsing for nested braces. |
| Source: https://gist.github.com/lewtun/9c2ce1937b741404090a3dc4c7c022b3 |
| """ |
| try: |
| start_index = text.rindex("\\boxed{") |
| content_start = start_index + 7 |
| bracket_count = 1 |
| current_pos = content_start |
| while bracket_count > 0 and current_pos < len(text): |
| if text[current_pos] == "{": |
| bracket_count += 1 |
| elif text[current_pos] == "}": |
| bracket_count -= 1 |
| current_pos += 1 |
| if bracket_count == 0: |
| return text[content_start : current_pos - 1].strip() |
| return None |
| except (ValueError, Exception): |
| return None |
|
|
|
|
| def weighted_best_of_n(extracted_answers, prm_scores): |
| """ |
| Weighted Best-of-N selection (DeepMind 2408.03314, Eq. from Section 5.1): |
| â = argmax_a Σᵢ 𝟙(aᵢ = a) · score(sᵢ) |
| |
| Groups solutions by final answer, sums their PRM scores, |
| and selects the answer group with the highest total. |
| """ |
| answer_scores = defaultdict(float) |
| for answer, score in zip(extracted_answers, prm_scores): |
| if answer is None: |
| continue |
| answer_scores[answer] += score |
| if not answer_scores: |
| return None, {} |
| best_answer = max(answer_scores, key=answer_scores.get) |
| return best_answer, dict(answer_scores) |
|
|
|
|
| def standard_best_of_n(extracted_answers, prm_scores): |
| """Standard Best-of-N: pick the single solution with highest PRM score.""" |
| best_idx, best_score = None, -1 |
| for i, (answer, score) in enumerate(zip(extracted_answers, prm_scores)): |
| if answer is not None and score > best_score: |
| best_score = score |
| best_idx = i |
| return extracted_answers[best_idx] if best_idx is not None else None |
|
|
|
|
| def majority_vote(extracted_answers): |
| """Pure majority vote: pick the most frequent answer.""" |
| counts = defaultdict(int) |
| for answer in extracted_answers: |
| if answer is not None: |
| counts[answer] += 1 |
| return max(counts, key=counts.get) if counts else None |
|
|
|
|
| |
| |
| |
| print("=" * 70) |
| print("STEP 1: Loading and filtering MATH-500 dataset") |
| print("=" * 70) |
|
|
| dataset = load_dataset("HuggingFaceH4/MATH-500", split="test") |
| print(f"Total problems: {len(dataset)}") |
|
|
| |
| |
| filtered = dataset.filter(lambda x: x["level"] in [1, 2, 3]) |
| print(f"Level 1-3 problems: {len(filtered)}") |
|
|
| |
| random.seed(SEED) |
| indices = random.sample(range(len(filtered)), k=N_PROBLEMS) |
| problems = filtered.select(indices) |
|
|
| problems_data = [] |
| for i, p in enumerate(problems): |
| problems_data.append({ |
| "idx": i, |
| "problem": p["problem"], |
| "solution": p["solution"], |
| "answer": p["answer"], |
| "subject": p["subject"], |
| "level": p["level"], |
| "unique_id": p["unique_id"], |
| }) |
| print(f" [{i+1:2d}] L{p['level']} {p['subject']:25s} {p['unique_id']}") |
|
|
| |
| with open(os.path.join(OUTPUT_DIR, "filtered_problems.json"), "w") as f: |
| json.dump(problems_data, f, indent=2) |
|
|
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("STEP 2: Generating greedy solutions (N=1)") |
| print("=" * 70) |
|
|
| tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID) |
| model = AutoModelForCausalLM.from_pretrained( |
| LLM_MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto" |
| ) |
|
|
|
|
| def generate_batch(problems_data, model, tokenizer, n, do_sample, temperature=None): |
| """Generate n solutions per problem. Returns list of solution lists.""" |
| all_solutions = [] |
| for i, p in enumerate(problems_data): |
| messages = [ |
| {"role": "system", "content": SYSTEM_PROMPT}, |
| {"role": "user", "content": p["problem"]}, |
| ] |
| prompt = tokenizer.apply_chat_template( |
| messages, tokenize=False, add_generation_prompt=True |
| ) |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
|
|
| solutions = [] |
| for j in range(n): |
| gen_kwargs = {"max_new_tokens": MAX_NEW_TOKENS, "do_sample": do_sample} |
| if do_sample and temperature: |
| gen_kwargs["temperature"] = temperature |
| gen_kwargs["top_p"] = 0.95 |
| with torch.no_grad(): |
| output = model.generate(**inputs, **gen_kwargs) |
| generated = output[0][inputs["input_ids"].shape[1]:] |
| solutions.append(tokenizer.decode(generated, skip_special_tokens=True)) |
|
|
| all_solutions.append(solutions) |
| ans = extract_boxed_solution(solutions[0]) if n == 1 else "..." |
| tag = "greedy" if n == 1 else f"N={n}" |
| print(f" [{i+1:2d}/{len(problems_data)}] {tag} | {p['unique_id']} | answer={ans}") |
|
|
| return all_solutions |
|
|
|
|
| |
| greedy_solutions = generate_batch(problems_data, model, tokenizer, n=1, do_sample=False) |
|
|
| |
| greedy_correct = 0 |
| for p, sols in zip(problems_data, greedy_solutions): |
| extracted = extract_boxed_solution(sols[0]) |
| p["greedy_solution"] = sols[0] |
| p["greedy_extracted_answer"] = extracted |
| p["greedy_correct"] = (extracted is not None) and (extracted == p["answer"]) |
| if p["greedy_correct"]: |
| greedy_correct += 1 |
| status = "✓" if p["greedy_correct"] else "✗" |
| print(f" {status} Expected: {p['answer']:20s} | Got: {str(extracted):20s} | {p['unique_id']}") |
|
|
| greedy_acc = greedy_correct / len(problems_data) |
| print(f"\n>>> Greedy accuracy: {greedy_correct}/{len(problems_data)} = {greedy_acc:.0%}") |
|
|
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print(f"STEP 3: Sampling N={N_SAMPLES} solutions per problem (T={TEMPERATURE})") |
| print("=" * 70) |
|
|
| sampled_solutions = generate_batch( |
| problems_data, model, tokenizer, |
| n=N_SAMPLES, do_sample=True, temperature=TEMPERATURE |
| ) |
|
|
| |
| for p, sols in zip(problems_data, sampled_solutions): |
| p["sampled_solutions"] = sols |
|
|
| with open(os.path.join(OUTPUT_DIR, "sampled_solutions.json"), "w") as f: |
| json.dump(problems_data, f, indent=2) |
|
|
| del model |
| torch.cuda.empty_cache() |
| print("Freed LLM memory for PRM loading.") |
|
|
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("STEP 4: Scoring solutions with Skywork PRM") |
| print("=" * 70) |
|
|
| |
| PRM_REPO_PATH = "/tmp/skywork-o1-prm-inference" |
| if not os.path.exists(PRM_REPO_PATH): |
| print("Cloning Skywork PRM inference repo...") |
| subprocess.run( |
| ["git", "clone", "https://github.com/SkyworkAI/skywork-o1-prm-inference.git", PRM_REPO_PATH], |
| check=True, |
| ) |
| sys.path.insert(0, PRM_REPO_PATH) |
|
|
| from model_utils.prm_model import PRM_MODEL |
| from model_utils.io_utils import prepare_input, prepare_batch_input_for_model, derive_step_rewards |
|
|
| prm_tokenizer = AutoTokenizer.from_pretrained(PRM_MODEL_ID, trust_remote_code=True) |
| prm_model = PRM_MODEL.from_pretrained(PRM_MODEL_ID, device_map="auto").eval() |
| prm_device = next(prm_model.pretrained_model.parameters()).device |
| print(f"PRM loaded on {prm_device}") |
|
|
|
|
| def score_solution(problem: str, solution: str) -> float: |
| """ |
| Score a single solution using the PRM's last-step prediction. |
| |
| Per DeepMind (2408.03314, Appendix E): "We use the PRM's prediction at the |
| last step as the full-answer score" — this outperforms min/product aggregation |
| when the PRM is trained with soft MC-return labels. |
| |
| Returns: float in [0, 1] — the sigmoid-normalized score at the last step. |
| """ |
| input_ids, steps, reward_flags = prepare_input( |
| problem, solution, prm_tokenizer, step_token="\n" |
| ) |
| input_ids_t, attn_mask_t, flags_t = prepare_batch_input_for_model( |
| [input_ids], [reward_flags], prm_tokenizer.pad_token_id |
| ) |
| input_ids_t = input_ids_t.to(prm_device) |
| attn_mask_t = attn_mask_t.to(prm_device) |
| flags_t = flags_t.to(prm_device) |
|
|
| with torch.no_grad(): |
| _, _, rewards = prm_model( |
| input_ids=input_ids_t, attention_mask=attn_mask_t, return_probs=True |
| ) |
| step_rewards = derive_step_rewards(rewards, flags_t) |
| |
| return step_rewards[0][-1] if step_rewards[0] else 0.0 |
|
|
|
|
| |
| for i, p in enumerate(problems_data): |
| print(f"\n Scoring problem {i+1}/{len(problems_data)}: {p['unique_id']}") |
| scores = [] |
| extracted_answers = [] |
| for j, sol in enumerate(p["sampled_solutions"]): |
| score = score_solution(p["problem"], sol) |
| scores.append(score) |
| extracted_answers.append(extract_boxed_solution(sol)) |
| if (j + 1) % 8 == 0: |
| print(f" Scored {j+1}/{N_SAMPLES} (last: {score:.4f})") |
| p["prm_scores"] = scores |
| p["extracted_answers"] = extracted_answers |
|
|
| |
| with open(os.path.join(OUTPUT_DIR, "scored_results.json"), "w") as f: |
| json.dump(problems_data, f, indent=2) |
|
|
| del prm_model |
| torch.cuda.empty_cache() |
|
|
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("STEP 5: Computing Best-of-N accuracy") |
| print("=" * 70) |
|
|
| weighted_correct = 0 |
| standard_correct = 0 |
| majority_correct_count = 0 |
|
|
| bon_summary = [] |
| for p in problems_data: |
| gt = p["answer"] |
|
|
| |
| w_ans, w_scores = weighted_best_of_n(p["extracted_answers"], p["prm_scores"]) |
| w_ok = (w_ans is not None) and (w_ans == gt) |
| if w_ok: weighted_correct += 1 |
|
|
| |
| s_ans = standard_best_of_n(p["extracted_answers"], p["prm_scores"]) |
| s_ok = (s_ans is not None) and (s_ans == gt) |
| if s_ok: standard_correct += 1 |
|
|
| |
| m_ans = majority_vote(p["extracted_answers"]) |
| m_ok = (m_ans is not None) and (m_ans == gt) |
| if m_ok: majority_correct_count += 1 |
|
|
| n_correct = sum(1 for a in p["extracted_answers"] if a == gt) |
|
|
| bon_summary.append({ |
| "unique_id": p["unique_id"], |
| "level": p["level"], |
| "subject": p["subject"], |
| "ground_truth": gt, |
| "greedy_answer": p["greedy_extracted_answer"], |
| "greedy_correct": p["greedy_correct"], |
| "weighted_bon_answer": w_ans, |
| "weighted_bon_correct": w_ok, |
| "standard_bon_answer": s_ans, |
| "standard_bon_correct": s_ok, |
| "majority_vote_answer": m_ans, |
| "majority_vote_correct": m_ok, |
| "n_correct_in_16": n_correct, |
| "answer_score_breakdown": w_scores, |
| "prm_scores": p["prm_scores"], |
| }) |
|
|
| sg = "✓" if p["greedy_correct"] else "✗" |
| sw = "✓" if w_ok else "✗" |
| print(f" {sg}→{sw} | {p['unique_id']:40s} | GT={gt:15s} | Greedy={str(p['greedy_extracted_answer']):15s} | WBoN={str(w_ans):15s} | {n_correct}/16 correct") |
|
|
| n = len(problems_data) |
| greedy_total = sum(1 for p in problems_data if p["greedy_correct"]) |
| print(f"\n{'='*70}") |
| print(f"RESULTS SUMMARY") |
| print(f"{'='*70}") |
| print(f" Greedy (N=1): {greedy_total}/{n} = {greedy_total/n:.0%}") |
| print(f" Majority Vote (N=16): {majority_correct_count}/{n} = {majority_correct_count/n:.0%}") |
| print(f" Standard Best-of-N (N=16): {standard_correct}/{n} = {standard_correct/n:.0%}") |
| print(f" Weighted Best-of-N (N=16): {weighted_correct}/{n} = {weighted_correct/n:.0%}") |
|
|
| with open(os.path.join(OUTPUT_DIR, "bon_results.json"), "w") as f: |
| json.dump(bon_summary, f, indent=2) |
|
|
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("ANALYSIS: Accuracy vs N") |
| print("=" * 70) |
|
|
| random.seed(SEED) |
| n_values = [1, 2, 4, 8, 16] |
| n_trials = 50 |
|
|
| accuracy_by_n = {} |
| for n_val in n_values: |
| if n_val == 16: |
| correct = sum(1 for p in problems_data |
| for _ in [weighted_best_of_n(p["extracted_answers"], p["prm_scores"])] |
| if _[0] == p["answer"]) |
| acc = correct / len(problems_data) |
| else: |
| trial_accs = [] |
| for _ in range(n_trials): |
| correct = 0 |
| for p in problems_data: |
| idx = random.sample(range(16), n_val) |
| sub_a = [p["extracted_answers"][j] for j in idx] |
| sub_s = [p["prm_scores"][j] for j in idx] |
| ans, _ = weighted_best_of_n(sub_a, sub_s) |
| if ans == p["answer"]: |
| correct += 1 |
| trial_accs.append(correct / len(problems_data)) |
| acc = sum(trial_accs) / len(trial_accs) |
| accuracy_by_n[n_val] = acc |
| print(f" N={n_val:2d}: {acc:.1%}") |
|
|
| with open(os.path.join(OUTPUT_DIR, "accuracy_by_n.json"), "w") as f: |
| json.dump(accuracy_by_n, f, indent=2) |
|
|
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("STEP 6: Generating analysis plots") |
| print("=" * 70) |
|
|
| import matplotlib |
| matplotlib.use("Agg") |
| import matplotlib.pyplot as plt |
| from matplotlib.patches import Patch |
|
|
| plt.rcParams.update({"font.size": 11, "figure.dpi": 150}) |
|
|
| |
| fig, ax = plt.subplots(figsize=(8, 5)) |
| methods = ["Greedy\n(N=1)", "Majority Vote\n(N=16)", "Standard BoN\n(N=16)", "Weighted BoN\n(N=16)"] |
| accs = [ |
| greedy_total / n, |
| majority_correct_count / n, |
| standard_correct / n, |
| weighted_correct / n, |
| ] |
| colors = ["#4C72B0", "#55A868", "#C44E52", "#8172B2"] |
| bars = ax.bar(methods, accs, color=colors, edgecolor="white", linewidth=1.5) |
| for bar, a in zip(bars, accs): |
| ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, |
| f"{a:.0%}", ha="center", va="bottom", fontweight="bold", fontsize=12) |
| ax.set_ylabel("Accuracy") |
| ax.set_title("Math Problem Accuracy: Greedy vs Best-of-N Methods\n(20 MATH-500 problems, Levels 1-3)") |
| ax.set_ylim(0, 1.15) |
| ax.grid(axis="y", alpha=0.3) |
| plt.tight_layout() |
| plt.savefig(os.path.join(OUTPUT_DIR, "plot1_accuracy_comparison.png")) |
| plt.close() |
|
|
| |
| fig, ax = plt.subplots(figsize=(7, 5)) |
| ns = sorted(accuracy_by_n.keys()) |
| acc_vals = [accuracy_by_n[nv] for nv in ns] |
| ax.plot(ns, acc_vals, "o-", color="#8172B2", linewidth=2, markersize=8, label="Weighted BoN") |
| ax.axhline(y=greedy_total/n, color="#4C72B0", linestyle="--", linewidth=1.5, |
| label=f"Greedy baseline ({greedy_total/n:.0%})") |
| for nv, a in zip(ns, acc_vals): |
| ax.annotate(f"{a:.0%}", (nv, a), textcoords="offset points", xytext=(0, 10), ha="center") |
| ax.set_xlabel("N (number of samples)") |
| ax.set_ylabel("Accuracy") |
| ax.set_title("Weighted Best-of-N Accuracy vs Number of Samples") |
| ax.set_xticks(ns) |
| ax.set_ylim(0, 1.1) |
| ax.legend() |
| ax.grid(alpha=0.3) |
| plt.tight_layout() |
| plt.savefig(os.path.join(OUTPUT_DIR, "plot2_accuracy_vs_n.png")) |
| plt.close() |
|
|
| |
| fig, ax = plt.subplots(figsize=(12, 5)) |
| cat_colors = { |
| "Both correct": "#55A868", "Only BoN correct": "#8172B2", |
| "Only Greedy correct": "#C44E52", "Both wrong": "#CCCCCC" |
| } |
| bar_colors = [] |
| for s in bon_summary: |
| g, b = s["greedy_correct"], s["weighted_bon_correct"] |
| if g and b: bar_colors.append(cat_colors["Both correct"]) |
| elif not g and b: bar_colors.append(cat_colors["Only BoN correct"]) |
| elif g and not b: bar_colors.append(cat_colors["Only Greedy correct"]) |
| else: bar_colors.append(cat_colors["Both wrong"]) |
|
|
| x = range(len(bon_summary)) |
| heights = [s["n_correct_in_16"] for s in bon_summary] |
| ax.bar(x, heights, color=bar_colors, edgecolor="white", linewidth=0.5) |
| ax.set_xticks(x) |
| labels = [f"L{s['level']}: {s['unique_id'].split('/')[-1].replace('.json','')[:12]}" for s in bon_summary] |
| ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=8) |
| ax.set_ylabel("# Correct Solutions (out of 16)") |
| ax.set_title("Per-Problem: Correct Solutions in N=16 Sample") |
| legend_elements = [Patch(facecolor=c, label=l) for l, c in cat_colors.items()] |
| ax.legend(handles=legend_elements, loc="upper right", fontsize=9) |
| ax.grid(axis="y", alpha=0.3) |
| plt.tight_layout() |
| plt.savefig(os.path.join(OUTPUT_DIR, "plot3_per_problem.png")) |
| plt.close() |
|
|
| |
| fig, ax = plt.subplots(figsize=(7, 5)) |
| correct_scores, incorrect_scores = [], [] |
| for p in problems_data: |
| for ans, sc in zip(p["extracted_answers"], p["prm_scores"]): |
| (correct_scores if ans == p["answer"] else incorrect_scores).append(sc) |
|
|
| bins = np.linspace(0, 1, 25) |
| ax.hist(correct_scores, bins=bins, alpha=0.7, label=f"Correct ({len(correct_scores)})", color="#55A868") |
| ax.hist(incorrect_scores, bins=bins, alpha=0.7, label=f"Incorrect ({len(incorrect_scores)})", color="#C44E52") |
| ax.set_xlabel("PRM Last-Step Score") |
| ax.set_ylabel("Count") |
| ax.set_title("PRM Score Distribution: Correct vs Incorrect Solutions") |
| ax.legend() |
| ax.grid(alpha=0.3) |
| plt.tight_layout() |
| plt.savefig(os.path.join(OUTPUT_DIR, "plot4_prm_scores.png")) |
| plt.close() |
|
|
| print("All plots saved.") |
|
|
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("STEP 7: Pushing dataset to HuggingFace Hub") |
| print("=" * 70) |
|
|
| rows = [] |
| for p, s in zip(problems_data, bon_summary): |
| rows.append({ |
| "problem": p["problem"], |
| "ground_truth_solution": p["solution"], |
| "ground_truth_answer": p["answer"], |
| "subject": p["subject"], |
| "level": p["level"], |
| "unique_id": p["unique_id"], |
| "greedy_solution": p["greedy_solution"], |
| "greedy_extracted_answer": p["greedy_extracted_answer"], |
| "greedy_correct": p["greedy_correct"], |
| "bon_weighted_answer": s["weighted_bon_answer"], |
| "bon_weighted_correct": s["weighted_bon_correct"], |
| "bon_standard_answer": s["standard_bon_answer"], |
| "bon_standard_correct": s["standard_bon_correct"], |
| "bon_majority_answer": s["majority_vote_answer"], |
| "bon_majority_correct": s["majority_vote_correct"], |
| "sampled_solutions": p["sampled_solutions"], |
| "sampled_extracted_answers": p["extracted_answers"], |
| "sampled_prm_scores": p["prm_scores"], |
| "n_correct_in_16": s["n_correct_in_16"], |
| "answer_score_breakdown": json.dumps(s["answer_score_breakdown"]), |
| }) |
|
|
| hf_dataset = Dataset.from_list(rows) |
| hf_dataset.push_to_hub(DATASET_ID, split="test") |
| print(f"Dataset pushed to: https://huggingface.co/datasets/{DATASET_ID}") |
|
|
| |
| from huggingface_hub import HfApi |
| api = HfApi() |
| for plot_file in ["plot1_accuracy_comparison.png", "plot2_accuracy_vs_n.png", |
| "plot3_per_problem.png", "plot4_prm_scores.png"]: |
| plot_path = os.path.join(OUTPUT_DIR, plot_file) |
| if os.path.exists(plot_path): |
| api.upload_file( |
| path_or_fileobj=plot_path, |
| path_in_repo=f"plots/{plot_file}", |
| repo_id=DATASET_ID, |
| repo_type="dataset", |
| ) |
| print(f" Uploaded {plot_file}") |
|
|
| |
| for json_file in ["filtered_problems.json", "bon_results.json", "accuracy_by_n.json"]: |
| json_path = os.path.join(OUTPUT_DIR, json_file) |
| if os.path.exists(json_path): |
| api.upload_file( |
| path_or_fileobj=json_path, |
| path_in_repo=f"results/{json_file}", |
| repo_id=DATASET_ID, |
| repo_type="dataset", |
| ) |
| print(f" Uploaded {json_file}") |
|
|
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("FINAL RESULTS") |
| print("=" * 70) |
| print(f" Greedy (N=1): {greedy_total}/{len(problems_data)} = {greedy_total/len(problems_data):.0%}") |
| print(f" Majority Vote (N=16): {majority_correct_count}/{len(problems_data)} = {majority_correct_count/len(problems_data):.0%}") |
| print(f" Standard Best-of-N (N=16): {standard_correct}/{len(problems_data)} = {standard_correct/len(problems_data):.0%}") |
| print(f" Weighted Best-of-N (N=16): {weighted_correct}/{len(problems_data)} = {weighted_correct/len(problems_data):.0%}") |
| print(f"\n Dataset: https://huggingface.co/datasets/{DATASET_ID}") |
| print("=" * 70) |
| print("DONE!") |
|
|