Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +4 -0
- eval_results/arc_challenge/outputs__grpo-full__final/results_2026-05-26T11-44-23.900648.json +141 -0
- eval_results/arc_challenge/outputs__grpo-full__final/samples_arc_challenge_2026-05-26T11-44-23.900648.jsonl +3 -0
- eval_results/arc_easy/outputs__grpo-full__final/results_2026-05-26T11-37-35.592842.json +141 -0
- eval_results/arc_easy/outputs__grpo-full__final/samples_arc_easy_2026-05-26T11-37-35.592842.jsonl +0 -0
- eval_results/benchmark_summary.md +8 -0
- eval_results/combined_results.json +6 -0
- eval_results/custom_eval.md +159 -0
- eval_results/gsm8k/outputs__grpo-full__final/results_2026-05-26T09-33-14.995899.json +175 -0
- eval_results/gsm8k/outputs__grpo-full__final/samples_gsm8k_2026-05-26T09-33-14.995899.jsonl +3 -0
- eval_results/hellaswag/outputs__grpo-full__final/results_2026-05-26T12-29-19.954733.json +139 -0
- eval_results/hellaswag/outputs__grpo-full__final/samples_hellaswag_2026-05-26T12-29-19.954733.jsonl +3 -0
- eval_results/minerva_math_algebra/outputs__grpo-full__final/results_2026-05-26T10-07-08.234107.json +145 -0
- eval_results/minerva_math_algebra/outputs__grpo-full__final/samples_minerva_math_algebra_2026-05-26T10-07-08.234107.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/results_2026-05-26T12-51-35.680076.json +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_abstract_algebra_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_anatomy_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_astronomy_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_business_ethics_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_clinical_knowledge_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_biology_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_chemistry_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_computer_science_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_mathematics_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_medicine_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_physics_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_computer_security_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_conceptual_physics_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_econometrics_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_electrical_engineering_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_elementary_mathematics_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_formal_logic_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_global_facts_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_biology_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_chemistry_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_computer_science_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_european_history_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_geography_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_government_and_politics_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_macroeconomics_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_mathematics_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_microeconomics_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_physics_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_psychology_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_statistics_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_us_history_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_world_history_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_human_aging_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_human_sexuality_2026-05-26T12-51-35.680076.jsonl +0 -0
- eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_international_law_2026-05-26T12-51-35.680076.jsonl +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
eval_results/arc_challenge/outputs__grpo-full__final/samples_arc_challenge_2026-05-26T11-44-23.900648.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
eval_results/gsm8k/outputs__grpo-full__final/samples_gsm8k_2026-05-26T09-33-14.995899.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
eval_results/hellaswag/outputs__grpo-full__final/samples_hellaswag_2026-05-26T12-29-19.954733.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_professional_law_2026-05-26T12-51-35.680076.jsonl filter=lfs diff=lfs merge=lfs -text
|
eval_results/arc_challenge/outputs__grpo-full__final/results_2026-05-26T11-44-23.900648.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_challenge": {
|
| 4 |
+
"name": "arc_challenge",
|
| 5 |
+
"alias": "arc_challenge",
|
| 6 |
+
"sample_len": 1172,
|
| 7 |
+
"acc,none": 0.19197952218430034,
|
| 8 |
+
"acc_stderr,none": 0.01150959890659822,
|
| 9 |
+
"acc_norm,none": 0.22781569965870307,
|
| 10 |
+
"acc_norm_stderr,none": 0.012256708602326964
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"group_subtasks": {},
|
| 14 |
+
"configs": {
|
| 15 |
+
"arc_challenge": {
|
| 16 |
+
"task": "arc_challenge",
|
| 17 |
+
"dataset_path": "allenai/ai2_arc",
|
| 18 |
+
"dataset_name": "ARC-Challenge",
|
| 19 |
+
"training_split": "train",
|
| 20 |
+
"validation_split": "validation",
|
| 21 |
+
"test_split": "test",
|
| 22 |
+
"doc_to_text": "Question: {{question}}\nAnswer:",
|
| 23 |
+
"doc_to_target": "{{choices.label.index(answerKey)}}",
|
| 24 |
+
"unsafe_code": false,
|
| 25 |
+
"doc_to_choice": "{{choices.text}}",
|
| 26 |
+
"description": "",
|
| 27 |
+
"target_delimiter": " ",
|
| 28 |
+
"fewshot_delimiter": "\n\n",
|
| 29 |
+
"fewshot_config": {
|
| 30 |
+
"sampler": "default",
|
| 31 |
+
"split": null,
|
| 32 |
+
"process_docs": null,
|
| 33 |
+
"fewshot_indices": null,
|
| 34 |
+
"samples": null,
|
| 35 |
+
"doc_to_text": "Question: {{question}}\nAnswer:",
|
| 36 |
+
"doc_to_choice": "{{choices.text}}",
|
| 37 |
+
"doc_to_target": "{{choices.label.index(answerKey)}}",
|
| 38 |
+
"gen_prefix": null,
|
| 39 |
+
"fewshot_delimiter": "\n\n",
|
| 40 |
+
"target_delimiter": " "
|
| 41 |
+
},
|
| 42 |
+
"num_fewshot": 25,
|
| 43 |
+
"metric_list": [
|
| 44 |
+
{
|
| 45 |
+
"metric": "acc",
|
| 46 |
+
"aggregation": "mean",
|
| 47 |
+
"higher_is_better": true
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"metric": "acc_norm",
|
| 51 |
+
"aggregation": "mean",
|
| 52 |
+
"higher_is_better": true
|
| 53 |
+
}
|
| 54 |
+
],
|
| 55 |
+
"output_type": "multiple_choice",
|
| 56 |
+
"repeats": 1,
|
| 57 |
+
"should_decontaminate": true,
|
| 58 |
+
"doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
|
| 59 |
+
"metadata": {
|
| 60 |
+
"version": 1.0,
|
| 61 |
+
"pretrained": "outputs/grpo-full/final",
|
| 62 |
+
"dtype": "bfloat16",
|
| 63 |
+
"config_source": "/home/himanshu/TinyMathReason-1B/venv/lib/python3.12/site-packages/lm_eval/tasks/arc/arc_challenge.yaml"
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
},
|
| 67 |
+
"versions": {
|
| 68 |
+
"arc_challenge": 1.0
|
| 69 |
+
},
|
| 70 |
+
"n-shot": {
|
| 71 |
+
"arc_challenge": 25
|
| 72 |
+
},
|
| 73 |
+
"higher_is_better": {
|
| 74 |
+
"arc_challenge": {
|
| 75 |
+
"acc": true,
|
| 76 |
+
"acc_norm": true
|
| 77 |
+
}
|
| 78 |
+
},
|
| 79 |
+
"n-samples": {
|
| 80 |
+
"arc_challenge": {
|
| 81 |
+
"original": 1172,
|
| 82 |
+
"effective": 1172
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"config": {
|
| 86 |
+
"model": "hf",
|
| 87 |
+
"model_args": {
|
| 88 |
+
"pretrained": "outputs/grpo-full/final",
|
| 89 |
+
"dtype": "bfloat16"
|
| 90 |
+
},
|
| 91 |
+
"model_num_parameters": 1123125248,
|
| 92 |
+
"model_dtype": "torch.bfloat16",
|
| 93 |
+
"model_revision": "main",
|
| 94 |
+
"model_sha": "",
|
| 95 |
+
"batch_size": "auto",
|
| 96 |
+
"batch_sizes": [
|
| 97 |
+
64
|
| 98 |
+
],
|
| 99 |
+
"device": "cuda",
|
| 100 |
+
"use_cache": null,
|
| 101 |
+
"limit": null,
|
| 102 |
+
"bootstrap_iters": 100000,
|
| 103 |
+
"gen_kwargs": {},
|
| 104 |
+
"random_seed": 0,
|
| 105 |
+
"numpy_seed": 1234,
|
| 106 |
+
"torch_seed": 1234,
|
| 107 |
+
"fewshot_seed": 1234
|
| 108 |
+
},
|
| 109 |
+
"git_hash": "03fcf23",
|
| 110 |
+
"date": 1779795467.4101148,
|
| 111 |
+
"pretty_env_info": "PyTorch version: 2.12.0+cu130\nIs debug build: False\nCUDA used to build PyTorch: 13.0\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 24.04.4 LTS (x86_64)\nGCC version: (Ubuntu 13.3.0-6ubuntu2~24.04.1) 13.3.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.39\n\nPython version: 3.12.3 (main, Mar 23 2026, 19:04:32) [GCC 13.3.0] (64-bit runtime)\nPython platform: Linux-6.17.0-1016-gcp-x86_64-with-glibc2.39\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: GPU 0: NVIDIA L4\nNvidia driver version: 580.126.20\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_graph.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_heuristic.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_adv.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_cnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_precompiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_graph.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_heuristic.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_ops.so.9.13.0\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\nCaching allocator config: N/A\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 16\nOn-line CPU(s) list: 0-15\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.20GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 8\nSocket(s): 1\nStepping: 7\nBogoMIPS: 4400.43\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat avx512_vnni md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 256 KiB (8 instances)\nL1i cache: 256 KiB (8 instances)\nL2 cache: 8 MiB (8 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-15\nVulnerability Gather data sampling: Not affected\nVulnerability Ghostwrite: Not affected\nVulnerability Indirect target selection: Mitigation; Aligned branch/return thunks\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Old microcode: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; Enhanced IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; PBRSB-eIBRS SW sequence; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsa: Not affected\nVulnerability Tsx async abort: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Vmscape: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==2.4.6\n[pip3] nvidia-cublas==13.1.1.3\n[pip3] nvidia-cuda-cupti==13.0.85\n[pip3] nvidia-cuda-nvrtc==13.0.88\n[pip3] nvidia-cuda-runtime==13.0.96\n[pip3] nvidia-cudnn-cu13==9.20.0.48\n[pip3] nvidia-cufft==12.0.0.61\n[pip3] nvidia-curand==10.4.0.35\n[pip3] nvidia-cusolver==12.0.4.66\n[pip3] nvidia-cusparse==12.6.3.3\n[pip3] nvidia-cusparselt-cu13==0.8.1\n[pip3] nvidia-nccl-cu13==2.29.7\n[pip3] nvidia-nvjitlink==13.0.88\n[pip3] nvidia-nvtx==13.0.85\n[pip3] torch==2.12.0\n[pip3] triton==3.7.0\n[conda] Could not collect",
|
| 112 |
+
"transformers_version": "5.9.0",
|
| 113 |
+
"lm_eval_version": "0.4.12",
|
| 114 |
+
"upper_git_hash": null,
|
| 115 |
+
"tokenizer_pad_token": [
|
| 116 |
+
"<|pad|>",
|
| 117 |
+
"3"
|
| 118 |
+
],
|
| 119 |
+
"tokenizer_eos_token": [
|
| 120 |
+
"<|eos|>",
|
| 121 |
+
"1"
|
| 122 |
+
],
|
| 123 |
+
"tokenizer_bos_token": [
|
| 124 |
+
"<|bos|>",
|
| 125 |
+
"0"
|
| 126 |
+
],
|
| 127 |
+
"eot_token_id": 1,
|
| 128 |
+
"max_length": 4096,
|
| 129 |
+
"task_hashes": {
|
| 130 |
+
"arc_challenge": "55e883475b5650b20d8d9dc1e9cdf59ef645a257fcd74bf43a9dbb5c632c529c"
|
| 131 |
+
},
|
| 132 |
+
"model_source": "hf",
|
| 133 |
+
"model_name": "outputs/grpo-full/final",
|
| 134 |
+
"model_name_sanitized": "outputs__grpo-full__final",
|
| 135 |
+
"system_instruction": null,
|
| 136 |
+
"system_instruction_sha": null,
|
| 137 |
+
"fewshot_as_multiturn": null,
|
| 138 |
+
"chat_template": null,
|
| 139 |
+
"chat_template_sha": null,
|
| 140 |
+
"total_evaluation_time_seconds": "405.71644051300245"
|
| 141 |
+
}
|
eval_results/arc_challenge/outputs__grpo-full__final/samples_arc_challenge_2026-05-26T11-44-23.900648.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dd9bfe557b60e908f99194a2f751c31995efbca3ef3d37873954b352b3cceb07
|
| 3 |
+
size 23316200
|
eval_results/arc_easy/outputs__grpo-full__final/results_2026-05-26T11-37-35.592842.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_easy": {
|
| 4 |
+
"name": "arc_easy",
|
| 5 |
+
"alias": "arc_easy",
|
| 6 |
+
"sample_len": 2376,
|
| 7 |
+
"acc,none": 0.2760942760942761,
|
| 8 |
+
"acc_stderr,none": 0.00917355987383544,
|
| 9 |
+
"acc_norm,none": 0.2878787878787879,
|
| 10 |
+
"acc_norm_stderr,none": 0.009290733161670239
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"group_subtasks": {},
|
| 14 |
+
"configs": {
|
| 15 |
+
"arc_easy": {
|
| 16 |
+
"task": "arc_easy",
|
| 17 |
+
"dataset_path": "allenai/ai2_arc",
|
| 18 |
+
"dataset_name": "ARC-Easy",
|
| 19 |
+
"training_split": "train",
|
| 20 |
+
"validation_split": "validation",
|
| 21 |
+
"test_split": "test",
|
| 22 |
+
"doc_to_text": "Question: {{question}}\nAnswer:",
|
| 23 |
+
"doc_to_target": "{{choices.label.index(answerKey)}}",
|
| 24 |
+
"unsafe_code": false,
|
| 25 |
+
"doc_to_choice": "{{choices.text}}",
|
| 26 |
+
"description": "",
|
| 27 |
+
"target_delimiter": " ",
|
| 28 |
+
"fewshot_delimiter": "\n\n",
|
| 29 |
+
"fewshot_config": {
|
| 30 |
+
"sampler": "default",
|
| 31 |
+
"split": null,
|
| 32 |
+
"process_docs": null,
|
| 33 |
+
"fewshot_indices": null,
|
| 34 |
+
"samples": null,
|
| 35 |
+
"doc_to_text": "Question: {{question}}\nAnswer:",
|
| 36 |
+
"doc_to_choice": "{{choices.text}}",
|
| 37 |
+
"doc_to_target": "{{choices.label.index(answerKey)}}",
|
| 38 |
+
"gen_prefix": null,
|
| 39 |
+
"fewshot_delimiter": "\n\n",
|
| 40 |
+
"target_delimiter": " "
|
| 41 |
+
},
|
| 42 |
+
"num_fewshot": 0,
|
| 43 |
+
"metric_list": [
|
| 44 |
+
{
|
| 45 |
+
"metric": "acc",
|
| 46 |
+
"aggregation": "mean",
|
| 47 |
+
"higher_is_better": true
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"metric": "acc_norm",
|
| 51 |
+
"aggregation": "mean",
|
| 52 |
+
"higher_is_better": true
|
| 53 |
+
}
|
| 54 |
+
],
|
| 55 |
+
"output_type": "multiple_choice",
|
| 56 |
+
"repeats": 1,
|
| 57 |
+
"should_decontaminate": true,
|
| 58 |
+
"doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
|
| 59 |
+
"metadata": {
|
| 60 |
+
"version": 1.0,
|
| 61 |
+
"pretrained": "outputs/grpo-full/final",
|
| 62 |
+
"dtype": "bfloat16",
|
| 63 |
+
"config_source": "/home/himanshu/TinyMathReason-1B/venv/lib/python3.12/site-packages/lm_eval/tasks/arc/arc_easy.yaml"
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
},
|
| 67 |
+
"versions": {
|
| 68 |
+
"arc_easy": 1.0
|
| 69 |
+
},
|
| 70 |
+
"n-shot": {
|
| 71 |
+
"arc_easy": 0
|
| 72 |
+
},
|
| 73 |
+
"higher_is_better": {
|
| 74 |
+
"arc_easy": {
|
| 75 |
+
"acc": true,
|
| 76 |
+
"acc_norm": true
|
| 77 |
+
}
|
| 78 |
+
},
|
| 79 |
+
"n-samples": {
|
| 80 |
+
"arc_easy": {
|
| 81 |
+
"original": 2376,
|
| 82 |
+
"effective": 2376
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"config": {
|
| 86 |
+
"model": "hf",
|
| 87 |
+
"model_args": {
|
| 88 |
+
"pretrained": "outputs/grpo-full/final",
|
| 89 |
+
"dtype": "bfloat16"
|
| 90 |
+
},
|
| 91 |
+
"model_num_parameters": 1123125248,
|
| 92 |
+
"model_dtype": "torch.bfloat16",
|
| 93 |
+
"model_revision": "main",
|
| 94 |
+
"model_sha": "",
|
| 95 |
+
"batch_size": "auto",
|
| 96 |
+
"batch_sizes": [
|
| 97 |
+
64
|
| 98 |
+
],
|
| 99 |
+
"device": "cuda",
|
| 100 |
+
"use_cache": null,
|
| 101 |
+
"limit": null,
|
| 102 |
+
"bootstrap_iters": 100000,
|
| 103 |
+
"gen_kwargs": {},
|
| 104 |
+
"random_seed": 0,
|
| 105 |
+
"numpy_seed": 1234,
|
| 106 |
+
"torch_seed": 1234,
|
| 107 |
+
"fewshot_seed": 1234
|
| 108 |
+
},
|
| 109 |
+
"git_hash": "03fcf23",
|
| 110 |
+
"date": 1779795408.3004034,
|
| 111 |
+
"pretty_env_info": "PyTorch version: 2.12.0+cu130\nIs debug build: False\nCUDA used to build PyTorch: 13.0\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 24.04.4 LTS (x86_64)\nGCC version: (Ubuntu 13.3.0-6ubuntu2~24.04.1) 13.3.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.39\n\nPython version: 3.12.3 (main, Mar 23 2026, 19:04:32) [GCC 13.3.0] (64-bit runtime)\nPython platform: Linux-6.17.0-1016-gcp-x86_64-with-glibc2.39\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: GPU 0: NVIDIA L4\nNvidia driver version: 580.126.20\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_graph.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_heuristic.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_adv.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_cnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_precompiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_graph.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_heuristic.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_ops.so.9.13.0\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\nCaching allocator config: N/A\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 16\nOn-line CPU(s) list: 0-15\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.20GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 8\nSocket(s): 1\nStepping: 7\nBogoMIPS: 4400.43\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat avx512_vnni md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 256 KiB (8 instances)\nL1i cache: 256 KiB (8 instances)\nL2 cache: 8 MiB (8 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-15\nVulnerability Gather data sampling: Not affected\nVulnerability Ghostwrite: Not affected\nVulnerability Indirect target selection: Mitigation; Aligned branch/return thunks\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Old microcode: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; Enhanced IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; PBRSB-eIBRS SW sequence; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsa: Not affected\nVulnerability Tsx async abort: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Vmscape: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==2.4.6\n[pip3] nvidia-cublas==13.1.1.3\n[pip3] nvidia-cuda-cupti==13.0.85\n[pip3] nvidia-cuda-nvrtc==13.0.88\n[pip3] nvidia-cuda-runtime==13.0.96\n[pip3] nvidia-cudnn-cu13==9.20.0.48\n[pip3] nvidia-cufft==12.0.0.61\n[pip3] nvidia-curand==10.4.0.35\n[pip3] nvidia-cusolver==12.0.4.66\n[pip3] nvidia-cusparse==12.6.3.3\n[pip3] nvidia-cusparselt-cu13==0.8.1\n[pip3] nvidia-nccl-cu13==2.29.7\n[pip3] nvidia-nvjitlink==13.0.88\n[pip3] nvidia-nvtx==13.0.85\n[pip3] torch==2.12.0\n[pip3] triton==3.7.0\n[conda] Could not collect",
|
| 112 |
+
"transformers_version": "5.9.0",
|
| 113 |
+
"lm_eval_version": "0.4.12",
|
| 114 |
+
"upper_git_hash": null,
|
| 115 |
+
"tokenizer_pad_token": [
|
| 116 |
+
"<|pad|>",
|
| 117 |
+
"3"
|
| 118 |
+
],
|
| 119 |
+
"tokenizer_eos_token": [
|
| 120 |
+
"<|eos|>",
|
| 121 |
+
"1"
|
| 122 |
+
],
|
| 123 |
+
"tokenizer_bos_token": [
|
| 124 |
+
"<|bos|>",
|
| 125 |
+
"0"
|
| 126 |
+
],
|
| 127 |
+
"eot_token_id": 1,
|
| 128 |
+
"max_length": 4096,
|
| 129 |
+
"task_hashes": {
|
| 130 |
+
"arc_easy": "dce0d9b0f0cecd55bf2ac264042c5e45487df708d13123af3ae9e67bbbefdeb1"
|
| 131 |
+
},
|
| 132 |
+
"model_source": "hf",
|
| 133 |
+
"model_name": "outputs/grpo-full/final",
|
| 134 |
+
"model_name_sanitized": "outputs__grpo-full__final",
|
| 135 |
+
"system_instruction": null,
|
| 136 |
+
"system_instruction_sha": null,
|
| 137 |
+
"fewshot_as_multiturn": null,
|
| 138 |
+
"chat_template": null,
|
| 139 |
+
"chat_template_sha": null,
|
| 140 |
+
"total_evaluation_time_seconds": "56.55086992699944"
|
| 141 |
+
}
|
eval_results/arc_easy/outputs__grpo-full__final/samples_arc_easy_2026-05-26T11-37-35.592842.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/benchmark_summary.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Benchmark Results for outputs/grpo-full/final
|
| 2 |
+
|
| 3 |
+
| Benchmark | Shots | Score |
|
| 4 |
+
| :--- | :---: | :---: |
|
| 5 |
+
| arc_easy | 0 | No results file |
|
| 6 |
+
| arc_challenge | 25 | No results file |
|
| 7 |
+
| hellaswag | 10 | No results file |
|
| 8 |
+
| mmlu | 5 | No results file |
|
eval_results/combined_results.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"arc_easy": "No results file",
|
| 3 |
+
"arc_challenge": "No results file",
|
| 4 |
+
"hellaswag": "No results file",
|
| 5 |
+
"mmlu": "No results file"
|
| 6 |
+
}
|
eval_results/custom_eval.md
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Custom Evaluation: outputs/grpo-full/final
|
| 2 |
+
|
| 3 |
+
### Level: Easy
|
| 4 |
+
**Q: What is 15% of 200?**
|
| 5 |
+
|
| 6 |
+
**A:**
|
| 7 |
+
|
| 8 |
+
There are 5% of the population of unknown families, which is 3% of the population.
|
| 9 |
+
|
| 10 |
+
In the case of the multi-choice, the population is 2%.
|
| 11 |
+
The proportion of the population has an impact on the population of 500 people.
|
| 12 |
+
The proportion of individuals living in rural areas is 4.
|
| 13 |
+
The population of the people living in rural areas in urban areas has 4% of the population.
|
| 14 |
+
The population of the population of West Africa is 2/100, which is the number of people living in rural areas:
|
| 15 |
+
|
| 16 |
+
The proportion of the proportion of people in the proportion of people living in the rural regions is 0.6/3,
|
| 17 |
+
|
| 18 |
+
The poverty in the rural area is 2/4.
|
| 19 |
+
If the number of children living in the population is 25/2. The population of the population is 2.5.6.
|
| 20 |
+
The population of people in rural areas is the population that is 2/3.6.
|
| 21 |
+
The population of the proportion of the people living in rural areas is 10.6.
|
| 22 |
+
The population of the population of the population is 4.
|
| 23 |
+
The population of the proportion of individuals who live in poverty are 65 years older than the poverty-free population, and in poverty.
|
| 24 |
+
The population of
|
| 25 |
+
|
| 26 |
+
---
|
| 27 |
+
### Level: Easy
|
| 28 |
+
**Q: Solve for x: 3x + 5 = 20**
|
| 29 |
+
|
| 30 |
+
**A:**
|
| 31 |
+
|
| 32 |
+
There are only x 2 = 8
|
| 33 |
+
The value of x is 2x + 3 = 2.
|
| 34 |
+
The value of x is 2.
|
| 35 |
+
The value of x is 2.
|
| 36 |
+
#### 2.
|
| 37 |
+
|
| 38 |
+
---
|
| 39 |
+
### Level: Easy
|
| 40 |
+
**Q: If a train travels 60 miles per hour, how far does it go in 2.5 hours?**
|
| 41 |
+
|
| 42 |
+
**A:**
|
| 43 |
+
|
| 44 |
+
The train tracks can be used to determine the speed of the train that is currently 10 km long, and the train is 10 meters.
|
| 45 |
+
If the train was in a train of 6 km, then the train would be 4 hours in the train is in the train, then the train would be a total of 12.5 km/hr, and a train to drive the train to the train.
|
| 46 |
+
If the train was running, how many miles traveled?
|
| 47 |
+
If the train passed through the train, then the train would be able to take it at the train station.
|
| 48 |
+
If it ran 50 mph, what is the distance from the train was to the train?
|
| 49 |
+
If we know the answer to the above question is 3, what is the average speed of the train after the train is 3 feet, what is the average speed of the train?
|
| 50 |
+
|
| 51 |
+
---
|
| 52 |
+
### Level: Medium
|
| 53 |
+
**Q: Find the roots of the quadratic equation: x^2 - 5x + 6 = 0**
|
| 54 |
+
|
| 55 |
+
**A:**
|
| 56 |
+
|
| 57 |
+
To solve this system of equations, we need to determine the value of $x^2 - 10 = 10$.
|
| 58 |
+
The value of $x^2$ is $x^2 = 0.1$.
|
| 59 |
+
The value of $x^2$ is given by:
|
| 60 |
+
|
| 61 |
+
$x^2 - 5 = 5$.
|
| 62 |
+
|
| 63 |
+
The value of $x^2$ is $2$ and the roots of the roots of $x^2$.
|
| 64 |
+
The value of x is $0$, which is $x = \frac{1}{2} + 2 = 0$.
|
| 65 |
+
Solving for $x = \frac{1}{2}$.
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
---
|
| 69 |
+
### Level: Medium
|
| 70 |
+
**Q: What is the derivative of f(x) = 3x^4 - 2x^2 + x?**
|
| 71 |
+
|
| 72 |
+
**A:**
|
| 73 |
+
|
| 74 |
+
The derivative of the derivative of f(x) = 2(x^2 - 4x + 2x^2 + 2x + 4x + 3x + 2x^2 + 2x + 2x + 2x^2 + 4x^2 + 2x + 2x^2 + 2x^2) = 4.
|
| 75 |
+
|
| 76 |
+
The answer is: 8
|
| 77 |
+
|
| 78 |
+
---
|
| 79 |
+
### Level: Medium
|
| 80 |
+
**Q: If A = {1, 2, 3} and B = {3, 4, 5}, what is the union of sets A and B?**
|
| 81 |
+
|
| 82 |
+
**A:**
|
| 83 |
+
|
| 84 |
+
Let's solve the multi-choice question.
|
| 85 |
+
Then, we have:
|
| 86 |
+
|
| 87 |
+
A = {1, 2, 2, 3, 3, 3, 3, 4, 4, 5, 7, 3, 6, 6, 8, 9, 7, 9, 9, 10, 9, 6, 9, 7, 8, 7, 7, 8, 7, 8, 8, 7, 7, 9, 9, 8, 8, 9, 7, 9}.
|
| 88 |
+
We can conclude that the feasible difference in this case is not possible in the multi-choice question is also the smallest integer.
|
| 89 |
+
Since there are two possible cases, the multi-choice question is equivalent to the multi-choice question.
|
| 90 |
+
Answer Choices:
|
| 91 |
+
Number of arrangements: (3, 6, 8, 13, 12), (2, 12), (3, 15), (3, 9, 8, 9) and (3, 15), (4, 8, 9), (2, 6, 13), (5, 9), (3, 9), (4, 9), (3, 9), (5, 5), (4, 1), (3, 2), (6, (3, 3), (4,
|
| 92 |
+
|
| 93 |
+
---
|
| 94 |
+
### Level: Hard
|
| 95 |
+
**Q: Prove that the sum of the first n positive integers is n(n+1)/2.**
|
| 96 |
+
|
| 97 |
+
**A:**
|
| 98 |
+
n(n+1) = n(n-1)
|
| 99 |
+
n(n+1) = n(n-1) + (n-1) = n(n-1)n(n+1) + n(n+1)n(n-1)n+1
|
| 100 |
+
|
| 101 |
+
n1 = n(n-1)
|
| 102 |
+
n2 = n2
|
| 103 |
+
n2 = n2
|
| 104 |
+
n3 = n2(n3)
|
| 105 |
+
n3 = n2 + n3 + 1(n3)
|
| 106 |
+
n4 = n3
|
| 107 |
+
n4 = n4
|
| 108 |
+
|
| 109 |
+
n2 = n3 = n1 + n4 + n2 = n2
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
where n1 = n1 = n3 = 1.
|
| 113 |
+
n3 = n1(n2) + n3(n3) = n1 + n1(n3) = n2(n1(n3) + n3(n3) = n2(n3)
|
| 114 |
+
|
| 115 |
+
Now, we can write:
|
| 116 |
+
|
| 117 |
+
n1 = n1(n1) = n3 + n3(n3) + n3 + n3(n3)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
n2 = n1 + (
|
| 121 |
+
|
| 122 |
+
---
|
| 123 |
+
### Level: Hard
|
| 124 |
+
**Q: Evaluate the integral of x * e^x dx from 0 to 1.**
|
| 125 |
+
|
| 126 |
+
**A:**
|
| 127 |
+
import numpy as np
|
| 128 |
+
|
| 129 |
+
# Define a value of x, which represents the value of x, y, and y, and y, which represents the value of x, y, and y, and y
|
| 130 |
+
|
| 131 |
+
print(num_x)
|
| 132 |
+
|
| 133 |
+
---
|
| 134 |
+
### Level: Hard
|
| 135 |
+
**Q: Find the eigenvalues of the matrix [[2, 1], [1, 2]].**
|
| 136 |
+
|
| 137 |
+
**A:**
|
| 138 |
+
import numpy as np
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
# define the eigenvalues of the eigenvalues, and to get the eigenvalues of the eigenvalues.
|
| 142 |
+
# Define the eigenvalues
|
| 143 |
+
|
| 144 |
+
2.0 = (0.0,0)
|
| 145 |
+
|
| 146 |
+
# compute the eigenvalues of the eigenvalues of the eigenvalues of the eigenvalues.
|
| 147 |
+
|
| 148 |
+
# calculate the eigenvalues
|
| 149 |
+
# The eigenvalues of the eigenvalues are given by the eigenvalues and eigenvectors of the eigenvalues.fill_1 and 0.1
|
| 150 |
+
# calculate the eigenvalues of the eigenvalues of the eigenvalues
|
| 151 |
+
# calculate the eigenvalues
|
| 152 |
+
|
| 153 |
+
# the eigenvalues of the eigenvalues
|
| 154 |
+
print(total_n) #print(# print the eigenvalues
|
| 155 |
+
print(row_1)
|
| 156 |
+
print(total_n)
|
| 157 |
+
print(answer_n)
|
| 158 |
+
|
| 159 |
+
---
|
eval_results/gsm8k/outputs__grpo-full__final/results_2026-05-26T09-33-14.995899.json
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"gsm8k": {
|
| 4 |
+
"name": "gsm8k",
|
| 5 |
+
"alias": "gsm8k",
|
| 6 |
+
"sample_len": 1319,
|
| 7 |
+
"exact_match,strict-match": 0.009855951478392721,
|
| 8 |
+
"exact_match_stderr,strict-match": 0.0027210765770416616,
|
| 9 |
+
"exact_match,flexible-extract": 0.021986353297952996,
|
| 10 |
+
"exact_match_stderr,flexible-extract": 0.0040391627581100546
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"group_subtasks": {},
|
| 14 |
+
"configs": {
|
| 15 |
+
"gsm8k": {
|
| 16 |
+
"task": "gsm8k",
|
| 17 |
+
"dataset_path": "openai/gsm8k",
|
| 18 |
+
"dataset_name": "main",
|
| 19 |
+
"training_split": "train",
|
| 20 |
+
"test_split": "test",
|
| 21 |
+
"fewshot_split": "train",
|
| 22 |
+
"doc_to_text": "Question: {{question}}\nAnswer:",
|
| 23 |
+
"doc_to_target": "{{answer}}",
|
| 24 |
+
"unsafe_code": false,
|
| 25 |
+
"description": "",
|
| 26 |
+
"target_delimiter": " ",
|
| 27 |
+
"fewshot_delimiter": "\n\n",
|
| 28 |
+
"fewshot_config": {
|
| 29 |
+
"sampler": "default",
|
| 30 |
+
"split": "train",
|
| 31 |
+
"process_docs": null,
|
| 32 |
+
"fewshot_indices": null,
|
| 33 |
+
"samples": null,
|
| 34 |
+
"doc_to_text": "Question: {{question}}\nAnswer:",
|
| 35 |
+
"doc_to_choice": null,
|
| 36 |
+
"doc_to_target": "{{answer}}",
|
| 37 |
+
"gen_prefix": null,
|
| 38 |
+
"fewshot_delimiter": "\n\n",
|
| 39 |
+
"target_delimiter": " "
|
| 40 |
+
},
|
| 41 |
+
"num_fewshot": 8,
|
| 42 |
+
"metric_list": [
|
| 43 |
+
{
|
| 44 |
+
"metric": "exact_match",
|
| 45 |
+
"aggregation": "mean",
|
| 46 |
+
"higher_is_better": true,
|
| 47 |
+
"ignore_case": true,
|
| 48 |
+
"ignore_punctuation": false,
|
| 49 |
+
"regexes_to_ignore": [
|
| 50 |
+
",",
|
| 51 |
+
"\\$",
|
| 52 |
+
"(?s).*#### ",
|
| 53 |
+
"\\.$"
|
| 54 |
+
]
|
| 55 |
+
}
|
| 56 |
+
],
|
| 57 |
+
"output_type": "generate_until",
|
| 58 |
+
"generation_kwargs": {
|
| 59 |
+
"until": [
|
| 60 |
+
"Question:",
|
| 61 |
+
"</s>",
|
| 62 |
+
"<|im_end|>"
|
| 63 |
+
],
|
| 64 |
+
"do_sample": false,
|
| 65 |
+
"temperature": 0.0
|
| 66 |
+
},
|
| 67 |
+
"repeats": 1,
|
| 68 |
+
"filter_list": [
|
| 69 |
+
{
|
| 70 |
+
"name": "strict-match",
|
| 71 |
+
"filter": [
|
| 72 |
+
{
|
| 73 |
+
"function": "regex",
|
| 74 |
+
"regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"function": "take_first"
|
| 78 |
+
}
|
| 79 |
+
]
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"name": "flexible-extract",
|
| 83 |
+
"filter": [
|
| 84 |
+
{
|
| 85 |
+
"function": "regex",
|
| 86 |
+
"group_select": -1,
|
| 87 |
+
"regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"function": "take_first"
|
| 91 |
+
}
|
| 92 |
+
]
|
| 93 |
+
}
|
| 94 |
+
],
|
| 95 |
+
"should_decontaminate": false,
|
| 96 |
+
"metadata": {
|
| 97 |
+
"version": 3.0,
|
| 98 |
+
"pretrained": "outputs/grpo-full/final",
|
| 99 |
+
"dtype": "bfloat16",
|
| 100 |
+
"config_source": "/home/himanshu/TinyMathReason-1B/venv/lib/python3.12/site-packages/lm_eval/tasks/gsm8k/gsm8k.yaml"
|
| 101 |
+
}
|
| 102 |
+
}
|
| 103 |
+
},
|
| 104 |
+
"versions": {
|
| 105 |
+
"gsm8k": 3.0
|
| 106 |
+
},
|
| 107 |
+
"n-shot": {
|
| 108 |
+
"gsm8k": 8
|
| 109 |
+
},
|
| 110 |
+
"higher_is_better": {
|
| 111 |
+
"gsm8k": {
|
| 112 |
+
"exact_match": true
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
"n-samples": {
|
| 116 |
+
"gsm8k": {
|
| 117 |
+
"original": 1319,
|
| 118 |
+
"effective": 1319
|
| 119 |
+
}
|
| 120 |
+
},
|
| 121 |
+
"config": {
|
| 122 |
+
"model": "hf",
|
| 123 |
+
"model_args": {
|
| 124 |
+
"pretrained": "outputs/grpo-full/final",
|
| 125 |
+
"dtype": "bfloat16"
|
| 126 |
+
},
|
| 127 |
+
"model_num_parameters": 1123125248,
|
| 128 |
+
"model_dtype": "torch.bfloat16",
|
| 129 |
+
"model_revision": "main",
|
| 130 |
+
"model_sha": "",
|
| 131 |
+
"batch_size": "auto",
|
| 132 |
+
"batch_sizes": [],
|
| 133 |
+
"device": "cuda",
|
| 134 |
+
"use_cache": null,
|
| 135 |
+
"limit": null,
|
| 136 |
+
"bootstrap_iters": 100000,
|
| 137 |
+
"gen_kwargs": {},
|
| 138 |
+
"random_seed": 0,
|
| 139 |
+
"numpy_seed": 1234,
|
| 140 |
+
"torch_seed": 1234,
|
| 141 |
+
"fewshot_seed": 1234
|
| 142 |
+
},
|
| 143 |
+
"git_hash": "03fcf23",
|
| 144 |
+
"date": 1779784898.324978,
|
| 145 |
+
"pretty_env_info": "PyTorch version: 2.12.0+cu130\nIs debug build: False\nCUDA used to build PyTorch: 13.0\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 24.04.4 LTS (x86_64)\nGCC version: (Ubuntu 13.3.0-6ubuntu2~24.04.1) 13.3.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.39\n\nPython version: 3.12.3 (main, Mar 23 2026, 19:04:32) [GCC 13.3.0] (64-bit runtime)\nPython platform: Linux-6.17.0-1016-gcp-x86_64-with-glibc2.39\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: GPU 0: NVIDIA L4\nNvidia driver version: 580.126.20\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_graph.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_heuristic.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_adv.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_cnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_precompiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_graph.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_heuristic.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_ops.so.9.13.0\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\nCaching allocator config: N/A\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 16\nOn-line CPU(s) list: 0-15\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.20GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 8\nSocket(s): 1\nStepping: 7\nBogoMIPS: 4400.43\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat avx512_vnni md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 256 KiB (8 instances)\nL1i cache: 256 KiB (8 instances)\nL2 cache: 8 MiB (8 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-15\nVulnerability Gather data sampling: Not affected\nVulnerability Ghostwrite: Not affected\nVulnerability Indirect target selection: Mitigation; Aligned branch/return thunks\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Old microcode: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; Enhanced IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; PBRSB-eIBRS SW sequence; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsa: Not affected\nVulnerability Tsx async abort: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Vmscape: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==2.4.6\n[pip3] nvidia-cublas==13.1.1.3\n[pip3] nvidia-cuda-cupti==13.0.85\n[pip3] nvidia-cuda-nvrtc==13.0.88\n[pip3] nvidia-cuda-runtime==13.0.96\n[pip3] nvidia-cudnn-cu13==9.20.0.48\n[pip3] nvidia-cufft==12.0.0.61\n[pip3] nvidia-curand==10.4.0.35\n[pip3] nvidia-cusolver==12.0.4.66\n[pip3] nvidia-cusparse==12.6.3.3\n[pip3] nvidia-cusparselt-cu13==0.8.1\n[pip3] nvidia-nccl-cu13==2.29.7\n[pip3] nvidia-nvjitlink==13.0.88\n[pip3] nvidia-nvtx==13.0.85\n[pip3] torch==2.12.0\n[pip3] triton==3.7.0\n[conda] Could not collect",
|
| 146 |
+
"transformers_version": "5.9.0",
|
| 147 |
+
"lm_eval_version": "0.4.12",
|
| 148 |
+
"upper_git_hash": null,
|
| 149 |
+
"tokenizer_pad_token": [
|
| 150 |
+
"<|pad|>",
|
| 151 |
+
"3"
|
| 152 |
+
],
|
| 153 |
+
"tokenizer_eos_token": [
|
| 154 |
+
"<|eos|>",
|
| 155 |
+
"1"
|
| 156 |
+
],
|
| 157 |
+
"tokenizer_bos_token": [
|
| 158 |
+
"<|bos|>",
|
| 159 |
+
"0"
|
| 160 |
+
],
|
| 161 |
+
"eot_token_id": 1,
|
| 162 |
+
"max_length": 4096,
|
| 163 |
+
"task_hashes": {
|
| 164 |
+
"gsm8k": "5edaa24ff4f3d939c3e1c5fd65a53cead84d4a52171818c453ec47099bd2a422"
|
| 165 |
+
},
|
| 166 |
+
"model_source": "hf",
|
| 167 |
+
"model_name": "outputs/grpo-full/final",
|
| 168 |
+
"model_name_sanitized": "outputs__grpo-full__final",
|
| 169 |
+
"system_instruction": null,
|
| 170 |
+
"system_instruction_sha": null,
|
| 171 |
+
"fewshot_as_multiturn": null,
|
| 172 |
+
"chat_template": null,
|
| 173 |
+
"chat_template_sha": null,
|
| 174 |
+
"total_evaluation_time_seconds": "3105.920454603998"
|
| 175 |
+
}
|
eval_results/gsm8k/outputs__grpo-full__final/samples_gsm8k_2026-05-26T09-33-14.995899.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:19d5589c2ca07cc8c977a4a341647694d37a6f6e76925e364b283957fb5313fa
|
| 3 |
+
size 17686475
|
eval_results/hellaswag/outputs__grpo-full__final/results_2026-05-26T12-29-19.954733.json
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"hellaswag": {
|
| 4 |
+
"name": "hellaswag",
|
| 5 |
+
"alias": "hellaswag",
|
| 6 |
+
"sample_len": 10042,
|
| 7 |
+
"acc,none": 0.2590121489743079,
|
| 8 |
+
"acc_stderr,none": 0.004371969542814812,
|
| 9 |
+
"acc_norm,none": 0.2629954192391954,
|
| 10 |
+
"acc_norm_stderr,none": 0.004393601887506574
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"group_subtasks": {},
|
| 14 |
+
"configs": {
|
| 15 |
+
"hellaswag": {
|
| 16 |
+
"task": "hellaswag",
|
| 17 |
+
"dataset_path": "Rowan/hellaswag",
|
| 18 |
+
"training_split": "train",
|
| 19 |
+
"validation_split": "validation",
|
| 20 |
+
"process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
|
| 21 |
+
"doc_to_text": "{{query}}",
|
| 22 |
+
"doc_to_target": "{{label}}",
|
| 23 |
+
"unsafe_code": false,
|
| 24 |
+
"doc_to_choice": "choices",
|
| 25 |
+
"description": "",
|
| 26 |
+
"target_delimiter": " ",
|
| 27 |
+
"fewshot_delimiter": "\n\n",
|
| 28 |
+
"fewshot_config": {
|
| 29 |
+
"sampler": "default",
|
| 30 |
+
"split": null,
|
| 31 |
+
"process_docs": "<function process_docs at 0x74288899f7e0>",
|
| 32 |
+
"fewshot_indices": null,
|
| 33 |
+
"samples": null,
|
| 34 |
+
"doc_to_text": "{{query}}",
|
| 35 |
+
"doc_to_choice": "choices",
|
| 36 |
+
"doc_to_target": "{{label}}",
|
| 37 |
+
"gen_prefix": null,
|
| 38 |
+
"fewshot_delimiter": "\n\n",
|
| 39 |
+
"target_delimiter": " "
|
| 40 |
+
},
|
| 41 |
+
"num_fewshot": 10,
|
| 42 |
+
"metric_list": [
|
| 43 |
+
{
|
| 44 |
+
"metric": "acc",
|
| 45 |
+
"aggregation": "mean",
|
| 46 |
+
"higher_is_better": true
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"metric": "acc_norm",
|
| 50 |
+
"aggregation": "mean",
|
| 51 |
+
"higher_is_better": true
|
| 52 |
+
}
|
| 53 |
+
],
|
| 54 |
+
"output_type": "multiple_choice",
|
| 55 |
+
"repeats": 1,
|
| 56 |
+
"should_decontaminate": false,
|
| 57 |
+
"metadata": {
|
| 58 |
+
"version": 1.0,
|
| 59 |
+
"pretrained": "outputs/grpo-full/final",
|
| 60 |
+
"dtype": "bfloat16",
|
| 61 |
+
"config_source": "/home/himanshu/TinyMathReason-1B/venv/lib/python3.12/site-packages/lm_eval/tasks/hellaswag/hellaswag.yaml"
|
| 62 |
+
}
|
| 63 |
+
}
|
| 64 |
+
},
|
| 65 |
+
"versions": {
|
| 66 |
+
"hellaswag": 1.0
|
| 67 |
+
},
|
| 68 |
+
"n-shot": {
|
| 69 |
+
"hellaswag": 10
|
| 70 |
+
},
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"hellaswag": {
|
| 73 |
+
"acc": true,
|
| 74 |
+
"acc_norm": true
|
| 75 |
+
}
|
| 76 |
+
},
|
| 77 |
+
"n-samples": {
|
| 78 |
+
"hellaswag": {
|
| 79 |
+
"original": 10042,
|
| 80 |
+
"effective": 10042
|
| 81 |
+
}
|
| 82 |
+
},
|
| 83 |
+
"config": {
|
| 84 |
+
"model": "hf",
|
| 85 |
+
"model_args": {
|
| 86 |
+
"pretrained": "outputs/grpo-full/final",
|
| 87 |
+
"dtype": "bfloat16"
|
| 88 |
+
},
|
| 89 |
+
"model_num_parameters": 1123125248,
|
| 90 |
+
"model_dtype": "torch.bfloat16",
|
| 91 |
+
"model_revision": "main",
|
| 92 |
+
"model_sha": "",
|
| 93 |
+
"batch_size": "auto",
|
| 94 |
+
"batch_sizes": [
|
| 95 |
+
64
|
| 96 |
+
],
|
| 97 |
+
"device": "cuda",
|
| 98 |
+
"use_cache": null,
|
| 99 |
+
"limit": null,
|
| 100 |
+
"bootstrap_iters": 100000,
|
| 101 |
+
"gen_kwargs": {},
|
| 102 |
+
"random_seed": 0,
|
| 103 |
+
"numpy_seed": 1234,
|
| 104 |
+
"torch_seed": 1234,
|
| 105 |
+
"fewshot_seed": 1234
|
| 106 |
+
},
|
| 107 |
+
"git_hash": "03fcf23",
|
| 108 |
+
"date": 1779795876.0037582,
|
| 109 |
+
"pretty_env_info": "PyTorch version: 2.12.0+cu130\nIs debug build: False\nCUDA used to build PyTorch: 13.0\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 24.04.4 LTS (x86_64)\nGCC version: (Ubuntu 13.3.0-6ubuntu2~24.04.1) 13.3.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.39\n\nPython version: 3.12.3 (main, Mar 23 2026, 19:04:32) [GCC 13.3.0] (64-bit runtime)\nPython platform: Linux-6.17.0-1016-gcp-x86_64-with-glibc2.39\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: GPU 0: NVIDIA L4\nNvidia driver version: 580.126.20\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_graph.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_heuristic.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_adv.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_cnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_precompiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_graph.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_heuristic.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_ops.so.9.13.0\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\nCaching allocator config: N/A\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 16\nOn-line CPU(s) list: 0-15\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.20GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 8\nSocket(s): 1\nStepping: 7\nBogoMIPS: 4400.43\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat avx512_vnni md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 256 KiB (8 instances)\nL1i cache: 256 KiB (8 instances)\nL2 cache: 8 MiB (8 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-15\nVulnerability Gather data sampling: Not affected\nVulnerability Ghostwrite: Not affected\nVulnerability Indirect target selection: Mitigation; Aligned branch/return thunks\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Old microcode: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; Enhanced IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; PBRSB-eIBRS SW sequence; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsa: Not affected\nVulnerability Tsx async abort: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Vmscape: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==2.4.6\n[pip3] nvidia-cublas==13.1.1.3\n[pip3] nvidia-cuda-cupti==13.0.85\n[pip3] nvidia-cuda-nvrtc==13.0.88\n[pip3] nvidia-cuda-runtime==13.0.96\n[pip3] nvidia-cudnn-cu13==9.20.0.48\n[pip3] nvidia-cufft==12.0.0.61\n[pip3] nvidia-curand==10.4.0.35\n[pip3] nvidia-cusolver==12.0.4.66\n[pip3] nvidia-cusparse==12.6.3.3\n[pip3] nvidia-cusparselt-cu13==0.8.1\n[pip3] nvidia-nccl-cu13==2.29.7\n[pip3] nvidia-nvjitlink==13.0.88\n[pip3] nvidia-nvtx==13.0.85\n[pip3] torch==2.12.0\n[pip3] triton==3.7.0\n[conda] Could not collect",
|
| 110 |
+
"transformers_version": "5.9.0",
|
| 111 |
+
"lm_eval_version": "0.4.12",
|
| 112 |
+
"upper_git_hash": null,
|
| 113 |
+
"tokenizer_pad_token": [
|
| 114 |
+
"<|pad|>",
|
| 115 |
+
"3"
|
| 116 |
+
],
|
| 117 |
+
"tokenizer_eos_token": [
|
| 118 |
+
"<|eos|>",
|
| 119 |
+
"1"
|
| 120 |
+
],
|
| 121 |
+
"tokenizer_bos_token": [
|
| 122 |
+
"<|bos|>",
|
| 123 |
+
"0"
|
| 124 |
+
],
|
| 125 |
+
"eot_token_id": 1,
|
| 126 |
+
"max_length": 4096,
|
| 127 |
+
"task_hashes": {
|
| 128 |
+
"hellaswag": "d4bcb44ec68db2b8a65f050c3c64c48454179b48fd8aee3e73b55e2ec51e6d82"
|
| 129 |
+
},
|
| 130 |
+
"model_source": "hf",
|
| 131 |
+
"model_name": "outputs/grpo-full/final",
|
| 132 |
+
"model_name_sanitized": "outputs__grpo-full__final",
|
| 133 |
+
"system_instruction": null,
|
| 134 |
+
"system_instruction_sha": null,
|
| 135 |
+
"fewshot_as_multiturn": null,
|
| 136 |
+
"chat_template": null,
|
| 137 |
+
"chat_template_sha": null,
|
| 138 |
+
"total_evaluation_time_seconds": "2693.2404373019963"
|
| 139 |
+
}
|
eval_results/hellaswag/outputs__grpo-full__final/samples_hellaswag_2026-05-26T12-29-19.954733.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf9e82964443ca811ee5103dca5c32ec42a4ef3fe4391929b5a6148d22e18613
|
| 3 |
+
size 186310702
|
eval_results/minerva_math_algebra/outputs__grpo-full__final/results_2026-05-26T10-07-08.234107.json
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"minerva_math_algebra": {
|
| 4 |
+
"name": "minerva_math_algebra",
|
| 5 |
+
"alias": "minerva_math_algebra",
|
| 6 |
+
"sample_len": 1187,
|
| 7 |
+
"exact_match,none": 0.0,
|
| 8 |
+
"exact_match_stderr,none": 0.0,
|
| 9 |
+
"math_verify,none": 0.020219039595619208,
|
| 10 |
+
"math_verify_stderr,none": 0.00408697908051843
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"group_subtasks": {},
|
| 14 |
+
"configs": {
|
| 15 |
+
"minerva_math_algebra": {
|
| 16 |
+
"task": "minerva_math_algebra",
|
| 17 |
+
"dataset_path": "EleutherAI/hendrycks_math",
|
| 18 |
+
"dataset_name": "algebra",
|
| 19 |
+
"training_split": "train",
|
| 20 |
+
"test_split": "test",
|
| 21 |
+
"process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc: dict) -> dict:\n out_doc = {\n \"problem\": doc[\"problem\"],\n \"solution\": doc[\"solution\"],\n \"answer\": normalize_final_answer(\n remove_boxed(last_boxed_only_string(doc[\"solution\"]))\n ),\n }\n if getattr(doc, \"few_shot\", None) is not None:\n out_doc[\"few_shot\"] = True\n return out_doc\n\n return dataset.map(_process_doc)\n",
|
| 22 |
+
"doc_to_text": "def doc_to_text(doc: dict) -> str:\n return \"Problem:\" + \"\\n\" + doc[\"problem\"] + \"\\n\\n\" + \"Solution:\"\n",
|
| 23 |
+
"doc_to_target": "{{answer if few_shot is undefined else solution}}",
|
| 24 |
+
"unsafe_code": false,
|
| 25 |
+
"process_results": "def process_results(doc: dict, results: list[str]) -> dict[str, int]:\n candidates = results[0]\n\n unnormalized_answer = get_unnormalized_answer(candidates)\n answer = normalize_final_answer(unnormalized_answer)\n\n if is_equiv(answer, doc[\"answer\"]):\n retval = 1\n else:\n retval = 0\n\n # math_verify\n _mvres = verify(\n gold=parse(doc[\"solution\"]),\n target=parse(candidates),\n )\n mathval = 1 if _mvres else 0\n\n res = {\n \"exact_match\": retval,\n \"math_verify\": mathval,\n }\n return res\n",
|
| 26 |
+
"description": "",
|
| 27 |
+
"target_delimiter": " ",
|
| 28 |
+
"fewshot_delimiter": "\n\n",
|
| 29 |
+
"fewshot_config": {
|
| 30 |
+
"sampler": "first_n",
|
| 31 |
+
"split": null,
|
| 32 |
+
"process_docs": "<function process_docs at 0x7dbd24753740>",
|
| 33 |
+
"fewshot_indices": null,
|
| 34 |
+
"samples": "<function list_fewshot_samples at 0x7dbd1874ec00>",
|
| 35 |
+
"doc_to_text": "<function doc_to_text at 0x7dbd246d1580>",
|
| 36 |
+
"doc_to_choice": null,
|
| 37 |
+
"doc_to_target": "{{answer if few_shot is undefined else solution}}",
|
| 38 |
+
"gen_prefix": null,
|
| 39 |
+
"fewshot_delimiter": "\n\n",
|
| 40 |
+
"target_delimiter": " "
|
| 41 |
+
},
|
| 42 |
+
"num_fewshot": 4,
|
| 43 |
+
"metric_list": [
|
| 44 |
+
{
|
| 45 |
+
"metric": "exact_match",
|
| 46 |
+
"aggregation": "mean",
|
| 47 |
+
"higher_is_better": true
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"metric": "math_verify",
|
| 51 |
+
"aggregation": "mean",
|
| 52 |
+
"higher_is_better": true
|
| 53 |
+
}
|
| 54 |
+
],
|
| 55 |
+
"output_type": "generate_until",
|
| 56 |
+
"generation_kwargs": {
|
| 57 |
+
"until": [
|
| 58 |
+
"Problem:"
|
| 59 |
+
],
|
| 60 |
+
"do_sample": false,
|
| 61 |
+
"temperature": 0.0
|
| 62 |
+
},
|
| 63 |
+
"repeats": 1,
|
| 64 |
+
"should_decontaminate": false,
|
| 65 |
+
"metadata": {
|
| 66 |
+
"version": 3.0,
|
| 67 |
+
"pretrained": "outputs/grpo-full/final",
|
| 68 |
+
"dtype": "bfloat16",
|
| 69 |
+
"config_source": "/home/himanshu/TinyMathReason-1B/venv/lib/python3.12/site-packages/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml"
|
| 70 |
+
}
|
| 71 |
+
}
|
| 72 |
+
},
|
| 73 |
+
"versions": {
|
| 74 |
+
"minerva_math_algebra": 3.0
|
| 75 |
+
},
|
| 76 |
+
"n-shot": {
|
| 77 |
+
"minerva_math_algebra": 4
|
| 78 |
+
},
|
| 79 |
+
"higher_is_better": {
|
| 80 |
+
"minerva_math_algebra": {
|
| 81 |
+
"exact_match": true,
|
| 82 |
+
"math_verify": true
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"n-samples": {
|
| 86 |
+
"minerva_math_algebra": {
|
| 87 |
+
"original": 1187,
|
| 88 |
+
"effective": 1187
|
| 89 |
+
}
|
| 90 |
+
},
|
| 91 |
+
"config": {
|
| 92 |
+
"model": "hf",
|
| 93 |
+
"model_args": {
|
| 94 |
+
"pretrained": "outputs/grpo-full/final",
|
| 95 |
+
"dtype": "bfloat16"
|
| 96 |
+
},
|
| 97 |
+
"model_num_parameters": 1123125248,
|
| 98 |
+
"model_dtype": "torch.bfloat16",
|
| 99 |
+
"model_revision": "main",
|
| 100 |
+
"model_sha": "",
|
| 101 |
+
"batch_size": "auto",
|
| 102 |
+
"batch_sizes": [],
|
| 103 |
+
"device": "cuda",
|
| 104 |
+
"use_cache": null,
|
| 105 |
+
"limit": null,
|
| 106 |
+
"bootstrap_iters": 100000,
|
| 107 |
+
"gen_kwargs": {},
|
| 108 |
+
"random_seed": 0,
|
| 109 |
+
"numpy_seed": 1234,
|
| 110 |
+
"torch_seed": 1234,
|
| 111 |
+
"fewshot_seed": 1234
|
| 112 |
+
},
|
| 113 |
+
"git_hash": "03fcf23",
|
| 114 |
+
"date": 1779789023.103882,
|
| 115 |
+
"pretty_env_info": "PyTorch version: 2.12.0+cu130\nIs debug build: False\nCUDA used to build PyTorch: 13.0\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 24.04.4 LTS (x86_64)\nGCC version: (Ubuntu 13.3.0-6ubuntu2~24.04.1) 13.3.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.39\n\nPython version: 3.12.3 (main, Mar 23 2026, 19:04:32) [GCC 13.3.0] (64-bit runtime)\nPython platform: Linux-6.17.0-1016-gcp-x86_64-with-glibc2.39\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: GPU 0: NVIDIA L4\nNvidia driver version: 580.126.20\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_graph.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_heuristic.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_adv.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_cnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_precompiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_graph.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_heuristic.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_ops.so.9.13.0\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\nCaching allocator config: N/A\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 16\nOn-line CPU(s) list: 0-15\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.20GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 8\nSocket(s): 1\nStepping: 7\nBogoMIPS: 4400.43\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat avx512_vnni md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 256 KiB (8 instances)\nL1i cache: 256 KiB (8 instances)\nL2 cache: 8 MiB (8 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-15\nVulnerability Gather data sampling: Not affected\nVulnerability Ghostwrite: Not affected\nVulnerability Indirect target selection: Mitigation; Aligned branch/return thunks\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Old microcode: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; Enhanced IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; PBRSB-eIBRS SW sequence; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsa: Not affected\nVulnerability Tsx async abort: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Vmscape: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==2.4.6\n[pip3] nvidia-cublas==13.1.1.3\n[pip3] nvidia-cuda-cupti==13.0.85\n[pip3] nvidia-cuda-nvrtc==13.0.88\n[pip3] nvidia-cuda-runtime==13.0.96\n[pip3] nvidia-cudnn-cu13==9.20.0.48\n[pip3] nvidia-cufft==12.0.0.61\n[pip3] nvidia-curand==10.4.0.35\n[pip3] nvidia-cusolver==12.0.4.66\n[pip3] nvidia-cusparse==12.6.3.3\n[pip3] nvidia-cusparselt-cu13==0.8.1\n[pip3] nvidia-nccl-cu13==2.29.7\n[pip3] nvidia-nvjitlink==13.0.88\n[pip3] nvidia-nvtx==13.0.85\n[pip3] torch==2.12.0\n[pip3] triton==3.7.0\n[conda] Could not collect",
|
| 116 |
+
"transformers_version": "5.9.0",
|
| 117 |
+
"lm_eval_version": "0.4.12",
|
| 118 |
+
"upper_git_hash": null,
|
| 119 |
+
"tokenizer_pad_token": [
|
| 120 |
+
"<|pad|>",
|
| 121 |
+
"3"
|
| 122 |
+
],
|
| 123 |
+
"tokenizer_eos_token": [
|
| 124 |
+
"<|eos|>",
|
| 125 |
+
"1"
|
| 126 |
+
],
|
| 127 |
+
"tokenizer_bos_token": [
|
| 128 |
+
"<|bos|>",
|
| 129 |
+
"0"
|
| 130 |
+
],
|
| 131 |
+
"eot_token_id": 1,
|
| 132 |
+
"max_length": 4096,
|
| 133 |
+
"task_hashes": {
|
| 134 |
+
"minerva_math_algebra": "5c955bbc89ad645142d61b1594b7c36b552b722edf416ae40fcc71a4c50bd24b"
|
| 135 |
+
},
|
| 136 |
+
"model_source": "hf",
|
| 137 |
+
"model_name": "outputs/grpo-full/final",
|
| 138 |
+
"model_name_sanitized": "outputs__grpo-full__final",
|
| 139 |
+
"system_instruction": null,
|
| 140 |
+
"system_instruction_sha": null,
|
| 141 |
+
"fewshot_as_multiturn": null,
|
| 142 |
+
"chat_template": null,
|
| 143 |
+
"chat_template_sha": null,
|
| 144 |
+
"total_evaluation_time_seconds": "1014.535830836001"
|
| 145 |
+
}
|
eval_results/minerva_math_algebra/outputs__grpo-full__final/samples_minerva_math_algebra_2026-05-26T10-07-08.234107.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/results_2026-05-26T12-51-35.680076.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_abstract_algebra_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_anatomy_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_astronomy_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_business_ethics_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_clinical_knowledge_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_biology_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_chemistry_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_computer_science_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_mathematics_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_medicine_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_physics_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_computer_security_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_conceptual_physics_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_econometrics_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_electrical_engineering_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_elementary_mathematics_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_formal_logic_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_global_facts_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_biology_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_chemistry_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_computer_science_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_european_history_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_geography_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_government_and_politics_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_macroeconomics_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_mathematics_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_microeconomics_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_physics_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_psychology_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_statistics_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_us_history_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_world_history_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_human_aging_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_human_sexuality_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_international_law_2026-05-26T12-51-35.680076.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|