himanshunakrani9 commited on
Commit
070e175
·
verified ·
1 Parent(s): 9e34e99

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. eval_results/arc_challenge/outputs__grpo-full__final/results_2026-05-26T11-44-23.900648.json +141 -0
  3. eval_results/arc_challenge/outputs__grpo-full__final/samples_arc_challenge_2026-05-26T11-44-23.900648.jsonl +3 -0
  4. eval_results/arc_easy/outputs__grpo-full__final/results_2026-05-26T11-37-35.592842.json +141 -0
  5. eval_results/arc_easy/outputs__grpo-full__final/samples_arc_easy_2026-05-26T11-37-35.592842.jsonl +0 -0
  6. eval_results/benchmark_summary.md +8 -0
  7. eval_results/combined_results.json +6 -0
  8. eval_results/custom_eval.md +159 -0
  9. eval_results/gsm8k/outputs__grpo-full__final/results_2026-05-26T09-33-14.995899.json +175 -0
  10. eval_results/gsm8k/outputs__grpo-full__final/samples_gsm8k_2026-05-26T09-33-14.995899.jsonl +3 -0
  11. eval_results/hellaswag/outputs__grpo-full__final/results_2026-05-26T12-29-19.954733.json +139 -0
  12. eval_results/hellaswag/outputs__grpo-full__final/samples_hellaswag_2026-05-26T12-29-19.954733.jsonl +3 -0
  13. eval_results/minerva_math_algebra/outputs__grpo-full__final/results_2026-05-26T10-07-08.234107.json +145 -0
  14. eval_results/minerva_math_algebra/outputs__grpo-full__final/samples_minerva_math_algebra_2026-05-26T10-07-08.234107.jsonl +0 -0
  15. eval_results/mmlu/outputs__grpo-full__final/results_2026-05-26T12-51-35.680076.json +0 -0
  16. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_abstract_algebra_2026-05-26T12-51-35.680076.jsonl +0 -0
  17. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_anatomy_2026-05-26T12-51-35.680076.jsonl +0 -0
  18. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_astronomy_2026-05-26T12-51-35.680076.jsonl +0 -0
  19. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_business_ethics_2026-05-26T12-51-35.680076.jsonl +0 -0
  20. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_clinical_knowledge_2026-05-26T12-51-35.680076.jsonl +0 -0
  21. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_biology_2026-05-26T12-51-35.680076.jsonl +0 -0
  22. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_chemistry_2026-05-26T12-51-35.680076.jsonl +0 -0
  23. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_computer_science_2026-05-26T12-51-35.680076.jsonl +0 -0
  24. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_mathematics_2026-05-26T12-51-35.680076.jsonl +0 -0
  25. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_medicine_2026-05-26T12-51-35.680076.jsonl +0 -0
  26. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_physics_2026-05-26T12-51-35.680076.jsonl +0 -0
  27. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_computer_security_2026-05-26T12-51-35.680076.jsonl +0 -0
  28. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_conceptual_physics_2026-05-26T12-51-35.680076.jsonl +0 -0
  29. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_econometrics_2026-05-26T12-51-35.680076.jsonl +0 -0
  30. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_electrical_engineering_2026-05-26T12-51-35.680076.jsonl +0 -0
  31. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_elementary_mathematics_2026-05-26T12-51-35.680076.jsonl +0 -0
  32. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_formal_logic_2026-05-26T12-51-35.680076.jsonl +0 -0
  33. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_global_facts_2026-05-26T12-51-35.680076.jsonl +0 -0
  34. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_biology_2026-05-26T12-51-35.680076.jsonl +0 -0
  35. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_chemistry_2026-05-26T12-51-35.680076.jsonl +0 -0
  36. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_computer_science_2026-05-26T12-51-35.680076.jsonl +0 -0
  37. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_european_history_2026-05-26T12-51-35.680076.jsonl +0 -0
  38. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_geography_2026-05-26T12-51-35.680076.jsonl +0 -0
  39. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_government_and_politics_2026-05-26T12-51-35.680076.jsonl +0 -0
  40. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_macroeconomics_2026-05-26T12-51-35.680076.jsonl +0 -0
  41. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_mathematics_2026-05-26T12-51-35.680076.jsonl +0 -0
  42. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_microeconomics_2026-05-26T12-51-35.680076.jsonl +0 -0
  43. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_physics_2026-05-26T12-51-35.680076.jsonl +0 -0
  44. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_psychology_2026-05-26T12-51-35.680076.jsonl +0 -0
  45. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_statistics_2026-05-26T12-51-35.680076.jsonl +0 -0
  46. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_us_history_2026-05-26T12-51-35.680076.jsonl +0 -0
  47. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_world_history_2026-05-26T12-51-35.680076.jsonl +0 -0
  48. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_human_aging_2026-05-26T12-51-35.680076.jsonl +0 -0
  49. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_human_sexuality_2026-05-26T12-51-35.680076.jsonl +0 -0
  50. eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_international_law_2026-05-26T12-51-35.680076.jsonl +0 -0
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ eval_results/arc_challenge/outputs__grpo-full__final/samples_arc_challenge_2026-05-26T11-44-23.900648.jsonl filter=lfs diff=lfs merge=lfs -text
37
+ eval_results/gsm8k/outputs__grpo-full__final/samples_gsm8k_2026-05-26T09-33-14.995899.jsonl filter=lfs diff=lfs merge=lfs -text
38
+ eval_results/hellaswag/outputs__grpo-full__final/samples_hellaswag_2026-05-26T12-29-19.954733.jsonl filter=lfs diff=lfs merge=lfs -text
39
+ eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_professional_law_2026-05-26T12-51-35.680076.jsonl filter=lfs diff=lfs merge=lfs -text
eval_results/arc_challenge/outputs__grpo-full__final/results_2026-05-26T11-44-23.900648.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_challenge": {
4
+ "name": "arc_challenge",
5
+ "alias": "arc_challenge",
6
+ "sample_len": 1172,
7
+ "acc,none": 0.19197952218430034,
8
+ "acc_stderr,none": 0.01150959890659822,
9
+ "acc_norm,none": 0.22781569965870307,
10
+ "acc_norm_stderr,none": 0.012256708602326964
11
+ }
12
+ },
13
+ "group_subtasks": {},
14
+ "configs": {
15
+ "arc_challenge": {
16
+ "task": "arc_challenge",
17
+ "dataset_path": "allenai/ai2_arc",
18
+ "dataset_name": "ARC-Challenge",
19
+ "training_split": "train",
20
+ "validation_split": "validation",
21
+ "test_split": "test",
22
+ "doc_to_text": "Question: {{question}}\nAnswer:",
23
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
24
+ "unsafe_code": false,
25
+ "doc_to_choice": "{{choices.text}}",
26
+ "description": "",
27
+ "target_delimiter": " ",
28
+ "fewshot_delimiter": "\n\n",
29
+ "fewshot_config": {
30
+ "sampler": "default",
31
+ "split": null,
32
+ "process_docs": null,
33
+ "fewshot_indices": null,
34
+ "samples": null,
35
+ "doc_to_text": "Question: {{question}}\nAnswer:",
36
+ "doc_to_choice": "{{choices.text}}",
37
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
38
+ "gen_prefix": null,
39
+ "fewshot_delimiter": "\n\n",
40
+ "target_delimiter": " "
41
+ },
42
+ "num_fewshot": 25,
43
+ "metric_list": [
44
+ {
45
+ "metric": "acc",
46
+ "aggregation": "mean",
47
+ "higher_is_better": true
48
+ },
49
+ {
50
+ "metric": "acc_norm",
51
+ "aggregation": "mean",
52
+ "higher_is_better": true
53
+ }
54
+ ],
55
+ "output_type": "multiple_choice",
56
+ "repeats": 1,
57
+ "should_decontaminate": true,
58
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
59
+ "metadata": {
60
+ "version": 1.0,
61
+ "pretrained": "outputs/grpo-full/final",
62
+ "dtype": "bfloat16",
63
+ "config_source": "/home/himanshu/TinyMathReason-1B/venv/lib/python3.12/site-packages/lm_eval/tasks/arc/arc_challenge.yaml"
64
+ }
65
+ }
66
+ },
67
+ "versions": {
68
+ "arc_challenge": 1.0
69
+ },
70
+ "n-shot": {
71
+ "arc_challenge": 25
72
+ },
73
+ "higher_is_better": {
74
+ "arc_challenge": {
75
+ "acc": true,
76
+ "acc_norm": true
77
+ }
78
+ },
79
+ "n-samples": {
80
+ "arc_challenge": {
81
+ "original": 1172,
82
+ "effective": 1172
83
+ }
84
+ },
85
+ "config": {
86
+ "model": "hf",
87
+ "model_args": {
88
+ "pretrained": "outputs/grpo-full/final",
89
+ "dtype": "bfloat16"
90
+ },
91
+ "model_num_parameters": 1123125248,
92
+ "model_dtype": "torch.bfloat16",
93
+ "model_revision": "main",
94
+ "model_sha": "",
95
+ "batch_size": "auto",
96
+ "batch_sizes": [
97
+ 64
98
+ ],
99
+ "device": "cuda",
100
+ "use_cache": null,
101
+ "limit": null,
102
+ "bootstrap_iters": 100000,
103
+ "gen_kwargs": {},
104
+ "random_seed": 0,
105
+ "numpy_seed": 1234,
106
+ "torch_seed": 1234,
107
+ "fewshot_seed": 1234
108
+ },
109
+ "git_hash": "03fcf23",
110
+ "date": 1779795467.4101148,
111
+ "pretty_env_info": "PyTorch version: 2.12.0+cu130\nIs debug build: False\nCUDA used to build PyTorch: 13.0\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 24.04.4 LTS (x86_64)\nGCC version: (Ubuntu 13.3.0-6ubuntu2~24.04.1) 13.3.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.39\n\nPython version: 3.12.3 (main, Mar 23 2026, 19:04:32) [GCC 13.3.0] (64-bit runtime)\nPython platform: Linux-6.17.0-1016-gcp-x86_64-with-glibc2.39\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: GPU 0: NVIDIA L4\nNvidia driver version: 580.126.20\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_graph.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_heuristic.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_adv.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_cnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_precompiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_graph.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_heuristic.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_ops.so.9.13.0\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\nCaching allocator config: N/A\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 16\nOn-line CPU(s) list: 0-15\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.20GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 8\nSocket(s): 1\nStepping: 7\nBogoMIPS: 4400.43\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat avx512_vnni md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 256 KiB (8 instances)\nL1i cache: 256 KiB (8 instances)\nL2 cache: 8 MiB (8 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-15\nVulnerability Gather data sampling: Not affected\nVulnerability Ghostwrite: Not affected\nVulnerability Indirect target selection: Mitigation; Aligned branch/return thunks\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Old microcode: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; Enhanced IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; PBRSB-eIBRS SW sequence; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsa: Not affected\nVulnerability Tsx async abort: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Vmscape: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==2.4.6\n[pip3] nvidia-cublas==13.1.1.3\n[pip3] nvidia-cuda-cupti==13.0.85\n[pip3] nvidia-cuda-nvrtc==13.0.88\n[pip3] nvidia-cuda-runtime==13.0.96\n[pip3] nvidia-cudnn-cu13==9.20.0.48\n[pip3] nvidia-cufft==12.0.0.61\n[pip3] nvidia-curand==10.4.0.35\n[pip3] nvidia-cusolver==12.0.4.66\n[pip3] nvidia-cusparse==12.6.3.3\n[pip3] nvidia-cusparselt-cu13==0.8.1\n[pip3] nvidia-nccl-cu13==2.29.7\n[pip3] nvidia-nvjitlink==13.0.88\n[pip3] nvidia-nvtx==13.0.85\n[pip3] torch==2.12.0\n[pip3] triton==3.7.0\n[conda] Could not collect",
112
+ "transformers_version": "5.9.0",
113
+ "lm_eval_version": "0.4.12",
114
+ "upper_git_hash": null,
115
+ "tokenizer_pad_token": [
116
+ "<|pad|>",
117
+ "3"
118
+ ],
119
+ "tokenizer_eos_token": [
120
+ "<|eos|>",
121
+ "1"
122
+ ],
123
+ "tokenizer_bos_token": [
124
+ "<|bos|>",
125
+ "0"
126
+ ],
127
+ "eot_token_id": 1,
128
+ "max_length": 4096,
129
+ "task_hashes": {
130
+ "arc_challenge": "55e883475b5650b20d8d9dc1e9cdf59ef645a257fcd74bf43a9dbb5c632c529c"
131
+ },
132
+ "model_source": "hf",
133
+ "model_name": "outputs/grpo-full/final",
134
+ "model_name_sanitized": "outputs__grpo-full__final",
135
+ "system_instruction": null,
136
+ "system_instruction_sha": null,
137
+ "fewshot_as_multiturn": null,
138
+ "chat_template": null,
139
+ "chat_template_sha": null,
140
+ "total_evaluation_time_seconds": "405.71644051300245"
141
+ }
eval_results/arc_challenge/outputs__grpo-full__final/samples_arc_challenge_2026-05-26T11-44-23.900648.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd9bfe557b60e908f99194a2f751c31995efbca3ef3d37873954b352b3cceb07
3
+ size 23316200
eval_results/arc_easy/outputs__grpo-full__final/results_2026-05-26T11-37-35.592842.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_easy": {
4
+ "name": "arc_easy",
5
+ "alias": "arc_easy",
6
+ "sample_len": 2376,
7
+ "acc,none": 0.2760942760942761,
8
+ "acc_stderr,none": 0.00917355987383544,
9
+ "acc_norm,none": 0.2878787878787879,
10
+ "acc_norm_stderr,none": 0.009290733161670239
11
+ }
12
+ },
13
+ "group_subtasks": {},
14
+ "configs": {
15
+ "arc_easy": {
16
+ "task": "arc_easy",
17
+ "dataset_path": "allenai/ai2_arc",
18
+ "dataset_name": "ARC-Easy",
19
+ "training_split": "train",
20
+ "validation_split": "validation",
21
+ "test_split": "test",
22
+ "doc_to_text": "Question: {{question}}\nAnswer:",
23
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
24
+ "unsafe_code": false,
25
+ "doc_to_choice": "{{choices.text}}",
26
+ "description": "",
27
+ "target_delimiter": " ",
28
+ "fewshot_delimiter": "\n\n",
29
+ "fewshot_config": {
30
+ "sampler": "default",
31
+ "split": null,
32
+ "process_docs": null,
33
+ "fewshot_indices": null,
34
+ "samples": null,
35
+ "doc_to_text": "Question: {{question}}\nAnswer:",
36
+ "doc_to_choice": "{{choices.text}}",
37
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
38
+ "gen_prefix": null,
39
+ "fewshot_delimiter": "\n\n",
40
+ "target_delimiter": " "
41
+ },
42
+ "num_fewshot": 0,
43
+ "metric_list": [
44
+ {
45
+ "metric": "acc",
46
+ "aggregation": "mean",
47
+ "higher_is_better": true
48
+ },
49
+ {
50
+ "metric": "acc_norm",
51
+ "aggregation": "mean",
52
+ "higher_is_better": true
53
+ }
54
+ ],
55
+ "output_type": "multiple_choice",
56
+ "repeats": 1,
57
+ "should_decontaminate": true,
58
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
59
+ "metadata": {
60
+ "version": 1.0,
61
+ "pretrained": "outputs/grpo-full/final",
62
+ "dtype": "bfloat16",
63
+ "config_source": "/home/himanshu/TinyMathReason-1B/venv/lib/python3.12/site-packages/lm_eval/tasks/arc/arc_easy.yaml"
64
+ }
65
+ }
66
+ },
67
+ "versions": {
68
+ "arc_easy": 1.0
69
+ },
70
+ "n-shot": {
71
+ "arc_easy": 0
72
+ },
73
+ "higher_is_better": {
74
+ "arc_easy": {
75
+ "acc": true,
76
+ "acc_norm": true
77
+ }
78
+ },
79
+ "n-samples": {
80
+ "arc_easy": {
81
+ "original": 2376,
82
+ "effective": 2376
83
+ }
84
+ },
85
+ "config": {
86
+ "model": "hf",
87
+ "model_args": {
88
+ "pretrained": "outputs/grpo-full/final",
89
+ "dtype": "bfloat16"
90
+ },
91
+ "model_num_parameters": 1123125248,
92
+ "model_dtype": "torch.bfloat16",
93
+ "model_revision": "main",
94
+ "model_sha": "",
95
+ "batch_size": "auto",
96
+ "batch_sizes": [
97
+ 64
98
+ ],
99
+ "device": "cuda",
100
+ "use_cache": null,
101
+ "limit": null,
102
+ "bootstrap_iters": 100000,
103
+ "gen_kwargs": {},
104
+ "random_seed": 0,
105
+ "numpy_seed": 1234,
106
+ "torch_seed": 1234,
107
+ "fewshot_seed": 1234
108
+ },
109
+ "git_hash": "03fcf23",
110
+ "date": 1779795408.3004034,
111
+ "pretty_env_info": "PyTorch version: 2.12.0+cu130\nIs debug build: False\nCUDA used to build PyTorch: 13.0\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 24.04.4 LTS (x86_64)\nGCC version: (Ubuntu 13.3.0-6ubuntu2~24.04.1) 13.3.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.39\n\nPython version: 3.12.3 (main, Mar 23 2026, 19:04:32) [GCC 13.3.0] (64-bit runtime)\nPython platform: Linux-6.17.0-1016-gcp-x86_64-with-glibc2.39\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: GPU 0: NVIDIA L4\nNvidia driver version: 580.126.20\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_graph.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_heuristic.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_adv.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_cnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_precompiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_graph.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_heuristic.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_ops.so.9.13.0\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\nCaching allocator config: N/A\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 16\nOn-line CPU(s) list: 0-15\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.20GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 8\nSocket(s): 1\nStepping: 7\nBogoMIPS: 4400.43\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat avx512_vnni md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 256 KiB (8 instances)\nL1i cache: 256 KiB (8 instances)\nL2 cache: 8 MiB (8 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-15\nVulnerability Gather data sampling: Not affected\nVulnerability Ghostwrite: Not affected\nVulnerability Indirect target selection: Mitigation; Aligned branch/return thunks\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Old microcode: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; Enhanced IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; PBRSB-eIBRS SW sequence; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsa: Not affected\nVulnerability Tsx async abort: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Vmscape: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==2.4.6\n[pip3] nvidia-cublas==13.1.1.3\n[pip3] nvidia-cuda-cupti==13.0.85\n[pip3] nvidia-cuda-nvrtc==13.0.88\n[pip3] nvidia-cuda-runtime==13.0.96\n[pip3] nvidia-cudnn-cu13==9.20.0.48\n[pip3] nvidia-cufft==12.0.0.61\n[pip3] nvidia-curand==10.4.0.35\n[pip3] nvidia-cusolver==12.0.4.66\n[pip3] nvidia-cusparse==12.6.3.3\n[pip3] nvidia-cusparselt-cu13==0.8.1\n[pip3] nvidia-nccl-cu13==2.29.7\n[pip3] nvidia-nvjitlink==13.0.88\n[pip3] nvidia-nvtx==13.0.85\n[pip3] torch==2.12.0\n[pip3] triton==3.7.0\n[conda] Could not collect",
112
+ "transformers_version": "5.9.0",
113
+ "lm_eval_version": "0.4.12",
114
+ "upper_git_hash": null,
115
+ "tokenizer_pad_token": [
116
+ "<|pad|>",
117
+ "3"
118
+ ],
119
+ "tokenizer_eos_token": [
120
+ "<|eos|>",
121
+ "1"
122
+ ],
123
+ "tokenizer_bos_token": [
124
+ "<|bos|>",
125
+ "0"
126
+ ],
127
+ "eot_token_id": 1,
128
+ "max_length": 4096,
129
+ "task_hashes": {
130
+ "arc_easy": "dce0d9b0f0cecd55bf2ac264042c5e45487df708d13123af3ae9e67bbbefdeb1"
131
+ },
132
+ "model_source": "hf",
133
+ "model_name": "outputs/grpo-full/final",
134
+ "model_name_sanitized": "outputs__grpo-full__final",
135
+ "system_instruction": null,
136
+ "system_instruction_sha": null,
137
+ "fewshot_as_multiturn": null,
138
+ "chat_template": null,
139
+ "chat_template_sha": null,
140
+ "total_evaluation_time_seconds": "56.55086992699944"
141
+ }
eval_results/arc_easy/outputs__grpo-full__final/samples_arc_easy_2026-05-26T11-37-35.592842.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/benchmark_summary.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Benchmark Results for outputs/grpo-full/final
2
+
3
+ | Benchmark | Shots | Score |
4
+ | :--- | :---: | :---: |
5
+ | arc_easy | 0 | No results file |
6
+ | arc_challenge | 25 | No results file |
7
+ | hellaswag | 10 | No results file |
8
+ | mmlu | 5 | No results file |
eval_results/combined_results.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "arc_easy": "No results file",
3
+ "arc_challenge": "No results file",
4
+ "hellaswag": "No results file",
5
+ "mmlu": "No results file"
6
+ }
eval_results/custom_eval.md ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Custom Evaluation: outputs/grpo-full/final
2
+
3
+ ### Level: Easy
4
+ **Q: What is 15% of 200?**
5
+
6
+ **A:**
7
+
8
+ There are 5% of the population of unknown families, which is 3% of the population.
9
+
10
+ In the case of the multi-choice, the population is 2%.
11
+ The proportion of the population has an impact on the population of 500 people.
12
+ The proportion of individuals living in rural areas is 4.
13
+ The population of the people living in rural areas in urban areas has 4% of the population.
14
+ The population of the population of West Africa is 2/100, which is the number of people living in rural areas:
15
+
16
+ The proportion of the proportion of people in the proportion of people living in the rural regions is 0.6/3,
17
+
18
+ The poverty in the rural area is 2/4.
19
+ If the number of children living in the population is 25/2. The population of the population is 2.5.6.
20
+ The population of people in rural areas is the population that is 2/3.6.
21
+ The population of the proportion of the people living in rural areas is 10.6.
22
+ The population of the population of the population is 4.
23
+ The population of the proportion of individuals who live in poverty are 65 years older than the poverty-free population, and in poverty.
24
+ The population of
25
+
26
+ ---
27
+ ### Level: Easy
28
+ **Q: Solve for x: 3x + 5 = 20**
29
+
30
+ **A:**
31
+
32
+ There are only x 2 = 8
33
+ The value of x is 2x + 3 = 2.
34
+ The value of x is 2.
35
+ The value of x is 2.
36
+ #### 2.
37
+
38
+ ---
39
+ ### Level: Easy
40
+ **Q: If a train travels 60 miles per hour, how far does it go in 2.5 hours?**
41
+
42
+ **A:**
43
+
44
+ The train tracks can be used to determine the speed of the train that is currently 10 km long, and the train is 10 meters.
45
+ If the train was in a train of 6 km, then the train would be 4 hours in the train is in the train, then the train would be a total of 12.5 km/hr, and a train to drive the train to the train.
46
+ If the train was running, how many miles traveled?
47
+ If the train passed through the train, then the train would be able to take it at the train station.
48
+ If it ran 50 mph, what is the distance from the train was to the train?
49
+ If we know the answer to the above question is 3, what is the average speed of the train after the train is 3 feet, what is the average speed of the train?
50
+
51
+ ---
52
+ ### Level: Medium
53
+ **Q: Find the roots of the quadratic equation: x^2 - 5x + 6 = 0**
54
+
55
+ **A:**
56
+
57
+ To solve this system of equations, we need to determine the value of $x^2 - 10 = 10$.
58
+ The value of $x^2$ is $x^2 = 0.1$.
59
+ The value of $x^2$ is given by:
60
+
61
+ $x^2 - 5 = 5$.
62
+
63
+ The value of $x^2$ is $2$ and the roots of the roots of $x^2$.
64
+ The value of x is $0$, which is $x = \frac{1}{2} + 2 = 0$.
65
+ Solving for $x = \frac{1}{2}$.
66
+
67
+
68
+ ---
69
+ ### Level: Medium
70
+ **Q: What is the derivative of f(x) = 3x^4 - 2x^2 + x?**
71
+
72
+ **A:**
73
+
74
+ The derivative of the derivative of f(x) = 2(x^2 - 4x + 2x^2 + 2x + 4x + 3x + 2x^2 + 2x + 2x + 2x^2 + 4x^2 + 2x + 2x^2 + 2x^2) = 4.
75
+
76
+ The answer is: 8
77
+
78
+ ---
79
+ ### Level: Medium
80
+ **Q: If A = {1, 2, 3} and B = {3, 4, 5}, what is the union of sets A and B?**
81
+
82
+ **A:**
83
+
84
+ Let's solve the multi-choice question.
85
+ Then, we have:
86
+
87
+ A = {1, 2, 2, 3, 3, 3, 3, 4, 4, 5, 7, 3, 6, 6, 8, 9, 7, 9, 9, 10, 9, 6, 9, 7, 8, 7, 7, 8, 7, 8, 8, 7, 7, 9, 9, 8, 8, 9, 7, 9}.
88
+ We can conclude that the feasible difference in this case is not possible in the multi-choice question is also the smallest integer.
89
+ Since there are two possible cases, the multi-choice question is equivalent to the multi-choice question.
90
+ Answer Choices:
91
+ Number of arrangements: (3, 6, 8, 13, 12), (2, 12), (3, 15), (3, 9, 8, 9) and (3, 15), (4, 8, 9), (2, 6, 13), (5, 9), (3, 9), (4, 9), (3, 9), (5, 5), (4, 1), (3, 2), (6, (3, 3), (4,
92
+
93
+ ---
94
+ ### Level: Hard
95
+ **Q: Prove that the sum of the first n positive integers is n(n+1)/2.**
96
+
97
+ **A:**
98
+ n(n+1) = n(n-1)
99
+ n(n+1) = n(n-1) + (n-1) = n(n-1)n(n+1) + n(n+1)n(n-1)n+1
100
+
101
+ n1 = n(n-1)
102
+ n2 = n2
103
+ n2 = n2
104
+ n3 = n2(n3)
105
+ n3 = n2 + n3 + 1(n3)
106
+ n4 = n3
107
+ n4 = n4
108
+
109
+ n2 = n3 = n1 + n4 + n2 = n2
110
+
111
+
112
+ where n1 = n1 = n3 = 1.
113
+ n3 = n1(n2) + n3(n3) = n1 + n1(n3) = n2(n1(n3) + n3(n3) = n2(n3)
114
+
115
+ Now, we can write:
116
+
117
+ n1 = n1(n1) = n3 + n3(n3) + n3 + n3(n3)
118
+
119
+
120
+ n2 = n1 + (
121
+
122
+ ---
123
+ ### Level: Hard
124
+ **Q: Evaluate the integral of x * e^x dx from 0 to 1.**
125
+
126
+ **A:**
127
+ import numpy as np
128
+
129
+ # Define a value of x, which represents the value of x, y, and y, and y, which represents the value of x, y, and y, and y
130
+
131
+ print(num_x)
132
+
133
+ ---
134
+ ### Level: Hard
135
+ **Q: Find the eigenvalues of the matrix [[2, 1], [1, 2]].**
136
+
137
+ **A:**
138
+ import numpy as np
139
+
140
+
141
+ # define the eigenvalues of the eigenvalues, and to get the eigenvalues of the eigenvalues.
142
+ # Define the eigenvalues
143
+
144
+ 2.0 = (0.0,0)
145
+
146
+ # compute the eigenvalues of the eigenvalues of the eigenvalues of the eigenvalues.
147
+
148
+ # calculate the eigenvalues
149
+ # The eigenvalues of the eigenvalues are given by the eigenvalues and eigenvectors of the eigenvalues.fill_1 and 0.1
150
+ # calculate the eigenvalues of the eigenvalues of the eigenvalues
151
+ # calculate the eigenvalues
152
+
153
+ # the eigenvalues of the eigenvalues
154
+ print(total_n) #print(# print the eigenvalues
155
+ print(row_1)
156
+ print(total_n)
157
+ print(answer_n)
158
+
159
+ ---
eval_results/gsm8k/outputs__grpo-full__final/results_2026-05-26T09-33-14.995899.json ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "gsm8k": {
4
+ "name": "gsm8k",
5
+ "alias": "gsm8k",
6
+ "sample_len": 1319,
7
+ "exact_match,strict-match": 0.009855951478392721,
8
+ "exact_match_stderr,strict-match": 0.0027210765770416616,
9
+ "exact_match,flexible-extract": 0.021986353297952996,
10
+ "exact_match_stderr,flexible-extract": 0.0040391627581100546
11
+ }
12
+ },
13
+ "group_subtasks": {},
14
+ "configs": {
15
+ "gsm8k": {
16
+ "task": "gsm8k",
17
+ "dataset_path": "openai/gsm8k",
18
+ "dataset_name": "main",
19
+ "training_split": "train",
20
+ "test_split": "test",
21
+ "fewshot_split": "train",
22
+ "doc_to_text": "Question: {{question}}\nAnswer:",
23
+ "doc_to_target": "{{answer}}",
24
+ "unsafe_code": false,
25
+ "description": "",
26
+ "target_delimiter": " ",
27
+ "fewshot_delimiter": "\n\n",
28
+ "fewshot_config": {
29
+ "sampler": "default",
30
+ "split": "train",
31
+ "process_docs": null,
32
+ "fewshot_indices": null,
33
+ "samples": null,
34
+ "doc_to_text": "Question: {{question}}\nAnswer:",
35
+ "doc_to_choice": null,
36
+ "doc_to_target": "{{answer}}",
37
+ "gen_prefix": null,
38
+ "fewshot_delimiter": "\n\n",
39
+ "target_delimiter": " "
40
+ },
41
+ "num_fewshot": 8,
42
+ "metric_list": [
43
+ {
44
+ "metric": "exact_match",
45
+ "aggregation": "mean",
46
+ "higher_is_better": true,
47
+ "ignore_case": true,
48
+ "ignore_punctuation": false,
49
+ "regexes_to_ignore": [
50
+ ",",
51
+ "\\$",
52
+ "(?s).*#### ",
53
+ "\\.$"
54
+ ]
55
+ }
56
+ ],
57
+ "output_type": "generate_until",
58
+ "generation_kwargs": {
59
+ "until": [
60
+ "Question:",
61
+ "</s>",
62
+ "<|im_end|>"
63
+ ],
64
+ "do_sample": false,
65
+ "temperature": 0.0
66
+ },
67
+ "repeats": 1,
68
+ "filter_list": [
69
+ {
70
+ "name": "strict-match",
71
+ "filter": [
72
+ {
73
+ "function": "regex",
74
+ "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
75
+ },
76
+ {
77
+ "function": "take_first"
78
+ }
79
+ ]
80
+ },
81
+ {
82
+ "name": "flexible-extract",
83
+ "filter": [
84
+ {
85
+ "function": "regex",
86
+ "group_select": -1,
87
+ "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
88
+ },
89
+ {
90
+ "function": "take_first"
91
+ }
92
+ ]
93
+ }
94
+ ],
95
+ "should_decontaminate": false,
96
+ "metadata": {
97
+ "version": 3.0,
98
+ "pretrained": "outputs/grpo-full/final",
99
+ "dtype": "bfloat16",
100
+ "config_source": "/home/himanshu/TinyMathReason-1B/venv/lib/python3.12/site-packages/lm_eval/tasks/gsm8k/gsm8k.yaml"
101
+ }
102
+ }
103
+ },
104
+ "versions": {
105
+ "gsm8k": 3.0
106
+ },
107
+ "n-shot": {
108
+ "gsm8k": 8
109
+ },
110
+ "higher_is_better": {
111
+ "gsm8k": {
112
+ "exact_match": true
113
+ }
114
+ },
115
+ "n-samples": {
116
+ "gsm8k": {
117
+ "original": 1319,
118
+ "effective": 1319
119
+ }
120
+ },
121
+ "config": {
122
+ "model": "hf",
123
+ "model_args": {
124
+ "pretrained": "outputs/grpo-full/final",
125
+ "dtype": "bfloat16"
126
+ },
127
+ "model_num_parameters": 1123125248,
128
+ "model_dtype": "torch.bfloat16",
129
+ "model_revision": "main",
130
+ "model_sha": "",
131
+ "batch_size": "auto",
132
+ "batch_sizes": [],
133
+ "device": "cuda",
134
+ "use_cache": null,
135
+ "limit": null,
136
+ "bootstrap_iters": 100000,
137
+ "gen_kwargs": {},
138
+ "random_seed": 0,
139
+ "numpy_seed": 1234,
140
+ "torch_seed": 1234,
141
+ "fewshot_seed": 1234
142
+ },
143
+ "git_hash": "03fcf23",
144
+ "date": 1779784898.324978,
145
+ "pretty_env_info": "PyTorch version: 2.12.0+cu130\nIs debug build: False\nCUDA used to build PyTorch: 13.0\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 24.04.4 LTS (x86_64)\nGCC version: (Ubuntu 13.3.0-6ubuntu2~24.04.1) 13.3.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.39\n\nPython version: 3.12.3 (main, Mar 23 2026, 19:04:32) [GCC 13.3.0] (64-bit runtime)\nPython platform: Linux-6.17.0-1016-gcp-x86_64-with-glibc2.39\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: GPU 0: NVIDIA L4\nNvidia driver version: 580.126.20\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_graph.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_heuristic.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_adv.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_cnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_precompiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_graph.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_heuristic.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_ops.so.9.13.0\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\nCaching allocator config: N/A\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 16\nOn-line CPU(s) list: 0-15\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.20GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 8\nSocket(s): 1\nStepping: 7\nBogoMIPS: 4400.43\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat avx512_vnni md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 256 KiB (8 instances)\nL1i cache: 256 KiB (8 instances)\nL2 cache: 8 MiB (8 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-15\nVulnerability Gather data sampling: Not affected\nVulnerability Ghostwrite: Not affected\nVulnerability Indirect target selection: Mitigation; Aligned branch/return thunks\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Old microcode: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; Enhanced IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; PBRSB-eIBRS SW sequence; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsa: Not affected\nVulnerability Tsx async abort: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Vmscape: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==2.4.6\n[pip3] nvidia-cublas==13.1.1.3\n[pip3] nvidia-cuda-cupti==13.0.85\n[pip3] nvidia-cuda-nvrtc==13.0.88\n[pip3] nvidia-cuda-runtime==13.0.96\n[pip3] nvidia-cudnn-cu13==9.20.0.48\n[pip3] nvidia-cufft==12.0.0.61\n[pip3] nvidia-curand==10.4.0.35\n[pip3] nvidia-cusolver==12.0.4.66\n[pip3] nvidia-cusparse==12.6.3.3\n[pip3] nvidia-cusparselt-cu13==0.8.1\n[pip3] nvidia-nccl-cu13==2.29.7\n[pip3] nvidia-nvjitlink==13.0.88\n[pip3] nvidia-nvtx==13.0.85\n[pip3] torch==2.12.0\n[pip3] triton==3.7.0\n[conda] Could not collect",
146
+ "transformers_version": "5.9.0",
147
+ "lm_eval_version": "0.4.12",
148
+ "upper_git_hash": null,
149
+ "tokenizer_pad_token": [
150
+ "<|pad|>",
151
+ "3"
152
+ ],
153
+ "tokenizer_eos_token": [
154
+ "<|eos|>",
155
+ "1"
156
+ ],
157
+ "tokenizer_bos_token": [
158
+ "<|bos|>",
159
+ "0"
160
+ ],
161
+ "eot_token_id": 1,
162
+ "max_length": 4096,
163
+ "task_hashes": {
164
+ "gsm8k": "5edaa24ff4f3d939c3e1c5fd65a53cead84d4a52171818c453ec47099bd2a422"
165
+ },
166
+ "model_source": "hf",
167
+ "model_name": "outputs/grpo-full/final",
168
+ "model_name_sanitized": "outputs__grpo-full__final",
169
+ "system_instruction": null,
170
+ "system_instruction_sha": null,
171
+ "fewshot_as_multiturn": null,
172
+ "chat_template": null,
173
+ "chat_template_sha": null,
174
+ "total_evaluation_time_seconds": "3105.920454603998"
175
+ }
eval_results/gsm8k/outputs__grpo-full__final/samples_gsm8k_2026-05-26T09-33-14.995899.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19d5589c2ca07cc8c977a4a341647694d37a6f6e76925e364b283957fb5313fa
3
+ size 17686475
eval_results/hellaswag/outputs__grpo-full__final/results_2026-05-26T12-29-19.954733.json ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "name": "hellaswag",
5
+ "alias": "hellaswag",
6
+ "sample_len": 10042,
7
+ "acc,none": 0.2590121489743079,
8
+ "acc_stderr,none": 0.004371969542814812,
9
+ "acc_norm,none": 0.2629954192391954,
10
+ "acc_norm_stderr,none": 0.004393601887506574
11
+ }
12
+ },
13
+ "group_subtasks": {},
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "dataset_path": "Rowan/hellaswag",
18
+ "training_split": "train",
19
+ "validation_split": "validation",
20
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
21
+ "doc_to_text": "{{query}}",
22
+ "doc_to_target": "{{label}}",
23
+ "unsafe_code": false,
24
+ "doc_to_choice": "choices",
25
+ "description": "",
26
+ "target_delimiter": " ",
27
+ "fewshot_delimiter": "\n\n",
28
+ "fewshot_config": {
29
+ "sampler": "default",
30
+ "split": null,
31
+ "process_docs": "<function process_docs at 0x74288899f7e0>",
32
+ "fewshot_indices": null,
33
+ "samples": null,
34
+ "doc_to_text": "{{query}}",
35
+ "doc_to_choice": "choices",
36
+ "doc_to_target": "{{label}}",
37
+ "gen_prefix": null,
38
+ "fewshot_delimiter": "\n\n",
39
+ "target_delimiter": " "
40
+ },
41
+ "num_fewshot": 10,
42
+ "metric_list": [
43
+ {
44
+ "metric": "acc",
45
+ "aggregation": "mean",
46
+ "higher_is_better": true
47
+ },
48
+ {
49
+ "metric": "acc_norm",
50
+ "aggregation": "mean",
51
+ "higher_is_better": true
52
+ }
53
+ ],
54
+ "output_type": "multiple_choice",
55
+ "repeats": 1,
56
+ "should_decontaminate": false,
57
+ "metadata": {
58
+ "version": 1.0,
59
+ "pretrained": "outputs/grpo-full/final",
60
+ "dtype": "bfloat16",
61
+ "config_source": "/home/himanshu/TinyMathReason-1B/venv/lib/python3.12/site-packages/lm_eval/tasks/hellaswag/hellaswag.yaml"
62
+ }
63
+ }
64
+ },
65
+ "versions": {
66
+ "hellaswag": 1.0
67
+ },
68
+ "n-shot": {
69
+ "hellaswag": 10
70
+ },
71
+ "higher_is_better": {
72
+ "hellaswag": {
73
+ "acc": true,
74
+ "acc_norm": true
75
+ }
76
+ },
77
+ "n-samples": {
78
+ "hellaswag": {
79
+ "original": 10042,
80
+ "effective": 10042
81
+ }
82
+ },
83
+ "config": {
84
+ "model": "hf",
85
+ "model_args": {
86
+ "pretrained": "outputs/grpo-full/final",
87
+ "dtype": "bfloat16"
88
+ },
89
+ "model_num_parameters": 1123125248,
90
+ "model_dtype": "torch.bfloat16",
91
+ "model_revision": "main",
92
+ "model_sha": "",
93
+ "batch_size": "auto",
94
+ "batch_sizes": [
95
+ 64
96
+ ],
97
+ "device": "cuda",
98
+ "use_cache": null,
99
+ "limit": null,
100
+ "bootstrap_iters": 100000,
101
+ "gen_kwargs": {},
102
+ "random_seed": 0,
103
+ "numpy_seed": 1234,
104
+ "torch_seed": 1234,
105
+ "fewshot_seed": 1234
106
+ },
107
+ "git_hash": "03fcf23",
108
+ "date": 1779795876.0037582,
109
+ "pretty_env_info": "PyTorch version: 2.12.0+cu130\nIs debug build: False\nCUDA used to build PyTorch: 13.0\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 24.04.4 LTS (x86_64)\nGCC version: (Ubuntu 13.3.0-6ubuntu2~24.04.1) 13.3.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.39\n\nPython version: 3.12.3 (main, Mar 23 2026, 19:04:32) [GCC 13.3.0] (64-bit runtime)\nPython platform: Linux-6.17.0-1016-gcp-x86_64-with-glibc2.39\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: GPU 0: NVIDIA L4\nNvidia driver version: 580.126.20\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_graph.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_heuristic.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_adv.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_cnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_precompiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_graph.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_heuristic.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_ops.so.9.13.0\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\nCaching allocator config: N/A\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 16\nOn-line CPU(s) list: 0-15\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.20GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 8\nSocket(s): 1\nStepping: 7\nBogoMIPS: 4400.43\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat avx512_vnni md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 256 KiB (8 instances)\nL1i cache: 256 KiB (8 instances)\nL2 cache: 8 MiB (8 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-15\nVulnerability Gather data sampling: Not affected\nVulnerability Ghostwrite: Not affected\nVulnerability Indirect target selection: Mitigation; Aligned branch/return thunks\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Old microcode: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; Enhanced IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; PBRSB-eIBRS SW sequence; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsa: Not affected\nVulnerability Tsx async abort: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Vmscape: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==2.4.6\n[pip3] nvidia-cublas==13.1.1.3\n[pip3] nvidia-cuda-cupti==13.0.85\n[pip3] nvidia-cuda-nvrtc==13.0.88\n[pip3] nvidia-cuda-runtime==13.0.96\n[pip3] nvidia-cudnn-cu13==9.20.0.48\n[pip3] nvidia-cufft==12.0.0.61\n[pip3] nvidia-curand==10.4.0.35\n[pip3] nvidia-cusolver==12.0.4.66\n[pip3] nvidia-cusparse==12.6.3.3\n[pip3] nvidia-cusparselt-cu13==0.8.1\n[pip3] nvidia-nccl-cu13==2.29.7\n[pip3] nvidia-nvjitlink==13.0.88\n[pip3] nvidia-nvtx==13.0.85\n[pip3] torch==2.12.0\n[pip3] triton==3.7.0\n[conda] Could not collect",
110
+ "transformers_version": "5.9.0",
111
+ "lm_eval_version": "0.4.12",
112
+ "upper_git_hash": null,
113
+ "tokenizer_pad_token": [
114
+ "<|pad|>",
115
+ "3"
116
+ ],
117
+ "tokenizer_eos_token": [
118
+ "<|eos|>",
119
+ "1"
120
+ ],
121
+ "tokenizer_bos_token": [
122
+ "<|bos|>",
123
+ "0"
124
+ ],
125
+ "eot_token_id": 1,
126
+ "max_length": 4096,
127
+ "task_hashes": {
128
+ "hellaswag": "d4bcb44ec68db2b8a65f050c3c64c48454179b48fd8aee3e73b55e2ec51e6d82"
129
+ },
130
+ "model_source": "hf",
131
+ "model_name": "outputs/grpo-full/final",
132
+ "model_name_sanitized": "outputs__grpo-full__final",
133
+ "system_instruction": null,
134
+ "system_instruction_sha": null,
135
+ "fewshot_as_multiturn": null,
136
+ "chat_template": null,
137
+ "chat_template_sha": null,
138
+ "total_evaluation_time_seconds": "2693.2404373019963"
139
+ }
eval_results/hellaswag/outputs__grpo-full__final/samples_hellaswag_2026-05-26T12-29-19.954733.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf9e82964443ca811ee5103dca5c32ec42a4ef3fe4391929b5a6148d22e18613
3
+ size 186310702
eval_results/minerva_math_algebra/outputs__grpo-full__final/results_2026-05-26T10-07-08.234107.json ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "minerva_math_algebra": {
4
+ "name": "minerva_math_algebra",
5
+ "alias": "minerva_math_algebra",
6
+ "sample_len": 1187,
7
+ "exact_match,none": 0.0,
8
+ "exact_match_stderr,none": 0.0,
9
+ "math_verify,none": 0.020219039595619208,
10
+ "math_verify_stderr,none": 0.00408697908051843
11
+ }
12
+ },
13
+ "group_subtasks": {},
14
+ "configs": {
15
+ "minerva_math_algebra": {
16
+ "task": "minerva_math_algebra",
17
+ "dataset_path": "EleutherAI/hendrycks_math",
18
+ "dataset_name": "algebra",
19
+ "training_split": "train",
20
+ "test_split": "test",
21
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc: dict) -> dict:\n out_doc = {\n \"problem\": doc[\"problem\"],\n \"solution\": doc[\"solution\"],\n \"answer\": normalize_final_answer(\n remove_boxed(last_boxed_only_string(doc[\"solution\"]))\n ),\n }\n if getattr(doc, \"few_shot\", None) is not None:\n out_doc[\"few_shot\"] = True\n return out_doc\n\n return dataset.map(_process_doc)\n",
22
+ "doc_to_text": "def doc_to_text(doc: dict) -> str:\n return \"Problem:\" + \"\\n\" + doc[\"problem\"] + \"\\n\\n\" + \"Solution:\"\n",
23
+ "doc_to_target": "{{answer if few_shot is undefined else solution}}",
24
+ "unsafe_code": false,
25
+ "process_results": "def process_results(doc: dict, results: list[str]) -> dict[str, int]:\n candidates = results[0]\n\n unnormalized_answer = get_unnormalized_answer(candidates)\n answer = normalize_final_answer(unnormalized_answer)\n\n if is_equiv(answer, doc[\"answer\"]):\n retval = 1\n else:\n retval = 0\n\n # math_verify\n _mvres = verify(\n gold=parse(doc[\"solution\"]),\n target=parse(candidates),\n )\n mathval = 1 if _mvres else 0\n\n res = {\n \"exact_match\": retval,\n \"math_verify\": mathval,\n }\n return res\n",
26
+ "description": "",
27
+ "target_delimiter": " ",
28
+ "fewshot_delimiter": "\n\n",
29
+ "fewshot_config": {
30
+ "sampler": "first_n",
31
+ "split": null,
32
+ "process_docs": "<function process_docs at 0x7dbd24753740>",
33
+ "fewshot_indices": null,
34
+ "samples": "<function list_fewshot_samples at 0x7dbd1874ec00>",
35
+ "doc_to_text": "<function doc_to_text at 0x7dbd246d1580>",
36
+ "doc_to_choice": null,
37
+ "doc_to_target": "{{answer if few_shot is undefined else solution}}",
38
+ "gen_prefix": null,
39
+ "fewshot_delimiter": "\n\n",
40
+ "target_delimiter": " "
41
+ },
42
+ "num_fewshot": 4,
43
+ "metric_list": [
44
+ {
45
+ "metric": "exact_match",
46
+ "aggregation": "mean",
47
+ "higher_is_better": true
48
+ },
49
+ {
50
+ "metric": "math_verify",
51
+ "aggregation": "mean",
52
+ "higher_is_better": true
53
+ }
54
+ ],
55
+ "output_type": "generate_until",
56
+ "generation_kwargs": {
57
+ "until": [
58
+ "Problem:"
59
+ ],
60
+ "do_sample": false,
61
+ "temperature": 0.0
62
+ },
63
+ "repeats": 1,
64
+ "should_decontaminate": false,
65
+ "metadata": {
66
+ "version": 3.0,
67
+ "pretrained": "outputs/grpo-full/final",
68
+ "dtype": "bfloat16",
69
+ "config_source": "/home/himanshu/TinyMathReason-1B/venv/lib/python3.12/site-packages/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml"
70
+ }
71
+ }
72
+ },
73
+ "versions": {
74
+ "minerva_math_algebra": 3.0
75
+ },
76
+ "n-shot": {
77
+ "minerva_math_algebra": 4
78
+ },
79
+ "higher_is_better": {
80
+ "minerva_math_algebra": {
81
+ "exact_match": true,
82
+ "math_verify": true
83
+ }
84
+ },
85
+ "n-samples": {
86
+ "minerva_math_algebra": {
87
+ "original": 1187,
88
+ "effective": 1187
89
+ }
90
+ },
91
+ "config": {
92
+ "model": "hf",
93
+ "model_args": {
94
+ "pretrained": "outputs/grpo-full/final",
95
+ "dtype": "bfloat16"
96
+ },
97
+ "model_num_parameters": 1123125248,
98
+ "model_dtype": "torch.bfloat16",
99
+ "model_revision": "main",
100
+ "model_sha": "",
101
+ "batch_size": "auto",
102
+ "batch_sizes": [],
103
+ "device": "cuda",
104
+ "use_cache": null,
105
+ "limit": null,
106
+ "bootstrap_iters": 100000,
107
+ "gen_kwargs": {},
108
+ "random_seed": 0,
109
+ "numpy_seed": 1234,
110
+ "torch_seed": 1234,
111
+ "fewshot_seed": 1234
112
+ },
113
+ "git_hash": "03fcf23",
114
+ "date": 1779789023.103882,
115
+ "pretty_env_info": "PyTorch version: 2.12.0+cu130\nIs debug build: False\nCUDA used to build PyTorch: 13.0\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 24.04.4 LTS (x86_64)\nGCC version: (Ubuntu 13.3.0-6ubuntu2~24.04.1) 13.3.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.39\n\nPython version: 3.12.3 (main, Mar 23 2026, 19:04:32) [GCC 13.3.0] (64-bit runtime)\nPython platform: Linux-6.17.0-1016-gcp-x86_64-with-glibc2.39\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: GPU 0: NVIDIA L4\nNvidia driver version: 580.126.20\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_graph.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_heuristic.so.9.13.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_adv.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_cnn.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_precompiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_engines_runtime_compiled.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_graph.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_heuristic.so.9.13.0\n/usr/local/cuda-12.9/targets/x86_64-linux/lib/libcudnn_ops.so.9.13.0\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\nCaching allocator config: N/A\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 16\nOn-line CPU(s) list: 0-15\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.20GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 8\nSocket(s): 1\nStepping: 7\nBogoMIPS: 4400.43\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat avx512_vnni md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 256 KiB (8 instances)\nL1i cache: 256 KiB (8 instances)\nL2 cache: 8 MiB (8 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-15\nVulnerability Gather data sampling: Not affected\nVulnerability Ghostwrite: Not affected\nVulnerability Indirect target selection: Mitigation; Aligned branch/return thunks\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Old microcode: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; Enhanced IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; PBRSB-eIBRS SW sequence; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsa: Not affected\nVulnerability Tsx async abort: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Vmscape: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==2.4.6\n[pip3] nvidia-cublas==13.1.1.3\n[pip3] nvidia-cuda-cupti==13.0.85\n[pip3] nvidia-cuda-nvrtc==13.0.88\n[pip3] nvidia-cuda-runtime==13.0.96\n[pip3] nvidia-cudnn-cu13==9.20.0.48\n[pip3] nvidia-cufft==12.0.0.61\n[pip3] nvidia-curand==10.4.0.35\n[pip3] nvidia-cusolver==12.0.4.66\n[pip3] nvidia-cusparse==12.6.3.3\n[pip3] nvidia-cusparselt-cu13==0.8.1\n[pip3] nvidia-nccl-cu13==2.29.7\n[pip3] nvidia-nvjitlink==13.0.88\n[pip3] nvidia-nvtx==13.0.85\n[pip3] torch==2.12.0\n[pip3] triton==3.7.0\n[conda] Could not collect",
116
+ "transformers_version": "5.9.0",
117
+ "lm_eval_version": "0.4.12",
118
+ "upper_git_hash": null,
119
+ "tokenizer_pad_token": [
120
+ "<|pad|>",
121
+ "3"
122
+ ],
123
+ "tokenizer_eos_token": [
124
+ "<|eos|>",
125
+ "1"
126
+ ],
127
+ "tokenizer_bos_token": [
128
+ "<|bos|>",
129
+ "0"
130
+ ],
131
+ "eot_token_id": 1,
132
+ "max_length": 4096,
133
+ "task_hashes": {
134
+ "minerva_math_algebra": "5c955bbc89ad645142d61b1594b7c36b552b722edf416ae40fcc71a4c50bd24b"
135
+ },
136
+ "model_source": "hf",
137
+ "model_name": "outputs/grpo-full/final",
138
+ "model_name_sanitized": "outputs__grpo-full__final",
139
+ "system_instruction": null,
140
+ "system_instruction_sha": null,
141
+ "fewshot_as_multiturn": null,
142
+ "chat_template": null,
143
+ "chat_template_sha": null,
144
+ "total_evaluation_time_seconds": "1014.535830836001"
145
+ }
eval_results/minerva_math_algebra/outputs__grpo-full__final/samples_minerva_math_algebra_2026-05-26T10-07-08.234107.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/results_2026-05-26T12-51-35.680076.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_abstract_algebra_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_anatomy_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_astronomy_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_business_ethics_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_clinical_knowledge_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_biology_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_chemistry_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_computer_science_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_mathematics_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_medicine_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_college_physics_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_computer_security_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_conceptual_physics_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_econometrics_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_electrical_engineering_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_elementary_mathematics_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_formal_logic_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_global_facts_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_biology_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_chemistry_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_computer_science_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_european_history_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_geography_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_government_and_politics_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_macroeconomics_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_mathematics_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_microeconomics_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_physics_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_psychology_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_statistics_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_us_history_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_high_school_world_history_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_human_aging_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_human_sexuality_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/mmlu/outputs__grpo-full__final/samples_mmlu_international_law_2026-05-26T12-51-35.680076.jsonl ADDED
The diff for this file is too large to render. See raw diff