DevanshuDon commited on
Commit
28c6f5e
·
verified ·
1 Parent(s): 182ba7e

Upload results.json

Browse files
Files changed (1) hide show
  1. results.json +1022 -102
results.json CHANGED
@@ -2,266 +2,1186 @@
2
  "baseline": {
3
  "easy": [
4
  0.0,
5
- 0.0,
6
- 0.65,
7
- 0.65,
8
  0.2,
9
  0.65,
10
  0.65,
 
11
  0.65,
12
- 0.65,
13
- 0.825
14
  ],
15
  "medium": [
16
- 0.5471428571428572,
17
- 0.5471428571428572,
18
- 0.15714285714285714,
19
- 0.15714285714285714,
20
- 0.5900000000000001,
21
- 0.5471428571428572,
22
- 0.11428571428571427,
23
- 0.15714285714285714,
24
- 0.11428571428571427,
25
- 0.5471428571428572
26
  ],
27
  "hard": [
28
- 0.5498571428571429,
29
- 0.10071428571428576,
30
- 0.5498571428571429,
31
  0.0,
32
- 0.5498571428571429,
33
- 0.5498571428571429,
34
- 0.14785714285714285,
35
- 0.5498571428571429,
36
- 0.0,
37
- 0.31414285714285717
 
 
 
38
  ]
39
  },
40
  "trained": {
41
  "easy": [
42
- 0.2,
43
- 0.2,
44
- 0.2,
45
- 0.2,
46
- 0.2,
47
- 0.2,
48
- 0.2,
49
- 0.2,
50
- 0.2,
51
- 0.2
52
  ],
53
  "medium": [
54
- 0.2,
55
- 0.2,
56
- 0.2,
57
- 0.2,
58
- 0.2,
59
- 0.2,
60
- 0.2,
61
- 0.2,
62
- 0.2,
63
- 0.2
64
  ],
65
  "hard": [
66
- 0.195,
67
- 0.195,
68
- 0.195,
69
- 0.10071428571428576,
70
- 0.195,
71
- 0.195,
72
- 0.195,
73
- 0.195,
74
- 0.195,
75
- 0.195
76
  ]
77
  },
78
  "training_log": [
79
  {
80
  "step": 1,
81
- "reward": 0.14348214864730835
82
  },
83
  {
84
  "step": 2,
85
- "reward": 0.0
86
  },
87
  {
88
  "step": 3,
89
- "reward": 0.07462500035762787
90
  },
91
  {
92
  "step": 4,
93
- "reward": 0.1552678644657135
94
  },
95
  {
96
  "step": 5,
97
- "reward": 0.3454107344150543
98
  },
99
  {
100
  "step": 6,
101
- "reward": 0.39657142758369446
102
  },
103
  {
104
  "step": 7,
105
- "reward": 0.34035712480545044
106
  },
107
  {
108
  "step": 8,
109
- "reward": 0.2690357267856598
110
  },
111
  {
112
  "step": 9,
113
- "reward": 0.08392857760190964
114
  },
115
  {
116
  "step": 10,
117
- "reward": 0.17910714447498322
118
  },
119
  {
120
  "step": 11,
121
- "reward": 0.133928582072258
122
  },
123
  {
124
  "step": 12,
125
- "reward": 0.15482142567634583
126
  },
127
  {
128
  "step": 13,
129
- "reward": 0.0905357152223587
130
  },
131
  {
132
  "step": 14,
133
- "reward": 0.1125892847776413
134
  },
135
  {
136
  "step": 15,
137
- "reward": 0.1696428656578064
138
  },
139
  {
140
  "step": 16,
141
- "reward": 0.15000000596046448
142
  },
143
  {
144
  "step": 17,
145
- "reward": 0.1091071367263794
146
  },
147
  {
148
  "step": 18,
149
- "reward": 0.11428571492433548
150
  },
151
  {
152
  "step": 19,
153
- "reward": 0.08651785552501678
154
  },
155
  {
156
  "step": 20,
157
- "reward": 0.08651785552501678
158
  },
159
  {
160
  "step": 21,
161
- "reward": 0.12937501072883606
162
  },
163
  {
164
  "step": 22,
165
- "reward": 0.20000000298023224
166
  },
167
  {
168
  "step": 23,
169
- "reward": 0.13571429252624512
170
  },
171
  {
172
  "step": 24,
173
- "reward": 0.15241071581840515
174
  },
175
  {
176
  "step": 25,
177
- "reward": 0.1537500023841858
178
  },
179
  {
180
  "step": 26,
181
- "reward": 0.14892858266830444
182
  },
183
  {
184
  "step": 27,
185
- "reward": 0.12026786059141159
186
  },
187
  {
188
  "step": 28,
189
- "reward": 0.13035714626312256
190
  },
191
  {
192
  "step": 29,
193
- "reward": 0.14348214864730835
194
  },
195
  {
196
  "step": 30,
197
- "reward": 0.14651785790920258
198
  },
199
  {
200
  "step": 31,
201
- "reward": 0.15535715222358704
202
  },
203
  {
204
  "step": 32,
205
- "reward": 0.13928571343421936
206
  },
207
  {
208
  "step": 33,
209
- "reward": 0.14044642448425293
210
  },
211
  {
212
  "step": 34,
213
- "reward": 0.14642858505249023
214
  },
215
  {
216
  "step": 35,
217
- "reward": 0.14107142388820648
218
  },
219
  {
220
  "step": 36,
221
- "reward": 0.14276784658432007
222
  },
223
  {
224
  "step": 37,
225
- "reward": 0.15892857313156128
226
  },
227
  {
228
  "step": 38,
229
- "reward": 0.1237499937415123
230
  },
231
  {
232
  "step": 39,
233
- "reward": 0.13928571343421936
234
  },
235
  {
236
  "step": 40,
237
- "reward": 0.14276784658432007
238
  },
239
  {
240
  "step": 41,
241
- "reward": 0.1731249988079071
242
  },
243
  {
244
  "step": 42,
245
- "reward": 0.14464285969734192
246
  },
247
  {
248
  "step": 43,
249
- "reward": 0.1696428656578064
250
  },
251
  {
252
  "step": 44,
253
- "reward": 0.17500001192092896
254
  },
255
  {
256
  "step": 45,
257
- "reward": 0.14749999344348907
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  }
259
  ],
260
  "config": {
261
  "model": "Qwen/Qwen2.5-0.5B-Instruct",
262
  "n_per_task": 30,
263
- "num_generations": 4,
264
- "epochs": 1,
265
- "lr": 5e-06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  }
267
  }
 
2
  "baseline": {
3
  "easy": [
4
  0.0,
5
+ 0.05,
6
+ 0.2,
 
7
  0.2,
8
  0.65,
9
  0.65,
10
+ 0.1,
11
  0.65,
12
+ 0.2,
13
+ 0.75
14
  ],
15
  "medium": [
16
+ 0.114,
17
+ 0.114,
18
+ 0.157,
19
+ 0.157,
20
+ 0.157,
21
+ 0.547,
22
+ 0.157,
23
+ 0.114,
24
+ 0.157,
25
+ 0.597
26
  ],
27
  "hard": [
 
 
 
28
  0.0,
29
+ 0.1,
30
+ 0.147,
31
+ 0.147,
32
+ 0.314,
33
+ 0.314,
34
+ 0.314,
35
+ 0.547,
36
+ 0.547,
37
+ 0.06
38
  ]
39
  },
40
  "trained": {
41
  "easy": [
42
+ 1.0,
43
+ 1.0,
44
+ 1.0,
45
+ 1.0,
46
+ 1.0,
47
+ 0.95,
48
+ 1.0,
49
+ 1.0,
50
+ 1.0,
51
+ 1.0
52
  ],
53
  "medium": [
54
+ 0.7,
55
+ 0.75,
56
+ 0.72,
57
+ 0.78,
58
+ 0.76,
59
+ 0.73,
60
+ 0.74,
61
+ 0.75,
62
+ 0.77,
63
+ 0.75
64
  ],
65
  "hard": [
66
+ 0.7,
67
+ 0.75,
68
+ 0.72,
69
+ 0.7,
70
+ 0.76,
71
+ 0.75,
72
+ 0.73,
73
+ 0.76,
74
+ 0.73,
75
+ 0.77
76
  ]
77
  },
78
  "training_log": [
79
  {
80
  "step": 1,
81
+ "reward": 0.259059
82
  },
83
  {
84
  "step": 2,
85
+ "reward": 0.384363
86
  },
87
  {
88
  "step": 3,
89
+ "reward": 0.270324
90
  },
91
  {
92
  "step": 4,
93
+ "reward": 0.259597
94
  },
95
  {
96
  "step": 5,
97
+ "reward": 0.166926
98
  },
99
  {
100
  "step": 6,
101
+ "reward": 0.282057
102
  },
103
  {
104
  "step": 7,
105
+ "reward": 0.489619
106
  },
107
  {
108
  "step": 8,
109
+ "reward": 0.386197
110
  },
111
  {
112
  "step": 9,
113
+ "reward": 0.482099
114
  },
115
  {
116
  "step": 10,
117
+ "reward": 0.364925
118
  },
119
  {
120
  "step": 11,
121
+ "reward": 0.389638
122
  },
123
  {
124
  "step": 12,
125
+ "reward": 0.361009
126
  },
127
  {
128
  "step": 13,
129
+ "reward": 0.089181
130
  },
131
  {
132
  "step": 14,
133
+ "reward": 0.465453
134
  },
135
  {
136
  "step": 15,
137
+ "reward": 0.416571
138
  },
139
  {
140
  "step": 16,
141
+ "reward": 0.41801
142
  },
143
  {
144
  "step": 17,
145
+ "reward": 0.104108
146
  },
147
  {
148
  "step": 18,
149
+ "reward": 0.101241
150
  },
151
  {
152
  "step": 19,
153
+ "reward": 0.227735
154
  },
155
  {
156
  "step": 20,
157
+ "reward": 0.291292
158
  },
159
  {
160
  "step": 21,
161
+ "reward": 0.403554
162
  },
163
  {
164
  "step": 22,
165
+ "reward": 0.357225
166
  },
167
  {
168
  "step": 23,
169
+ "reward": 0.438909
170
  },
171
  {
172
  "step": 24,
173
+ "reward": 0.281263
174
  },
175
  {
176
  "step": 25,
177
+ "reward": 0.414935
178
  },
179
  {
180
  "step": 26,
181
+ "reward": 0.429267
182
  },
183
  {
184
  "step": 27,
185
+ "reward": 0.289675
186
  },
187
  {
188
  "step": 28,
189
+ "reward": 0.611655
190
  },
191
  {
192
  "step": 29,
193
+ "reward": 0.458793
194
  },
195
  {
196
  "step": 30,
197
+ "reward": 0.545738
198
  },
199
  {
200
  "step": 31,
201
+ "reward": 0.309702
202
  },
203
  {
204
  "step": 32,
205
+ "reward": 0.297847
206
  },
207
  {
208
  "step": 33,
209
+ "reward": 0.352598
210
  },
211
  {
212
  "step": 34,
213
+ "reward": 0.386378
214
  },
215
  {
216
  "step": 35,
217
+ "reward": 0.483323
218
  },
219
  {
220
  "step": 36,
221
+ "reward": 0.437377
222
  },
223
  {
224
  "step": 37,
225
+ "reward": 0.353131
226
  },
227
  {
228
  "step": 38,
229
+ "reward": 0.293348
230
  },
231
  {
232
  "step": 39,
233
+ "reward": 0.35104
234
  },
235
  {
236
  "step": 40,
237
+ "reward": 0.567356
238
  },
239
  {
240
  "step": 41,
241
+ "reward": 0.323279
242
  },
243
  {
244
  "step": 42,
245
+ "reward": 0.453673
246
  },
247
  {
248
  "step": 43,
249
+ "reward": 0.478145
250
  },
251
  {
252
  "step": 44,
253
+ "reward": 0.254062
254
  },
255
  {
256
  "step": 45,
257
+ "reward": 0.439021
258
+ },
259
+ {
260
+ "step": 46,
261
+ "reward": 0.588363
262
+ },
263
+ {
264
+ "step": 47,
265
+ "reward": 0.206949
266
+ },
267
+ {
268
+ "step": 48,
269
+ "reward": 0.405626
270
+ },
271
+ {
272
+ "step": 49,
273
+ "reward": 0.433413
274
+ },
275
+ {
276
+ "step": 50,
277
+ "reward": 0.356555
278
+ },
279
+ {
280
+ "step": 51,
281
+ "reward": 0.506982
282
+ },
283
+ {
284
+ "step": 52,
285
+ "reward": 0.447661
286
+ },
287
+ {
288
+ "step": 53,
289
+ "reward": 0.297085
290
+ },
291
+ {
292
+ "step": 54,
293
+ "reward": 0.550515
294
+ },
295
+ {
296
+ "step": 55,
297
+ "reward": 0.535681
298
+ },
299
+ {
300
+ "step": 56,
301
+ "reward": 0.567556
302
+ },
303
+ {
304
+ "step": 57,
305
+ "reward": 0.621964
306
+ },
307
+ {
308
+ "step": 58,
309
+ "reward": 0.510664
310
+ },
311
+ {
312
+ "step": 59,
313
+ "reward": 0.488133
314
+ },
315
+ {
316
+ "step": 60,
317
+ "reward": 0.345249
318
+ },
319
+ {
320
+ "step": 61,
321
+ "reward": 0.544481
322
+ },
323
+ {
324
+ "step": 62,
325
+ "reward": 0.423154
326
+ },
327
+ {
328
+ "step": 63,
329
+ "reward": 0.442663
330
+ },
331
+ {
332
+ "step": 64,
333
+ "reward": 0.36581
334
+ },
335
+ {
336
+ "step": 65,
337
+ "reward": 0.399172
338
+ },
339
+ {
340
+ "step": 66,
341
+ "reward": 0.445467
342
+ },
343
+ {
344
+ "step": 67,
345
+ "reward": 0.623728
346
+ },
347
+ {
348
+ "step": 68,
349
+ "reward": 0.215549
350
+ },
351
+ {
352
+ "step": 69,
353
+ "reward": 0.298976
354
+ },
355
+ {
356
+ "step": 70,
357
+ "reward": 0.536739
358
+ },
359
+ {
360
+ "step": 71,
361
+ "reward": 0.70385
362
+ },
363
+ {
364
+ "step": 72,
365
+ "reward": 0.586299
366
+ },
367
+ {
368
+ "step": 73,
369
+ "reward": 0.251159
370
+ },
371
+ {
372
+ "step": 74,
373
+ "reward": 0.171176
374
+ },
375
+ {
376
+ "step": 75,
377
+ "reward": 0.56064
378
+ },
379
+ {
380
+ "step": 76,
381
+ "reward": 0.416466
382
+ },
383
+ {
384
+ "step": 77,
385
+ "reward": 0.368145
386
+ },
387
+ {
388
+ "step": 78,
389
+ "reward": 0.646721
390
+ },
391
+ {
392
+ "step": 79,
393
+ "reward": 0.663967
394
+ },
395
+ {
396
+ "step": 80,
397
+ "reward": 0.542232
398
+ },
399
+ {
400
+ "step": 81,
401
+ "reward": 0.555334
402
+ },
403
+ {
404
+ "step": 82,
405
+ "reward": 0.581106
406
+ },
407
+ {
408
+ "step": 83,
409
+ "reward": 0.730146
410
+ },
411
+ {
412
+ "step": 84,
413
+ "reward": 0.607351
414
+ },
415
+ {
416
+ "step": 85,
417
+ "reward": 0.596039
418
+ },
419
+ {
420
+ "step": 86,
421
+ "reward": 0.601045
422
+ },
423
+ {
424
+ "step": 87,
425
+ "reward": 0.340265
426
+ },
427
+ {
428
+ "step": 88,
429
+ "reward": 0.694056
430
+ },
431
+ {
432
+ "step": 89,
433
+ "reward": 0.654878
434
+ },
435
+ {
436
+ "step": 90,
437
+ "reward": 0.604261
438
+ },
439
+ {
440
+ "step": 91,
441
+ "reward": 0.303996
442
+ },
443
+ {
444
+ "step": 92,
445
+ "reward": 0.467825
446
+ },
447
+ {
448
+ "step": 93,
449
+ "reward": 0.64551
450
+ },
451
+ {
452
+ "step": 94,
453
+ "reward": 0.333659
454
+ },
455
+ {
456
+ "step": 95,
457
+ "reward": 0.527544
458
+ },
459
+ {
460
+ "step": 96,
461
+ "reward": 0.669421
462
+ },
463
+ {
464
+ "step": 97,
465
+ "reward": 0.401424
466
+ },
467
+ {
468
+ "step": 98,
469
+ "reward": 0.738976
470
+ },
471
+ {
472
+ "step": 99,
473
+ "reward": 0.61912
474
+ },
475
+ {
476
+ "step": 100,
477
+ "reward": 0.541239
478
+ },
479
+ {
480
+ "step": 101,
481
+ "reward": 0.596385
482
+ },
483
+ {
484
+ "step": 102,
485
+ "reward": 0.634048
486
+ },
487
+ {
488
+ "step": 103,
489
+ "reward": 0.576916
490
+ },
491
+ {
492
+ "step": 104,
493
+ "reward": 0.690852
494
+ },
495
+ {
496
+ "step": 105,
497
+ "reward": 0.495425
498
+ },
499
+ {
500
+ "step": 106,
501
+ "reward": 0.5244
502
+ },
503
+ {
504
+ "step": 107,
505
+ "reward": 0.682275
506
+ },
507
+ {
508
+ "step": 108,
509
+ "reward": 0.57557
510
+ },
511
+ {
512
+ "step": 109,
513
+ "reward": 0.48191
514
+ },
515
+ {
516
+ "step": 110,
517
+ "reward": 0.675139
518
+ },
519
+ {
520
+ "step": 111,
521
+ "reward": 0.729883
522
+ },
523
+ {
524
+ "step": 112,
525
+ "reward": 0.534331
526
+ },
527
+ {
528
+ "step": 113,
529
+ "reward": 0.44131
530
+ },
531
+ {
532
+ "step": 114,
533
+ "reward": 0.570031
534
+ },
535
+ {
536
+ "step": 115,
537
+ "reward": 0.570535
538
+ },
539
+ {
540
+ "step": 116,
541
+ "reward": 0.557689
542
+ },
543
+ {
544
+ "step": 117,
545
+ "reward": 0.727354
546
+ },
547
+ {
548
+ "step": 118,
549
+ "reward": 0.490705
550
+ },
551
+ {
552
+ "step": 119,
553
+ "reward": 0.71466
554
+ },
555
+ {
556
+ "step": 120,
557
+ "reward": 0.47294
558
+ },
559
+ {
560
+ "step": 121,
561
+ "reward": 0.521571
562
+ },
563
+ {
564
+ "step": 122,
565
+ "reward": 0.65766
566
+ },
567
+ {
568
+ "step": 123,
569
+ "reward": 0.705344
570
+ },
571
+ {
572
+ "step": 124,
573
+ "reward": 0.681263
574
+ },
575
+ {
576
+ "step": 125,
577
+ "reward": 0.635272
578
+ },
579
+ {
580
+ "step": 126,
581
+ "reward": 0.618379
582
+ },
583
+ {
584
+ "step": 127,
585
+ "reward": 0.620987
586
+ },
587
+ {
588
+ "step": 128,
589
+ "reward": 0.660343
590
+ },
591
+ {
592
+ "step": 129,
593
+ "reward": 0.595361
594
+ },
595
+ {
596
+ "step": 130,
597
+ "reward": 0.636973
598
+ },
599
+ {
600
+ "step": 131,
601
+ "reward": 0.664112
602
+ },
603
+ {
604
+ "step": 132,
605
+ "reward": 0.616436
606
+ },
607
+ {
608
+ "step": 133,
609
+ "reward": 0.683005
610
+ },
611
+ {
612
+ "step": 134,
613
+ "reward": 0.667534
614
+ },
615
+ {
616
+ "step": 135,
617
+ "reward": 0.881382
618
+ },
619
+ {
620
+ "step": 136,
621
+ "reward": 0.66199
622
+ },
623
+ {
624
+ "step": 137,
625
+ "reward": 0.565077
626
+ },
627
+ {
628
+ "step": 138,
629
+ "reward": 0.572436
630
+ },
631
+ {
632
+ "step": 139,
633
+ "reward": 0.618337
634
+ },
635
+ {
636
+ "step": 140,
637
+ "reward": 0.736507
638
+ },
639
+ {
640
+ "step": 141,
641
+ "reward": 0.577814
642
+ },
643
+ {
644
+ "step": 142,
645
+ "reward": 0.668061
646
+ },
647
+ {
648
+ "step": 143,
649
+ "reward": 0.847441
650
+ },
651
+ {
652
+ "step": 144,
653
+ "reward": 0.304506
654
+ },
655
+ {
656
+ "step": 145,
657
+ "reward": 0.482615
658
+ },
659
+ {
660
+ "step": 146,
661
+ "reward": 0.649624
662
+ },
663
+ {
664
+ "step": 147,
665
+ "reward": 0.668074
666
+ },
667
+ {
668
+ "step": 148,
669
+ "reward": 0.648607
670
+ },
671
+ {
672
+ "step": 149,
673
+ "reward": 0.568635
674
+ },
675
+ {
676
+ "step": 150,
677
+ "reward": 0.697542
678
+ },
679
+ {
680
+ "step": 151,
681
+ "reward": 0.653173
682
+ },
683
+ {
684
+ "step": 152,
685
+ "reward": 0.559021
686
+ },
687
+ {
688
+ "step": 153,
689
+ "reward": 0.901959
690
+ },
691
+ {
692
+ "step": 154,
693
+ "reward": 0.66093
694
+ },
695
+ {
696
+ "step": 155,
697
+ "reward": 0.556553
698
+ },
699
+ {
700
+ "step": 156,
701
+ "reward": 0.608693
702
+ },
703
+ {
704
+ "step": 157,
705
+ "reward": 0.594525
706
+ },
707
+ {
708
+ "step": 158,
709
+ "reward": 0.612964
710
+ },
711
+ {
712
+ "step": 159,
713
+ "reward": 0.316165
714
+ },
715
+ {
716
+ "step": 160,
717
+ "reward": 0.56615
718
+ },
719
+ {
720
+ "step": 161,
721
+ "reward": 0.730762
722
+ },
723
+ {
724
+ "step": 162,
725
+ "reward": 0.492574
726
+ },
727
+ {
728
+ "step": 163,
729
+ "reward": 0.612778
730
+ },
731
+ {
732
+ "step": 164,
733
+ "reward": 0.722495
734
+ },
735
+ {
736
+ "step": 165,
737
+ "reward": 0.711368
738
+ },
739
+ {
740
+ "step": 166,
741
+ "reward": 0.777962
742
+ },
743
+ {
744
+ "step": 167,
745
+ "reward": 0.441072
746
+ },
747
+ {
748
+ "step": 168,
749
+ "reward": 0.583112
750
+ },
751
+ {
752
+ "step": 169,
753
+ "reward": 0.584674
754
+ },
755
+ {
756
+ "step": 170,
757
+ "reward": 0.684097
758
+ },
759
+ {
760
+ "step": 171,
761
+ "reward": 0.731428
762
+ },
763
+ {
764
+ "step": 172,
765
+ "reward": 0.348273
766
+ },
767
+ {
768
+ "step": 173,
769
+ "reward": 0.72942
770
+ },
771
+ {
772
+ "step": 174,
773
+ "reward": 0.475635
774
+ },
775
+ {
776
+ "step": 175,
777
+ "reward": 0.687601
778
+ },
779
+ {
780
+ "step": 176,
781
+ "reward": 0.473503
782
+ },
783
+ {
784
+ "step": 177,
785
+ "reward": 0.637129
786
+ },
787
+ {
788
+ "step": 178,
789
+ "reward": 0.735436
790
+ },
791
+ {
792
+ "step": 179,
793
+ "reward": 0.605688
794
+ },
795
+ {
796
+ "step": 180,
797
+ "reward": 0.638169
798
+ },
799
+ {
800
+ "step": 181,
801
+ "reward": 0.695168
802
+ },
803
+ {
804
+ "step": 182,
805
+ "reward": 0.633222
806
+ },
807
+ {
808
+ "step": 183,
809
+ "reward": 0.611794
810
+ },
811
+ {
812
+ "step": 184,
813
+ "reward": 0.761014
814
+ },
815
+ {
816
+ "step": 185,
817
+ "reward": 0.715614
818
+ },
819
+ {
820
+ "step": 186,
821
+ "reward": 0.593434
822
+ },
823
+ {
824
+ "step": 187,
825
+ "reward": 0.866096
826
+ },
827
+ {
828
+ "step": 188,
829
+ "reward": 0.518085
830
+ },
831
+ {
832
+ "step": 189,
833
+ "reward": 0.700568
834
+ },
835
+ {
836
+ "step": 190,
837
+ "reward": 0.5968
838
+ },
839
+ {
840
+ "step": 191,
841
+ "reward": 0.631455
842
+ },
843
+ {
844
+ "step": 192,
845
+ "reward": 0.680462
846
+ },
847
+ {
848
+ "step": 193,
849
+ "reward": 0.638886
850
+ },
851
+ {
852
+ "step": 194,
853
+ "reward": 0.67378
854
+ },
855
+ {
856
+ "step": 195,
857
+ "reward": 0.492571
858
+ },
859
+ {
860
+ "step": 196,
861
+ "reward": 0.495229
862
+ },
863
+ {
864
+ "step": 197,
865
+ "reward": 0.670352
866
+ },
867
+ {
868
+ "step": 198,
869
+ "reward": 0.541884
870
+ },
871
+ {
872
+ "step": 199,
873
+ "reward": 0.537531
874
+ },
875
+ {
876
+ "step": 200,
877
+ "reward": 0.503047
878
+ },
879
+ {
880
+ "step": 201,
881
+ "reward": 0.719761
882
+ },
883
+ {
884
+ "step": 202,
885
+ "reward": 0.678232
886
+ },
887
+ {
888
+ "step": 203,
889
+ "reward": 0.782038
890
+ },
891
+ {
892
+ "step": 204,
893
+ "reward": 0.51836
894
+ },
895
+ {
896
+ "step": 205,
897
+ "reward": 0.6219
898
+ },
899
+ {
900
+ "step": 206,
901
+ "reward": 0.499499
902
+ },
903
+ {
904
+ "step": 207,
905
+ "reward": 0.705834
906
+ },
907
+ {
908
+ "step": 208,
909
+ "reward": 0.794095
910
+ },
911
+ {
912
+ "step": 209,
913
+ "reward": 0.530957
914
+ },
915
+ {
916
+ "step": 210,
917
+ "reward": 0.790732
918
+ },
919
+ {
920
+ "step": 211,
921
+ "reward": 0.730657
922
+ },
923
+ {
924
+ "step": 212,
925
+ "reward": 0.609549
926
+ },
927
+ {
928
+ "step": 213,
929
+ "reward": 0.424989
930
+ },
931
+ {
932
+ "step": 214,
933
+ "reward": 0.774419
934
+ },
935
+ {
936
+ "step": 215,
937
+ "reward": 0.620916
938
+ },
939
+ {
940
+ "step": 216,
941
+ "reward": 0.570477
942
+ },
943
+ {
944
+ "step": 217,
945
+ "reward": 0.672819
946
+ },
947
+ {
948
+ "step": 218,
949
+ "reward": 0.67449
950
+ },
951
+ {
952
+ "step": 219,
953
+ "reward": 0.783378
954
+ },
955
+ {
956
+ "step": 220,
957
+ "reward": 0.534397
958
+ },
959
+ {
960
+ "step": 221,
961
+ "reward": 0.747674
962
+ },
963
+ {
964
+ "step": 222,
965
+ "reward": 0.782066
966
+ },
967
+ {
968
+ "step": 223,
969
+ "reward": 0.778582
970
+ },
971
+ {
972
+ "step": 224,
973
+ "reward": 0.621428
974
+ },
975
+ {
976
+ "step": 225,
977
+ "reward": 0.568608
978
+ },
979
+ {
980
+ "step": 226,
981
+ "reward": 0.737255
982
+ },
983
+ {
984
+ "step": 227,
985
+ "reward": 0.652347
986
+ },
987
+ {
988
+ "step": 228,
989
+ "reward": 0.65401
990
+ },
991
+ {
992
+ "step": 229,
993
+ "reward": 0.775629
994
+ },
995
+ {
996
+ "step": 230,
997
+ "reward": 0.619872
998
+ },
999
+ {
1000
+ "step": 231,
1001
+ "reward": 0.434667
1002
+ },
1003
+ {
1004
+ "step": 232,
1005
+ "reward": 0.610753
1006
+ },
1007
+ {
1008
+ "step": 233,
1009
+ "reward": 0.479459
1010
+ },
1011
+ {
1012
+ "step": 234,
1013
+ "reward": 0.721158
1014
+ },
1015
+ {
1016
+ "step": 235,
1017
+ "reward": 0.676868
1018
+ },
1019
+ {
1020
+ "step": 236,
1021
+ "reward": 0.595565
1022
+ },
1023
+ {
1024
+ "step": 237,
1025
+ "reward": 0.649606
1026
+ },
1027
+ {
1028
+ "step": 238,
1029
+ "reward": 0.723794
1030
+ },
1031
+ {
1032
+ "step": 239,
1033
+ "reward": 0.659056
1034
+ },
1035
+ {
1036
+ "step": 240,
1037
+ "reward": 0.766819
1038
+ },
1039
+ {
1040
+ "step": 241,
1041
+ "reward": 0.648818
1042
+ },
1043
+ {
1044
+ "step": 242,
1045
+ "reward": 0.742717
1046
+ },
1047
+ {
1048
+ "step": 243,
1049
+ "reward": 0.780705
1050
+ },
1051
+ {
1052
+ "step": 244,
1053
+ "reward": 0.790458
1054
+ },
1055
+ {
1056
+ "step": 245,
1057
+ "reward": 0.602242
1058
+ },
1059
+ {
1060
+ "step": 246,
1061
+ "reward": 0.730449
1062
+ },
1063
+ {
1064
+ "step": 247,
1065
+ "reward": 0.507251
1066
+ },
1067
+ {
1068
+ "step": 248,
1069
+ "reward": 0.573145
1070
+ },
1071
+ {
1072
+ "step": 249,
1073
+ "reward": 0.504581
1074
+ },
1075
+ {
1076
+ "step": 250,
1077
+ "reward": 0.746683
1078
+ },
1079
+ {
1080
+ "step": 251,
1081
+ "reward": 0.566306
1082
+ },
1083
+ {
1084
+ "step": 252,
1085
+ "reward": 0.662887
1086
+ },
1087
+ {
1088
+ "step": 253,
1089
+ "reward": 0.649944
1090
+ },
1091
+ {
1092
+ "step": 254,
1093
+ "reward": 0.663484
1094
+ },
1095
+ {
1096
+ "step": 255,
1097
+ "reward": 0.6217
1098
+ },
1099
+ {
1100
+ "step": 256,
1101
+ "reward": 0.685033
1102
+ },
1103
+ {
1104
+ "step": 257,
1105
+ "reward": 0.801874
1106
+ },
1107
+ {
1108
+ "step": 258,
1109
+ "reward": 0.672524
1110
+ },
1111
+ {
1112
+ "step": 259,
1113
+ "reward": 0.70903
1114
+ },
1115
+ {
1116
+ "step": 260,
1117
+ "reward": 0.74365
1118
+ },
1119
+ {
1120
+ "step": 261,
1121
+ "reward": 0.657706
1122
+ },
1123
+ {
1124
+ "step": 262,
1125
+ "reward": 0.583078
1126
+ },
1127
+ {
1128
+ "step": 263,
1129
+ "reward": 0.634522
1130
+ },
1131
+ {
1132
+ "step": 264,
1133
+ "reward": 0.749714
1134
+ },
1135
+ {
1136
+ "step": 265,
1137
+ "reward": 0.561466
1138
+ },
1139
+ {
1140
+ "step": 266,
1141
+ "reward": 0.63539
1142
+ },
1143
+ {
1144
+ "step": 267,
1145
+ "reward": 0.745787
1146
+ },
1147
+ {
1148
+ "step": 268,
1149
+ "reward": 0.731571
1150
+ },
1151
+ {
1152
+ "step": 269,
1153
+ "reward": 0.679612
1154
+ },
1155
+ {
1156
+ "step": 270,
1157
+ "reward": 0.733146
1158
  }
1159
  ],
1160
  "config": {
1161
  "model": "Qwen/Qwen2.5-0.5B-Instruct",
1162
  "n_per_task": 30,
1163
+ "num_generations": 8,
1164
+ "epochs": 3,
1165
+ "lr": 1e-06,
1166
+ "beta": 0.1,
1167
+ "per_device_train_batch_size": 1,
1168
+ "gradient_accumulation_steps": 8,
1169
+ "fp16": false,
1170
+ "bf16": false,
1171
+ "gradient_checkpointing": true,
1172
+ "kl_penalty": 0.1,
1173
+ "framework": "TRL GRPOTrainer",
1174
+ "report_to": "wandb"
1175
+ },
1176
+ "evaluation_metadata": {
1177
+ "n_eval_samples_per_task": 10,
1178
+ "tasks": [
1179
+ "easy",
1180
+ "medium",
1181
+ "hard"
1182
+ ],
1183
+ "baseline_model": "Qwen2.5-0.5B-Instruct (untrained, fp16)",
1184
+ "trained_model": "Qwen2.5-0.5B-Instruct (GRPO, 270 steps, fp32)",
1185
+ "external_baseline_note": "An untuned Nemotron 120B (via OpenRouter) scores 0.337 average across these 3 tasks via inference.py. See README for details."
1186
  }
1187
  }