aman-jaglan commited on
Commit
2e3a075
·
verified ·
1 Parent(s): d8e999d

Copy to checkpoint/trainer_state.json

Browse files
Files changed (1) hide show
  1. checkpoint/trainer_state.json +664 -0
checkpoint/trainer_state.json ADDED
@@ -0,0 +1,664 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.1568627450980392,
6
+ "eval_steps": 500,
7
+ "global_step": 30,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "accuracy_delta": -0.03125,
14
+ "baseline_accuracy": 0.5625,
15
+ "completion_length": 1660.1953125,
16
+ "degradation_rate": 0.1875,
17
+ "epoch": 0.00522875816993464,
18
+ "grad_norm": 1.8285036167620026,
19
+ "improvement_rate": 0.15625,
20
+ "kl": 0.0,
21
+ "learning_rate": 2e-06,
22
+ "loss": 0.0,
23
+ "reward": 0.24004681408405304,
24
+ "reward_std": 0.25635848194360733,
25
+ "rewards/AdaptiveTeachingReward": 0.24004681408405304,
26
+ "step": 1,
27
+ "student_accuracy": 0.53125,
28
+ "student_approach_length": 500.0,
29
+ "teaching_length_mean": 1349.5,
30
+ "teaching_length_std": 1533.2601122433693,
31
+ "token_efficiency": 0.01778511088825826
32
+ },
33
+ {
34
+ "accuracy_delta": -0.21875,
35
+ "baseline_accuracy": 1.0,
36
+ "completion_length": 2174.171875,
37
+ "degradation_rate": 0.21875,
38
+ "epoch": 0.01045751633986928,
39
+ "grad_norm": 1.2225533588422857,
40
+ "improvement_rate": 0.0,
41
+ "kl": 0.002572178840637207,
42
+ "learning_rate": 2e-06,
43
+ "loss": 0.0001,
44
+ "reward": 0.2656950503587723,
45
+ "reward_std": 0.17467603832483292,
46
+ "rewards/AdaptiveTeachingReward": 0.2656950503587723,
47
+ "step": 2,
48
+ "student_accuracy": 0.78125,
49
+ "student_approach_length": 489.875,
50
+ "teaching_length_mean": 1645.90625,
51
+ "teaching_length_std": 1777.2254866782437,
52
+ "token_efficiency": 0.016061369340544678
53
+ },
54
+ {
55
+ "accuracy_delta": -0.0625,
56
+ "baseline_accuracy": 0.0625,
57
+ "completion_length": 2889.65625,
58
+ "degradation_rate": 0.0625,
59
+ "epoch": 0.01568627450980392,
60
+ "grad_norm": 0.4912196165940296,
61
+ "improvement_rate": 0.0,
62
+ "kl": 0.0021309852600097656,
63
+ "learning_rate": 2e-06,
64
+ "loss": 0.0001,
65
+ "reward": 0.1566198617219925,
66
+ "reward_std": 0.17452973127365112,
67
+ "rewards/AdaptiveTeachingReward": 0.1566198617219925,
68
+ "step": 3,
69
+ "student_accuracy": 0.0,
70
+ "student_approach_length": 500.0,
71
+ "teaching_length_mean": 3435.71875,
72
+ "teaching_length_std": 874.6175996443687,
73
+ "token_efficiency": 0.005029539554335019
74
+ },
75
+ {
76
+ "accuracy_delta": 0.0,
77
+ "baseline_accuracy": 0.0,
78
+ "completion_length": 1765.4765625,
79
+ "degradation_rate": 0.0,
80
+ "epoch": 0.02091503267973856,
81
+ "grad_norm": 0.014919439029800962,
82
+ "improvement_rate": 0.0,
83
+ "kl": 0.0024797916412353516,
84
+ "learning_rate": 2e-06,
85
+ "loss": 0.0001,
86
+ "reward": 0.0,
87
+ "reward_std": 0.0,
88
+ "rewards/AdaptiveTeachingReward": 0.0,
89
+ "step": 4,
90
+ "student_accuracy": 0.0,
91
+ "student_approach_length": 500.0,
92
+ "teaching_length_mean": 1644.53125,
93
+ "teaching_length_std": 1544.9233390329066,
94
+ "token_efficiency": 0.0
95
+ },
96
+ {
97
+ "accuracy_delta": 0.1875,
98
+ "baseline_accuracy": 0.375,
99
+ "completion_length": 2330.296875,
100
+ "degradation_rate": 0.09375,
101
+ "epoch": 0.026143790849673203,
102
+ "grad_norm": 1.6480657403271404,
103
+ "improvement_rate": 0.28125,
104
+ "kl": 0.002542257308959961,
105
+ "learning_rate": 2e-06,
106
+ "loss": 0.0001,
107
+ "reward": 0.293629452586174,
108
+ "reward_std": 0.31043318659067154,
109
+ "rewards/AdaptiveTeachingReward": 0.293629452586174,
110
+ "step": 5,
111
+ "student_accuracy": 0.5625,
112
+ "student_approach_length": 500.0,
113
+ "teaching_length_mean": 2543.6875,
114
+ "teaching_length_std": 1157.76463240177,
115
+ "token_efficiency": 0.015775285003662067
116
+ },
117
+ {
118
+ "accuracy_delta": -0.125,
119
+ "baseline_accuracy": 0.375,
120
+ "completion_length": 2799.2265625,
121
+ "degradation_rate": 0.21875,
122
+ "epoch": 0.03137254901960784,
123
+ "grad_norm": 1.3626772466174568,
124
+ "improvement_rate": 0.09375,
125
+ "kl": 0.0024237632751464844,
126
+ "learning_rate": 2e-06,
127
+ "loss": 0.0001,
128
+ "reward": 0.297846183180809,
129
+ "reward_std": 0.1792445182800293,
130
+ "rewards/AdaptiveTeachingReward": 0.297846183180809,
131
+ "step": 6,
132
+ "student_accuracy": 0.25,
133
+ "student_approach_length": 500.0,
134
+ "teaching_length_mean": 3300.46875,
135
+ "teaching_length_std": 1322.463354762313,
136
+ "token_efficiency": 0.008984860526379333
137
+ },
138
+ {
139
+ "accuracy_delta": 0.15625,
140
+ "baseline_accuracy": 0.09375,
141
+ "completion_length": 2839.59375,
142
+ "degradation_rate": 0.0,
143
+ "epoch": 0.036601307189542485,
144
+ "grad_norm": 1.0148015671601693,
145
+ "improvement_rate": 0.15625,
146
+ "kl": 0.0022563934326171875,
147
+ "learning_rate": 2e-06,
148
+ "loss": 0.0001,
149
+ "reward": 0.22244123369455338,
150
+ "reward_std": 0.32708095014095306,
151
+ "rewards/AdaptiveTeachingReward": 0.22244123369455338,
152
+ "step": 7,
153
+ "student_accuracy": 0.25,
154
+ "student_approach_length": 500.0,
155
+ "teaching_length_mean": 2663.28125,
156
+ "teaching_length_std": 1263.471554191286,
157
+ "token_efficiency": 0.009322577760442718
158
+ },
159
+ {
160
+ "accuracy_delta": 0.03125,
161
+ "baseline_accuracy": 0.34375,
162
+ "completion_length": 2997.0234375,
163
+ "degradation_rate": 0.0625,
164
+ "epoch": 0.04183006535947712,
165
+ "grad_norm": 1.0933058629769363,
166
+ "improvement_rate": 0.09375,
167
+ "kl": 0.00222015380859375,
168
+ "learning_rate": 2e-06,
169
+ "loss": 0.0001,
170
+ "reward": 0.4145798534154892,
171
+ "reward_std": 0.348370686173439,
172
+ "rewards/AdaptiveTeachingReward": 0.4145798534154892,
173
+ "step": 8,
174
+ "student_accuracy": 0.375,
175
+ "student_approach_length": 500.0,
176
+ "teaching_length_mean": 2859.9375,
177
+ "teaching_length_std": 1548.3925018635746,
178
+ "token_efficiency": 0.014895284304358753
179
+ },
180
+ {
181
+ "accuracy_delta": 0.0,
182
+ "baseline_accuracy": 0.0,
183
+ "completion_length": 2069.3515625,
184
+ "degradation_rate": 0.0,
185
+ "epoch": 0.047058823529411764,
186
+ "grad_norm": 0.6907373861546955,
187
+ "improvement_rate": 0.0,
188
+ "kl": 0.0022356510162353516,
189
+ "learning_rate": 2e-06,
190
+ "loss": 0.0001,
191
+ "reward": 0.13787749409675598,
192
+ "reward_std": 0.08163860440254211,
193
+ "rewards/AdaptiveTeachingReward": 0.13787749409675598,
194
+ "step": 9,
195
+ "student_accuracy": 0.0,
196
+ "student_approach_length": 500.0,
197
+ "teaching_length_mean": 1586.78125,
198
+ "teaching_length_std": 1393.5538468081056,
199
+ "token_efficiency": 0.009196431155361413
200
+ },
201
+ {
202
+ "accuracy_delta": 0.0,
203
+ "baseline_accuracy": 0.0,
204
+ "completion_length": 2533.3203125,
205
+ "degradation_rate": 0.0,
206
+ "epoch": 0.05228758169934641,
207
+ "grad_norm": 0.9569337200656036,
208
+ "improvement_rate": 0.0,
209
+ "kl": 0.002455472946166992,
210
+ "learning_rate": 2e-06,
211
+ "loss": 0.0001,
212
+ "reward": 0.2650887817144394,
213
+ "reward_std": 0.1834174394607544,
214
+ "rewards/AdaptiveTeachingReward": 0.2650887817144394,
215
+ "step": 10,
216
+ "student_accuracy": 0.0,
217
+ "student_approach_length": 500.0,
218
+ "teaching_length_mean": 2718.53125,
219
+ "teaching_length_std": 1545.6357044636015,
220
+ "token_efficiency": 0.00983657689669804
221
+ },
222
+ {
223
+ "accuracy_delta": 0.0,
224
+ "baseline_accuracy": 0.0,
225
+ "completion_length": 2366.4609375,
226
+ "degradation_rate": 0.0,
227
+ "epoch": 0.05751633986928104,
228
+ "grad_norm": 0.1992844149700769,
229
+ "improvement_rate": 0.0,
230
+ "kl": 0.0023772716522216797,
231
+ "learning_rate": 2e-06,
232
+ "loss": 0.0001,
233
+ "reward": 0.00799931213259697,
234
+ "reward_std": 0.045250944793224335,
235
+ "rewards/AdaptiveTeachingReward": 0.00799931213259697,
236
+ "step": 11,
237
+ "student_accuracy": 0.0,
238
+ "student_approach_length": 500.0,
239
+ "teaching_length_mean": 2560.28125,
240
+ "teaching_length_std": 1264.0513369895032,
241
+ "token_efficiency": 0.000244351732796639
242
+ },
243
+ {
244
+ "accuracy_delta": 0.0,
245
+ "baseline_accuracy": 0.0,
246
+ "completion_length": 2429.25,
247
+ "degradation_rate": 0.0,
248
+ "epoch": 0.06274509803921569,
249
+ "grad_norm": 0.02438642631727396,
250
+ "improvement_rate": 0.0,
251
+ "kl": 0.0028073787689208984,
252
+ "learning_rate": 2e-06,
253
+ "loss": 0.0001,
254
+ "reward": 0.0,
255
+ "reward_std": 0.0,
256
+ "rewards/AdaptiveTeachingReward": 0.0,
257
+ "step": 12,
258
+ "student_accuracy": 0.0,
259
+ "student_approach_length": 500.0,
260
+ "teaching_length_mean": 2684.0,
261
+ "teaching_length_std": 1537.9383137400418,
262
+ "token_efficiency": 0.0
263
+ },
264
+ {
265
+ "accuracy_delta": 0.03125,
266
+ "baseline_accuracy": 0.03125,
267
+ "completion_length": 2985.0234375,
268
+ "degradation_rate": 0.03125,
269
+ "epoch": 0.06797385620915032,
270
+ "grad_norm": 1.9377263264415499,
271
+ "improvement_rate": 0.0625,
272
+ "kl": 0.002295255661010742,
273
+ "learning_rate": 2e-06,
274
+ "loss": 0.0001,
275
+ "reward": 0.3229658156633377,
276
+ "reward_std": 0.2836003005504608,
277
+ "rewards/AdaptiveTeachingReward": 0.3229658156633377,
278
+ "step": 13,
279
+ "student_accuracy": 0.0625,
280
+ "student_approach_length": 500.0,
281
+ "teaching_length_mean": 2764.78125,
282
+ "teaching_length_std": 1359.6938149421062,
283
+ "token_efficiency": 0.011864912871618307
284
+ },
285
+ {
286
+ "accuracy_delta": 0.0,
287
+ "baseline_accuracy": 0.40625,
288
+ "completion_length": 2860.7890625,
289
+ "degradation_rate": 0.03125,
290
+ "epoch": 0.07320261437908497,
291
+ "grad_norm": 0.7397348398480618,
292
+ "improvement_rate": 0.03125,
293
+ "kl": 0.0022339820861816406,
294
+ "learning_rate": 2e-06,
295
+ "loss": 0.0001,
296
+ "reward": 0.3183840811252594,
297
+ "reward_std": 0.2730633243918419,
298
+ "rewards/AdaptiveTeachingReward": 0.3183840811252594,
299
+ "step": 14,
300
+ "student_accuracy": 0.40625,
301
+ "student_approach_length": 500.0,
302
+ "teaching_length_mean": 3300.75,
303
+ "teaching_length_std": 1258.5522551939885,
304
+ "token_efficiency": 0.009677591351820222
305
+ },
306
+ {
307
+ "accuracy_delta": -0.25,
308
+ "baseline_accuracy": 0.59375,
309
+ "completion_length": 2667.375,
310
+ "degradation_rate": 0.375,
311
+ "epoch": 0.0784313725490196,
312
+ "grad_norm": 0.591091766464445,
313
+ "improvement_rate": 0.125,
314
+ "kl": 0.0020873546600341797,
315
+ "learning_rate": 2e-06,
316
+ "loss": 0.0001,
317
+ "reward": 0.14508739858865738,
318
+ "reward_std": 0.1915995106101036,
319
+ "rewards/AdaptiveTeachingReward": 0.14508739858865738,
320
+ "step": 15,
321
+ "student_accuracy": 0.34375,
322
+ "student_approach_length": 500.0,
323
+ "teaching_length_mean": 2580.1875,
324
+ "teaching_length_std": 1411.9177183341415,
325
+ "token_efficiency": 0.006345218750950054
326
+ },
327
+ {
328
+ "accuracy_delta": -0.125,
329
+ "baseline_accuracy": 0.78125,
330
+ "completion_length": 2397.8046875,
331
+ "degradation_rate": 0.28125,
332
+ "epoch": 0.08366013071895424,
333
+ "grad_norm": 1.0467841728187868,
334
+ "improvement_rate": 0.15625,
335
+ "kl": 0.0020101070404052734,
336
+ "learning_rate": 2e-06,
337
+ "loss": 0.0001,
338
+ "reward": 0.29722827672958374,
339
+ "reward_std": 0.29581306129693985,
340
+ "rewards/AdaptiveTeachingReward": 0.29722827672958374,
341
+ "step": 16,
342
+ "student_accuracy": 0.65625,
343
+ "student_approach_length": 500.0,
344
+ "teaching_length_mean": 1867.65625,
345
+ "teaching_length_std": 1645.2406135166107,
346
+ "token_efficiency": 0.016315762169946447
347
+ },
348
+ {
349
+ "accuracy_delta": 0.0,
350
+ "baseline_accuracy": 0.4375,
351
+ "completion_length": 2681.4296875,
352
+ "degradation_rate": 0.0625,
353
+ "epoch": 0.08888888888888889,
354
+ "grad_norm": 1.0178368263353954,
355
+ "improvement_rate": 0.0625,
356
+ "kl": 0.0022170543670654297,
357
+ "learning_rate": 2e-06,
358
+ "loss": 0.0001,
359
+ "reward": 0.27442795038223267,
360
+ "reward_std": 0.24267160892486572,
361
+ "rewards/AdaptiveTeachingReward": 0.27442795038223267,
362
+ "step": 17,
363
+ "student_accuracy": 0.4375,
364
+ "student_approach_length": 500.0,
365
+ "teaching_length_mean": 1946.4375,
366
+ "teaching_length_std": 1595.4173133278073,
367
+ "token_efficiency": 0.01725690617086827
368
+ },
369
+ {
370
+ "accuracy_delta": 0.0,
371
+ "baseline_accuracy": 0.0,
372
+ "completion_length": 2260.1484375,
373
+ "degradation_rate": 0.0,
374
+ "epoch": 0.09411764705882353,
375
+ "grad_norm": 0.39094525161531923,
376
+ "improvement_rate": 0.0,
377
+ "kl": 0.0026335716247558594,
378
+ "learning_rate": 2e-06,
379
+ "loss": 0.0001,
380
+ "reward": 0.15987491607666016,
381
+ "reward_std": 0.12795361876487732,
382
+ "rewards/AdaptiveTeachingReward": 0.15987491607666016,
383
+ "step": 18,
384
+ "student_accuracy": 0.0,
385
+ "student_approach_length": 500.0,
386
+ "teaching_length_mean": 2452.8125,
387
+ "teaching_length_std": 1687.6677585883976,
388
+ "token_efficiency": 0.00733118954839666
389
+ },
390
+ {
391
+ "accuracy_delta": 0.09375,
392
+ "baseline_accuracy": 0.34375,
393
+ "completion_length": 2509.359375,
394
+ "degradation_rate": 0.0,
395
+ "epoch": 0.09934640522875816,
396
+ "grad_norm": 1.112027848483532,
397
+ "improvement_rate": 0.09375,
398
+ "kl": 0.0021845102310180664,
399
+ "learning_rate": 2e-06,
400
+ "loss": 0.0001,
401
+ "reward": 0.32069824635982513,
402
+ "reward_std": 0.1779022440314293,
403
+ "rewards/AdaptiveTeachingReward": 0.32069824635982513,
404
+ "step": 19,
405
+ "student_accuracy": 0.4375,
406
+ "student_approach_length": 500.0,
407
+ "teaching_length_mean": 1688.25,
408
+ "teaching_length_std": 1591.1127810270257,
409
+ "token_efficiency": 0.02000900419695598
410
+ },
411
+ {
412
+ "accuracy_delta": 0.15625,
413
+ "baseline_accuracy": 0.25,
414
+ "completion_length": 2566.109375,
415
+ "degradation_rate": 0.125,
416
+ "epoch": 0.10457516339869281,
417
+ "grad_norm": 0.635668741879516,
418
+ "improvement_rate": 0.28125,
419
+ "kl": 0.0020592212677001953,
420
+ "learning_rate": 2e-06,
421
+ "loss": 0.0001,
422
+ "reward": 0.3237999305129051,
423
+ "reward_std": 0.29250405728816986,
424
+ "rewards/AdaptiveTeachingReward": 0.3237999305129051,
425
+ "step": 20,
426
+ "student_accuracy": 0.40625,
427
+ "student_approach_length": 500.0,
428
+ "teaching_length_mean": 3000.8125,
429
+ "teaching_length_std": 1146.9664495361749,
430
+ "token_efficiency": 0.009248973352977438
431
+ },
432
+ {
433
+ "accuracy_delta": -0.15625,
434
+ "baseline_accuracy": 0.5,
435
+ "completion_length": 2821.34375,
436
+ "degradation_rate": 0.15625,
437
+ "epoch": 0.10980392156862745,
438
+ "grad_norm": 1.038632983891241,
439
+ "improvement_rate": 0.0,
440
+ "kl": 0.0022677183151245117,
441
+ "learning_rate": 2e-06,
442
+ "loss": 0.0001,
443
+ "reward": 0.26167380064725876,
444
+ "reward_std": 0.20054005086421967,
445
+ "rewards/AdaptiveTeachingReward": 0.26167380064725876,
446
+ "step": 21,
447
+ "student_accuracy": 0.34375,
448
+ "student_approach_length": 500.0,
449
+ "teaching_length_mean": 2324.625,
450
+ "teaching_length_std": 1783.1465396131703,
451
+ "token_efficiency": 0.011391752427342916
452
+ },
453
+ {
454
+ "accuracy_delta": -0.0625,
455
+ "baseline_accuracy": 0.6875,
456
+ "completion_length": 1981.4453125,
457
+ "degradation_rate": 0.21875,
458
+ "epoch": 0.11503267973856209,
459
+ "grad_norm": 1.0591629807323937,
460
+ "improvement_rate": 0.15625,
461
+ "kl": 0.0025501251220703125,
462
+ "learning_rate": 2e-06,
463
+ "loss": 0.0001,
464
+ "reward": 0.3476671576499939,
465
+ "reward_std": 0.32001765072345734,
466
+ "rewards/AdaptiveTeachingReward": 0.3476671576499939,
467
+ "step": 22,
468
+ "student_accuracy": 0.625,
469
+ "student_approach_length": 500.0,
470
+ "teaching_length_mean": 1747.6875,
471
+ "teaching_length_std": 1544.1982674617702,
472
+ "token_efficiency": 0.019930595836054183
473
+ },
474
+ {
475
+ "accuracy_delta": 0.0,
476
+ "baseline_accuracy": 0.6875,
477
+ "completion_length": 2491.2109375,
478
+ "degradation_rate": 0.15625,
479
+ "epoch": 0.12026143790849673,
480
+ "grad_norm": 0.8736384977021919,
481
+ "improvement_rate": 0.15625,
482
+ "kl": 0.002077817916870117,
483
+ "learning_rate": 2e-06,
484
+ "loss": 0.0001,
485
+ "reward": 0.3966591954231262,
486
+ "reward_std": 0.35394637286663055,
487
+ "rewards/AdaptiveTeachingReward": 0.3966591954231262,
488
+ "step": 23,
489
+ "student_accuracy": 0.6875,
490
+ "student_approach_length": 500.0,
491
+ "teaching_length_mean": 2244.40625,
492
+ "teaching_length_std": 1464.1512714006012,
493
+ "token_efficiency": 0.020170202633726
494
+ },
495
+ {
496
+ "accuracy_delta": -0.0625,
497
+ "baseline_accuracy": 0.40625,
498
+ "completion_length": 3183.6796875,
499
+ "degradation_rate": 0.09375,
500
+ "epoch": 0.12549019607843137,
501
+ "grad_norm": 0.731937384244321,
502
+ "improvement_rate": 0.03125,
503
+ "kl": 0.002083301544189453,
504
+ "learning_rate": 2e-06,
505
+ "loss": 0.0001,
506
+ "reward": 0.24638524651527405,
507
+ "reward_std": 0.25337880849838257,
508
+ "rewards/AdaptiveTeachingReward": 0.24638524651527405,
509
+ "step": 24,
510
+ "student_accuracy": 0.34375,
511
+ "student_approach_length": 500.0,
512
+ "teaching_length_mean": 2833.75,
513
+ "teaching_length_std": 883.8529811162587,
514
+ "token_efficiency": 0.0154971457828618
515
+ },
516
+ {
517
+ "accuracy_delta": 0.09375,
518
+ "baseline_accuracy": 0.09375,
519
+ "completion_length": 2996.8203125,
520
+ "degradation_rate": 0.0625,
521
+ "epoch": 0.13071895424836602,
522
+ "grad_norm": 1.0366727211421645,
523
+ "improvement_rate": 0.15625,
524
+ "kl": 0.002071857452392578,
525
+ "learning_rate": 2e-06,
526
+ "loss": 0.0001,
527
+ "reward": 0.10002126544713974,
528
+ "reward_std": 0.12362907081842422,
529
+ "rewards/AdaptiveTeachingReward": 0.10002126544713974,
530
+ "step": 25,
531
+ "student_accuracy": 0.1875,
532
+ "student_approach_length": 500.0,
533
+ "teaching_length_mean": 2241.40625,
534
+ "teaching_length_std": 1680.2583046873353,
535
+ "token_efficiency": 0.005314979233325259
536
+ },
537
+ {
538
+ "accuracy_delta": 0.03125,
539
+ "baseline_accuracy": 0.125,
540
+ "completion_length": 2699.5078125,
541
+ "degradation_rate": 0.125,
542
+ "epoch": 0.13594771241830064,
543
+ "grad_norm": 0.8849958564728577,
544
+ "improvement_rate": 0.15625,
545
+ "kl": 0.00244140625,
546
+ "learning_rate": 2e-06,
547
+ "loss": 0.0001,
548
+ "reward": 0.21003766357898712,
549
+ "reward_std": 0.27024491131305695,
550
+ "rewards/AdaptiveTeachingReward": 0.21003766357898712,
551
+ "step": 26,
552
+ "student_accuracy": 0.15625,
553
+ "student_approach_length": 500.0,
554
+ "teaching_length_mean": 2865.75,
555
+ "teaching_length_std": 1575.6950824020187,
556
+ "token_efficiency": 0.00747702067329278
557
+ },
558
+ {
559
+ "accuracy_delta": 0.09375,
560
+ "baseline_accuracy": 0.0,
561
+ "completion_length": 2216.78125,
562
+ "degradation_rate": 0.0,
563
+ "epoch": 0.1411764705882353,
564
+ "grad_norm": 0.6451950468247998,
565
+ "improvement_rate": 0.09375,
566
+ "kl": 0.0021767616271972656,
567
+ "learning_rate": 2e-06,
568
+ "loss": 0.0001,
569
+ "reward": 0.2089657336473465,
570
+ "reward_std": 0.21514078974723816,
571
+ "rewards/AdaptiveTeachingReward": 0.2089657336473465,
572
+ "step": 27,
573
+ "student_accuracy": 0.09375,
574
+ "student_approach_length": 499.96875,
575
+ "teaching_length_mean": 2619.40625,
576
+ "teaching_length_std": 1265.4369118665845,
577
+ "token_efficiency": 0.0060904088353781515
578
+ },
579
+ {
580
+ "accuracy_delta": 0.0,
581
+ "baseline_accuracy": 0.0,
582
+ "completion_length": 2668.4140625,
583
+ "degradation_rate": 0.0,
584
+ "epoch": 0.14640522875816994,
585
+ "grad_norm": 0.7896767066978999,
586
+ "improvement_rate": 0.0,
587
+ "kl": 0.002083301544189453,
588
+ "learning_rate": 2e-06,
589
+ "loss": 0.0001,
590
+ "reward": 0.14528799057006836,
591
+ "reward_std": 0.09566954523324966,
592
+ "rewards/AdaptiveTeachingReward": 0.14528799057006836,
593
+ "step": 28,
594
+ "student_accuracy": 0.0,
595
+ "student_approach_length": 500.0,
596
+ "teaching_length_mean": 3113.90625,
597
+ "teaching_length_std": 884.9974614855895,
598
+ "token_efficiency": 0.0035470700822770596
599
+ },
600
+ {
601
+ "accuracy_delta": 0.03125,
602
+ "baseline_accuracy": 0.0,
603
+ "completion_length": 2467.4140625,
604
+ "degradation_rate": 0.0,
605
+ "epoch": 0.15163398692810456,
606
+ "grad_norm": 0.22872066233966626,
607
+ "improvement_rate": 0.03125,
608
+ "kl": 0.0025146007537841797,
609
+ "learning_rate": 2e-06,
610
+ "loss": 0.0001,
611
+ "reward": 0.007998689077794552,
612
+ "reward_std": 0.04524742066860199,
613
+ "rewards/AdaptiveTeachingReward": 0.007998689077794552,
614
+ "step": 29,
615
+ "student_accuracy": 0.03125,
616
+ "student_approach_length": 500.0,
617
+ "teaching_length_mean": 2764.9375,
618
+ "teaching_length_std": 1560.6896673437209,
619
+ "token_efficiency": 0.0002591402932910396
620
+ },
621
+ {
622
+ "accuracy_delta": 0.0,
623
+ "baseline_accuracy": 0.0,
624
+ "completion_length": 2854.59375,
625
+ "degradation_rate": 0.0,
626
+ "epoch": 0.1568627450980392,
627
+ "grad_norm": 0.5744778046050318,
628
+ "improvement_rate": 0.0,
629
+ "kl": 0.0022513866424560547,
630
+ "learning_rate": 2e-06,
631
+ "loss": 0.0001,
632
+ "reward": 0.16005077958106995,
633
+ "reward_std": 0.10021104663610458,
634
+ "rewards/AdaptiveTeachingReward": 0.16005077958106995,
635
+ "step": 30,
636
+ "student_accuracy": 0.0,
637
+ "student_approach_length": 500.0,
638
+ "teaching_length_mean": 2139.71875,
639
+ "teaching_length_std": 1711.9645016733543,
640
+ "token_efficiency": 0.007949871083127776
641
+ }
642
+ ],
643
+ "logging_steps": 1,
644
+ "max_steps": 250,
645
+ "num_input_tokens_seen": 0,
646
+ "num_train_epochs": 2,
647
+ "save_steps": 10,
648
+ "stateful_callbacks": {
649
+ "TrainerControl": {
650
+ "args": {
651
+ "should_epoch_stop": false,
652
+ "should_evaluate": false,
653
+ "should_log": false,
654
+ "should_save": true,
655
+ "should_training_stop": false
656
+ },
657
+ "attributes": {}
658
+ }
659
+ },
660
+ "total_flos": 0.0,
661
+ "train_batch_size": 16,
662
+ "trial_name": null,
663
+ "trial_params": null
664
+ }