| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 20.0, | |
| "eval_steps": 10, | |
| "global_step": 1640, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.12195121951219512, | |
| "grad_norm": 35.852298736572266, | |
| "learning_rate": 1.219512195121951e-07, | |
| "loss": 7.8912, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.12195121951219512, | |
| "eval_loss": 7.806792736053467, | |
| "eval_runtime": 39.7349, | |
| "eval_samples_per_second": 444.47, | |
| "eval_steps_per_second": 0.453, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.24390243902439024, | |
| "grad_norm": 35.98800277709961, | |
| "learning_rate": 2.439024390243902e-07, | |
| "loss": 7.6722, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.24390243902439024, | |
| "eval_loss": 7.456321716308594, | |
| "eval_runtime": 39.2616, | |
| "eval_samples_per_second": 449.828, | |
| "eval_steps_per_second": 0.458, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.36585365853658536, | |
| "grad_norm": 33.745487213134766, | |
| "learning_rate": 3.6585365853658536e-07, | |
| "loss": 7.2095, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.36585365853658536, | |
| "eval_loss": 6.869460105895996, | |
| "eval_runtime": 39.2191, | |
| "eval_samples_per_second": 450.316, | |
| "eval_steps_per_second": 0.459, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.4878048780487805, | |
| "grad_norm": 28.555143356323242, | |
| "learning_rate": 4.878048780487804e-07, | |
| "loss": 6.5213, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.4878048780487805, | |
| "eval_loss": 6.08616828918457, | |
| "eval_runtime": 39.2368, | |
| "eval_samples_per_second": 450.114, | |
| "eval_steps_per_second": 0.459, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.6097560975609756, | |
| "grad_norm": 17.861717224121094, | |
| "learning_rate": 6.097560975609756e-07, | |
| "loss": 5.6988, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.6097560975609756, | |
| "eval_loss": 5.294862270355225, | |
| "eval_runtime": 39.2476, | |
| "eval_samples_per_second": 449.989, | |
| "eval_steps_per_second": 0.459, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.7317073170731707, | |
| "grad_norm": 9.703508377075195, | |
| "learning_rate": 7.317073170731707e-07, | |
| "loss": 5.0031, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.7317073170731707, | |
| "eval_loss": 4.72209358215332, | |
| "eval_runtime": 39.2415, | |
| "eval_samples_per_second": 450.059, | |
| "eval_steps_per_second": 0.459, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.8536585365853658, | |
| "grad_norm": 5.830469131469727, | |
| "learning_rate": 8.536585365853657e-07, | |
| "loss": 4.5171, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.8536585365853658, | |
| "eval_loss": 4.342286586761475, | |
| "eval_runtime": 39.2288, | |
| "eval_samples_per_second": 450.205, | |
| "eval_steps_per_second": 0.459, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.975609756097561, | |
| "grad_norm": 4.307955265045166, | |
| "learning_rate": 9.756097560975609e-07, | |
| "loss": 4.1858, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.975609756097561, | |
| "eval_loss": 4.0347723960876465, | |
| "eval_runtime": 39.2556, | |
| "eval_samples_per_second": 449.897, | |
| "eval_steps_per_second": 0.459, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.0975609756097562, | |
| "grad_norm": 3.8736400604248047, | |
| "learning_rate": 1.0975609756097562e-06, | |
| "loss": 3.8907, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0975609756097562, | |
| "eval_loss": 3.7496390342712402, | |
| "eval_runtime": 39.2415, | |
| "eval_samples_per_second": 450.059, | |
| "eval_steps_per_second": 0.459, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.2195121951219512, | |
| "grad_norm": 3.2329907417297363, | |
| "learning_rate": 1.2195121951219512e-06, | |
| "loss": 3.6078, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.2195121951219512, | |
| "eval_loss": 3.480637788772583, | |
| "eval_runtime": 39.2636, | |
| "eval_samples_per_second": 449.805, | |
| "eval_steps_per_second": 0.458, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.3414634146341464, | |
| "grad_norm": 2.6243083477020264, | |
| "learning_rate": 1.3414634146341463e-06, | |
| "loss": 3.3481, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.3414634146341464, | |
| "eval_loss": 3.248450756072998, | |
| "eval_runtime": 39.2605, | |
| "eval_samples_per_second": 449.841, | |
| "eval_steps_per_second": 0.458, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.4634146341463414, | |
| "grad_norm": 2.2950096130371094, | |
| "learning_rate": 1.4634146341463414e-06, | |
| "loss": 3.1262, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.4634146341463414, | |
| "eval_loss": 3.042900323867798, | |
| "eval_runtime": 39.3067, | |
| "eval_samples_per_second": 449.313, | |
| "eval_steps_per_second": 0.458, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.5853658536585367, | |
| "grad_norm": 2.118208408355713, | |
| "learning_rate": 1.5853658536585366e-06, | |
| "loss": 2.9296, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.5853658536585367, | |
| "eval_loss": 2.842834711074829, | |
| "eval_runtime": 39.3323, | |
| "eval_samples_per_second": 449.021, | |
| "eval_steps_per_second": 0.458, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.7073170731707317, | |
| "grad_norm": 1.656348466873169, | |
| "learning_rate": 1.7073170731707315e-06, | |
| "loss": 2.7478, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.7073170731707317, | |
| "eval_loss": 2.671419382095337, | |
| "eval_runtime": 39.2963, | |
| "eval_samples_per_second": 449.431, | |
| "eval_steps_per_second": 0.458, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.8292682926829267, | |
| "grad_norm": 1.4909868240356445, | |
| "learning_rate": 1.8292682926829268e-06, | |
| "loss": 2.5869, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.8292682926829267, | |
| "eval_loss": 2.5269365310668945, | |
| "eval_runtime": 39.305, | |
| "eval_samples_per_second": 449.332, | |
| "eval_steps_per_second": 0.458, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.951219512195122, | |
| "grad_norm": 1.4536538124084473, | |
| "learning_rate": 1.9512195121951218e-06, | |
| "loss": 2.4527, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.951219512195122, | |
| "eval_loss": 2.4036409854888916, | |
| "eval_runtime": 39.3364, | |
| "eval_samples_per_second": 448.974, | |
| "eval_steps_per_second": 0.458, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.073170731707317, | |
| "grad_norm": 1.1609474420547485, | |
| "learning_rate": 1.9999184556954774e-06, | |
| "loss": 2.3461, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.073170731707317, | |
| "eval_loss": 2.299896717071533, | |
| "eval_runtime": 39.3179, | |
| "eval_samples_per_second": 449.185, | |
| "eval_steps_per_second": 0.458, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.1951219512195124, | |
| "grad_norm": 0.9852533340454102, | |
| "learning_rate": 1.999420177550043e-06, | |
| "loss": 2.243, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.1951219512195124, | |
| "eval_loss": 2.2197790145874023, | |
| "eval_runtime": 39.305, | |
| "eval_samples_per_second": 449.332, | |
| "eval_steps_per_second": 0.458, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.317073170731707, | |
| "grad_norm": 0.7964138388633728, | |
| "learning_rate": 1.9984691491033903e-06, | |
| "loss": 2.1769, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.317073170731707, | |
| "eval_loss": 2.1610889434814453, | |
| "eval_runtime": 39.3444, | |
| "eval_samples_per_second": 448.883, | |
| "eval_steps_per_second": 0.457, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.4390243902439024, | |
| "grad_norm": 0.8308465480804443, | |
| "learning_rate": 1.9970658011837403e-06, | |
| "loss": 2.1223, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.4390243902439024, | |
| "eval_loss": 2.120553493499756, | |
| "eval_runtime": 39.3228, | |
| "eval_samples_per_second": 449.129, | |
| "eval_steps_per_second": 0.458, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.5609756097560976, | |
| "grad_norm": 0.593834638595581, | |
| "learning_rate": 1.995210769525899e-06, | |
| "loss": 2.0966, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.5609756097560976, | |
| "eval_loss": 2.0926828384399414, | |
| "eval_runtime": 39.3549, | |
| "eval_samples_per_second": 448.762, | |
| "eval_steps_per_second": 0.457, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.682926829268293, | |
| "grad_norm": 0.5518779158592224, | |
| "learning_rate": 1.9929048944832634e-06, | |
| "loss": 2.0771, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.682926829268293, | |
| "eval_loss": 2.072599172592163, | |
| "eval_runtime": 39.3161, | |
| "eval_samples_per_second": 449.205, | |
| "eval_steps_per_second": 0.458, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.8048780487804876, | |
| "grad_norm": 0.5161460041999817, | |
| "learning_rate": 1.9901492206471324e-06, | |
| "loss": 2.0494, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.8048780487804876, | |
| "eval_loss": 2.056934118270874, | |
| "eval_runtime": 39.345, | |
| "eval_samples_per_second": 448.876, | |
| "eval_steps_per_second": 0.457, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.926829268292683, | |
| "grad_norm": 0.49138274788856506, | |
| "learning_rate": 1.986944996373489e-06, | |
| "loss": 2.0358, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.926829268292683, | |
| "eval_loss": 2.0439107418060303, | |
| "eval_runtime": 39.3247, | |
| "eval_samples_per_second": 449.107, | |
| "eval_steps_per_second": 0.458, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.048780487804878, | |
| "grad_norm": 0.4589332044124603, | |
| "learning_rate": 1.9832936732174833e-06, | |
| "loss": 2.0267, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.048780487804878, | |
| "eval_loss": 2.0325710773468018, | |
| "eval_runtime": 39.3333, | |
| "eval_samples_per_second": 449.008, | |
| "eval_steps_per_second": 0.458, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.1707317073170733, | |
| "grad_norm": 0.5090314745903015, | |
| "learning_rate": 1.979196905275856e-06, | |
| "loss": 2.0137, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.1707317073170733, | |
| "eval_loss": 2.022416830062866, | |
| "eval_runtime": 39.3644, | |
| "eval_samples_per_second": 448.654, | |
| "eval_steps_per_second": 0.457, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.292682926829268, | |
| "grad_norm": 0.4382960796356201, | |
| "learning_rate": 1.974656548437613e-06, | |
| "loss": 2.0011, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.292682926829268, | |
| "eval_loss": 2.013012647628784, | |
| "eval_runtime": 39.333, | |
| "eval_samples_per_second": 449.012, | |
| "eval_steps_per_second": 0.458, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.4146341463414633, | |
| "grad_norm": 0.4359953999519348, | |
| "learning_rate": 1.9696746595432827e-06, | |
| "loss": 1.9985, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.4146341463414633, | |
| "eval_loss": 2.004154920578003, | |
| "eval_runtime": 39.3656, | |
| "eval_samples_per_second": 448.641, | |
| "eval_steps_per_second": 0.457, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.5365853658536586, | |
| "grad_norm": 0.4334374964237213, | |
| "learning_rate": 1.964253495453141e-06, | |
| "loss": 1.9824, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.5365853658536586, | |
| "eval_loss": 1.995658040046692, | |
| "eval_runtime": 39.3661, | |
| "eval_samples_per_second": 448.635, | |
| "eval_steps_per_second": 0.457, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.658536585365854, | |
| "grad_norm": 0.45431938767433167, | |
| "learning_rate": 1.9583955120248236e-06, | |
| "loss": 1.9834, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.658536585365854, | |
| "eval_loss": 1.987615704536438, | |
| "eval_runtime": 39.3423, | |
| "eval_samples_per_second": 448.907, | |
| "eval_steps_per_second": 0.458, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.7804878048780486, | |
| "grad_norm": 0.4443369507789612, | |
| "learning_rate": 1.9521033630007928e-06, | |
| "loss": 1.9771, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.7804878048780486, | |
| "eval_loss": 1.9800046682357788, | |
| "eval_runtime": 39.3384, | |
| "eval_samples_per_second": 448.951, | |
| "eval_steps_per_second": 0.458, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.902439024390244, | |
| "grad_norm": 0.41829970479011536, | |
| "learning_rate": 1.945379898806153e-06, | |
| "loss": 1.9685, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.902439024390244, | |
| "eval_loss": 1.9727787971496582, | |
| "eval_runtime": 39.3267, | |
| "eval_samples_per_second": 449.084, | |
| "eval_steps_per_second": 0.458, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 4.024390243902439, | |
| "grad_norm": 0.43398183584213257, | |
| "learning_rate": 1.9382281652573785e-06, | |
| "loss": 1.9591, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 4.024390243902439, | |
| "eval_loss": 1.9658193588256836, | |
| "eval_runtime": 39.3633, | |
| "eval_samples_per_second": 448.666, | |
| "eval_steps_per_second": 0.457, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 4.146341463414634, | |
| "grad_norm": 0.39597055315971375, | |
| "learning_rate": 1.9306514021825116e-06, | |
| "loss": 1.9487, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 4.146341463414634, | |
| "eval_loss": 1.9592342376708984, | |
| "eval_runtime": 39.3471, | |
| "eval_samples_per_second": 448.852, | |
| "eval_steps_per_second": 0.457, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 4.2682926829268295, | |
| "grad_norm": 0.4240054190158844, | |
| "learning_rate": 1.922653041953483e-06, | |
| "loss": 1.9454, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 4.2682926829268295, | |
| "eval_loss": 1.9528599977493286, | |
| "eval_runtime": 39.4079, | |
| "eval_samples_per_second": 448.159, | |
| "eval_steps_per_second": 0.457, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 4.390243902439025, | |
| "grad_norm": 0.3959615230560303, | |
| "learning_rate": 1.914236707931202e-06, | |
| "loss": 1.9361, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.390243902439025, | |
| "eval_loss": 1.9468114376068115, | |
| "eval_runtime": 39.3624, | |
| "eval_samples_per_second": 448.676, | |
| "eval_steps_per_second": 0.457, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.512195121951219, | |
| "grad_norm": 0.3723958432674408, | |
| "learning_rate": 1.905406212824126e-06, | |
| "loss": 1.9389, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 4.512195121951219, | |
| "eval_loss": 1.940889596939087, | |
| "eval_runtime": 39.3409, | |
| "eval_samples_per_second": 448.922, | |
| "eval_steps_per_second": 0.458, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 4.634146341463414, | |
| "grad_norm": 0.37107619643211365, | |
| "learning_rate": 1.8961655569610556e-06, | |
| "loss": 1.9279, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 4.634146341463414, | |
| "eval_loss": 1.9352220296859741, | |
| "eval_runtime": 39.3794, | |
| "eval_samples_per_second": 448.483, | |
| "eval_steps_per_second": 0.457, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 4.7560975609756095, | |
| "grad_norm": 0.37607431411743164, | |
| "learning_rate": 1.8865189264789318e-06, | |
| "loss": 1.9212, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 4.7560975609756095, | |
| "eval_loss": 1.9297622442245483, | |
| "eval_runtime": 39.3451, | |
| "eval_samples_per_second": 448.875, | |
| "eval_steps_per_second": 0.457, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 4.878048780487805, | |
| "grad_norm": 0.371985524892807, | |
| "learning_rate": 1.8764706914264633e-06, | |
| "loss": 1.9142, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.878048780487805, | |
| "eval_loss": 1.9244760274887085, | |
| "eval_runtime": 39.3136, | |
| "eval_samples_per_second": 449.234, | |
| "eval_steps_per_second": 0.458, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.4083458185195923, | |
| "learning_rate": 1.8660254037844386e-06, | |
| "loss": 1.9087, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 1.9192752838134766, | |
| "eval_runtime": 39.3184, | |
| "eval_samples_per_second": 449.18, | |
| "eval_steps_per_second": 0.458, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 5.121951219512195, | |
| "grad_norm": 0.36457160115242004, | |
| "learning_rate": 1.8551877954036162e-06, | |
| "loss": 1.9061, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 5.121951219512195, | |
| "eval_loss": 1.9141229391098022, | |
| "eval_runtime": 39.3307, | |
| "eval_samples_per_second": 449.038, | |
| "eval_steps_per_second": 0.458, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 5.2439024390243905, | |
| "grad_norm": 0.4809107482433319, | |
| "learning_rate": 1.8439627758611382e-06, | |
| "loss": 1.9013, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 5.2439024390243905, | |
| "eval_loss": 1.908936619758606, | |
| "eval_runtime": 39.3459, | |
| "eval_samples_per_second": 448.865, | |
| "eval_steps_per_second": 0.457, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 5.365853658536586, | |
| "grad_norm": 0.5720298290252686, | |
| "learning_rate": 1.832355430236427e-06, | |
| "loss": 1.8953, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 5.365853658536586, | |
| "eval_loss": 1.9032728672027588, | |
| "eval_runtime": 39.357, | |
| "eval_samples_per_second": 448.739, | |
| "eval_steps_per_second": 0.457, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 5.487804878048781, | |
| "grad_norm": 0.40899941325187683, | |
| "learning_rate": 1.8203710168075784e-06, | |
| "loss": 1.8877, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 5.487804878048781, | |
| "eval_loss": 1.8964674472808838, | |
| "eval_runtime": 39.3338, | |
| "eval_samples_per_second": 449.003, | |
| "eval_steps_per_second": 0.458, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 5.609756097560975, | |
| "grad_norm": 0.7770607471466064, | |
| "learning_rate": 1.8080149646692928e-06, | |
| "loss": 1.8794, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 5.609756097560975, | |
| "eval_loss": 1.8876992464065552, | |
| "eval_runtime": 39.345, | |
| "eval_samples_per_second": 448.876, | |
| "eval_steps_per_second": 0.457, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 5.7317073170731705, | |
| "grad_norm": 1.4756108522415161, | |
| "learning_rate": 1.7952928712734265e-06, | |
| "loss": 1.8732, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 5.7317073170731705, | |
| "eval_loss": 1.8765217065811157, | |
| "eval_runtime": 39.3534, | |
| "eval_samples_per_second": 448.78, | |
| "eval_steps_per_second": 0.457, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 5.853658536585366, | |
| "grad_norm": 3.089818239212036, | |
| "learning_rate": 1.7822104998932711e-06, | |
| "loss": 1.8652, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 5.853658536585366, | |
| "eval_loss": 1.8682845830917358, | |
| "eval_runtime": 39.3351, | |
| "eval_samples_per_second": 448.988, | |
| "eval_steps_per_second": 0.458, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 5.975609756097561, | |
| "grad_norm": 1.618462085723877, | |
| "learning_rate": 1.7687737770127184e-06, | |
| "loss": 1.8513, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 5.975609756097561, | |
| "eval_loss": 1.8599414825439453, | |
| "eval_runtime": 39.3081, | |
| "eval_samples_per_second": 449.297, | |
| "eval_steps_per_second": 0.458, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 6.097560975609756, | |
| "grad_norm": 1.5310617685317993, | |
| "learning_rate": 1.754988789641485e-06, | |
| "loss": 1.8501, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 6.097560975609756, | |
| "eval_loss": 1.8557490110397339, | |
| "eval_runtime": 39.3877, | |
| "eval_samples_per_second": 448.389, | |
| "eval_steps_per_second": 0.457, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 6.219512195121951, | |
| "grad_norm": 1.2465336322784424, | |
| "learning_rate": 1.7408617825576177e-06, | |
| "loss": 1.8475, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 6.219512195121951, | |
| "eval_loss": 1.85213303565979, | |
| "eval_runtime": 39.3619, | |
| "eval_samples_per_second": 448.682, | |
| "eval_steps_per_second": 0.457, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 6.341463414634147, | |
| "grad_norm": 0.8563424944877625, | |
| "learning_rate": 1.7263991554785288e-06, | |
| "loss": 1.8349, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 6.341463414634147, | |
| "eval_loss": 1.8481909036636353, | |
| "eval_runtime": 39.3592, | |
| "eval_samples_per_second": 448.714, | |
| "eval_steps_per_second": 0.457, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 6.463414634146342, | |
| "grad_norm": 0.47906893491744995, | |
| "learning_rate": 1.7116074601618415e-06, | |
| "loss": 1.8369, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 6.463414634146342, | |
| "eval_loss": 1.8447495698928833, | |
| "eval_runtime": 39.3461, | |
| "eval_samples_per_second": 448.863, | |
| "eval_steps_per_second": 0.457, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 6.585365853658536, | |
| "grad_norm": 0.3527175188064575, | |
| "learning_rate": 1.696493397437357e-06, | |
| "loss": 1.8288, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 6.585365853658536, | |
| "eval_loss": 1.8413718938827515, | |
| "eval_runtime": 39.3552, | |
| "eval_samples_per_second": 448.759, | |
| "eval_steps_per_second": 0.457, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 6.7073170731707314, | |
| "grad_norm": 0.37900474667549133, | |
| "learning_rate": 1.6810638141714932e-06, | |
| "loss": 1.8271, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 6.7073170731707314, | |
| "eval_loss": 1.8382277488708496, | |
| "eval_runtime": 39.3397, | |
| "eval_samples_per_second": 448.936, | |
| "eval_steps_per_second": 0.458, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 6.829268292682927, | |
| "grad_norm": 0.3187570869922638, | |
| "learning_rate": 1.665325700165565e-06, | |
| "loss": 1.8296, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 6.829268292682927, | |
| "eval_loss": 1.8352141380310059, | |
| "eval_runtime": 39.3555, | |
| "eval_samples_per_second": 448.755, | |
| "eval_steps_per_second": 0.457, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 6.951219512195122, | |
| "grad_norm": 0.36600008606910706, | |
| "learning_rate": 1.6492861849893147e-06, | |
| "loss": 1.8257, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 6.951219512195122, | |
| "eval_loss": 1.832342505455017, | |
| "eval_runtime": 39.336, | |
| "eval_samples_per_second": 448.978, | |
| "eval_steps_per_second": 0.458, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 7.073170731707317, | |
| "grad_norm": 0.288286954164505, | |
| "learning_rate": 1.6329525347511218e-06, | |
| "loss": 1.8238, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 7.073170731707317, | |
| "eval_loss": 1.8295822143554688, | |
| "eval_runtime": 39.3243, | |
| "eval_samples_per_second": 449.112, | |
| "eval_steps_per_second": 0.458, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 7.195121951219512, | |
| "grad_norm": 0.3237072825431824, | |
| "learning_rate": 1.6163321488063635e-06, | |
| "loss": 1.8174, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 7.195121951219512, | |
| "eval_loss": 1.8268990516662598, | |
| "eval_runtime": 39.3498, | |
| "eval_samples_per_second": 448.82, | |
| "eval_steps_per_second": 0.457, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 7.317073170731708, | |
| "grad_norm": 0.36146941781044006, | |
| "learning_rate": 1.599432556405412e-06, | |
| "loss": 1.8141, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 7.317073170731708, | |
| "eval_loss": 1.8243464231491089, | |
| "eval_runtime": 39.3676, | |
| "eval_samples_per_second": 448.617, | |
| "eval_steps_per_second": 0.457, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 7.439024390243903, | |
| "grad_norm": 0.3055365979671478, | |
| "learning_rate": 1.5822614132827836e-06, | |
| "loss": 1.8141, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 7.439024390243903, | |
| "eval_loss": 1.8218821287155151, | |
| "eval_runtime": 39.3324, | |
| "eval_samples_per_second": 449.019, | |
| "eval_steps_per_second": 0.458, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 7.560975609756097, | |
| "grad_norm": 0.2906692624092102, | |
| "learning_rate": 1.5648264981889934e-06, | |
| "loss": 1.8096, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 7.560975609756097, | |
| "eval_loss": 1.8194576501846313, | |
| "eval_runtime": 39.3325, | |
| "eval_samples_per_second": 449.018, | |
| "eval_steps_per_second": 0.458, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 7.682926829268292, | |
| "grad_norm": 0.3652225434780121, | |
| "learning_rate": 1.5471357093666804e-06, | |
| "loss": 1.8119, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 7.682926829268292, | |
| "eval_loss": 1.8171180486679077, | |
| "eval_runtime": 39.3377, | |
| "eval_samples_per_second": 448.958, | |
| "eval_steps_per_second": 0.458, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 7.804878048780488, | |
| "grad_norm": 0.3688996732234955, | |
| "learning_rate": 1.5291970609726005e-06, | |
| "loss": 1.8042, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 7.804878048780488, | |
| "eval_loss": 1.8148137331008911, | |
| "eval_runtime": 39.3463, | |
| "eval_samples_per_second": 448.86, | |
| "eval_steps_per_second": 0.457, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 7.926829268292683, | |
| "grad_norm": 0.28809812664985657, | |
| "learning_rate": 1.5110186794471103e-06, | |
| "loss": 1.7979, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 7.926829268292683, | |
| "eval_loss": 1.8126047849655151, | |
| "eval_runtime": 39.3619, | |
| "eval_samples_per_second": 448.682, | |
| "eval_steps_per_second": 0.457, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 8.048780487804878, | |
| "grad_norm": 0.2660142481327057, | |
| "learning_rate": 1.4926087998327837e-06, | |
| "loss": 1.804, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 8.048780487804878, | |
| "eval_loss": 1.8104569911956787, | |
| "eval_runtime": 39.3952, | |
| "eval_samples_per_second": 448.303, | |
| "eval_steps_per_second": 0.457, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 8.170731707317072, | |
| "grad_norm": 0.281999796628952, | |
| "learning_rate": 1.4739757620438307e-06, | |
| "loss": 1.7987, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 8.170731707317072, | |
| "eval_loss": 1.8083666563034058, | |
| "eval_runtime": 39.3346, | |
| "eval_samples_per_second": 448.994, | |
| "eval_steps_per_second": 0.458, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 8.292682926829269, | |
| "grad_norm": 0.2869739234447479, | |
| "learning_rate": 1.4551280070880087e-06, | |
| "loss": 1.7954, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 8.292682926829269, | |
| "eval_loss": 1.8063015937805176, | |
| "eval_runtime": 39.3428, | |
| "eval_samples_per_second": 448.9, | |
| "eval_steps_per_second": 0.458, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 8.414634146341463, | |
| "grad_norm": 0.2752714157104492, | |
| "learning_rate": 1.4360740732427365e-06, | |
| "loss": 1.797, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 8.414634146341463, | |
| "eval_loss": 1.804310917854309, | |
| "eval_runtime": 39.3238, | |
| "eval_samples_per_second": 449.118, | |
| "eval_steps_per_second": 0.458, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 8.536585365853659, | |
| "grad_norm": 0.4099307358264923, | |
| "learning_rate": 1.416822592187143e-06, | |
| "loss": 1.791, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 8.536585365853659, | |
| "eval_loss": 1.802320122718811, | |
| "eval_runtime": 39.3277, | |
| "eval_samples_per_second": 449.073, | |
| "eval_steps_per_second": 0.458, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 8.658536585365853, | |
| "grad_norm": 0.3235901892185211, | |
| "learning_rate": 1.3973822850918054e-06, | |
| "loss": 1.7893, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 8.658536585365853, | |
| "eval_loss": 1.8004404306411743, | |
| "eval_runtime": 39.6357, | |
| "eval_samples_per_second": 445.583, | |
| "eval_steps_per_second": 0.454, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 8.78048780487805, | |
| "grad_norm": 0.3761025071144104, | |
| "learning_rate": 1.3777619586679457e-06, | |
| "loss": 1.787, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 8.78048780487805, | |
| "eval_loss": 1.7985868453979492, | |
| "eval_runtime": 39.2931, | |
| "eval_samples_per_second": 449.468, | |
| "eval_steps_per_second": 0.458, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 8.902439024390244, | |
| "grad_norm": 0.2766464054584503, | |
| "learning_rate": 1.3579705011778765e-06, | |
| "loss": 1.7899, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 8.902439024390244, | |
| "eval_loss": 1.7967922687530518, | |
| "eval_runtime": 39.3376, | |
| "eval_samples_per_second": 448.96, | |
| "eval_steps_per_second": 0.458, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 9.024390243902438, | |
| "grad_norm": 0.3136584162712097, | |
| "learning_rate": 1.3380168784085026e-06, | |
| "loss": 1.7917, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 9.024390243902438, | |
| "eval_loss": 1.7949668169021606, | |
| "eval_runtime": 39.3251, | |
| "eval_samples_per_second": 449.102, | |
| "eval_steps_per_second": 0.458, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 9.146341463414634, | |
| "grad_norm": 0.268803209066391, | |
| "learning_rate": 1.3179101296097033e-06, | |
| "loss": 1.7838, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 9.146341463414634, | |
| "eval_loss": 1.793213129043579, | |
| "eval_runtime": 39.3253, | |
| "eval_samples_per_second": 449.101, | |
| "eval_steps_per_second": 0.458, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 9.268292682926829, | |
| "grad_norm": 0.2907431125640869, | |
| "learning_rate": 1.2976593633994346e-06, | |
| "loss": 1.7803, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 9.268292682926829, | |
| "eval_loss": 1.7914844751358032, | |
| "eval_runtime": 39.3189, | |
| "eval_samples_per_second": 449.173, | |
| "eval_steps_per_second": 0.458, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 9.390243902439025, | |
| "grad_norm": 0.3980807960033417, | |
| "learning_rate": 1.2772737536374078e-06, | |
| "loss": 1.7789, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 9.390243902439025, | |
| "eval_loss": 1.7898335456848145, | |
| "eval_runtime": 39.3451, | |
| "eval_samples_per_second": 448.875, | |
| "eval_steps_per_second": 0.457, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 9.512195121951219, | |
| "grad_norm": 0.30676034092903137, | |
| "learning_rate": 1.2567625352692126e-06, | |
| "loss": 1.7811, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 9.512195121951219, | |
| "eval_loss": 1.7882270812988281, | |
| "eval_runtime": 39.3352, | |
| "eval_samples_per_second": 448.987, | |
| "eval_steps_per_second": 0.458, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 9.634146341463415, | |
| "grad_norm": 0.24213315546512604, | |
| "learning_rate": 1.2361350001427649e-06, | |
| "loss": 1.7791, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 9.634146341463415, | |
| "eval_loss": 1.786568522453308, | |
| "eval_runtime": 39.3646, | |
| "eval_samples_per_second": 448.652, | |
| "eval_steps_per_second": 0.457, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 9.75609756097561, | |
| "grad_norm": 0.2722227871417999, | |
| "learning_rate": 1.2154004927989813e-06, | |
| "loss": 1.7742, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 9.75609756097561, | |
| "eval_loss": 1.784982681274414, | |
| "eval_runtime": 39.3395, | |
| "eval_samples_per_second": 448.938, | |
| "eval_steps_per_second": 0.458, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 9.878048780487806, | |
| "grad_norm": 0.2399929314851761, | |
| "learning_rate": 1.19456840623858e-06, | |
| "loss": 1.7717, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 9.878048780487806, | |
| "eval_loss": 1.7834330797195435, | |
| "eval_runtime": 39.2984, | |
| "eval_samples_per_second": 449.408, | |
| "eval_steps_per_second": 0.458, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.24806931614875793, | |
| "learning_rate": 1.1736481776669305e-06, | |
| "loss": 1.7788, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 1.7818834781646729, | |
| "eval_runtime": 39.2977, | |
| "eval_samples_per_second": 449.415, | |
| "eval_steps_per_second": 0.458, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 10.121951219512194, | |
| "grad_norm": 0.32369279861450195, | |
| "learning_rate": 1.1526492842188744e-06, | |
| "loss": 1.7719, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 10.121951219512194, | |
| "eval_loss": 1.7803385257720947, | |
| "eval_runtime": 39.3284, | |
| "eval_samples_per_second": 449.064, | |
| "eval_steps_per_second": 0.458, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 10.24390243902439, | |
| "grad_norm": 0.3276310861110687, | |
| "learning_rate": 1.1315812386654649e-06, | |
| "loss": 1.7675, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 10.24390243902439, | |
| "eval_loss": 1.7788329124450684, | |
| "eval_runtime": 39.674, | |
| "eval_samples_per_second": 445.153, | |
| "eval_steps_per_second": 0.454, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 10.365853658536585, | |
| "grad_norm": 0.2850521504878998, | |
| "learning_rate": 1.1104535851045538e-06, | |
| "loss": 1.7725, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 10.365853658536585, | |
| "eval_loss": 1.7772928476333618, | |
| "eval_runtime": 39.3302, | |
| "eval_samples_per_second": 449.045, | |
| "eval_steps_per_second": 0.458, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 10.487804878048781, | |
| "grad_norm": 0.27776798605918884, | |
| "learning_rate": 1.0892758946371942e-06, | |
| "loss": 1.7648, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 10.487804878048781, | |
| "eval_loss": 1.7757339477539062, | |
| "eval_runtime": 39.3544, | |
| "eval_samples_per_second": 448.768, | |
| "eval_steps_per_second": 0.457, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 10.609756097560975, | |
| "grad_norm": 0.5228049755096436, | |
| "learning_rate": 1.0680577610318071e-06, | |
| "loss": 1.7609, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 10.609756097560975, | |
| "eval_loss": 1.7741671800613403, | |
| "eval_runtime": 39.3709, | |
| "eval_samples_per_second": 448.581, | |
| "eval_steps_per_second": 0.457, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 10.731707317073171, | |
| "grad_norm": 0.3848848044872284, | |
| "learning_rate": 1.0468087963780787e-06, | |
| "loss": 1.7636, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 10.731707317073171, | |
| "eval_loss": 1.77255380153656, | |
| "eval_runtime": 39.3539, | |
| "eval_samples_per_second": 448.773, | |
| "eval_steps_per_second": 0.457, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 10.853658536585366, | |
| "grad_norm": 0.5747771263122559, | |
| "learning_rate": 1.0255386267325602e-06, | |
| "loss": 1.7598, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 10.853658536585366, | |
| "eval_loss": 1.770812749862671, | |
| "eval_runtime": 39.3689, | |
| "eval_samples_per_second": 448.603, | |
| "eval_steps_per_second": 0.457, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 10.975609756097562, | |
| "grad_norm": 1.4300990104675293, | |
| "learning_rate": 1.0042568877579387e-06, | |
| "loss": 1.7651, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 10.975609756097562, | |
| "eval_loss": 1.7690285444259644, | |
| "eval_runtime": 39.3685, | |
| "eval_samples_per_second": 448.607, | |
| "eval_steps_per_second": 0.457, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 11.097560975609756, | |
| "grad_norm": 0.43552011251449585, | |
| "learning_rate": 9.829732203579585e-07, | |
| "loss": 1.7598, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 11.097560975609756, | |
| "eval_loss": 1.7672632932662964, | |
| "eval_runtime": 39.3224, | |
| "eval_samples_per_second": 449.133, | |
| "eval_steps_per_second": 0.458, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 11.21951219512195, | |
| "grad_norm": 4.467670917510986, | |
| "learning_rate": 9.616972663099646e-07, | |
| "loss": 1.7585, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 11.21951219512195, | |
| "eval_loss": 1.768608570098877, | |
| "eval_runtime": 39.3441, | |
| "eval_samples_per_second": 448.886, | |
| "eval_steps_per_second": 0.458, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 11.341463414634147, | |
| "grad_norm": 4.179907321929932, | |
| "learning_rate": 9.40438663897054e-07, | |
| "loss": 1.7572, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 11.341463414634147, | |
| "eval_loss": 1.7685447931289673, | |
| "eval_runtime": 39.3059, | |
| "eval_samples_per_second": 449.322, | |
| "eval_steps_per_second": 0.458, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 11.463414634146341, | |
| "grad_norm": 4.369534492492676, | |
| "learning_rate": 9.192070435418078e-07, | |
| "loss": 1.7485, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 11.463414634146341, | |
| "eval_loss": 1.7640718221664429, | |
| "eval_runtime": 39.3093, | |
| "eval_samples_per_second": 449.283, | |
| "eval_steps_per_second": 0.458, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 11.585365853658537, | |
| "grad_norm": 1.9561405181884766, | |
| "learning_rate": 8.980120234435848e-07, | |
| "loss": 1.7528, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 11.585365853658537, | |
| "eval_loss": 1.763830542564392, | |
| "eval_runtime": 39.3238, | |
| "eval_samples_per_second": 449.117, | |
| "eval_steps_per_second": 0.458, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 11.707317073170731, | |
| "grad_norm": 2.5773379802703857, | |
| "learning_rate": 8.768632052213531e-07, | |
| "loss": 1.7545, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 11.707317073170731, | |
| "eval_loss": 1.7629334926605225, | |
| "eval_runtime": 39.3328, | |
| "eval_samples_per_second": 449.014, | |
| "eval_steps_per_second": 0.458, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 11.829268292682928, | |
| "grad_norm": 3.903297185897827, | |
| "learning_rate": 8.557701695640321e-07, | |
| "loss": 1.7508, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 11.829268292682928, | |
| "eval_loss": 1.760330319404602, | |
| "eval_runtime": 39.3191, | |
| "eval_samples_per_second": 449.171, | |
| "eval_steps_per_second": 0.458, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 11.951219512195122, | |
| "grad_norm": 2.7617976665496826, | |
| "learning_rate": 8.347424718903151e-07, | |
| "loss": 1.7542, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 11.951219512195122, | |
| "eval_loss": 1.759660243988037, | |
| "eval_runtime": 39.3037, | |
| "eval_samples_per_second": 449.347, | |
| "eval_steps_per_second": 0.458, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 12.073170731707316, | |
| "grad_norm": 3.2472615242004395, | |
| "learning_rate": 8.137896380199421e-07, | |
| "loss": 1.752, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 12.073170731707316, | |
| "eval_loss": 1.7598719596862793, | |
| "eval_runtime": 39.6916, | |
| "eval_samples_per_second": 444.956, | |
| "eval_steps_per_second": 0.453, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 12.195121951219512, | |
| "grad_norm": 4.050698757171631, | |
| "learning_rate": 7.929211598583793e-07, | |
| "loss": 1.7487, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 12.195121951219512, | |
| "eval_loss": 1.7570974826812744, | |
| "eval_runtime": 39.3566, | |
| "eval_samples_per_second": 448.743, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 12.317073170731707, | |
| "grad_norm": 2.279803991317749, | |
| "learning_rate": 7.721464910968626e-07, | |
| "loss": 1.7454, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 12.317073170731707, | |
| "eval_loss": 1.7562564611434937, | |
| "eval_runtime": 39.3538, | |
| "eval_samples_per_second": 448.775, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 12.439024390243903, | |
| "grad_norm": 3.100792407989502, | |
| "learning_rate": 7.514750429297527e-07, | |
| "loss": 1.7472, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 12.439024390243903, | |
| "eval_loss": 1.7561583518981934, | |
| "eval_runtime": 39.3529, | |
| "eval_samples_per_second": 448.786, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 12.560975609756097, | |
| "grad_norm": 3.9019737243652344, | |
| "learning_rate": 7.30916179791144e-07, | |
| "loss": 1.7411, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 12.560975609756097, | |
| "eval_loss": 1.7533916234970093, | |
| "eval_runtime": 39.3587, | |
| "eval_samples_per_second": 448.719, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 12.682926829268293, | |
| "grad_norm": 2.5269076824188232, | |
| "learning_rate": 7.104792151126514e-07, | |
| "loss": 1.7441, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 12.682926829268293, | |
| "eval_loss": 1.7524149417877197, | |
| "eval_runtime": 39.3483, | |
| "eval_samples_per_second": 448.838, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 12.804878048780488, | |
| "grad_norm": 3.1689910888671875, | |
| "learning_rate": 6.901734071043071e-07, | |
| "loss": 1.7391, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 12.804878048780488, | |
| "eval_loss": 1.752366542816162, | |
| "eval_runtime": 39.3518, | |
| "eval_samples_per_second": 448.798, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 12.926829268292684, | |
| "grad_norm": 3.6410083770751953, | |
| "learning_rate": 6.700079545604707e-07, | |
| "loss": 1.7441, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 12.926829268292684, | |
| "eval_loss": 1.749656319618225, | |
| "eval_runtime": 39.3618, | |
| "eval_samples_per_second": 448.684, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 13.048780487804878, | |
| "grad_norm": 1.9339579343795776, | |
| "learning_rate": 6.499919926926565e-07, | |
| "loss": 1.7415, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 13.048780487804878, | |
| "eval_loss": 1.7485558986663818, | |
| "eval_runtime": 39.3581, | |
| "eval_samples_per_second": 448.726, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 13.170731707317072, | |
| "grad_norm": 2.8899059295654297, | |
| "learning_rate": 6.301345889911636e-07, | |
| "loss": 1.7352, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 13.170731707317072, | |
| "eval_loss": 1.7480661869049072, | |
| "eval_runtime": 39.3461, | |
| "eval_samples_per_second": 448.863, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 13.292682926829269, | |
| "grad_norm": 3.037234306335449, | |
| "learning_rate": 6.104447391173858e-07, | |
| "loss": 1.7395, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 13.292682926829269, | |
| "eval_loss": 1.7456430196762085, | |
| "eval_runtime": 39.3861, | |
| "eval_samples_per_second": 448.407, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 13.414634146341463, | |
| "grad_norm": 1.7524123191833496, | |
| "learning_rate": 5.9093136282866e-07, | |
| "loss": 1.7317, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 13.414634146341463, | |
| "eval_loss": 1.7444212436676025, | |
| "eval_runtime": 39.3868, | |
| "eval_samples_per_second": 448.399, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 13.536585365853659, | |
| "grad_norm": 1.582607388496399, | |
| "learning_rate": 5.716032999375006e-07, | |
| "loss": 1.7356, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 13.536585365853659, | |
| "eval_loss": 1.743189811706543, | |
| "eval_runtime": 39.3597, | |
| "eval_samples_per_second": 448.707, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 13.658536585365853, | |
| "grad_norm": 0.7700549364089966, | |
| "learning_rate": 5.524693063070492e-07, | |
| "loss": 1.7347, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 13.658536585365853, | |
| "eval_loss": 1.7409182786941528, | |
| "eval_runtime": 39.3602, | |
| "eval_samples_per_second": 448.702, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 13.78048780487805, | |
| "grad_norm": 0.8657609820365906, | |
| "learning_rate": 5.335380498845559e-07, | |
| "loss": 1.7291, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 13.78048780487805, | |
| "eval_loss": 1.7383273839950562, | |
| "eval_runtime": 39.3511, | |
| "eval_samples_per_second": 448.806, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 13.902439024390244, | |
| "grad_norm": 0.5521230101585388, | |
| "learning_rate": 5.148181067746861e-07, | |
| "loss": 1.7238, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 13.902439024390244, | |
| "eval_loss": 1.7357066869735718, | |
| "eval_runtime": 39.3644, | |
| "eval_samples_per_second": 448.654, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 14.024390243902438, | |
| "grad_norm": 0.9353064894676208, | |
| "learning_rate": 4.963179573544356e-07, | |
| "loss": 1.7238, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 14.024390243902438, | |
| "eval_loss": 1.7331624031066895, | |
| "eval_runtime": 39.355, | |
| "eval_samples_per_second": 448.761, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 14.146341463414634, | |
| "grad_norm": 0.4633055329322815, | |
| "learning_rate": 4.780459824314066e-07, | |
| "loss": 1.7234, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 14.146341463414634, | |
| "eval_loss": 1.7308125495910645, | |
| "eval_runtime": 39.4238, | |
| "eval_samples_per_second": 447.978, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 14.268292682926829, | |
| "grad_norm": 0.5228179693222046, | |
| "learning_rate": 4.6001045944719594e-07, | |
| "loss": 1.7165, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 14.268292682926829, | |
| "eval_loss": 1.7286032438278198, | |
| "eval_runtime": 39.3556, | |
| "eval_samples_per_second": 448.755, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 14.390243902439025, | |
| "grad_norm": 0.3939041495323181, | |
| "learning_rate": 4.4221955872760573e-07, | |
| "loss": 1.7171, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 14.390243902439025, | |
| "eval_loss": 1.72659432888031, | |
| "eval_runtime": 39.3559, | |
| "eval_samples_per_second": 448.75, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 14.512195121951219, | |
| "grad_norm": 0.3697729706764221, | |
| "learning_rate": 4.246813397813794e-07, | |
| "loss": 1.7153, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 14.512195121951219, | |
| "eval_loss": 1.7247569561004639, | |
| "eval_runtime": 39.3585, | |
| "eval_samples_per_second": 448.722, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 14.634146341463415, | |
| "grad_norm": 0.38930952548980713, | |
| "learning_rate": 4.074037476491413e-07, | |
| "loss": 1.7147, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 14.634146341463415, | |
| "eval_loss": 1.7230459451675415, | |
| "eval_runtime": 39.3707, | |
| "eval_samples_per_second": 448.582, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 14.75609756097561, | |
| "grad_norm": 0.5216050148010254, | |
| "learning_rate": 3.9039460930418767e-07, | |
| "loss": 1.7093, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 14.75609756097561, | |
| "eval_loss": 1.721459984779358, | |
| "eval_runtime": 39.4124, | |
| "eval_samples_per_second": 448.108, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 14.878048780487806, | |
| "grad_norm": 0.6067308187484741, | |
| "learning_rate": 3.736616301067693e-07, | |
| "loss": 1.7114, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 14.878048780487806, | |
| "eval_loss": 1.7200278043746948, | |
| "eval_runtime": 39.3411, | |
| "eval_samples_per_second": 448.92, | |
| "eval_steps_per_second": 0.458, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.528874933719635, | |
| "learning_rate": 3.5721239031346063e-07, | |
| "loss": 1.7074, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 1.7186657190322876, | |
| "eval_runtime": 39.4208, | |
| "eval_samples_per_second": 448.012, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 15.121951219512194, | |
| "grad_norm": 0.4690570831298828, | |
| "learning_rate": 3.410543416432069e-07, | |
| "loss": 1.7068, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 15.121951219512194, | |
| "eval_loss": 1.7174080610275269, | |
| "eval_runtime": 39.3531, | |
| "eval_samples_per_second": 448.783, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 15.24390243902439, | |
| "grad_norm": 0.4555855989456177, | |
| "learning_rate": 3.2519480390159804e-07, | |
| "loss": 1.7067, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 15.24390243902439, | |
| "eval_loss": 1.7162292003631592, | |
| "eval_runtime": 39.3524, | |
| "eval_samples_per_second": 448.791, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 15.365853658536585, | |
| "grad_norm": 0.8303574919700623, | |
| "learning_rate": 3.096409616649023e-07, | |
| "loss": 1.7034, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 15.365853658536585, | |
| "eval_loss": 1.7151583433151245, | |
| "eval_runtime": 39.3553, | |
| "eval_samples_per_second": 448.758, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 15.487804878048781, | |
| "grad_norm": 0.5495628714561462, | |
| "learning_rate": 2.943998610253604e-07, | |
| "loss": 1.7075, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 15.487804878048781, | |
| "eval_loss": 1.7141631841659546, | |
| "eval_runtime": 39.3687, | |
| "eval_samples_per_second": 448.606, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 15.609756097560975, | |
| "grad_norm": 0.361331582069397, | |
| "learning_rate": 2.7947840639921303e-07, | |
| "loss": 1.7002, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 15.609756097560975, | |
| "eval_loss": 1.7131644487380981, | |
| "eval_runtime": 39.4083, | |
| "eval_samples_per_second": 448.155, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 15.731707317073171, | |
| "grad_norm": 0.4098544418811798, | |
| "learning_rate": 2.648833573989118e-07, | |
| "loss": 1.7055, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 15.731707317073171, | |
| "eval_loss": 1.712282657623291, | |
| "eval_runtime": 39.3799, | |
| "eval_samples_per_second": 448.478, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 15.853658536585366, | |
| "grad_norm": 0.5131831765174866, | |
| "learning_rate": 2.50621325770927e-07, | |
| "loss": 1.6976, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 15.853658536585366, | |
| "eval_loss": 1.7114192247390747, | |
| "eval_runtime": 39.3871, | |
| "eval_samples_per_second": 448.395, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 15.975609756097562, | |
| "grad_norm": 0.4333685338497162, | |
| "learning_rate": 2.3669877240054037e-07, | |
| "loss": 1.7002, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 15.975609756097562, | |
| "eval_loss": 1.710659146308899, | |
| "eval_runtime": 39.3698, | |
| "eval_samples_per_second": 448.593, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 16.097560975609756, | |
| "grad_norm": 0.36955586075782776, | |
| "learning_rate": 2.231220043849804e-07, | |
| "loss": 1.7015, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 16.097560975609756, | |
| "eval_loss": 1.7099283933639526, | |
| "eval_runtime": 39.3604, | |
| "eval_samples_per_second": 448.699, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 16.21951219512195, | |
| "grad_norm": 0.37774789333343506, | |
| "learning_rate": 2.0989717217622648e-07, | |
| "loss": 1.6987, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 16.21951219512195, | |
| "eval_loss": 1.70924973487854, | |
| "eval_runtime": 39.3549, | |
| "eval_samples_per_second": 448.763, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 16.341463414634145, | |
| "grad_norm": 0.389635294675827, | |
| "learning_rate": 1.9703026679477252e-07, | |
| "loss": 1.6985, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 16.341463414634145, | |
| "eval_loss": 1.7086195945739746, | |
| "eval_runtime": 39.7499, | |
| "eval_samples_per_second": 444.303, | |
| "eval_steps_per_second": 0.453, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 16.463414634146343, | |
| "grad_norm": 0.4067881405353546, | |
| "learning_rate": 1.845271171156184e-07, | |
| "loss": 1.6986, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 16.463414634146343, | |
| "eval_loss": 1.7080307006835938, | |
| "eval_runtime": 39.3203, | |
| "eval_samples_per_second": 449.157, | |
| "eval_steps_per_second": 0.458, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 16.585365853658537, | |
| "grad_norm": 0.33628836274147034, | |
| "learning_rate": 1.7239338722771324e-07, | |
| "loss": 1.6993, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 16.585365853658537, | |
| "eval_loss": 1.707476019859314, | |
| "eval_runtime": 39.357, | |
| "eval_samples_per_second": 448.739, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 16.70731707317073, | |
| "grad_norm": 0.31285569071769714, | |
| "learning_rate": 1.6063457386805003e-07, | |
| "loss": 1.6946, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 16.70731707317073, | |
| "eval_loss": 1.7069728374481201, | |
| "eval_runtime": 39.3881, | |
| "eval_samples_per_second": 448.384, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 16.829268292682926, | |
| "grad_norm": 0.3229863941669464, | |
| "learning_rate": 1.4925600393157322e-07, | |
| "loss": 1.6934, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 16.829268292682926, | |
| "eval_loss": 1.7064942121505737, | |
| "eval_runtime": 39.3862, | |
| "eval_samples_per_second": 448.406, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 16.951219512195124, | |
| "grad_norm": 0.32283350825309753, | |
| "learning_rate": 1.3826283205802424e-07, | |
| "loss": 1.6936, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 16.951219512195124, | |
| "eval_loss": 1.706059217453003, | |
| "eval_runtime": 39.374, | |
| "eval_samples_per_second": 448.544, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 17.073170731707318, | |
| "grad_norm": 0.2510131597518921, | |
| "learning_rate": 1.2766003829682504e-07, | |
| "loss": 1.6972, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 17.073170731707318, | |
| "eval_loss": 1.705664873123169, | |
| "eval_runtime": 39.3448, | |
| "eval_samples_per_second": 448.877, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 17.195121951219512, | |
| "grad_norm": 0.2857695519924164, | |
| "learning_rate": 1.1745242585104953e-07, | |
| "loss": 1.6923, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 17.195121951219512, | |
| "eval_loss": 1.7052934169769287, | |
| "eval_runtime": 39.3548, | |
| "eval_samples_per_second": 448.763, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 17.317073170731707, | |
| "grad_norm": 0.39086970686912537, | |
| "learning_rate": 1.0764461890151111e-07, | |
| "loss": 1.6943, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 17.317073170731707, | |
| "eval_loss": 1.704952359199524, | |
| "eval_runtime": 39.366, | |
| "eval_samples_per_second": 448.635, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 17.4390243902439, | |
| "grad_norm": 0.2526913583278656, | |
| "learning_rate": 9.824106051194858e-08, | |
| "loss": 1.6944, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 17.4390243902439, | |
| "eval_loss": 1.7046380043029785, | |
| "eval_runtime": 39.3509, | |
| "eval_samples_per_second": 448.809, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 17.5609756097561, | |
| "grad_norm": 0.2990001142024994, | |
| "learning_rate": 8.924601061626048e-08, | |
| "loss": 1.6929, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 17.5609756097561, | |
| "eval_loss": 1.7043615579605103, | |
| "eval_runtime": 39.3593, | |
| "eval_samples_per_second": 448.712, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 17.682926829268293, | |
| "grad_norm": 0.24767932295799255, | |
| "learning_rate": 8.066354408870047e-08, | |
| "loss": 1.6926, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 17.682926829268293, | |
| "eval_loss": 1.7040989398956299, | |
| "eval_runtime": 39.3568, | |
| "eval_samples_per_second": 448.741, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 17.804878048780488, | |
| "grad_norm": 0.3169814348220825, | |
| "learning_rate": 7.249754889790538e-08, | |
| "loss": 1.6926, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 17.804878048780488, | |
| "eval_loss": 1.703873634338379, | |
| "eval_runtime": 39.4, | |
| "eval_samples_per_second": 448.249, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 17.926829268292682, | |
| "grad_norm": 0.3294218182563782, | |
| "learning_rate": 6.475172434559573e-08, | |
| "loss": 1.6932, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 17.926829268292682, | |
| "eval_loss": 1.703667163848877, | |
| "eval_runtime": 39.4234, | |
| "eval_samples_per_second": 447.983, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 18.048780487804876, | |
| "grad_norm": 0.2851867079734802, | |
| "learning_rate": 5.742957939074411e-08, | |
| "loss": 1.6927, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 18.048780487804876, | |
| "eval_loss": 1.7034906148910522, | |
| "eval_runtime": 39.3855, | |
| "eval_samples_per_second": 448.414, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 18.170731707317074, | |
| "grad_norm": 0.2505706250667572, | |
| "learning_rate": 5.053443105997068e-08, | |
| "loss": 1.6905, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 18.170731707317074, | |
| "eval_loss": 1.7033272981643677, | |
| "eval_runtime": 39.3764, | |
| "eval_samples_per_second": 448.517, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 18.29268292682927, | |
| "grad_norm": 0.2556091248989105, | |
| "learning_rate": 4.4069402944887704e-08, | |
| "loss": 1.6928, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 18.29268292682927, | |
| "eval_loss": 1.703181505203247, | |
| "eval_runtime": 39.3582, | |
| "eval_samples_per_second": 448.725, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 18.414634146341463, | |
| "grad_norm": 0.2573912739753723, | |
| "learning_rate": 3.803742378707198e-08, | |
| "loss": 1.6944, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 18.414634146341463, | |
| "eval_loss": 1.703063726425171, | |
| "eval_runtime": 39.3337, | |
| "eval_samples_per_second": 449.004, | |
| "eval_steps_per_second": 0.458, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 18.536585365853657, | |
| "grad_norm": 0.24173639714717865, | |
| "learning_rate": 3.24412261513064e-08, | |
| "loss": 1.6925, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 18.536585365853657, | |
| "eval_loss": 1.7029577493667603, | |
| "eval_runtime": 39.3737, | |
| "eval_samples_per_second": 448.549, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 18.658536585365855, | |
| "grad_norm": 0.24515186250209808, | |
| "learning_rate": 2.7283345187693264e-08, | |
| "loss": 1.6944, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 18.658536585365855, | |
| "eval_loss": 1.7028616666793823, | |
| "eval_runtime": 39.3701, | |
| "eval_samples_per_second": 448.589, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 18.78048780487805, | |
| "grad_norm": 0.25829750299453735, | |
| "learning_rate": 2.256611748319792e-08, | |
| "loss": 1.6897, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 18.78048780487805, | |
| "eval_loss": 1.7027884721755981, | |
| "eval_runtime": 39.4047, | |
| "eval_samples_per_second": 448.195, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 18.902439024390244, | |
| "grad_norm": 0.2337442934513092, | |
| "learning_rate": 1.8291680003145073e-08, | |
| "loss": 1.6915, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 18.902439024390244, | |
| "eval_loss": 1.702728033065796, | |
| "eval_runtime": 39.8565, | |
| "eval_samples_per_second": 443.115, | |
| "eval_steps_per_second": 0.452, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 19.024390243902438, | |
| "grad_norm": 0.24271942675113678, | |
| "learning_rate": 1.4461969123145457e-08, | |
| "loss": 1.6891, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 19.024390243902438, | |
| "eval_loss": 1.7026790380477905, | |
| "eval_runtime": 39.3468, | |
| "eval_samples_per_second": 448.854, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 19.146341463414632, | |
| "grad_norm": 0.2199811339378357, | |
| "learning_rate": 1.107871975189234e-08, | |
| "loss": 1.6884, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 19.146341463414632, | |
| "eval_loss": 1.702639102935791, | |
| "eval_runtime": 39.3619, | |
| "eval_samples_per_second": 448.683, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 19.26829268292683, | |
| "grad_norm": 0.24018193781375885, | |
| "learning_rate": 8.143464545226297e-09, | |
| "loss": 1.6962, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 19.26829268292683, | |
| "eval_loss": 1.7026113271713257, | |
| "eval_runtime": 39.2823, | |
| "eval_samples_per_second": 449.591, | |
| "eval_steps_per_second": 0.458, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 19.390243902439025, | |
| "grad_norm": 0.23089687526226044, | |
| "learning_rate": 5.657533211820941e-09, | |
| "loss": 1.6918, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 19.390243902439025, | |
| "eval_loss": 1.7025905847549438, | |
| "eval_runtime": 39.2847, | |
| "eval_samples_per_second": 449.564, | |
| "eval_steps_per_second": 0.458, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 19.51219512195122, | |
| "grad_norm": 0.219436913728714, | |
| "learning_rate": 3.6220519108086654e-09, | |
| "loss": 1.6906, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 19.51219512195122, | |
| "eval_loss": 1.7025744915008545, | |
| "eval_runtime": 39.3227, | |
| "eval_samples_per_second": 449.13, | |
| "eval_steps_per_second": 0.458, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 19.634146341463413, | |
| "grad_norm": 0.21289722621440887, | |
| "learning_rate": 2.037942741615617e-09, | |
| "loss": 1.691, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 19.634146341463413, | |
| "eval_loss": 1.7025699615478516, | |
| "eval_runtime": 39.3676, | |
| "eval_samples_per_second": 448.617, | |
| "eval_steps_per_second": 0.457, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 19.75609756097561, | |
| "grad_norm": 0.2050682008266449, | |
| "learning_rate": 9.059233262386224e-10, | |
| "loss": 1.6963, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 19.75609756097561, | |
| "eval_loss": 1.7025647163391113, | |
| "eval_runtime": 39.2816, | |
| "eval_samples_per_second": 449.599, | |
| "eval_steps_per_second": 0.458, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 19.878048780487806, | |
| "grad_norm": 0.2104637622833252, | |
| "learning_rate": 2.265064841533437e-10, | |
| "loss": 1.69, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 19.878048780487806, | |
| "eval_loss": 1.7025623321533203, | |
| "eval_runtime": 39.3024, | |
| "eval_samples_per_second": 449.362, | |
| "eval_steps_per_second": 0.458, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.24021713435649872, | |
| "learning_rate": 0.0, | |
| "loss": 1.6877, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 1.7025611400604248, | |
| "eval_runtime": 39.2695, | |
| "eval_samples_per_second": 449.738, | |
| "eval_steps_per_second": 0.458, | |
| "step": 1640 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1640, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.5456972789249475e+19, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |