| { |
| "best_metric": 1.0354996919631958, |
| "best_model_checkpoint": "./results/checkpoint-1100", |
| "epoch": 20.0, |
| "eval_steps": 500, |
| "global_step": 1100, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.18181818181818182, |
| "grad_norm": 59.80698776245117, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 14.2827, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.36363636363636365, |
| "grad_norm": 56.797889709472656, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 14.071, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.5454545454545454, |
| "grad_norm": 57.82393264770508, |
| "learning_rate": 3e-06, |
| "loss": 13.2518, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.7272727272727273, |
| "grad_norm": 59.039188385009766, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 12.7538, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.9090909090909091, |
| "grad_norm": 56.73802947998047, |
| "learning_rate": 5e-06, |
| "loss": 12.0862, |
| "step": 50 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 11.018159866333008, |
| "eval_runtime": 0.597, |
| "eval_samples_per_second": 162.49, |
| "eval_steps_per_second": 11.726, |
| "step": 55 |
| }, |
| { |
| "epoch": 1.0909090909090908, |
| "grad_norm": 51.90390396118164, |
| "learning_rate": 6e-06, |
| "loss": 11.2102, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.2727272727272727, |
| "grad_norm": 50.65810012817383, |
| "learning_rate": 7.000000000000001e-06, |
| "loss": 9.8171, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.4545454545454546, |
| "grad_norm": 47.3421630859375, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 8.5561, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.6363636363636362, |
| "grad_norm": 44.67892837524414, |
| "learning_rate": 9e-06, |
| "loss": 7.0233, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.8181818181818183, |
| "grad_norm": 28.591655731201172, |
| "learning_rate": 1e-05, |
| "loss": 5.5566, |
| "step": 100 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 28.30428695678711, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 4.5989, |
| "step": 110 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 3.789405345916748, |
| "eval_runtime": 0.5933, |
| "eval_samples_per_second": 163.488, |
| "eval_steps_per_second": 11.798, |
| "step": 110 |
| }, |
| { |
| "epoch": 2.1818181818181817, |
| "grad_norm": 10.207085609436035, |
| "learning_rate": 1.2e-05, |
| "loss": 3.8885, |
| "step": 120 |
| }, |
| { |
| "epoch": 2.3636363636363638, |
| "grad_norm": 3.9164113998413086, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 3.4224, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.5454545454545454, |
| "grad_norm": 2.935734272003174, |
| "learning_rate": 1.4000000000000001e-05, |
| "loss": 3.2076, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.7272727272727275, |
| "grad_norm": 3.6059088706970215, |
| "learning_rate": 1.5e-05, |
| "loss": 2.8546, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.909090909090909, |
| "grad_norm": 2.061573028564453, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 2.7753, |
| "step": 160 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 2.1805596351623535, |
| "eval_runtime": 0.5943, |
| "eval_samples_per_second": 163.223, |
| "eval_steps_per_second": 11.779, |
| "step": 165 |
| }, |
| { |
| "epoch": 3.090909090909091, |
| "grad_norm": 1.548179268836975, |
| "learning_rate": 1.7000000000000003e-05, |
| "loss": 2.4559, |
| "step": 170 |
| }, |
| { |
| "epoch": 3.2727272727272725, |
| "grad_norm": 1.3137719631195068, |
| "learning_rate": 1.8e-05, |
| "loss": 2.4184, |
| "step": 180 |
| }, |
| { |
| "epoch": 3.4545454545454546, |
| "grad_norm": 1.1977788209915161, |
| "learning_rate": 1.9e-05, |
| "loss": 2.4158, |
| "step": 190 |
| }, |
| { |
| "epoch": 3.6363636363636362, |
| "grad_norm": 1.3094607591629028, |
| "learning_rate": 2e-05, |
| "loss": 2.3414, |
| "step": 200 |
| }, |
| { |
| "epoch": 3.8181818181818183, |
| "grad_norm": 1.8832674026489258, |
| "learning_rate": 2.1e-05, |
| "loss": 2.2682, |
| "step": 210 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 1.4545402526855469, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 2.2063, |
| "step": 220 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 1.8433746099472046, |
| "eval_runtime": 0.5925, |
| "eval_samples_per_second": 163.707, |
| "eval_steps_per_second": 11.814, |
| "step": 220 |
| }, |
| { |
| "epoch": 4.181818181818182, |
| "grad_norm": 1.1338486671447754, |
| "learning_rate": 2.3000000000000003e-05, |
| "loss": 2.1248, |
| "step": 230 |
| }, |
| { |
| "epoch": 4.363636363636363, |
| "grad_norm": 1.7253062725067139, |
| "learning_rate": 2.4e-05, |
| "loss": 2.0733, |
| "step": 240 |
| }, |
| { |
| "epoch": 4.545454545454545, |
| "grad_norm": 1.2330889701843262, |
| "learning_rate": 2.5e-05, |
| "loss": 2.1109, |
| "step": 250 |
| }, |
| { |
| "epoch": 4.7272727272727275, |
| "grad_norm": 1.1504757404327393, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 2.0596, |
| "step": 260 |
| }, |
| { |
| "epoch": 4.909090909090909, |
| "grad_norm": 0.8018497824668884, |
| "learning_rate": 2.7000000000000002e-05, |
| "loss": 1.95, |
| "step": 270 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 1.6294249296188354, |
| "eval_runtime": 0.5937, |
| "eval_samples_per_second": 163.37, |
| "eval_steps_per_second": 11.79, |
| "step": 275 |
| }, |
| { |
| "epoch": 5.090909090909091, |
| "grad_norm": 0.9103118181228638, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 1.9405, |
| "step": 280 |
| }, |
| { |
| "epoch": 5.2727272727272725, |
| "grad_norm": 0.7254413366317749, |
| "learning_rate": 2.9e-05, |
| "loss": 1.7494, |
| "step": 290 |
| }, |
| { |
| "epoch": 5.454545454545454, |
| "grad_norm": 0.8356502652168274, |
| "learning_rate": 3e-05, |
| "loss": 1.9466, |
| "step": 300 |
| }, |
| { |
| "epoch": 5.636363636363637, |
| "grad_norm": 0.9298439025878906, |
| "learning_rate": 3.1e-05, |
| "loss": 1.8615, |
| "step": 310 |
| }, |
| { |
| "epoch": 5.818181818181818, |
| "grad_norm": 0.8624694347381592, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 1.8345, |
| "step": 320 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 2.2793524265289307, |
| "learning_rate": 3.3e-05, |
| "loss": 1.8421, |
| "step": 330 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 1.492344856262207, |
| "eval_runtime": 0.5945, |
| "eval_samples_per_second": 163.169, |
| "eval_steps_per_second": 11.775, |
| "step": 330 |
| }, |
| { |
| "epoch": 6.181818181818182, |
| "grad_norm": 0.9516413807868958, |
| "learning_rate": 3.4000000000000007e-05, |
| "loss": 1.7457, |
| "step": 340 |
| }, |
| { |
| "epoch": 6.363636363636363, |
| "grad_norm": 0.9872556924819946, |
| "learning_rate": 3.5e-05, |
| "loss": 1.7445, |
| "step": 350 |
| }, |
| { |
| "epoch": 6.545454545454545, |
| "grad_norm": 0.7271709442138672, |
| "learning_rate": 3.6e-05, |
| "loss": 1.6672, |
| "step": 360 |
| }, |
| { |
| "epoch": 6.7272727272727275, |
| "grad_norm": 0.9995086193084717, |
| "learning_rate": 3.7e-05, |
| "loss": 1.7882, |
| "step": 370 |
| }, |
| { |
| "epoch": 6.909090909090909, |
| "grad_norm": 0.6862213611602783, |
| "learning_rate": 3.8e-05, |
| "loss": 1.7476, |
| "step": 380 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 1.3870813846588135, |
| "eval_runtime": 0.5928, |
| "eval_samples_per_second": 163.634, |
| "eval_steps_per_second": 11.809, |
| "step": 385 |
| }, |
| { |
| "epoch": 7.090909090909091, |
| "grad_norm": 0.7572046518325806, |
| "learning_rate": 3.9000000000000006e-05, |
| "loss": 1.619, |
| "step": 390 |
| }, |
| { |
| "epoch": 7.2727272727272725, |
| "grad_norm": 0.797691822052002, |
| "learning_rate": 4e-05, |
| "loss": 1.6172, |
| "step": 400 |
| }, |
| { |
| "epoch": 7.454545454545454, |
| "grad_norm": 0.8255596160888672, |
| "learning_rate": 4.1e-05, |
| "loss": 1.6527, |
| "step": 410 |
| }, |
| { |
| "epoch": 7.636363636363637, |
| "grad_norm": 0.911715030670166, |
| "learning_rate": 4.2e-05, |
| "loss": 1.6293, |
| "step": 420 |
| }, |
| { |
| "epoch": 7.818181818181818, |
| "grad_norm": 0.969050943851471, |
| "learning_rate": 4.3e-05, |
| "loss": 1.6089, |
| "step": 430 |
| }, |
| { |
| "epoch": 8.0, |
| "grad_norm": 1.2061145305633545, |
| "learning_rate": 4.4000000000000006e-05, |
| "loss": 1.6103, |
| "step": 440 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 1.3084237575531006, |
| "eval_runtime": 0.5959, |
| "eval_samples_per_second": 162.78, |
| "eval_steps_per_second": 11.747, |
| "step": 440 |
| }, |
| { |
| "epoch": 8.181818181818182, |
| "grad_norm": 0.8220289349555969, |
| "learning_rate": 4.5e-05, |
| "loss": 1.6498, |
| "step": 450 |
| }, |
| { |
| "epoch": 8.363636363636363, |
| "grad_norm": 0.8548042178153992, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 1.5546, |
| "step": 460 |
| }, |
| { |
| "epoch": 8.545454545454545, |
| "grad_norm": 0.9596685767173767, |
| "learning_rate": 4.7e-05, |
| "loss": 1.4974, |
| "step": 470 |
| }, |
| { |
| "epoch": 8.727272727272727, |
| "grad_norm": 1.1037862300872803, |
| "learning_rate": 4.8e-05, |
| "loss": 1.4494, |
| "step": 480 |
| }, |
| { |
| "epoch": 8.909090909090908, |
| "grad_norm": 0.8066275119781494, |
| "learning_rate": 4.9e-05, |
| "loss": 1.5523, |
| "step": 490 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 1.242799162864685, |
| "eval_runtime": 0.5947, |
| "eval_samples_per_second": 163.098, |
| "eval_steps_per_second": 11.77, |
| "step": 495 |
| }, |
| { |
| "epoch": 9.090909090909092, |
| "grad_norm": 0.8395436406135559, |
| "learning_rate": 5e-05, |
| "loss": 1.5186, |
| "step": 500 |
| }, |
| { |
| "epoch": 9.272727272727273, |
| "grad_norm": 0.7990550994873047, |
| "learning_rate": 4.9166666666666665e-05, |
| "loss": 1.4607, |
| "step": 510 |
| }, |
| { |
| "epoch": 9.454545454545455, |
| "grad_norm": 1.9658387899398804, |
| "learning_rate": 4.8333333333333334e-05, |
| "loss": 1.5294, |
| "step": 520 |
| }, |
| { |
| "epoch": 9.636363636363637, |
| "grad_norm": 0.7331168055534363, |
| "learning_rate": 4.75e-05, |
| "loss": 1.4599, |
| "step": 530 |
| }, |
| { |
| "epoch": 9.818181818181818, |
| "grad_norm": 0.8598091006278992, |
| "learning_rate": 4.666666666666667e-05, |
| "loss": 1.4911, |
| "step": 540 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 1.3404613733291626, |
| "learning_rate": 4.5833333333333334e-05, |
| "loss": 1.3831, |
| "step": 550 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 1.192116141319275, |
| "eval_runtime": 0.5956, |
| "eval_samples_per_second": 162.849, |
| "eval_steps_per_second": 11.752, |
| "step": 550 |
| }, |
| { |
| "epoch": 10.181818181818182, |
| "grad_norm": 0.6602842211723328, |
| "learning_rate": 4.5e-05, |
| "loss": 1.4571, |
| "step": 560 |
| }, |
| { |
| "epoch": 10.363636363636363, |
| "grad_norm": 0.7361099123954773, |
| "learning_rate": 4.4166666666666665e-05, |
| "loss": 1.4129, |
| "step": 570 |
| }, |
| { |
| "epoch": 10.545454545454545, |
| "grad_norm": 0.5836505889892578, |
| "learning_rate": 4.3333333333333334e-05, |
| "loss": 1.4046, |
| "step": 580 |
| }, |
| { |
| "epoch": 10.727272727272727, |
| "grad_norm": 0.7169276475906372, |
| "learning_rate": 4.25e-05, |
| "loss": 1.3453, |
| "step": 590 |
| }, |
| { |
| "epoch": 10.909090909090908, |
| "grad_norm": 0.6864651441574097, |
| "learning_rate": 4.166666666666667e-05, |
| "loss": 1.3774, |
| "step": 600 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_loss": 1.1549417972564697, |
| "eval_runtime": 0.5937, |
| "eval_samples_per_second": 163.384, |
| "eval_steps_per_second": 11.791, |
| "step": 605 |
| }, |
| { |
| "epoch": 11.090909090909092, |
| "grad_norm": 0.5845214128494263, |
| "learning_rate": 4.0833333333333334e-05, |
| "loss": 1.4287, |
| "step": 610 |
| }, |
| { |
| "epoch": 11.272727272727273, |
| "grad_norm": 0.6631967425346375, |
| "learning_rate": 4e-05, |
| "loss": 1.412, |
| "step": 620 |
| }, |
| { |
| "epoch": 11.454545454545455, |
| "grad_norm": 0.7297359704971313, |
| "learning_rate": 3.9166666666666665e-05, |
| "loss": 1.3722, |
| "step": 630 |
| }, |
| { |
| "epoch": 11.636363636363637, |
| "grad_norm": 0.9224486947059631, |
| "learning_rate": 3.8333333333333334e-05, |
| "loss": 1.3108, |
| "step": 640 |
| }, |
| { |
| "epoch": 11.818181818181818, |
| "grad_norm": 0.7744407057762146, |
| "learning_rate": 3.7500000000000003e-05, |
| "loss": 1.338, |
| "step": 650 |
| }, |
| { |
| "epoch": 12.0, |
| "grad_norm": 1.0474393367767334, |
| "learning_rate": 3.6666666666666666e-05, |
| "loss": 1.3958, |
| "step": 660 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_loss": 1.1199445724487305, |
| "eval_runtime": 0.5916, |
| "eval_samples_per_second": 163.952, |
| "eval_steps_per_second": 11.832, |
| "step": 660 |
| }, |
| { |
| "epoch": 12.181818181818182, |
| "grad_norm": 0.9611796140670776, |
| "learning_rate": 3.5833333333333335e-05, |
| "loss": 1.3588, |
| "step": 670 |
| }, |
| { |
| "epoch": 12.363636363636363, |
| "grad_norm": 0.5708998441696167, |
| "learning_rate": 3.5e-05, |
| "loss": 1.3708, |
| "step": 680 |
| }, |
| { |
| "epoch": 12.545454545454545, |
| "grad_norm": 0.6570747494697571, |
| "learning_rate": 3.4166666666666666e-05, |
| "loss": 1.3801, |
| "step": 690 |
| }, |
| { |
| "epoch": 12.727272727272727, |
| "grad_norm": 1.4075642824172974, |
| "learning_rate": 3.3333333333333335e-05, |
| "loss": 1.294, |
| "step": 700 |
| }, |
| { |
| "epoch": 12.909090909090908, |
| "grad_norm": 0.7119397521018982, |
| "learning_rate": 3.2500000000000004e-05, |
| "loss": 1.2247, |
| "step": 710 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_loss": 1.0983611345291138, |
| "eval_runtime": 0.5921, |
| "eval_samples_per_second": 163.826, |
| "eval_steps_per_second": 11.823, |
| "step": 715 |
| }, |
| { |
| "epoch": 13.090909090909092, |
| "grad_norm": 1.2503626346588135, |
| "learning_rate": 3.1666666666666666e-05, |
| "loss": 1.3526, |
| "step": 720 |
| }, |
| { |
| "epoch": 13.272727272727273, |
| "grad_norm": 0.7760170698165894, |
| "learning_rate": 3.0833333333333335e-05, |
| "loss": 1.256, |
| "step": 730 |
| }, |
| { |
| "epoch": 13.454545454545455, |
| "grad_norm": 0.7515042424201965, |
| "learning_rate": 3e-05, |
| "loss": 1.2808, |
| "step": 740 |
| }, |
| { |
| "epoch": 13.636363636363637, |
| "grad_norm": 1.301086664199829, |
| "learning_rate": 2.916666666666667e-05, |
| "loss": 1.3106, |
| "step": 750 |
| }, |
| { |
| "epoch": 13.818181818181818, |
| "grad_norm": 2.129178285598755, |
| "learning_rate": 2.8333333333333335e-05, |
| "loss": 1.3353, |
| "step": 760 |
| }, |
| { |
| "epoch": 14.0, |
| "grad_norm": 1.301637053489685, |
| "learning_rate": 2.7500000000000004e-05, |
| "loss": 1.3556, |
| "step": 770 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_loss": 1.079917550086975, |
| "eval_runtime": 0.593, |
| "eval_samples_per_second": 163.562, |
| "eval_steps_per_second": 11.803, |
| "step": 770 |
| }, |
| { |
| "epoch": 14.181818181818182, |
| "grad_norm": 1.0607006549835205, |
| "learning_rate": 2.6666666666666667e-05, |
| "loss": 1.2479, |
| "step": 780 |
| }, |
| { |
| "epoch": 14.363636363636363, |
| "grad_norm": 0.6330237984657288, |
| "learning_rate": 2.5833333333333336e-05, |
| "loss": 1.2676, |
| "step": 790 |
| }, |
| { |
| "epoch": 14.545454545454545, |
| "grad_norm": 0.5833084583282471, |
| "learning_rate": 2.5e-05, |
| "loss": 1.266, |
| "step": 800 |
| }, |
| { |
| "epoch": 14.727272727272727, |
| "grad_norm": 0.6804158687591553, |
| "learning_rate": 2.4166666666666667e-05, |
| "loss": 1.2527, |
| "step": 810 |
| }, |
| { |
| "epoch": 14.909090909090908, |
| "grad_norm": 0.612727165222168, |
| "learning_rate": 2.3333333333333336e-05, |
| "loss": 1.3262, |
| "step": 820 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_loss": 1.0633071660995483, |
| "eval_runtime": 0.5948, |
| "eval_samples_per_second": 163.077, |
| "eval_steps_per_second": 11.768, |
| "step": 825 |
| }, |
| { |
| "epoch": 15.090909090909092, |
| "grad_norm": 0.6649700999259949, |
| "learning_rate": 2.25e-05, |
| "loss": 1.2947, |
| "step": 830 |
| }, |
| { |
| "epoch": 15.272727272727273, |
| "grad_norm": 0.7356764674186707, |
| "learning_rate": 2.1666666666666667e-05, |
| "loss": 1.324, |
| "step": 840 |
| }, |
| { |
| "epoch": 15.454545454545455, |
| "grad_norm": 0.7462002038955688, |
| "learning_rate": 2.0833333333333336e-05, |
| "loss": 1.2928, |
| "step": 850 |
| }, |
| { |
| "epoch": 15.636363636363637, |
| "grad_norm": 0.6588531136512756, |
| "learning_rate": 2e-05, |
| "loss": 1.2479, |
| "step": 860 |
| }, |
| { |
| "epoch": 15.818181818181818, |
| "grad_norm": 0.7770337462425232, |
| "learning_rate": 1.9166666666666667e-05, |
| "loss": 1.1583, |
| "step": 870 |
| }, |
| { |
| "epoch": 16.0, |
| "grad_norm": 1.0388455390930176, |
| "learning_rate": 1.8333333333333333e-05, |
| "loss": 1.3213, |
| "step": 880 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_loss": 1.054055094718933, |
| "eval_runtime": 0.5914, |
| "eval_samples_per_second": 164.013, |
| "eval_steps_per_second": 11.836, |
| "step": 880 |
| }, |
| { |
| "epoch": 16.181818181818183, |
| "grad_norm": 0.7560206055641174, |
| "learning_rate": 1.75e-05, |
| "loss": 1.2206, |
| "step": 890 |
| }, |
| { |
| "epoch": 16.363636363636363, |
| "grad_norm": 0.6592769026756287, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 1.2581, |
| "step": 900 |
| }, |
| { |
| "epoch": 16.545454545454547, |
| "grad_norm": 1.3577500581741333, |
| "learning_rate": 1.5833333333333333e-05, |
| "loss": 1.2142, |
| "step": 910 |
| }, |
| { |
| "epoch": 16.727272727272727, |
| "grad_norm": 0.699577808380127, |
| "learning_rate": 1.5e-05, |
| "loss": 1.2416, |
| "step": 920 |
| }, |
| { |
| "epoch": 16.90909090909091, |
| "grad_norm": 0.6473222970962524, |
| "learning_rate": 1.4166666666666668e-05, |
| "loss": 1.294, |
| "step": 930 |
| }, |
| { |
| "epoch": 17.0, |
| "eval_loss": 1.0457794666290283, |
| "eval_runtime": 0.594, |
| "eval_samples_per_second": 163.302, |
| "eval_steps_per_second": 11.785, |
| "step": 935 |
| }, |
| { |
| "epoch": 17.09090909090909, |
| "grad_norm": 0.6135825514793396, |
| "learning_rate": 1.3333333333333333e-05, |
| "loss": 1.2185, |
| "step": 940 |
| }, |
| { |
| "epoch": 17.272727272727273, |
| "grad_norm": 0.764563798904419, |
| "learning_rate": 1.25e-05, |
| "loss": 1.1457, |
| "step": 950 |
| }, |
| { |
| "epoch": 17.454545454545453, |
| "grad_norm": 1.0261220932006836, |
| "learning_rate": 1.1666666666666668e-05, |
| "loss": 1.2785, |
| "step": 960 |
| }, |
| { |
| "epoch": 17.636363636363637, |
| "grad_norm": 0.7169294953346252, |
| "learning_rate": 1.0833333333333334e-05, |
| "loss": 1.2914, |
| "step": 970 |
| }, |
| { |
| "epoch": 17.818181818181817, |
| "grad_norm": 0.6962844133377075, |
| "learning_rate": 1e-05, |
| "loss": 1.2887, |
| "step": 980 |
| }, |
| { |
| "epoch": 18.0, |
| "grad_norm": 0.8768235445022583, |
| "learning_rate": 9.166666666666666e-06, |
| "loss": 1.1882, |
| "step": 990 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_loss": 1.0396397113800049, |
| "eval_runtime": 0.5941, |
| "eval_samples_per_second": 163.267, |
| "eval_steps_per_second": 11.782, |
| "step": 990 |
| }, |
| { |
| "epoch": 18.181818181818183, |
| "grad_norm": 0.699731707572937, |
| "learning_rate": 8.333333333333334e-06, |
| "loss": 1.2342, |
| "step": 1000 |
| }, |
| { |
| "epoch": 18.363636363636363, |
| "grad_norm": 0.6250368356704712, |
| "learning_rate": 7.5e-06, |
| "loss": 1.2437, |
| "step": 1010 |
| }, |
| { |
| "epoch": 18.545454545454547, |
| "grad_norm": 1.1902947425842285, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 1.2064, |
| "step": 1020 |
| }, |
| { |
| "epoch": 18.727272727272727, |
| "grad_norm": 0.562523603439331, |
| "learning_rate": 5.833333333333334e-06, |
| "loss": 1.2225, |
| "step": 1030 |
| }, |
| { |
| "epoch": 18.90909090909091, |
| "grad_norm": 1.156785011291504, |
| "learning_rate": 5e-06, |
| "loss": 1.3008, |
| "step": 1040 |
| }, |
| { |
| "epoch": 19.0, |
| "eval_loss": 1.036926507949829, |
| "eval_runtime": 0.593, |
| "eval_samples_per_second": 163.562, |
| "eval_steps_per_second": 11.803, |
| "step": 1045 |
| }, |
| { |
| "epoch": 19.09090909090909, |
| "grad_norm": 0.8822602033615112, |
| "learning_rate": 4.166666666666667e-06, |
| "loss": 1.1771, |
| "step": 1050 |
| }, |
| { |
| "epoch": 19.272727272727273, |
| "grad_norm": 0.6822894811630249, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 1.1692, |
| "step": 1060 |
| }, |
| { |
| "epoch": 19.454545454545453, |
| "grad_norm": 0.7295346260070801, |
| "learning_rate": 2.5e-06, |
| "loss": 1.2821, |
| "step": 1070 |
| }, |
| { |
| "epoch": 19.636363636363637, |
| "grad_norm": 0.7395732998847961, |
| "learning_rate": 1.6666666666666667e-06, |
| "loss": 1.1773, |
| "step": 1080 |
| }, |
| { |
| "epoch": 19.818181818181817, |
| "grad_norm": 0.6997842192649841, |
| "learning_rate": 8.333333333333333e-07, |
| "loss": 1.2195, |
| "step": 1090 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 1.0376482009887695, |
| "learning_rate": 0.0, |
| "loss": 1.3129, |
| "step": 1100 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_loss": 1.0354996919631958, |
| "eval_runtime": 0.593, |
| "eval_samples_per_second": 163.573, |
| "eval_steps_per_second": 11.804, |
| "step": 1100 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1100, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 20, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2354947345612800.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|