| { | |
| "best_global_step": 7210, | |
| "best_metric": 2.6293044090270996, | |
| "best_model_checkpoint": "/workspace/AI/Tunning/FFT/_cpu_save/Llama-3.2-1B-ins-korean_best2/checkpoint-7210", | |
| "epoch": 8.0, | |
| "eval_steps": 103, | |
| "global_step": 8280, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00966533768273529, | |
| "grad_norm": 22.400630950927734, | |
| "learning_rate": 2.1739130434782606e-08, | |
| "loss": 3.6412, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01933067536547058, | |
| "grad_norm": 21.54340934753418, | |
| "learning_rate": 4.589371980676329e-08, | |
| "loss": 3.6599, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.02899601304820587, | |
| "grad_norm": 17.44480323791504, | |
| "learning_rate": 7.004830917874397e-08, | |
| "loss": 3.6637, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03866135073094116, | |
| "grad_norm": 14.80634593963623, | |
| "learning_rate": 9.420289855072464e-08, | |
| "loss": 3.6108, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.048326688413676454, | |
| "grad_norm": 11.775601387023926, | |
| "learning_rate": 1.1835748792270531e-07, | |
| "loss": 3.6227, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05799202609641174, | |
| "grad_norm": 17.34461212158203, | |
| "learning_rate": 1.4251207729468598e-07, | |
| "loss": 3.611, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.06765736377914704, | |
| "grad_norm": 9.619295120239258, | |
| "learning_rate": 1.6666666666666665e-07, | |
| "loss": 3.5769, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.07732270146188232, | |
| "grad_norm": 8.995287895202637, | |
| "learning_rate": 1.9082125603864732e-07, | |
| "loss": 3.5653, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.08698803914461761, | |
| "grad_norm": 7.953859329223633, | |
| "learning_rate": 2.1497584541062802e-07, | |
| "loss": 3.5337, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.09665337682735291, | |
| "grad_norm": 7.673056125640869, | |
| "learning_rate": 2.391304347826087e-07, | |
| "loss": 3.5483, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0995529781321735, | |
| "eval_loss": 3.5090503692626953, | |
| "eval_runtime": 217.1473, | |
| "eval_samples_per_second": 101.636, | |
| "eval_steps_per_second": 4.237, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.1063187145100882, | |
| "grad_norm": 7.639675617218018, | |
| "learning_rate": 2.6328502415458936e-07, | |
| "loss": 3.5037, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.11598405219282348, | |
| "grad_norm": 7.536130428314209, | |
| "learning_rate": 2.8743961352657006e-07, | |
| "loss": 3.5268, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.12564938987555876, | |
| "grad_norm": 7.575876235961914, | |
| "learning_rate": 3.115942028985507e-07, | |
| "loss": 3.4664, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.13531472755829407, | |
| "grad_norm": 7.081434726715088, | |
| "learning_rate": 3.357487922705314e-07, | |
| "loss": 3.4784, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.14498006524102935, | |
| "grad_norm": 7.422059059143066, | |
| "learning_rate": 3.5990338164251205e-07, | |
| "loss": 3.475, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.15464540292376464, | |
| "grad_norm": 7.345850467681885, | |
| "learning_rate": 3.8405797101449274e-07, | |
| "loss": 3.4595, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.16431074060649994, | |
| "grad_norm": 7.430078029632568, | |
| "learning_rate": 4.082125603864734e-07, | |
| "loss": 3.4618, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.17397607828923523, | |
| "grad_norm": 7.277656078338623, | |
| "learning_rate": 4.323671497584541e-07, | |
| "loss": 3.4182, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.18364141597197053, | |
| "grad_norm": 7.314350605010986, | |
| "learning_rate": 4.5652173913043473e-07, | |
| "loss": 3.4357, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.19330675365470582, | |
| "grad_norm": 7.0931267738342285, | |
| "learning_rate": 4.806763285024155e-07, | |
| "loss": 3.359, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.199105956264347, | |
| "eval_loss": 3.377324342727661, | |
| "eval_runtime": 216.8839, | |
| "eval_samples_per_second": 101.759, | |
| "eval_steps_per_second": 4.242, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.2029720913374411, | |
| "grad_norm": 7.167511940002441, | |
| "learning_rate": 5.048309178743962e-07, | |
| "loss": 3.3954, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2126374290201764, | |
| "grad_norm": 6.995325565338135, | |
| "learning_rate": 5.289855072463768e-07, | |
| "loss": 3.3675, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.22230276670291169, | |
| "grad_norm": 7.191945552825928, | |
| "learning_rate": 5.531400966183575e-07, | |
| "loss": 3.333, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.23196810438564697, | |
| "grad_norm": 7.308806419372559, | |
| "learning_rate": 5.772946859903382e-07, | |
| "loss": 3.3752, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.24163344206838228, | |
| "grad_norm": 7.029453277587891, | |
| "learning_rate": 6.014492753623189e-07, | |
| "loss": 3.3537, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.25129877975111753, | |
| "grad_norm": 7.261192798614502, | |
| "learning_rate": 6.256038647342995e-07, | |
| "loss": 3.3393, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.26096411743385284, | |
| "grad_norm": 7.451327800750732, | |
| "learning_rate": 6.497584541062802e-07, | |
| "loss": 3.3266, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.27062945511658815, | |
| "grad_norm": 7.12129020690918, | |
| "learning_rate": 6.739130434782609e-07, | |
| "loss": 3.293, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.2802947927993234, | |
| "grad_norm": 7.322017192840576, | |
| "learning_rate": 6.980676328502416e-07, | |
| "loss": 3.3214, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.2899601304820587, | |
| "grad_norm": 7.200961589813232, | |
| "learning_rate": 7.222222222222221e-07, | |
| "loss": 3.298, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.29865893439652047, | |
| "eval_loss": 3.266552448272705, | |
| "eval_runtime": 217.0856, | |
| "eval_samples_per_second": 101.665, | |
| "eval_steps_per_second": 4.238, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.299625468164794, | |
| "grad_norm": 7.080708026885986, | |
| "learning_rate": 7.463768115942028e-07, | |
| "loss": 3.2466, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.30929080584752927, | |
| "grad_norm": 7.21260929107666, | |
| "learning_rate": 7.705314009661835e-07, | |
| "loss": 3.2607, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.3189561435302646, | |
| "grad_norm": 7.206089973449707, | |
| "learning_rate": 7.946859903381642e-07, | |
| "loss": 3.2278, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3286214812129999, | |
| "grad_norm": 7.031430244445801, | |
| "learning_rate": 8.188405797101448e-07, | |
| "loss": 3.2724, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3382868188957352, | |
| "grad_norm": 7.544980525970459, | |
| "learning_rate": 8.429951690821255e-07, | |
| "loss": 3.2514, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.34795215657847045, | |
| "grad_norm": 7.031979084014893, | |
| "learning_rate": 8.671497584541062e-07, | |
| "loss": 3.223, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.35761749426120576, | |
| "grad_norm": 7.519832611083984, | |
| "learning_rate": 8.913043478260869e-07, | |
| "loss": 3.2113, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.36728283194394107, | |
| "grad_norm": 7.309943675994873, | |
| "learning_rate": 9.154589371980675e-07, | |
| "loss": 3.2434, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.3769481696266763, | |
| "grad_norm": 7.484668254852295, | |
| "learning_rate": 9.396135265700482e-07, | |
| "loss": 3.1784, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.38661350730941163, | |
| "grad_norm": 7.425693988800049, | |
| "learning_rate": 9.637681159420288e-07, | |
| "loss": 3.224, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.39627884499214694, | |
| "grad_norm": 7.163578987121582, | |
| "learning_rate": 9.879227053140095e-07, | |
| "loss": 3.1703, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.398211912528694, | |
| "eval_loss": 3.1767160892486572, | |
| "eval_runtime": 217.0763, | |
| "eval_samples_per_second": 101.669, | |
| "eval_steps_per_second": 4.238, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.4059441826748822, | |
| "grad_norm": 7.424842357635498, | |
| "learning_rate": 1.0120772946859904e-06, | |
| "loss": 3.1604, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.4156095203576175, | |
| "grad_norm": 7.412405967712402, | |
| "learning_rate": 1.036231884057971e-06, | |
| "loss": 3.1459, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.4252748580403528, | |
| "grad_norm": 7.094534397125244, | |
| "learning_rate": 1.0603864734299516e-06, | |
| "loss": 3.1701, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.43494019572308806, | |
| "grad_norm": 7.796970844268799, | |
| "learning_rate": 1.0845410628019323e-06, | |
| "loss": 3.1437, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.44460553340582337, | |
| "grad_norm": 7.447244644165039, | |
| "learning_rate": 1.108695652173913e-06, | |
| "loss": 3.1425, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.4542708710885587, | |
| "grad_norm": 7.39740514755249, | |
| "learning_rate": 1.1328502415458937e-06, | |
| "loss": 3.1273, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.46393620877129393, | |
| "grad_norm": 7.5719428062438965, | |
| "learning_rate": 1.1570048309178744e-06, | |
| "loss": 3.1535, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.47360154645402924, | |
| "grad_norm": 7.861938953399658, | |
| "learning_rate": 1.1811594202898549e-06, | |
| "loss": 3.1265, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.48326688413676455, | |
| "grad_norm": 7.544646263122559, | |
| "learning_rate": 1.2053140096618358e-06, | |
| "loss": 3.1087, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4929322218194998, | |
| "grad_norm": 7.783000469207764, | |
| "learning_rate": 1.2294685990338163e-06, | |
| "loss": 3.1148, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.4977648906608675, | |
| "eval_loss": 3.1075122356414795, | |
| "eval_runtime": 216.8998, | |
| "eval_samples_per_second": 101.752, | |
| "eval_steps_per_second": 4.242, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.5025975595022351, | |
| "grad_norm": 7.453518390655518, | |
| "learning_rate": 1.253623188405797e-06, | |
| "loss": 3.1099, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5122628971849704, | |
| "grad_norm": 7.3546600341796875, | |
| "learning_rate": 1.2777777777777777e-06, | |
| "loss": 3.0977, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5219282348677057, | |
| "grad_norm": 7.336857318878174, | |
| "learning_rate": 1.3019323671497584e-06, | |
| "loss": 3.1053, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.5315935725504409, | |
| "grad_norm": 7.137737274169922, | |
| "learning_rate": 1.326086956521739e-06, | |
| "loss": 3.1258, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5412589102331763, | |
| "grad_norm": 7.388270854949951, | |
| "learning_rate": 1.3502415458937198e-06, | |
| "loss": 3.0867, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.5509242479159115, | |
| "grad_norm": 7.798702239990234, | |
| "learning_rate": 1.3743961352657005e-06, | |
| "loss": 3.0653, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.5605895855986468, | |
| "grad_norm": 7.419099807739258, | |
| "learning_rate": 1.3985507246376811e-06, | |
| "loss": 3.0953, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.5702549232813822, | |
| "grad_norm": 7.186514377593994, | |
| "learning_rate": 1.4227053140096618e-06, | |
| "loss": 3.0377, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.5799202609641174, | |
| "grad_norm": 7.876040935516357, | |
| "learning_rate": 1.4468599033816423e-06, | |
| "loss": 3.0786, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5895855986468527, | |
| "grad_norm": 7.537592887878418, | |
| "learning_rate": 1.4710144927536232e-06, | |
| "loss": 3.0611, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.5973178687930409, | |
| "eval_loss": 3.0519590377807617, | |
| "eval_runtime": 217.0226, | |
| "eval_samples_per_second": 101.694, | |
| "eval_steps_per_second": 4.239, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 0.599250936329588, | |
| "grad_norm": 7.553813457489014, | |
| "learning_rate": 1.4951690821256037e-06, | |
| "loss": 3.0642, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6089162740123233, | |
| "grad_norm": 7.489668369293213, | |
| "learning_rate": 1.5193236714975846e-06, | |
| "loss": 3.0466, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.6185816116950585, | |
| "grad_norm": 7.333913803100586, | |
| "learning_rate": 1.5434782608695651e-06, | |
| "loss": 3.0133, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.6282469493777939, | |
| "grad_norm": 7.396905422210693, | |
| "learning_rate": 1.5676328502415458e-06, | |
| "loss": 3.0207, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.6379122870605292, | |
| "grad_norm": 7.6018266677856445, | |
| "learning_rate": 1.5917874396135265e-06, | |
| "loss": 3.0175, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.6475776247432644, | |
| "grad_norm": 7.45857048034668, | |
| "learning_rate": 1.6159420289855072e-06, | |
| "loss": 3.0618, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.6572429624259998, | |
| "grad_norm": 7.37148904800415, | |
| "learning_rate": 1.6400966183574877e-06, | |
| "loss": 3.0365, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.666908300108735, | |
| "grad_norm": 7.367425918579102, | |
| "learning_rate": 1.6642512077294686e-06, | |
| "loss": 3.0245, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.6765736377914704, | |
| "grad_norm": 7.418240547180176, | |
| "learning_rate": 1.688405797101449e-06, | |
| "loss": 3.0269, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6862389754742056, | |
| "grad_norm": 7.5704426765441895, | |
| "learning_rate": 1.71256038647343e-06, | |
| "loss": 3.0419, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.6959043131569409, | |
| "grad_norm": 7.671561241149902, | |
| "learning_rate": 1.7367149758454105e-06, | |
| "loss": 3.0498, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.6968708469252145, | |
| "eval_loss": 3.005807876586914, | |
| "eval_runtime": 216.8392, | |
| "eval_samples_per_second": 101.781, | |
| "eval_steps_per_second": 4.243, | |
| "step": 721 | |
| }, | |
| { | |
| "epoch": 0.7055696508396763, | |
| "grad_norm": 7.509618759155273, | |
| "learning_rate": 1.7608695652173912e-06, | |
| "loss": 3.0009, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.7152349885224115, | |
| "grad_norm": 7.517166614532471, | |
| "learning_rate": 1.7850241545893719e-06, | |
| "loss": 2.992, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.7249003262051468, | |
| "grad_norm": 7.742889881134033, | |
| "learning_rate": 1.8091787439613526e-06, | |
| "loss": 2.9995, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.7345656638878821, | |
| "grad_norm": 7.825319290161133, | |
| "learning_rate": 1.833333333333333e-06, | |
| "loss": 3.021, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.7442310015706174, | |
| "grad_norm": 7.782094478607178, | |
| "learning_rate": 1.857487922705314e-06, | |
| "loss": 3.0349, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.7538963392533526, | |
| "grad_norm": 7.751500606536865, | |
| "learning_rate": 1.8816425120772945e-06, | |
| "loss": 2.9727, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.763561676936088, | |
| "grad_norm": 7.506228923797607, | |
| "learning_rate": 1.9057971014492754e-06, | |
| "loss": 2.9381, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.7732270146188233, | |
| "grad_norm": 7.2402849197387695, | |
| "learning_rate": 1.9299516908212557e-06, | |
| "loss": 2.9831, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.7828923523015585, | |
| "grad_norm": 7.474483013153076, | |
| "learning_rate": 1.9541062801932366e-06, | |
| "loss": 3.0057, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.7925576899842939, | |
| "grad_norm": 7.3539934158325195, | |
| "learning_rate": 1.978260869565217e-06, | |
| "loss": 2.969, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.796423825057388, | |
| "eval_loss": 2.966066360473633, | |
| "eval_runtime": 216.579, | |
| "eval_samples_per_second": 101.903, | |
| "eval_steps_per_second": 4.248, | |
| "step": 824 | |
| }, | |
| { | |
| "epoch": 0.8022230276670291, | |
| "grad_norm": 7.149211406707764, | |
| "learning_rate": 1.99999991113637e-06, | |
| "loss": 2.974, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.8118883653497644, | |
| "grad_norm": 7.528937339782715, | |
| "learning_rate": 1.9999892475198546e-06, | |
| "loss": 2.9405, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.8215537030324997, | |
| "grad_norm": 7.3311614990234375, | |
| "learning_rate": 1.9999608113944566e-06, | |
| "loss": 2.9328, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.831219040715235, | |
| "grad_norm": 7.876272678375244, | |
| "learning_rate": 1.9999146032655633e-06, | |
| "loss": 2.9703, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.8408843783979703, | |
| "grad_norm": 7.37180757522583, | |
| "learning_rate": 1.999850623954417e-06, | |
| "loss": 2.9538, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.8505497160807056, | |
| "grad_norm": 7.159989833831787, | |
| "learning_rate": 1.999768874598104e-06, | |
| "loss": 2.934, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.8602150537634409, | |
| "grad_norm": 7.683533191680908, | |
| "learning_rate": 1.9996693566495293e-06, | |
| "loss": 2.9945, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.8698803914461761, | |
| "grad_norm": 7.5594353675842285, | |
| "learning_rate": 1.999552071877397e-06, | |
| "loss": 2.9649, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.8795457291289115, | |
| "grad_norm": 7.358302116394043, | |
| "learning_rate": 1.999417022366174e-06, | |
| "loss": 2.9244, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.8892110668116467, | |
| "grad_norm": 7.427350997924805, | |
| "learning_rate": 1.9992642105160544e-06, | |
| "loss": 2.9356, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.8959768031895614, | |
| "eval_loss": 2.9331393241882324, | |
| "eval_runtime": 216.9209, | |
| "eval_samples_per_second": 101.742, | |
| "eval_steps_per_second": 4.241, | |
| "step": 927 | |
| }, | |
| { | |
| "epoch": 0.898876404494382, | |
| "grad_norm": 7.108737945556641, | |
| "learning_rate": 1.9990936390429174e-06, | |
| "loss": 2.8991, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.9085417421771174, | |
| "grad_norm": 7.447180271148682, | |
| "learning_rate": 1.9989053109782786e-06, | |
| "loss": 2.9311, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.9182070798598526, | |
| "grad_norm": 7.615334987640381, | |
| "learning_rate": 1.998699229669236e-06, | |
| "loss": 2.9204, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.9278724175425879, | |
| "grad_norm": 7.724397659301758, | |
| "learning_rate": 1.9984753987784118e-06, | |
| "loss": 2.9048, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.9375377552253232, | |
| "grad_norm": 7.22221565246582, | |
| "learning_rate": 1.9982338222838842e-06, | |
| "loss": 2.9372, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.9472030929080585, | |
| "grad_norm": 7.188429832458496, | |
| "learning_rate": 1.9979745044791204e-06, | |
| "loss": 2.9471, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.9568684305907937, | |
| "grad_norm": 7.243603706359863, | |
| "learning_rate": 1.9976974499728977e-06, | |
| "loss": 2.9325, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.9665337682735291, | |
| "grad_norm": 7.491036891937256, | |
| "learning_rate": 1.9974026636892224e-06, | |
| "loss": 2.9511, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.9761991059562644, | |
| "grad_norm": 7.160167217254639, | |
| "learning_rate": 1.9970901508672435e-06, | |
| "loss": 2.9265, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.9858644436389996, | |
| "grad_norm": 7.02191686630249, | |
| "learning_rate": 1.9967599170611565e-06, | |
| "loss": 2.9277, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.995529781321735, | |
| "grad_norm": 7.492205619812012, | |
| "learning_rate": 1.996411968140109e-06, | |
| "loss": 2.9185, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.995529781321735, | |
| "eval_loss": 2.905233383178711, | |
| "eval_runtime": 216.909, | |
| "eval_samples_per_second": 101.748, | |
| "eval_steps_per_second": 4.241, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.0048326688413676, | |
| "grad_norm": 7.374047756195068, | |
| "learning_rate": 1.996046310288092e-06, | |
| "loss": 2.9196, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.014498006524103, | |
| "grad_norm": 7.255711555480957, | |
| "learning_rate": 1.995662950003833e-06, | |
| "loss": 2.8683, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.0241633442068383, | |
| "grad_norm": 7.150922775268555, | |
| "learning_rate": 1.9952618941006794e-06, | |
| "loss": 2.8806, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.0338286818895734, | |
| "grad_norm": 7.3063883781433105, | |
| "learning_rate": 1.9948431497064772e-06, | |
| "loss": 2.8827, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.0434940195723088, | |
| "grad_norm": 7.305731773376465, | |
| "learning_rate": 1.994406724263445e-06, | |
| "loss": 2.8813, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.0531593572550442, | |
| "grad_norm": 7.360948085784912, | |
| "learning_rate": 1.9939526255280415e-06, | |
| "loss": 2.8888, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.0628246949377793, | |
| "grad_norm": 7.436374187469482, | |
| "learning_rate": 1.993480861570827e-06, | |
| "loss": 2.8621, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.0724900326205147, | |
| "grad_norm": 7.136722564697266, | |
| "learning_rate": 1.9929914407763206e-06, | |
| "loss": 2.8604, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.08215537030325, | |
| "grad_norm": 7.82803201675415, | |
| "learning_rate": 1.9924843718428517e-06, | |
| "loss": 2.8565, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.0918207079859852, | |
| "grad_norm": 7.341372489929199, | |
| "learning_rate": 1.9919596637824044e-06, | |
| "loss": 2.8914, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.0947203092908058, | |
| "eval_loss": 2.883575201034546, | |
| "eval_runtime": 216.9173, | |
| "eval_samples_per_second": 101.744, | |
| "eval_steps_per_second": 4.241, | |
| "step": 1133 | |
| }, | |
| { | |
| "epoch": 1.1014860456687205, | |
| "grad_norm": 7.483231544494629, | |
| "learning_rate": 1.991417325920457e-06, | |
| "loss": 2.8729, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.111151383351456, | |
| "grad_norm": 7.345571041107178, | |
| "learning_rate": 1.990857367895818e-06, | |
| "loss": 2.8265, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.120816721034191, | |
| "grad_norm": 7.247069835662842, | |
| "learning_rate": 1.9902797996604535e-06, | |
| "loss": 2.8562, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.1304820587169264, | |
| "grad_norm": 7.213609218597412, | |
| "learning_rate": 1.9896846314793106e-06, | |
| "loss": 2.8768, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.1401473963996618, | |
| "grad_norm": 7.366117477416992, | |
| "learning_rate": 1.989071873930134e-06, | |
| "loss": 2.8697, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.149812734082397, | |
| "grad_norm": 7.163483619689941, | |
| "learning_rate": 1.9884415379032807e-06, | |
| "loss": 2.8694, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.1594780717651323, | |
| "grad_norm": 7.064028739929199, | |
| "learning_rate": 1.9877936346015232e-06, | |
| "loss": 2.8417, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.1691434094478677, | |
| "grad_norm": 7.20824670791626, | |
| "learning_rate": 1.987128175539853e-06, | |
| "loss": 2.8613, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.1788087471306028, | |
| "grad_norm": 7.681613922119141, | |
| "learning_rate": 1.9864451725452738e-06, | |
| "loss": 2.8694, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.1884740848133382, | |
| "grad_norm": 7.184121131896973, | |
| "learning_rate": 1.985744637756593e-06, | |
| "loss": 2.8297, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.1942732874229793, | |
| "eval_loss": 2.863403081893921, | |
| "eval_runtime": 216.6561, | |
| "eval_samples_per_second": 101.867, | |
| "eval_steps_per_second": 4.246, | |
| "step": 1236 | |
| }, | |
| { | |
| "epoch": 1.1981394224960735, | |
| "grad_norm": 7.362229347229004, | |
| "learning_rate": 1.985026583624206e-06, | |
| "loss": 2.876, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.2078047601788087, | |
| "grad_norm": 6.909084320068359, | |
| "learning_rate": 1.9842910229098727e-06, | |
| "loss": 2.8253, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.217470097861544, | |
| "grad_norm": 7.297941207885742, | |
| "learning_rate": 1.983537968686493e-06, | |
| "loss": 2.832, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.2271354355442794, | |
| "grad_norm": 7.0146164894104, | |
| "learning_rate": 1.982767434337874e-06, | |
| "loss": 2.8104, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.2368007732270145, | |
| "grad_norm": 7.416932582855225, | |
| "learning_rate": 1.9819794335584913e-06, | |
| "loss": 2.8184, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.24646611090975, | |
| "grad_norm": 7.231618881225586, | |
| "learning_rate": 1.981173980353246e-06, | |
| "loss": 2.8503, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.2561314485924853, | |
| "grad_norm": 7.189469337463379, | |
| "learning_rate": 1.9803510890372174e-06, | |
| "loss": 2.8199, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.2657967862752204, | |
| "grad_norm": 7.181421279907227, | |
| "learning_rate": 1.9795107742354046e-06, | |
| "loss": 2.8475, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.2754621239579558, | |
| "grad_norm": 7.317245960235596, | |
| "learning_rate": 1.9786530508824715e-06, | |
| "loss": 2.8784, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.2851274616406911, | |
| "grad_norm": 7.532864570617676, | |
| "learning_rate": 1.9777779342224776e-06, | |
| "loss": 2.8484, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.293826265555153, | |
| "eval_loss": 2.8460726737976074, | |
| "eval_runtime": 216.904, | |
| "eval_samples_per_second": 101.75, | |
| "eval_steps_per_second": 4.242, | |
| "step": 1339 | |
| }, | |
| { | |
| "epoch": 1.2947927993234263, | |
| "grad_norm": 6.849309921264648, | |
| "learning_rate": 1.9768854398086084e-06, | |
| "loss": 2.8427, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.3044581370061616, | |
| "grad_norm": 7.047804355621338, | |
| "learning_rate": 1.9759755835029e-06, | |
| "loss": 2.8141, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.314123474688897, | |
| "grad_norm": 7.175754547119141, | |
| "learning_rate": 1.975048381475955e-06, | |
| "loss": 2.8576, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.3237888123716322, | |
| "grad_norm": 7.228835105895996, | |
| "learning_rate": 1.9741038502066566e-06, | |
| "loss": 2.8288, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.3334541500543675, | |
| "grad_norm": 7.117552757263184, | |
| "learning_rate": 1.9731420064818765e-06, | |
| "loss": 2.8667, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.3431194877371029, | |
| "grad_norm": 7.141602039337158, | |
| "learning_rate": 1.9721628673961743e-06, | |
| "loss": 2.8021, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.352784825419838, | |
| "grad_norm": 7.065165042877197, | |
| "learning_rate": 1.971166450351495e-06, | |
| "loss": 2.8376, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.3624501631025734, | |
| "grad_norm": 7.072872638702393, | |
| "learning_rate": 1.97015277305686e-06, | |
| "loss": 2.8373, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.3721155007853088, | |
| "grad_norm": 6.841920375823975, | |
| "learning_rate": 1.9691218535280517e-06, | |
| "loss": 2.8458, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.381780838468044, | |
| "grad_norm": 7.044722557067871, | |
| "learning_rate": 1.968073710087294e-06, | |
| "loss": 2.8373, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.3914461761507793, | |
| "grad_norm": 7.605813980102539, | |
| "learning_rate": 1.9670083613629245e-06, | |
| "loss": 2.8032, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.3933792436873262, | |
| "eval_loss": 2.8305492401123047, | |
| "eval_runtime": 216.7183, | |
| "eval_samples_per_second": 101.837, | |
| "eval_steps_per_second": 4.245, | |
| "step": 1442 | |
| }, | |
| { | |
| "epoch": 1.4011115138335146, | |
| "grad_norm": 7.031517505645752, | |
| "learning_rate": 1.965925826289068e-06, | |
| "loss": 2.8395, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.4107768515162498, | |
| "grad_norm": 7.169219970703125, | |
| "learning_rate": 1.9648261241052952e-06, | |
| "loss": 2.8244, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.4204421891989851, | |
| "grad_norm": 7.192759037017822, | |
| "learning_rate": 1.9637092743562823e-06, | |
| "loss": 2.8123, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.4301075268817205, | |
| "grad_norm": 7.061282634735107, | |
| "learning_rate": 1.9625752968914656e-06, | |
| "loss": 2.7968, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.4397728645644556, | |
| "grad_norm": 7.095070838928223, | |
| "learning_rate": 1.9614242118646858e-06, | |
| "loss": 2.8005, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.449438202247191, | |
| "grad_norm": 6.964037895202637, | |
| "learning_rate": 1.9602560397338325e-06, | |
| "loss": 2.7837, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.4591035399299264, | |
| "grad_norm": 6.997467517852783, | |
| "learning_rate": 1.959070801260478e-06, | |
| "loss": 2.8355, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.4687688776126615, | |
| "grad_norm": 6.82883882522583, | |
| "learning_rate": 1.9578685175095098e-06, | |
| "loss": 2.817, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.4784342152953969, | |
| "grad_norm": 6.964229583740234, | |
| "learning_rate": 1.9566492098487572e-06, | |
| "loss": 2.7913, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.4880995529781322, | |
| "grad_norm": 6.858983516693115, | |
| "learning_rate": 1.9554128999486085e-06, | |
| "loss": 2.7997, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.4929322218194998, | |
| "eval_loss": 2.816601037979126, | |
| "eval_runtime": 217.0118, | |
| "eval_samples_per_second": 101.7, | |
| "eval_steps_per_second": 4.239, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 1.4977648906608674, | |
| "grad_norm": 6.959181308746338, | |
| "learning_rate": 1.954159609781629e-06, | |
| "loss": 2.7839, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.5074302283436027, | |
| "grad_norm": 6.717589855194092, | |
| "learning_rate": 1.952889361622169e-06, | |
| "loss": 2.7969, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.517095566026338, | |
| "grad_norm": 6.919014930725098, | |
| "learning_rate": 1.9516021780459678e-06, | |
| "loss": 2.7675, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.5267609037090732, | |
| "grad_norm": 7.024904251098633, | |
| "learning_rate": 1.950298081929753e-06, | |
| "loss": 2.7854, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.5364262413918086, | |
| "grad_norm": 7.037530422210693, | |
| "learning_rate": 1.9489770964508335e-06, | |
| "loss": 2.7911, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.546091579074544, | |
| "grad_norm": 6.958497524261475, | |
| "learning_rate": 1.947639245086688e-06, | |
| "loss": 2.801, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.5557569167572791, | |
| "grad_norm": 6.978152751922607, | |
| "learning_rate": 1.9462845516145473e-06, | |
| "loss": 2.7855, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.5654222544400145, | |
| "grad_norm": 6.826775550842285, | |
| "learning_rate": 1.944913040110972e-06, | |
| "loss": 2.7614, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.5750875921227498, | |
| "grad_norm": 6.996805191040039, | |
| "learning_rate": 1.943524734951425e-06, | |
| "loss": 2.8176, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.584752929805485, | |
| "grad_norm": 7.008688449859619, | |
| "learning_rate": 1.942119660809837e-06, | |
| "loss": 2.7718, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.5924851999516734, | |
| "eval_loss": 2.8026342391967773, | |
| "eval_runtime": 216.9019, | |
| "eval_samples_per_second": 101.751, | |
| "eval_steps_per_second": 4.242, | |
| "step": 1648 | |
| }, | |
| { | |
| "epoch": 1.5944182674882204, | |
| "grad_norm": 7.19225549697876, | |
| "learning_rate": 1.940697842658169e-06, | |
| "loss": 2.7797, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.6040836051709557, | |
| "grad_norm": 7.009806156158447, | |
| "learning_rate": 1.939259305765969e-06, | |
| "loss": 2.7773, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.6137489428536909, | |
| "grad_norm": 6.812699317932129, | |
| "learning_rate": 1.937804075699921e-06, | |
| "loss": 2.7684, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.6234142805364262, | |
| "grad_norm": 7.124710559844971, | |
| "learning_rate": 1.936332178323393e-06, | |
| "loss": 2.789, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.6330796182191616, | |
| "grad_norm": 7.376682758331299, | |
| "learning_rate": 1.934843639795975e-06, | |
| "loss": 2.801, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.6427449559018967, | |
| "grad_norm": 6.929679870605469, | |
| "learning_rate": 1.933338486573016e-06, | |
| "loss": 2.8004, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.652410293584632, | |
| "grad_norm": 7.01241397857666, | |
| "learning_rate": 1.9318167454051523e-06, | |
| "loss": 2.7923, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.6620756312673675, | |
| "grad_norm": 6.667963981628418, | |
| "learning_rate": 1.930278443337833e-06, | |
| "loss": 2.7892, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.6717409689501026, | |
| "grad_norm": 6.875312805175781, | |
| "learning_rate": 1.92872360771084e-06, | |
| "loss": 2.7754, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.681406306632838, | |
| "grad_norm": 6.666006565093994, | |
| "learning_rate": 1.9271522661577995e-06, | |
| "loss": 2.7922, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.6910716443155733, | |
| "grad_norm": 7.197988986968994, | |
| "learning_rate": 1.925564446605694e-06, | |
| "loss": 2.7951, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.692038178083847, | |
| "eval_loss": 2.791018009185791, | |
| "eval_runtime": 216.9287, | |
| "eval_samples_per_second": 101.739, | |
| "eval_steps_per_second": 4.241, | |
| "step": 1751 | |
| }, | |
| { | |
| "epoch": 1.7007369819983085, | |
| "grad_norm": 6.819860935211182, | |
| "learning_rate": 1.9239601772743645e-06, | |
| "loss": 2.7526, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.7104023196810438, | |
| "grad_norm": 6.862004280090332, | |
| "learning_rate": 1.9223394866760073e-06, | |
| "loss": 2.7449, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.7200676573637792, | |
| "grad_norm": 6.870265007019043, | |
| "learning_rate": 1.920702403614671e-06, | |
| "loss": 2.7771, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.7297329950465143, | |
| "grad_norm": 7.154959678649902, | |
| "learning_rate": 1.9190489571857423e-06, | |
| "loss": 2.8067, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.7393983327292497, | |
| "grad_norm": 6.809077739715576, | |
| "learning_rate": 1.9173791767754272e-06, | |
| "loss": 2.7733, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.749063670411985, | |
| "grad_norm": 6.7708282470703125, | |
| "learning_rate": 1.915693092060232e-06, | |
| "loss": 2.7649, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.7587290080947202, | |
| "grad_norm": 7.031125068664551, | |
| "learning_rate": 1.913990733006436e-06, | |
| "loss": 2.7686, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.7683943457774556, | |
| "grad_norm": 6.831878662109375, | |
| "learning_rate": 1.912272129869554e-06, | |
| "loss": 2.7496, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.778059683460191, | |
| "grad_norm": 6.8769097328186035, | |
| "learning_rate": 1.9105373131938057e-06, | |
| "loss": 2.7351, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.787725021142926, | |
| "grad_norm": 6.5916266441345215, | |
| "learning_rate": 1.9087863138115665e-06, | |
| "loss": 2.7503, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.7915911562160203, | |
| "eval_loss": 2.7797155380249023, | |
| "eval_runtime": 216.9502, | |
| "eval_samples_per_second": 101.728, | |
| "eval_steps_per_second": 4.241, | |
| "step": 1854 | |
| }, | |
| { | |
| "epoch": 1.7973903588256614, | |
| "grad_norm": 6.822347164154053, | |
| "learning_rate": 1.907019162842823e-06, | |
| "loss": 2.7729, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.8070556965083968, | |
| "grad_norm": 7.012885093688965, | |
| "learning_rate": 1.9052358916946192e-06, | |
| "loss": 2.7429, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.816721034191132, | |
| "grad_norm": 6.920743942260742, | |
| "learning_rate": 1.9034365320604987e-06, | |
| "loss": 2.7963, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.8263863718738673, | |
| "grad_norm": 7.041555404663086, | |
| "learning_rate": 1.90162111591994e-06, | |
| "loss": 2.752, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.8360517095566027, | |
| "grad_norm": 6.854340076446533, | |
| "learning_rate": 1.8997896755377898e-06, | |
| "loss": 2.786, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.8457170472393378, | |
| "grad_norm": 7.011345386505127, | |
| "learning_rate": 1.897942243463688e-06, | |
| "loss": 2.7627, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.8553823849220732, | |
| "grad_norm": 6.391646385192871, | |
| "learning_rate": 1.8960788525314912e-06, | |
| "loss": 2.7706, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.8650477226048086, | |
| "grad_norm": 6.750019550323486, | |
| "learning_rate": 1.8941995358586874e-06, | |
| "loss": 2.7245, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.8747130602875437, | |
| "grad_norm": 6.734500408172607, | |
| "learning_rate": 1.8923043268458074e-06, | |
| "loss": 2.7494, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.884378397970279, | |
| "grad_norm": 6.839148998260498, | |
| "learning_rate": 1.8903932591758328e-06, | |
| "loss": 2.7551, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.8911441343481938, | |
| "eval_loss": 2.7689898014068604, | |
| "eval_runtime": 216.9508, | |
| "eval_samples_per_second": 101.728, | |
| "eval_steps_per_second": 4.241, | |
| "step": 1957 | |
| }, | |
| { | |
| "epoch": 1.8940437356530144, | |
| "grad_norm": 6.925171852111816, | |
| "learning_rate": 1.888466366813596e-06, | |
| "loss": 2.7679, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.9037090733357496, | |
| "grad_norm": 7.017655849456787, | |
| "learning_rate": 1.8865236840051769e-06, | |
| "loss": 2.7386, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.913374411018485, | |
| "grad_norm": 6.900379180908203, | |
| "learning_rate": 1.8845652452772938e-06, | |
| "loss": 2.7741, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.9230397487012203, | |
| "grad_norm": 6.903608798980713, | |
| "learning_rate": 1.8825910854366912e-06, | |
| "loss": 2.7476, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.9327050863839554, | |
| "grad_norm": 6.967295169830322, | |
| "learning_rate": 1.8806012395695193e-06, | |
| "loss": 2.7653, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.9423704240666908, | |
| "grad_norm": 6.658292770385742, | |
| "learning_rate": 1.8785957430407116e-06, | |
| "loss": 2.7692, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.9520357617494262, | |
| "grad_norm": 6.8208909034729, | |
| "learning_rate": 1.8765746314933564e-06, | |
| "loss": 2.7523, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.9617010994321613, | |
| "grad_norm": 6.817293643951416, | |
| "learning_rate": 1.8745379408480631e-06, | |
| "loss": 2.7417, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.9713664371148967, | |
| "grad_norm": 6.492961406707764, | |
| "learning_rate": 1.8724857073023232e-06, | |
| "loss": 2.7474, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.981031774797632, | |
| "grad_norm": 6.4757585525512695, | |
| "learning_rate": 1.8704179673298678e-06, | |
| "loss": 2.7271, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.9906971124803672, | |
| "grad_norm": 6.683932781219482, | |
| "learning_rate": 1.8683347576800194e-06, | |
| "loss": 2.763, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.9906971124803672, | |
| "eval_loss": 2.759366750717163, | |
| "eval_runtime": 216.6345, | |
| "eval_samples_per_second": 101.877, | |
| "eval_steps_per_second": 4.247, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 9.310201644897461, | |
| "learning_rate": 1.8662361153770377e-06, | |
| "loss": 2.7428, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.009665337682735, | |
| "grad_norm": 6.855327129364014, | |
| "learning_rate": 1.8641220777194628e-06, | |
| "loss": 2.6795, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.0193306753654707, | |
| "grad_norm": 6.812604904174805, | |
| "learning_rate": 1.8619926822794519e-06, | |
| "loss": 2.6903, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.028996013048206, | |
| "grad_norm": 6.613392353057861, | |
| "learning_rate": 1.8598479669021114e-06, | |
| "loss": 2.7386, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.038661350730941, | |
| "grad_norm": 6.695821762084961, | |
| "learning_rate": 1.857687969704824e-06, | |
| "loss": 2.6984, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.0483266884136766, | |
| "grad_norm": 6.728259563446045, | |
| "learning_rate": 1.8555127290765717e-06, | |
| "loss": 2.7247, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.0579920260964117, | |
| "grad_norm": 6.941239833831787, | |
| "learning_rate": 1.853322283677254e-06, | |
| "loss": 2.6862, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.067657363779147, | |
| "grad_norm": 6.898434162139893, | |
| "learning_rate": 1.8511166724369996e-06, | |
| "loss": 2.6707, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.0773227014618825, | |
| "grad_norm": 6.764381408691406, | |
| "learning_rate": 1.8488959345554754e-06, | |
| "loss": 2.6758, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.0869880391446176, | |
| "grad_norm": 6.8859639167785645, | |
| "learning_rate": 1.8466601095011893e-06, | |
| "loss": 2.7344, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.0898876404494384, | |
| "eval_loss": 2.7534408569335938, | |
| "eval_runtime": 216.6942, | |
| "eval_samples_per_second": 101.849, | |
| "eval_steps_per_second": 4.246, | |
| "step": 2163 | |
| }, | |
| { | |
| "epoch": 2.0966533768273528, | |
| "grad_norm": 6.86688756942749, | |
| "learning_rate": 1.8444092370107897e-06, | |
| "loss": 2.7162, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.1063187145100883, | |
| "grad_norm": 6.992837905883789, | |
| "learning_rate": 1.8421433570883576e-06, | |
| "loss": 2.7357, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.1159840521928235, | |
| "grad_norm": 6.954036712646484, | |
| "learning_rate": 1.8398625100046971e-06, | |
| "loss": 2.7045, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.1256493898755586, | |
| "grad_norm": 6.828469753265381, | |
| "learning_rate": 1.8375667362966193e-06, | |
| "loss": 2.6887, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.135314727558294, | |
| "grad_norm": 6.695276260375977, | |
| "learning_rate": 1.8352560767662217e-06, | |
| "loss": 2.6672, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.1449800652410294, | |
| "grad_norm": 6.662326335906982, | |
| "learning_rate": 1.8329305724801624e-06, | |
| "loss": 2.6651, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.1546454029237645, | |
| "grad_norm": 6.725667953491211, | |
| "learning_rate": 1.830590264768932e-06, | |
| "loss": 2.6712, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.1643107406065, | |
| "grad_norm": 6.9403395652771, | |
| "learning_rate": 1.8282351952261167e-06, | |
| "loss": 2.6914, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.1739760782892352, | |
| "grad_norm": 6.7621636390686035, | |
| "learning_rate": 1.8258654057076614e-06, | |
| "loss": 2.7119, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.1836414159719704, | |
| "grad_norm": 6.9445719718933105, | |
| "learning_rate": 1.823480938331124e-06, | |
| "loss": 2.7022, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.1894406185816115, | |
| "eval_loss": 2.7453184127807617, | |
| "eval_runtime": 216.9137, | |
| "eval_samples_per_second": 101.746, | |
| "eval_steps_per_second": 4.241, | |
| "step": 2266 | |
| }, | |
| { | |
| "epoch": 2.193306753654706, | |
| "grad_norm": 6.655174255371094, | |
| "learning_rate": 1.8210818354749279e-06, | |
| "loss": 2.7272, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.202972091337441, | |
| "grad_norm": 6.8438262939453125, | |
| "learning_rate": 1.818668139777608e-06, | |
| "loss": 2.707, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.2126374290201762, | |
| "grad_norm": 6.986091613769531, | |
| "learning_rate": 1.8162398941370545e-06, | |
| "loss": 2.7106, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.222302766702912, | |
| "grad_norm": 6.6938347816467285, | |
| "learning_rate": 1.813797141709748e-06, | |
| "loss": 2.7015, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.231968104385647, | |
| "grad_norm": 6.803264617919922, | |
| "learning_rate": 1.811339925909995e-06, | |
| "loss": 2.7209, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.241633442068382, | |
| "grad_norm": 6.701619625091553, | |
| "learning_rate": 1.8088682904091543e-06, | |
| "loss": 2.6589, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.2512987797511177, | |
| "grad_norm": 6.809988498687744, | |
| "learning_rate": 1.8063822791348624e-06, | |
| "loss": 2.7053, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.260964117433853, | |
| "grad_norm": 6.671308517456055, | |
| "learning_rate": 1.803881936270252e-06, | |
| "loss": 2.7065, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.270629455116588, | |
| "grad_norm": 6.887976169586182, | |
| "learning_rate": 1.8013673062531663e-06, | |
| "loss": 2.712, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.2802947927993236, | |
| "grad_norm": 6.573399543762207, | |
| "learning_rate": 1.7988384337753702e-06, | |
| "loss": 2.6855, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.288993596713785, | |
| "eval_loss": 2.737840175628662, | |
| "eval_runtime": 217.0075, | |
| "eval_samples_per_second": 101.702, | |
| "eval_steps_per_second": 4.239, | |
| "step": 2369 | |
| }, | |
| { | |
| "epoch": 2.2899601304820587, | |
| "grad_norm": 6.888051509857178, | |
| "learning_rate": 1.7962953637817556e-06, | |
| "loss": 2.6826, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.299625468164794, | |
| "grad_norm": 6.976341724395752, | |
| "learning_rate": 1.7937381414695428e-06, | |
| "loss": 2.688, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.3092908058475294, | |
| "grad_norm": 6.785758018493652, | |
| "learning_rate": 1.7911668122874764e-06, | |
| "loss": 2.6971, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.3189561435302646, | |
| "grad_norm": 6.791547775268555, | |
| "learning_rate": 1.7885814219350187e-06, | |
| "loss": 2.7279, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.3286214812129997, | |
| "grad_norm": 6.7897820472717285, | |
| "learning_rate": 1.7859820163615367e-06, | |
| "loss": 2.697, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.3382868188957353, | |
| "grad_norm": 6.9951395988464355, | |
| "learning_rate": 1.7833686417654857e-06, | |
| "loss": 2.6994, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.3479521565784705, | |
| "grad_norm": 6.830650806427002, | |
| "learning_rate": 1.7807413445935886e-06, | |
| "loss": 2.6833, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.3576174942612056, | |
| "grad_norm": 6.746628761291504, | |
| "learning_rate": 1.7781001715400094e-06, | |
| "loss": 2.699, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.367282831943941, | |
| "grad_norm": 6.582501411437988, | |
| "learning_rate": 1.7754451695455248e-06, | |
| "loss": 2.6798, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.3769481696266763, | |
| "grad_norm": 6.743264675140381, | |
| "learning_rate": 1.7727763857966887e-06, | |
| "loss": 2.6569, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.3866135073094115, | |
| "grad_norm": 6.615793228149414, | |
| "learning_rate": 1.7700938677249934e-06, | |
| "loss": 2.6972, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.3885465748459587, | |
| "eval_loss": 2.7308976650238037, | |
| "eval_runtime": 217.0492, | |
| "eval_samples_per_second": 101.682, | |
| "eval_steps_per_second": 4.239, | |
| "step": 2472 | |
| }, | |
| { | |
| "epoch": 2.396278844992147, | |
| "grad_norm": 6.7577948570251465, | |
| "learning_rate": 1.7673976630060287e-06, | |
| "loss": 2.6751, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.405944182674882, | |
| "grad_norm": 6.334952354431152, | |
| "learning_rate": 1.7646878195586311e-06, | |
| "loss": 2.679, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.4156095203576173, | |
| "grad_norm": 7.024070739746094, | |
| "learning_rate": 1.7619643855440362e-06, | |
| "loss": 2.6967, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.425274858040353, | |
| "grad_norm": 6.562467575073242, | |
| "learning_rate": 1.7592274093650191e-06, | |
| "loss": 2.6722, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.434940195723088, | |
| "grad_norm": 6.788969993591309, | |
| "learning_rate": 1.7564769396650366e-06, | |
| "loss": 2.6808, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.444605533405823, | |
| "grad_norm": 6.727795124053955, | |
| "learning_rate": 1.753713025327361e-06, | |
| "loss": 2.6603, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.454270871088559, | |
| "grad_norm": 6.815580368041992, | |
| "learning_rate": 1.7509357154742132e-06, | |
| "loss": 2.662, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.463936208771294, | |
| "grad_norm": 6.863372802734375, | |
| "learning_rate": 1.7481450594658873e-06, | |
| "loss": 2.678, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.473601546454029, | |
| "grad_norm": 6.5276641845703125, | |
| "learning_rate": 1.7453411068998762e-06, | |
| "loss": 2.6925, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.4832668841367647, | |
| "grad_norm": 6.627063274383545, | |
| "learning_rate": 1.7425239076099866e-06, | |
| "loss": 2.6452, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.4880995529781322, | |
| "eval_loss": 2.7234959602355957, | |
| "eval_runtime": 216.9128, | |
| "eval_samples_per_second": 101.746, | |
| "eval_steps_per_second": 4.241, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 2.4929322218195, | |
| "grad_norm": 6.625070095062256, | |
| "learning_rate": 1.739693511665457e-06, | |
| "loss": 2.6579, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.502597559502235, | |
| "grad_norm": 6.858110427856445, | |
| "learning_rate": 1.7368499693700652e-06, | |
| "loss": 2.6886, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.5122628971849705, | |
| "grad_norm": 6.678401470184326, | |
| "learning_rate": 1.7339933312612352e-06, | |
| "loss": 2.6669, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.5219282348677057, | |
| "grad_norm": 6.686957359313965, | |
| "learning_rate": 1.731123648109139e-06, | |
| "loss": 2.7024, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.531593572550441, | |
| "grad_norm": 6.644192695617676, | |
| "learning_rate": 1.7282409709157947e-06, | |
| "loss": 2.6749, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.5412589102331764, | |
| "grad_norm": 6.746331214904785, | |
| "learning_rate": 1.7253453509141586e-06, | |
| "loss": 2.6588, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.5509242479159115, | |
| "grad_norm": 6.613391876220703, | |
| "learning_rate": 1.7224368395672166e-06, | |
| "loss": 2.6554, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.5605895855986467, | |
| "grad_norm": 6.989151477813721, | |
| "learning_rate": 1.7195154885670683e-06, | |
| "loss": 2.6336, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.5702549232813823, | |
| "grad_norm": 6.685927867889404, | |
| "learning_rate": 1.7165813498340088e-06, | |
| "loss": 2.6931, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.5799202609641174, | |
| "grad_norm": 6.835086345672607, | |
| "learning_rate": 1.7136344755156048e-06, | |
| "loss": 2.6702, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.587652531110306, | |
| "eval_loss": 2.716982126235962, | |
| "eval_runtime": 216.725, | |
| "eval_samples_per_second": 101.834, | |
| "eval_steps_per_second": 4.245, | |
| "step": 2678 | |
| }, | |
| { | |
| "epoch": 2.5895855986468526, | |
| "grad_norm": 6.729923248291016, | |
| "learning_rate": 1.7106749179857701e-06, | |
| "loss": 2.6708, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.599250936329588, | |
| "grad_norm": 6.472827911376953, | |
| "learning_rate": 1.7077027298438327e-06, | |
| "loss": 2.6505, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.6089162740123233, | |
| "grad_norm": 6.623499393463135, | |
| "learning_rate": 1.7047179639136e-06, | |
| "loss": 2.666, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.6185816116950584, | |
| "grad_norm": 6.723398208618164, | |
| "learning_rate": 1.7017206732424226e-06, | |
| "loss": 2.6724, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.628246949377794, | |
| "grad_norm": 6.559345245361328, | |
| "learning_rate": 1.6987109111002474e-06, | |
| "loss": 2.663, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.637912287060529, | |
| "grad_norm": 6.853271484375, | |
| "learning_rate": 1.6956887309786743e-06, | |
| "loss": 2.6874, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.6475776247432643, | |
| "grad_norm": 7.0866312980651855, | |
| "learning_rate": 1.692654186590004e-06, | |
| "loss": 2.6809, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.657242962426, | |
| "grad_norm": 6.323083877563477, | |
| "learning_rate": 1.6896073318662834e-06, | |
| "loss": 2.6679, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.666908300108735, | |
| "grad_norm": 6.670552730560303, | |
| "learning_rate": 1.6865482209583473e-06, | |
| "loss": 2.6977, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.6765736377914706, | |
| "grad_norm": 6.876490592956543, | |
| "learning_rate": 1.6834769082348563e-06, | |
| "loss": 2.6711, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.6862389754742058, | |
| "grad_norm": 6.621648788452148, | |
| "learning_rate": 1.6803934482813297e-06, | |
| "loss": 2.6526, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.687205509242479, | |
| "eval_loss": 2.711458444595337, | |
| "eval_runtime": 216.9564, | |
| "eval_samples_per_second": 101.725, | |
| "eval_steps_per_second": 4.24, | |
| "step": 2781 | |
| }, | |
| { | |
| "epoch": 2.695904313156941, | |
| "grad_norm": 6.591291904449463, | |
| "learning_rate": 1.6772978958991766e-06, | |
| "loss": 2.6639, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.705569650839676, | |
| "grad_norm": 6.6821608543396, | |
| "learning_rate": 1.6741903061047201e-06, | |
| "loss": 2.6498, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.7152349885224116, | |
| "grad_norm": 6.35032320022583, | |
| "learning_rate": 1.6710707341282223e-06, | |
| "loss": 2.6378, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.7249003262051468, | |
| "grad_norm": 6.479367733001709, | |
| "learning_rate": 1.667939235412899e-06, | |
| "loss": 2.6458, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.7345656638878824, | |
| "grad_norm": 6.6215009689331055, | |
| "learning_rate": 1.6647958656139376e-06, | |
| "loss": 2.6879, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.7442310015706175, | |
| "grad_norm": 6.726338863372803, | |
| "learning_rate": 1.6616406805975069e-06, | |
| "loss": 2.6433, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.7538963392533526, | |
| "grad_norm": 6.45868444442749, | |
| "learning_rate": 1.6584737364397637e-06, | |
| "loss": 2.6362, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.763561676936088, | |
| "grad_norm": 6.664327144622803, | |
| "learning_rate": 1.6552950894258555e-06, | |
| "loss": 2.6657, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.7732270146188234, | |
| "grad_norm": 6.683444976806641, | |
| "learning_rate": 1.6521047960489228e-06, | |
| "loss": 2.6648, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.7828923523015585, | |
| "grad_norm": 6.655669212341309, | |
| "learning_rate": 1.6489029130090921e-06, | |
| "loss": 2.6701, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.7867584873746525, | |
| "eval_loss": 2.705122470855713, | |
| "eval_runtime": 217.1999, | |
| "eval_samples_per_second": 101.611, | |
| "eval_steps_per_second": 4.236, | |
| "step": 2884 | |
| }, | |
| { | |
| "epoch": 2.792557689984294, | |
| "grad_norm": 6.554884910583496, | |
| "learning_rate": 1.6456894972124707e-06, | |
| "loss": 2.646, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.8022230276670292, | |
| "grad_norm": 6.624351978302002, | |
| "learning_rate": 1.6424646057701325e-06, | |
| "loss": 2.6591, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.8118883653497644, | |
| "grad_norm": 6.485014915466309, | |
| "learning_rate": 1.6392282959971068e-06, | |
| "loss": 2.6668, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.8215537030324995, | |
| "grad_norm": 6.348880767822266, | |
| "learning_rate": 1.6359806254113554e-06, | |
| "loss": 2.6663, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.831219040715235, | |
| "grad_norm": 6.690762519836426, | |
| "learning_rate": 1.6327216517327535e-06, | |
| "loss": 2.6598, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 2.8408843783979703, | |
| "grad_norm": 6.5298309326171875, | |
| "learning_rate": 1.6294514328820629e-06, | |
| "loss": 2.6988, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.850549716080706, | |
| "grad_norm": 6.622351169586182, | |
| "learning_rate": 1.6261700269799014e-06, | |
| "loss": 2.6194, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.860215053763441, | |
| "grad_norm": 6.835710048675537, | |
| "learning_rate": 1.622877492345712e-06, | |
| "loss": 2.6718, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.869880391446176, | |
| "grad_norm": 6.526007175445557, | |
| "learning_rate": 1.6195738874967249e-06, | |
| "loss": 2.6562, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.8795457291289113, | |
| "grad_norm": 6.33076286315918, | |
| "learning_rate": 1.6162592711469181e-06, | |
| "loss": 2.6509, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.886311465506826, | |
| "eval_loss": 2.699232578277588, | |
| "eval_runtime": 217.0251, | |
| "eval_samples_per_second": 101.693, | |
| "eval_steps_per_second": 4.239, | |
| "step": 2987 | |
| }, | |
| { | |
| "epoch": 2.889211066811647, | |
| "grad_norm": 6.612087249755859, | |
| "learning_rate": 1.6129337022059733e-06, | |
| "loss": 2.6695, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 2.898876404494382, | |
| "grad_norm": 6.748770236968994, | |
| "learning_rate": 1.6095972397782298e-06, | |
| "loss": 2.6739, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.9085417421771176, | |
| "grad_norm": 6.782710552215576, | |
| "learning_rate": 1.6062499431616331e-06, | |
| "loss": 2.6703, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 2.9182070798598527, | |
| "grad_norm": 6.288000106811523, | |
| "learning_rate": 1.602891871846682e-06, | |
| "loss": 2.6443, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 2.927872417542588, | |
| "grad_norm": 6.792652130126953, | |
| "learning_rate": 1.59952308551537e-06, | |
| "loss": 2.6951, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 2.937537755225323, | |
| "grad_norm": 6.761292934417725, | |
| "learning_rate": 1.5961436440401267e-06, | |
| "loss": 2.6623, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 2.9472030929080586, | |
| "grad_norm": 6.930576324462891, | |
| "learning_rate": 1.5927536074827503e-06, | |
| "loss": 2.6476, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.9568684305907937, | |
| "grad_norm": 6.642486095428467, | |
| "learning_rate": 1.5893530360933448e-06, | |
| "loss": 2.6679, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 2.9665337682735293, | |
| "grad_norm": 6.4895501136779785, | |
| "learning_rate": 1.5859419903092445e-06, | |
| "loss": 2.6638, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 2.9761991059562645, | |
| "grad_norm": 6.47406005859375, | |
| "learning_rate": 1.582520530753943e-06, | |
| "loss": 2.6515, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 2.9858644436389996, | |
| "grad_norm": 6.589229106903076, | |
| "learning_rate": 1.5790887182360143e-06, | |
| "loss": 2.671, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 2.9858644436389996, | |
| "eval_loss": 2.694453239440918, | |
| "eval_runtime": 216.9988, | |
| "eval_samples_per_second": 101.706, | |
| "eval_steps_per_second": 4.24, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 2.9955297813217348, | |
| "grad_norm": 6.626533508300781, | |
| "learning_rate": 1.575646613748033e-06, | |
| "loss": 2.6769, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 3.0048326688413676, | |
| "grad_norm": 6.644468307495117, | |
| "learning_rate": 1.5721942784654892e-06, | |
| "loss": 2.6425, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 3.0144980065241027, | |
| "grad_norm": 6.786801338195801, | |
| "learning_rate": 1.5687317737457023e-06, | |
| "loss": 2.6216, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 3.0241633442068383, | |
| "grad_norm": 6.773380756378174, | |
| "learning_rate": 1.5652591611267293e-06, | |
| "loss": 2.6165, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 3.0338286818895734, | |
| "grad_norm": 6.535521507263184, | |
| "learning_rate": 1.5617765023262734e-06, | |
| "loss": 2.6002, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 3.043494019572309, | |
| "grad_norm": 6.627861499786377, | |
| "learning_rate": 1.5582838592405838e-06, | |
| "loss": 2.6218, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 3.053159357255044, | |
| "grad_norm": 6.7703938484191895, | |
| "learning_rate": 1.5547812939433587e-06, | |
| "loss": 2.6333, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 3.0628246949377793, | |
| "grad_norm": 6.72239875793457, | |
| "learning_rate": 1.5512688686846402e-06, | |
| "loss": 2.6356, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 3.0724900326205145, | |
| "grad_norm": 6.52492618560791, | |
| "learning_rate": 1.5477466458897083e-06, | |
| "loss": 2.6048, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 3.08215537030325, | |
| "grad_norm": 6.695172309875488, | |
| "learning_rate": 1.5442146881579726e-06, | |
| "loss": 2.5957, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 3.0850549716080704, | |
| "eval_loss": 2.6920766830444336, | |
| "eval_runtime": 216.947, | |
| "eval_samples_per_second": 101.73, | |
| "eval_steps_per_second": 4.241, | |
| "step": 3193 | |
| }, | |
| { | |
| "epoch": 3.091820707985985, | |
| "grad_norm": 6.52951717376709, | |
| "learning_rate": 1.5406730582618575e-06, | |
| "loss": 2.5731, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 3.1014860456687208, | |
| "grad_norm": 6.658172130584717, | |
| "learning_rate": 1.5371218191456892e-06, | |
| "loss": 2.599, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 3.111151383351456, | |
| "grad_norm": 6.837924003601074, | |
| "learning_rate": 1.5335610339245748e-06, | |
| "loss": 2.6061, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 3.120816721034191, | |
| "grad_norm": 6.788025856018066, | |
| "learning_rate": 1.5299907658832817e-06, | |
| "loss": 2.593, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 3.130482058716926, | |
| "grad_norm": 6.745009899139404, | |
| "learning_rate": 1.526411078475113e-06, | |
| "loss": 2.6085, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 3.140147396399662, | |
| "grad_norm": 6.781652927398682, | |
| "learning_rate": 1.5228220353207784e-06, | |
| "loss": 2.6148, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 3.149812734082397, | |
| "grad_norm": 6.939257621765137, | |
| "learning_rate": 1.5192237002072656e-06, | |
| "loss": 2.6116, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 3.1594780717651325, | |
| "grad_norm": 6.560710430145264, | |
| "learning_rate": 1.5156161370867052e-06, | |
| "loss": 2.6266, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 3.1691434094478677, | |
| "grad_norm": 6.522158145904541, | |
| "learning_rate": 1.5119994100752337e-06, | |
| "loss": 2.6049, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 3.178808747130603, | |
| "grad_norm": 6.933811187744141, | |
| "learning_rate": 1.5083735834518555e-06, | |
| "loss": 2.6168, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 3.184607949740244, | |
| "eval_loss": 2.68808650970459, | |
| "eval_runtime": 216.8361, | |
| "eval_samples_per_second": 101.782, | |
| "eval_steps_per_second": 4.243, | |
| "step": 3296 | |
| }, | |
| { | |
| "epoch": 3.188474084813338, | |
| "grad_norm": 6.851550579071045, | |
| "learning_rate": 1.5047387216572994e-06, | |
| "loss": 2.6326, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 3.1981394224960735, | |
| "grad_norm": 6.676671504974365, | |
| "learning_rate": 1.5010948892928732e-06, | |
| "loss": 2.5766, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 3.2078047601788087, | |
| "grad_norm": 6.791378974914551, | |
| "learning_rate": 1.4974421511193164e-06, | |
| "loss": 2.5921, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 3.2174700978615443, | |
| "grad_norm": 6.412371635437012, | |
| "learning_rate": 1.493780572055649e-06, | |
| "loss": 2.6231, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 3.2271354355442794, | |
| "grad_norm": 6.870903968811035, | |
| "learning_rate": 1.4901102171780174e-06, | |
| "loss": 2.6049, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 3.2368007732270145, | |
| "grad_norm": 6.820573806762695, | |
| "learning_rate": 1.4864311517185369e-06, | |
| "loss": 2.592, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 3.2464661109097497, | |
| "grad_norm": 6.859145641326904, | |
| "learning_rate": 1.482743441064134e-06, | |
| "loss": 2.6487, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 3.2561314485924853, | |
| "grad_norm": 6.515082359313965, | |
| "learning_rate": 1.4790471507553847e-06, | |
| "loss": 2.6272, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 3.2657967862752204, | |
| "grad_norm": 6.639008045196533, | |
| "learning_rate": 1.4753423464853473e-06, | |
| "loss": 2.597, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 3.275462123957956, | |
| "grad_norm": 6.76662540435791, | |
| "learning_rate": 1.4716290940983965e-06, | |
| "loss": 2.6137, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 3.2841609278724175, | |
| "eval_loss": 2.6844358444213867, | |
| "eval_runtime": 216.9567, | |
| "eval_samples_per_second": 101.725, | |
| "eval_steps_per_second": 4.24, | |
| "step": 3399 | |
| }, | |
| { | |
| "epoch": 3.285127461640691, | |
| "grad_norm": 6.6699347496032715, | |
| "learning_rate": 1.4679074595890532e-06, | |
| "loss": 2.5737, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 3.2947927993234263, | |
| "grad_norm": 6.792168617248535, | |
| "learning_rate": 1.4641775091008106e-06, | |
| "loss": 2.6094, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 3.3044581370061614, | |
| "grad_norm": 6.676168441772461, | |
| "learning_rate": 1.4604393089249599e-06, | |
| "loss": 2.6151, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 3.314123474688897, | |
| "grad_norm": 6.540125846862793, | |
| "learning_rate": 1.456692925499411e-06, | |
| "loss": 2.626, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 3.323788812371632, | |
| "grad_norm": 6.471897602081299, | |
| "learning_rate": 1.4529384254075133e-06, | |
| "loss": 2.6007, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 3.3334541500543677, | |
| "grad_norm": 6.628961086273193, | |
| "learning_rate": 1.4491758753768694e-06, | |
| "loss": 2.6175, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 3.343119487737103, | |
| "grad_norm": 6.617523670196533, | |
| "learning_rate": 1.4454053422781532e-06, | |
| "loss": 2.6116, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 3.352784825419838, | |
| "grad_norm": 6.7695722579956055, | |
| "learning_rate": 1.4416268931239177e-06, | |
| "loss": 2.5806, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 3.362450163102573, | |
| "grad_norm": 6.675383567810059, | |
| "learning_rate": 1.4378405950674065e-06, | |
| "loss": 2.5972, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 3.3721155007853088, | |
| "grad_norm": 6.70187520980835, | |
| "learning_rate": 1.4340465154013585e-06, | |
| "loss": 2.5809, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 3.381780838468044, | |
| "grad_norm": 6.520016670227051, | |
| "learning_rate": 1.4302447215568136e-06, | |
| "loss": 2.6, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 3.383713906004591, | |
| "eval_loss": 2.6802666187286377, | |
| "eval_runtime": 216.9945, | |
| "eval_samples_per_second": 101.708, | |
| "eval_steps_per_second": 4.24, | |
| "step": 3502 | |
| }, | |
| { | |
| "epoch": 3.3914461761507795, | |
| "grad_norm": 6.802120208740234, | |
| "learning_rate": 1.426435281101913e-06, | |
| "loss": 2.6175, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 3.4011115138335146, | |
| "grad_norm": 6.581682205200195, | |
| "learning_rate": 1.4226182617406994e-06, | |
| "loss": 2.6121, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 3.4107768515162498, | |
| "grad_norm": 6.738428115844727, | |
| "learning_rate": 1.4187937313119124e-06, | |
| "loss": 2.6015, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 3.420442189198985, | |
| "grad_norm": 6.7162675857543945, | |
| "learning_rate": 1.4149617577877841e-06, | |
| "loss": 2.6161, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 3.4301075268817205, | |
| "grad_norm": 6.830008029937744, | |
| "learning_rate": 1.41112240927283e-06, | |
| "loss": 2.6125, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 3.4397728645644556, | |
| "grad_norm": 6.8902435302734375, | |
| "learning_rate": 1.4072757540026396e-06, | |
| "loss": 2.6181, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 3.449438202247191, | |
| "grad_norm": 6.644567012786865, | |
| "learning_rate": 1.403421860342663e-06, | |
| "loss": 2.5965, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 3.4591035399299264, | |
| "grad_norm": 6.674381732940674, | |
| "learning_rate": 1.3995607967869963e-06, | |
| "loss": 2.6186, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 3.4687688776126615, | |
| "grad_norm": 6.683381080627441, | |
| "learning_rate": 1.3956926319571628e-06, | |
| "loss": 2.6271, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 3.4784342152953966, | |
| "grad_norm": 6.642363548278809, | |
| "learning_rate": 1.3918174346008963e-06, | |
| "loss": 2.6047, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 3.4832668841367647, | |
| "eval_loss": 2.6766762733459473, | |
| "eval_runtime": 216.9538, | |
| "eval_samples_per_second": 101.727, | |
| "eval_steps_per_second": 4.241, | |
| "step": 3605 | |
| }, | |
| { | |
| "epoch": 3.4880995529781322, | |
| "grad_norm": 6.7301106452941895, | |
| "learning_rate": 1.3879352735909163e-06, | |
| "loss": 2.6016, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 3.4977648906608674, | |
| "grad_norm": 6.565543174743652, | |
| "learning_rate": 1.3840462179237058e-06, | |
| "loss": 2.6413, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 3.507430228343603, | |
| "grad_norm": 6.726136207580566, | |
| "learning_rate": 1.3801503367182846e-06, | |
| "loss": 2.5709, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 3.517095566026338, | |
| "grad_norm": 6.778688907623291, | |
| "learning_rate": 1.3762476992149803e-06, | |
| "loss": 2.5772, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 3.5267609037090732, | |
| "grad_norm": 6.823915004730225, | |
| "learning_rate": 1.3723383747741988e-06, | |
| "loss": 2.6417, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 3.5364262413918084, | |
| "grad_norm": 6.821535110473633, | |
| "learning_rate": 1.3684224328751904e-06, | |
| "loss": 2.6092, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 3.546091579074544, | |
| "grad_norm": 6.66444730758667, | |
| "learning_rate": 1.364499943114815e-06, | |
| "loss": 2.5744, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 3.555756916757279, | |
| "grad_norm": 6.7145161628723145, | |
| "learning_rate": 1.3605709752063072e-06, | |
| "loss": 2.6028, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 3.5654222544400147, | |
| "grad_norm": 6.7199931144714355, | |
| "learning_rate": 1.356635598978034e-06, | |
| "loss": 2.5938, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 3.57508759212275, | |
| "grad_norm": 6.810650825500488, | |
| "learning_rate": 1.352693884372257e-06, | |
| "loss": 2.5833, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 3.582819862268938, | |
| "eval_loss": 2.67258882522583, | |
| "eval_runtime": 216.9499, | |
| "eval_samples_per_second": 101.729, | |
| "eval_steps_per_second": 4.241, | |
| "step": 3708 | |
| }, | |
| { | |
| "epoch": 3.584752929805485, | |
| "grad_norm": 6.88958740234375, | |
| "learning_rate": 1.3487459014438875e-06, | |
| "loss": 2.6103, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 3.59441826748822, | |
| "grad_norm": 6.527899742126465, | |
| "learning_rate": 1.344791720359241e-06, | |
| "loss": 2.5896, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 3.6040836051709557, | |
| "grad_norm": 6.705896854400635, | |
| "learning_rate": 1.340831411394792e-06, | |
| "loss": 2.5731, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 3.613748942853691, | |
| "grad_norm": 6.65834903717041, | |
| "learning_rate": 1.3368650449359237e-06, | |
| "loss": 2.5992, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 3.6234142805364264, | |
| "grad_norm": 6.513383865356445, | |
| "learning_rate": 1.3328926914756764e-06, | |
| "loss": 2.59, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 3.6330796182191616, | |
| "grad_norm": 6.5526652336120605, | |
| "learning_rate": 1.3289144216134973e-06, | |
| "loss": 2.5497, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 3.6427449559018967, | |
| "grad_norm": 6.795760631561279, | |
| "learning_rate": 1.324930306053983e-06, | |
| "loss": 2.6445, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 3.652410293584632, | |
| "grad_norm": 6.859256744384766, | |
| "learning_rate": 1.3209404156056234e-06, | |
| "loss": 2.5714, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 3.6620756312673675, | |
| "grad_norm": 6.862630367279053, | |
| "learning_rate": 1.316944821179545e-06, | |
| "loss": 2.5888, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 3.6717409689501026, | |
| "grad_norm": 6.7294511795043945, | |
| "learning_rate": 1.3129435937882485e-06, | |
| "loss": 2.5981, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 3.681406306632838, | |
| "grad_norm": 6.428034782409668, | |
| "learning_rate": 1.3089368045443478e-06, | |
| "loss": 2.6094, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 3.6823728404011113, | |
| "eval_loss": 2.6688151359558105, | |
| "eval_runtime": 216.9738, | |
| "eval_samples_per_second": 101.717, | |
| "eval_steps_per_second": 4.24, | |
| "step": 3811 | |
| }, | |
| { | |
| "epoch": 3.6910716443155733, | |
| "grad_norm": 6.683334827423096, | |
| "learning_rate": 1.3049245246593063e-06, | |
| "loss": 2.5955, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 3.7007369819983085, | |
| "grad_norm": 6.5523481369018555, | |
| "learning_rate": 1.3009068254421707e-06, | |
| "loss": 2.5874, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 3.7104023196810436, | |
| "grad_norm": 6.8417253494262695, | |
| "learning_rate": 1.2968837782983032e-06, | |
| "loss": 2.5853, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 3.720067657363779, | |
| "grad_norm": 6.959075450897217, | |
| "learning_rate": 1.292855454728115e-06, | |
| "loss": 2.5751, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 3.7297329950465143, | |
| "grad_norm": 6.557989120483398, | |
| "learning_rate": 1.288821926325791e-06, | |
| "loss": 2.6214, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 3.73939833272925, | |
| "grad_norm": 6.578183650970459, | |
| "learning_rate": 1.2847832647780218e-06, | |
| "loss": 2.5918, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 3.749063670411985, | |
| "grad_norm": 6.700976371765137, | |
| "learning_rate": 1.2807395418627277e-06, | |
| "loss": 2.5793, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 3.75872900809472, | |
| "grad_norm": 6.564661502838135, | |
| "learning_rate": 1.2766908294477826e-06, | |
| "loss": 2.5966, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 3.7683943457774554, | |
| "grad_norm": 6.6865410804748535, | |
| "learning_rate": 1.2726371994897376e-06, | |
| "loss": 2.5844, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 3.778059683460191, | |
| "grad_norm": 6.696436882019043, | |
| "learning_rate": 1.2685787240325417e-06, | |
| "loss": 2.5761, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 3.781925818533285, | |
| "eval_loss": 2.665092706680298, | |
| "eval_runtime": 216.965, | |
| "eval_samples_per_second": 101.721, | |
| "eval_steps_per_second": 4.24, | |
| "step": 3914 | |
| }, | |
| { | |
| "epoch": 3.787725021142926, | |
| "grad_norm": 6.7038702964782715, | |
| "learning_rate": 1.264515475206262e-06, | |
| "loss": 2.5867, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 3.7973903588256617, | |
| "grad_norm": 6.5760064125061035, | |
| "learning_rate": 1.2604475252258005e-06, | |
| "loss": 2.5927, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 3.807055696508397, | |
| "grad_norm": 6.533830642700195, | |
| "learning_rate": 1.2563749463896116e-06, | |
| "loss": 2.5828, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 3.816721034191132, | |
| "grad_norm": 6.66879415512085, | |
| "learning_rate": 1.2522978110784177e-06, | |
| "loss": 2.5703, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 3.826386371873867, | |
| "grad_norm": 6.830549716949463, | |
| "learning_rate": 1.2482161917539209e-06, | |
| "loss": 2.6076, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 3.8360517095566027, | |
| "grad_norm": 6.542690753936768, | |
| "learning_rate": 1.2441301609575172e-06, | |
| "loss": 2.5879, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 3.845717047239338, | |
| "grad_norm": 6.628766059875488, | |
| "learning_rate": 1.2400397913090059e-06, | |
| "loss": 2.6014, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 3.8553823849220734, | |
| "grad_norm": 6.663356781005859, | |
| "learning_rate": 1.2359451555052997e-06, | |
| "loss": 2.5364, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 3.8650477226048086, | |
| "grad_norm": 6.802213191986084, | |
| "learning_rate": 1.2318463263191323e-06, | |
| "loss": 2.5678, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 3.8747130602875437, | |
| "grad_norm": 6.468794345855713, | |
| "learning_rate": 1.227743376597765e-06, | |
| "loss": 2.5762, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 3.8814787966654585, | |
| "eval_loss": 2.6623401641845703, | |
| "eval_runtime": 217.0567, | |
| "eval_samples_per_second": 101.678, | |
| "eval_steps_per_second": 4.239, | |
| "step": 4017 | |
| }, | |
| { | |
| "epoch": 3.884378397970279, | |
| "grad_norm": 6.709383964538574, | |
| "learning_rate": 1.2236363792616923e-06, | |
| "loss": 2.6092, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 3.8940437356530144, | |
| "grad_norm": 6.665591239929199, | |
| "learning_rate": 1.2195254073033455e-06, | |
| "loss": 2.5627, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 3.9037090733357496, | |
| "grad_norm": 6.547743797302246, | |
| "learning_rate": 1.2154105337857963e-06, | |
| "loss": 2.6063, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 3.913374411018485, | |
| "grad_norm": 6.582630634307861, | |
| "learning_rate": 1.2112918318414572e-06, | |
| "loss": 2.5997, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 3.9230397487012203, | |
| "grad_norm": 6.448917865753174, | |
| "learning_rate": 1.2071693746707812e-06, | |
| "loss": 2.5644, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 3.9327050863839554, | |
| "grad_norm": 6.690165042877197, | |
| "learning_rate": 1.203043235540964e-06, | |
| "loss": 2.6261, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 3.9423704240666906, | |
| "grad_norm": 6.645413398742676, | |
| "learning_rate": 1.1989134877846377e-06, | |
| "loss": 2.6075, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 3.952035761749426, | |
| "grad_norm": 6.471231460571289, | |
| "learning_rate": 1.1947802047985701e-06, | |
| "loss": 2.5694, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 3.9617010994321613, | |
| "grad_norm": 6.488655090332031, | |
| "learning_rate": 1.1906434600423605e-06, | |
| "loss": 2.5728, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 3.971366437114897, | |
| "grad_norm": 6.397697448730469, | |
| "learning_rate": 1.1865033270371317e-06, | |
| "loss": 2.6184, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 3.981031774797632, | |
| "grad_norm": 6.57921838760376, | |
| "learning_rate": 1.1823598793642256e-06, | |
| "loss": 2.5828, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 3.981031774797632, | |
| "eval_loss": 2.658400774002075, | |
| "eval_runtime": 216.9957, | |
| "eval_samples_per_second": 101.707, | |
| "eval_steps_per_second": 4.24, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 3.990697112480367, | |
| "grad_norm": 6.8846306800842285, | |
| "learning_rate": 1.1782131906638947e-06, | |
| "loss": 2.5556, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 8.866212844848633, | |
| "learning_rate": 1.174063334633993e-06, | |
| "loss": 2.5868, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 4.009665337682735, | |
| "grad_norm": 6.461270332336426, | |
| "learning_rate": 1.1699103850286667e-06, | |
| "loss": 2.5343, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 4.01933067536547, | |
| "grad_norm": 6.403049945831299, | |
| "learning_rate": 1.1657544156570433e-06, | |
| "loss": 2.5403, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 4.028996013048205, | |
| "grad_norm": 6.411298751831055, | |
| "learning_rate": 1.1615955003819195e-06, | |
| "loss": 2.5484, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 4.0386613507309415, | |
| "grad_norm": 6.594933032989502, | |
| "learning_rate": 1.157433713118449e-06, | |
| "loss": 2.5664, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 4.048326688413677, | |
| "grad_norm": 6.8718342781066895, | |
| "learning_rate": 1.1532691278328282e-06, | |
| "loss": 2.5654, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 4.057992026096412, | |
| "grad_norm": 6.683941841125488, | |
| "learning_rate": 1.1491018185409813e-06, | |
| "loss": 2.532, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 4.067657363779147, | |
| "grad_norm": 6.692328929901123, | |
| "learning_rate": 1.1449318593072465e-06, | |
| "loss": 2.5486, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 4.077322701461882, | |
| "grad_norm": 6.605168342590332, | |
| "learning_rate": 1.1407593242430586e-06, | |
| "loss": 2.5821, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 4.080222302766703, | |
| "eval_loss": 2.6606152057647705, | |
| "eval_runtime": 217.0244, | |
| "eval_samples_per_second": 101.694, | |
| "eval_steps_per_second": 4.239, | |
| "step": 4223 | |
| }, | |
| { | |
| "epoch": 4.086988039144618, | |
| "grad_norm": 6.7313337326049805, | |
| "learning_rate": 1.1365842875056311e-06, | |
| "loss": 2.554, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 4.096653376827353, | |
| "grad_norm": 6.845920085906982, | |
| "learning_rate": 1.1324068232966392e-06, | |
| "loss": 2.5234, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 4.106318714510088, | |
| "grad_norm": 6.765261650085449, | |
| "learning_rate": 1.1282270058609013e-06, | |
| "loss": 2.547, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 4.1159840521928235, | |
| "grad_norm": 6.755435943603516, | |
| "learning_rate": 1.1240449094850584e-06, | |
| "loss": 2.5661, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 4.125649389875559, | |
| "grad_norm": 6.7944655418396, | |
| "learning_rate": 1.1198606084962547e-06, | |
| "loss": 2.5423, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 4.135314727558294, | |
| "grad_norm": 6.660946369171143, | |
| "learning_rate": 1.1156741772608165e-06, | |
| "loss": 2.5314, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 4.144980065241029, | |
| "grad_norm": 6.597288608551025, | |
| "learning_rate": 1.11148569018293e-06, | |
| "loss": 2.5404, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 4.154645402923765, | |
| "grad_norm": 6.629675388336182, | |
| "learning_rate": 1.1072952217033195e-06, | |
| "loss": 2.5392, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 4.1643107406065, | |
| "grad_norm": 6.65186882019043, | |
| "learning_rate": 1.103102846297924e-06, | |
| "loss": 2.6007, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 4.173976078289235, | |
| "grad_norm": 6.881357669830322, | |
| "learning_rate": 1.0989086384765737e-06, | |
| "loss": 2.5721, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 4.179775280898877, | |
| "eval_loss": 2.6572110652923584, | |
| "eval_runtime": 216.941, | |
| "eval_samples_per_second": 101.733, | |
| "eval_steps_per_second": 4.241, | |
| "step": 4326 | |
| }, | |
| { | |
| "epoch": 4.18364141597197, | |
| "grad_norm": 6.678983688354492, | |
| "learning_rate": 1.0947126727816665e-06, | |
| "loss": 2.5719, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 4.1933067536547055, | |
| "grad_norm": 6.7589945793151855, | |
| "learning_rate": 1.090515023786841e-06, | |
| "loss": 2.5502, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 4.2029720913374415, | |
| "grad_norm": 6.886190414428711, | |
| "learning_rate": 1.0863157660956538e-06, | |
| "loss": 2.5236, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 4.212637429020177, | |
| "grad_norm": 6.677691459655762, | |
| "learning_rate": 1.082114974340252e-06, | |
| "loss": 2.5518, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 4.222302766702912, | |
| "grad_norm": 6.715696811676025, | |
| "learning_rate": 1.0779127231800474e-06, | |
| "loss": 2.5375, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 4.231968104385647, | |
| "grad_norm": 6.802489757537842, | |
| "learning_rate": 1.0737090873003884e-06, | |
| "loss": 2.5263, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 4.241633442068382, | |
| "grad_norm": 7.024548053741455, | |
| "learning_rate": 1.069504141411235e-06, | |
| "loss": 2.5676, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 4.251298779751117, | |
| "grad_norm": 6.528923034667969, | |
| "learning_rate": 1.0652979602458287e-06, | |
| "loss": 2.5676, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 4.260964117433852, | |
| "grad_norm": 6.7343244552612305, | |
| "learning_rate": 1.0610906185593653e-06, | |
| "loss": 2.5323, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 4.270629455116588, | |
| "grad_norm": 6.739745140075684, | |
| "learning_rate": 1.0568821911276668e-06, | |
| "loss": 2.5333, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 4.2793282590310495, | |
| "eval_loss": 2.6547646522521973, | |
| "eval_runtime": 217.1095, | |
| "eval_samples_per_second": 101.654, | |
| "eval_steps_per_second": 4.237, | |
| "step": 4429 | |
| }, | |
| { | |
| "epoch": 4.280294792799324, | |
| "grad_norm": 6.441568851470947, | |
| "learning_rate": 1.0526727527458508e-06, | |
| "loss": 2.561, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 4.289960130482059, | |
| "grad_norm": 6.605863571166992, | |
| "learning_rate": 1.048462378227003e-06, | |
| "loss": 2.5611, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 4.299625468164794, | |
| "grad_norm": 6.771341323852539, | |
| "learning_rate": 1.0442511424008464e-06, | |
| "loss": 2.5735, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 4.309290805847529, | |
| "grad_norm": 6.989775657653809, | |
| "learning_rate": 1.040039120112412e-06, | |
| "loss": 2.5473, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 4.318956143530265, | |
| "grad_norm": 7.052975654602051, | |
| "learning_rate": 1.0358263862207083e-06, | |
| "loss": 2.5443, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 4.328621481213, | |
| "grad_norm": 6.5822858810424805, | |
| "learning_rate": 1.031613015597391e-06, | |
| "loss": 2.5585, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 4.338286818895735, | |
| "grad_norm": 6.736408710479736, | |
| "learning_rate": 1.0273990831254319e-06, | |
| "loss": 2.5436, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 4.3479521565784705, | |
| "grad_norm": 6.6356120109558105, | |
| "learning_rate": 1.0231846636977882e-06, | |
| "loss": 2.5533, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 4.357617494261206, | |
| "grad_norm": 6.671817779541016, | |
| "learning_rate": 1.0189698322160732e-06, | |
| "loss": 2.5469, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 4.367282831943941, | |
| "grad_norm": 7.152904987335205, | |
| "learning_rate": 1.0147546635892209e-06, | |
| "loss": 2.537, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 4.376948169626676, | |
| "grad_norm": 6.494625568389893, | |
| "learning_rate": 1.010539232732159e-06, | |
| "loss": 2.5045, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 4.378881237163223, | |
| "eval_loss": 2.653103828430176, | |
| "eval_runtime": 217.106, | |
| "eval_samples_per_second": 101.655, | |
| "eval_steps_per_second": 4.238, | |
| "step": 4532 | |
| }, | |
| { | |
| "epoch": 4.386613507309412, | |
| "grad_norm": 6.714130401611328, | |
| "learning_rate": 1.0063236145644762e-06, | |
| "loss": 2.5592, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 4.396278844992147, | |
| "grad_norm": 6.6748528480529785, | |
| "learning_rate": 1.0021078840090886e-06, | |
| "loss": 2.5473, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 4.405944182674882, | |
| "grad_norm": 6.548516273498535, | |
| "learning_rate": 9.978921159909113e-07, | |
| "loss": 2.5382, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 4.415609520357617, | |
| "grad_norm": 6.959436416625977, | |
| "learning_rate": 9.93676385435524e-07, | |
| "loss": 2.537, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 4.4252748580403525, | |
| "grad_norm": 6.954132556915283, | |
| "learning_rate": 9.894607672678408e-07, | |
| "loss": 2.5511, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 4.4349401957230885, | |
| "grad_norm": 6.731428146362305, | |
| "learning_rate": 9.852453364107792e-07, | |
| "loss": 2.522, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 4.444605533405824, | |
| "grad_norm": 6.622025489807129, | |
| "learning_rate": 9.81030167783927e-07, | |
| "loss": 2.5363, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 4.454270871088559, | |
| "grad_norm": 6.871068954467773, | |
| "learning_rate": 9.768153363022115e-07, | |
| "loss": 2.5361, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 4.463936208771294, | |
| "grad_norm": 6.612332820892334, | |
| "learning_rate": 9.72600916874568e-07, | |
| "loss": 2.5385, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 4.473601546454029, | |
| "grad_norm": 6.64678955078125, | |
| "learning_rate": 9.683869844026089e-07, | |
| "loss": 2.5374, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 4.478434215295397, | |
| "eval_loss": 2.650099515914917, | |
| "eval_runtime": 216.8756, | |
| "eval_samples_per_second": 101.763, | |
| "eval_steps_per_second": 4.242, | |
| "step": 4635 | |
| }, | |
| { | |
| "epoch": 4.483266884136764, | |
| "grad_norm": 6.953402996063232, | |
| "learning_rate": 9.641736137792914e-07, | |
| "loss": 2.5646, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 4.492932221819499, | |
| "grad_norm": 6.753168106079102, | |
| "learning_rate": 9.59960879887588e-07, | |
| "loss": 2.545, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 4.502597559502235, | |
| "grad_norm": 6.863531589508057, | |
| "learning_rate": 9.557488575991537e-07, | |
| "loss": 2.5303, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 4.5122628971849705, | |
| "grad_norm": 6.904908657073975, | |
| "learning_rate": 9.515376217729971e-07, | |
| "loss": 2.5381, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 4.521928234867706, | |
| "grad_norm": 6.753827095031738, | |
| "learning_rate": 9.473272472541492e-07, | |
| "loss": 2.5243, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 4.531593572550441, | |
| "grad_norm": 6.280510902404785, | |
| "learning_rate": 9.431178088723333e-07, | |
| "loss": 2.5566, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 4.541258910233176, | |
| "grad_norm": 6.862277030944824, | |
| "learning_rate": 9.389093814406347e-07, | |
| "loss": 2.5286, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 4.550924247915912, | |
| "grad_norm": 6.67894172668457, | |
| "learning_rate": 9.347020397541714e-07, | |
| "loss": 2.5297, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 4.560589585598647, | |
| "grad_norm": 6.786202907562256, | |
| "learning_rate": 9.30495858588765e-07, | |
| "loss": 2.565, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 4.570254923281382, | |
| "grad_norm": 6.705028533935547, | |
| "learning_rate": 9.262909126996116e-07, | |
| "loss": 2.5141, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 4.57798719342757, | |
| "eval_loss": 2.6482667922973633, | |
| "eval_runtime": 216.9673, | |
| "eval_samples_per_second": 101.72, | |
| "eval_steps_per_second": 4.24, | |
| "step": 4738 | |
| }, | |
| { | |
| "epoch": 4.579920260964117, | |
| "grad_norm": 6.745487213134766, | |
| "learning_rate": 9.220872768199527e-07, | |
| "loss": 2.5456, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 4.589585598646853, | |
| "grad_norm": 6.562654495239258, | |
| "learning_rate": 9.178850256597481e-07, | |
| "loss": 2.5743, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 4.599250936329588, | |
| "grad_norm": 6.653303623199463, | |
| "learning_rate": 9.136842339043463e-07, | |
| "loss": 2.522, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 4.608916274012323, | |
| "grad_norm": 7.028505325317383, | |
| "learning_rate": 9.09484976213159e-07, | |
| "loss": 2.5192, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 4.618581611695059, | |
| "grad_norm": 6.768604278564453, | |
| "learning_rate": 9.052873272183335e-07, | |
| "loss": 2.5391, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 4.628246949377794, | |
| "grad_norm": 7.153252601623535, | |
| "learning_rate": 9.01091361523426e-07, | |
| "loss": 2.5597, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 4.637912287060529, | |
| "grad_norm": 6.854784965515137, | |
| "learning_rate": 8.968971537020757e-07, | |
| "loss": 2.553, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 4.647577624743264, | |
| "grad_norm": 6.819455146789551, | |
| "learning_rate": 8.927047782966806e-07, | |
| "loss": 2.5752, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 4.657242962425999, | |
| "grad_norm": 6.677375793457031, | |
| "learning_rate": 8.8851430981707e-07, | |
| "loss": 2.562, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 4.6669083001087355, | |
| "grad_norm": 6.6845245361328125, | |
| "learning_rate": 8.843258227391834e-07, | |
| "loss": 2.5613, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 4.676573637791471, | |
| "grad_norm": 6.7105607986450195, | |
| "learning_rate": 8.801393915037456e-07, | |
| "loss": 2.558, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 4.677540171559744, | |
| "eval_loss": 2.6455490589141846, | |
| "eval_runtime": 217.0811, | |
| "eval_samples_per_second": 101.667, | |
| "eval_steps_per_second": 4.238, | |
| "step": 4841 | |
| }, | |
| { | |
| "epoch": 4.686238975474206, | |
| "grad_norm": 6.720954418182373, | |
| "learning_rate": 8.759550905149419e-07, | |
| "loss": 2.5752, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 4.695904313156941, | |
| "grad_norm": 6.768190383911133, | |
| "learning_rate": 8.717729941390988e-07, | |
| "loss": 2.5586, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 4.705569650839676, | |
| "grad_norm": 6.576201915740967, | |
| "learning_rate": 8.675931767033609e-07, | |
| "loss": 2.5396, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 4.715234988522411, | |
| "grad_norm": 6.709794521331787, | |
| "learning_rate": 8.63415712494369e-07, | |
| "loss": 2.5761, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 4.724900326205146, | |
| "grad_norm": 6.836709022521973, | |
| "learning_rate": 8.592406757569415e-07, | |
| "loss": 2.5602, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 4.734565663887882, | |
| "grad_norm": 6.63275146484375, | |
| "learning_rate": 8.550681406927533e-07, | |
| "loss": 2.5477, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 4.7442310015706175, | |
| "grad_norm": 6.481536865234375, | |
| "learning_rate": 8.508981814590188e-07, | |
| "loss": 2.5443, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 4.753896339253353, | |
| "grad_norm": 6.737277507781982, | |
| "learning_rate": 8.46730872167172e-07, | |
| "loss": 2.5516, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 4.763561676936088, | |
| "grad_norm": 6.8111138343811035, | |
| "learning_rate": 8.425662868815509e-07, | |
| "loss": 2.5538, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 4.773227014618823, | |
| "grad_norm": 6.9275078773498535, | |
| "learning_rate": 8.384044996180805e-07, | |
| "loss": 2.4995, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 4.777093149691917, | |
| "eval_loss": 2.6439967155456543, | |
| "eval_runtime": 217.1401, | |
| "eval_samples_per_second": 101.639, | |
| "eval_steps_per_second": 4.237, | |
| "step": 4944 | |
| }, | |
| { | |
| "epoch": 4.782892352301559, | |
| "grad_norm": 6.729687690734863, | |
| "learning_rate": 8.342455843429568e-07, | |
| "loss": 2.5391, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 4.792557689984294, | |
| "grad_norm": 6.650339126586914, | |
| "learning_rate": 8.300896149713334e-07, | |
| "loss": 2.5402, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 4.802223027667029, | |
| "grad_norm": 6.950220108032227, | |
| "learning_rate": 8.259366653660071e-07, | |
| "loss": 2.5354, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 4.811888365349764, | |
| "grad_norm": 6.6570305824279785, | |
| "learning_rate": 8.217868093361053e-07, | |
| "loss": 2.5529, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 4.8215537030324995, | |
| "grad_norm": 6.657363414764404, | |
| "learning_rate": 8.176401206357742e-07, | |
| "loss": 2.5434, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 4.831219040715235, | |
| "grad_norm": 6.5188398361206055, | |
| "learning_rate": 8.134966729628683e-07, | |
| "loss": 2.5375, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 4.84088437839797, | |
| "grad_norm": 6.585175037384033, | |
| "learning_rate": 8.093565399576394e-07, | |
| "loss": 2.537, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 4.850549716080706, | |
| "grad_norm": 6.604196548461914, | |
| "learning_rate": 8.052197952014296e-07, | |
| "loss": 2.5458, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 4.860215053763441, | |
| "grad_norm": 6.6913909912109375, | |
| "learning_rate": 8.010865122153627e-07, | |
| "loss": 2.5668, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 4.869880391446176, | |
| "grad_norm": 6.809241771697998, | |
| "learning_rate": 7.969567644590365e-07, | |
| "loss": 2.5216, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 4.876646127824091, | |
| "eval_loss": 2.641507863998413, | |
| "eval_runtime": 217.12, | |
| "eval_samples_per_second": 101.649, | |
| "eval_steps_per_second": 4.237, | |
| "step": 5047 | |
| }, | |
| { | |
| "epoch": 4.879545729128911, | |
| "grad_norm": 6.640186309814453, | |
| "learning_rate": 7.928306253292189e-07, | |
| "loss": 2.556, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 4.889211066811646, | |
| "grad_norm": 6.924322605133057, | |
| "learning_rate": 7.887081681585432e-07, | |
| "loss": 2.5384, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 4.898876404494382, | |
| "grad_norm": 6.675876140594482, | |
| "learning_rate": 7.845894662142037e-07, | |
| "loss": 2.4959, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 4.908541742177118, | |
| "grad_norm": 6.73758602142334, | |
| "learning_rate": 7.804745926966546e-07, | |
| "loss": 2.5323, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 4.918207079859853, | |
| "grad_norm": 6.658973217010498, | |
| "learning_rate": 7.763636207383079e-07, | |
| "loss": 2.5538, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 4.927872417542588, | |
| "grad_norm": 6.829054355621338, | |
| "learning_rate": 7.722566234022351e-07, | |
| "loss": 2.539, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 4.937537755225323, | |
| "grad_norm": 6.643087863922119, | |
| "learning_rate": 7.681536736808678e-07, | |
| "loss": 2.5454, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 4.947203092908058, | |
| "grad_norm": 6.710460186004639, | |
| "learning_rate": 7.640548444947003e-07, | |
| "loss": 2.5398, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 4.956868430590793, | |
| "grad_norm": 7.006237030029297, | |
| "learning_rate": 7.599602086909942e-07, | |
| "loss": 2.5618, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 4.966533768273529, | |
| "grad_norm": 6.898321628570557, | |
| "learning_rate": 7.558698390424829e-07, | |
| "loss": 2.5254, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 4.9761991059562645, | |
| "grad_norm": 6.923776149749756, | |
| "learning_rate": 7.51783808246079e-07, | |
| "loss": 2.5391, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 4.9761991059562645, | |
| "eval_loss": 2.640099048614502, | |
| "eval_runtime": 216.9681, | |
| "eval_samples_per_second": 101.72, | |
| "eval_steps_per_second": 4.24, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 4.985864443639, | |
| "grad_norm": 6.776469707489014, | |
| "learning_rate": 7.477021889215822e-07, | |
| "loss": 2.5117, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 4.995529781321735, | |
| "grad_norm": 6.805541515350342, | |
| "learning_rate": 7.43625053610388e-07, | |
| "loss": 2.5361, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 5.004832668841368, | |
| "grad_norm": 6.649566173553467, | |
| "learning_rate": 7.395524747741995e-07, | |
| "loss": 2.5263, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 5.014498006524103, | |
| "grad_norm": 6.7946977615356445, | |
| "learning_rate": 7.35484524793738e-07, | |
| "loss": 2.5104, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 5.024163344206838, | |
| "grad_norm": 6.8785624504089355, | |
| "learning_rate": 7.314212759674581e-07, | |
| "loss": 2.5303, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 5.033828681889574, | |
| "grad_norm": 6.662807464599609, | |
| "learning_rate": 7.273628005102628e-07, | |
| "loss": 2.5041, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 5.043494019572309, | |
| "grad_norm": 6.747055530548096, | |
| "learning_rate": 7.233091705522179e-07, | |
| "loss": 2.5294, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 5.053159357255044, | |
| "grad_norm": 6.868143558502197, | |
| "learning_rate": 7.192604581372726e-07, | |
| "loss": 2.4964, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 5.062824694937779, | |
| "grad_norm": 6.930590629577637, | |
| "learning_rate": 7.152167352219783e-07, | |
| "loss": 2.5184, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 5.0724900326205145, | |
| "grad_norm": 6.698469638824463, | |
| "learning_rate": 7.111780736742093e-07, | |
| "loss": 2.4956, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 5.075389633925335, | |
| "eval_loss": 2.642467498779297, | |
| "eval_runtime": 217.038, | |
| "eval_samples_per_second": 101.687, | |
| "eval_steps_per_second": 4.239, | |
| "step": 5253 | |
| }, | |
| { | |
| "epoch": 5.08215537030325, | |
| "grad_norm": 6.79097318649292, | |
| "learning_rate": 7.071445452718852e-07, | |
| "loss": 2.5243, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 5.091820707985986, | |
| "grad_norm": 6.664259910583496, | |
| "learning_rate": 7.031162217016966e-07, | |
| "loss": 2.5117, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 5.101486045668721, | |
| "grad_norm": 6.928717613220215, | |
| "learning_rate": 6.990931745578295e-07, | |
| "loss": 2.5242, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 5.111151383351456, | |
| "grad_norm": 6.610539436340332, | |
| "learning_rate": 6.950754753406937e-07, | |
| "loss": 2.474, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 5.120816721034191, | |
| "grad_norm": 6.7854790687561035, | |
| "learning_rate": 6.910631954556522e-07, | |
| "loss": 2.4971, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 5.130482058716926, | |
| "grad_norm": 6.860823154449463, | |
| "learning_rate": 6.870564062117517e-07, | |
| "loss": 2.5279, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 5.140147396399661, | |
| "grad_norm": 6.848371982574463, | |
| "learning_rate": 6.830551788204551e-07, | |
| "loss": 2.5232, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 5.149812734082397, | |
| "grad_norm": 6.649808883666992, | |
| "learning_rate": 6.790595843943768e-07, | |
| "loss": 2.4639, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 5.1594780717651325, | |
| "grad_norm": 6.732810974121094, | |
| "learning_rate": 6.750696939460172e-07, | |
| "loss": 2.5131, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 5.169143409447868, | |
| "grad_norm": 6.754150867462158, | |
| "learning_rate": 6.710855783865025e-07, | |
| "loss": 2.5343, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 5.174942612057508, | |
| "eval_loss": 2.640747547149658, | |
| "eval_runtime": 216.8589, | |
| "eval_samples_per_second": 101.771, | |
| "eval_steps_per_second": 4.242, | |
| "step": 5356 | |
| }, | |
| { | |
| "epoch": 5.178808747130603, | |
| "grad_norm": 6.860896587371826, | |
| "learning_rate": 6.671073085243233e-07, | |
| "loss": 2.5313, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 5.188474084813338, | |
| "grad_norm": 6.547597408294678, | |
| "learning_rate": 6.631349550640764e-07, | |
| "loss": 2.5394, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 5.198139422496073, | |
| "grad_norm": 6.75359582901001, | |
| "learning_rate": 6.591685886052079e-07, | |
| "loss": 2.5012, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 5.207804760178809, | |
| "grad_norm": 6.7049455642700195, | |
| "learning_rate": 6.552082796407589e-07, | |
| "loss": 2.5162, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 5.217470097861544, | |
| "grad_norm": 6.600377082824707, | |
| "learning_rate": 6.512540985561125e-07, | |
| "loss": 2.5347, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 5.227135435544279, | |
| "grad_norm": 6.767351150512695, | |
| "learning_rate": 6.473061156277431e-07, | |
| "loss": 2.4873, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 5.2368007732270145, | |
| "grad_norm": 6.60963773727417, | |
| "learning_rate": 6.433644010219661e-07, | |
| "loss": 2.4962, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 5.24646611090975, | |
| "grad_norm": 6.64891242980957, | |
| "learning_rate": 6.394290247936931e-07, | |
| "loss": 2.5101, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 5.256131448592485, | |
| "grad_norm": 6.72605037689209, | |
| "learning_rate": 6.355000568851849e-07, | |
| "loss": 2.4919, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 5.265796786275221, | |
| "grad_norm": 7.019696235656738, | |
| "learning_rate": 6.315775671248098e-07, | |
| "loss": 2.4964, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 5.274495590189682, | |
| "eval_loss": 2.639216184616089, | |
| "eval_runtime": 216.9692, | |
| "eval_samples_per_second": 101.719, | |
| "eval_steps_per_second": 4.24, | |
| "step": 5459 | |
| }, | |
| { | |
| "epoch": 5.275462123957956, | |
| "grad_norm": 6.810911178588867, | |
| "learning_rate": 6.276616252258014e-07, | |
| "loss": 2.499, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 5.285127461640691, | |
| "grad_norm": 6.777184009552002, | |
| "learning_rate": 6.237523007850196e-07, | |
| "loss": 2.5246, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 5.294792799323426, | |
| "grad_norm": 6.932974338531494, | |
| "learning_rate": 6.198496632817154e-07, | |
| "loss": 2.5384, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 5.304458137006161, | |
| "grad_norm": 6.819568634033203, | |
| "learning_rate": 6.159537820762942e-07, | |
| "loss": 2.5205, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 5.314123474688897, | |
| "grad_norm": 6.879855632781982, | |
| "learning_rate": 6.120647264090838e-07, | |
| "loss": 2.5193, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 5.323788812371633, | |
| "grad_norm": 6.756879806518555, | |
| "learning_rate": 6.081825653991037e-07, | |
| "loss": 2.4839, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 5.333454150054368, | |
| "grad_norm": 6.868306636810303, | |
| "learning_rate": 6.043073680428372e-07, | |
| "loss": 2.5402, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 5.343119487737103, | |
| "grad_norm": 6.817827224731445, | |
| "learning_rate": 6.004392032130039e-07, | |
| "loss": 2.5098, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 5.352784825419838, | |
| "grad_norm": 6.814731597900391, | |
| "learning_rate": 5.965781396573367e-07, | |
| "loss": 2.5142, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 5.362450163102573, | |
| "grad_norm": 7.052011013031006, | |
| "learning_rate": 5.9272424599736e-07, | |
| "loss": 2.5233, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 5.372115500785308, | |
| "grad_norm": 6.802794456481934, | |
| "learning_rate": 5.888775907271696e-07, | |
| "loss": 2.5157, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 5.3740485683218555, | |
| "eval_loss": 2.6385231018066406, | |
| "eval_runtime": 216.8661, | |
| "eval_samples_per_second": 101.768, | |
| "eval_steps_per_second": 4.242, | |
| "step": 5562 | |
| }, | |
| { | |
| "epoch": 5.381780838468044, | |
| "grad_norm": 6.800307273864746, | |
| "learning_rate": 5.850382422122159e-07, | |
| "loss": 2.5085, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 5.3914461761507795, | |
| "grad_norm": 6.743267059326172, | |
| "learning_rate": 5.812062686880879e-07, | |
| "loss": 2.5179, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 5.401111513833515, | |
| "grad_norm": 6.773606300354004, | |
| "learning_rate": 5.773817382593007e-07, | |
| "loss": 2.5203, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 5.41077685151625, | |
| "grad_norm": 7.061532497406006, | |
| "learning_rate": 5.735647188980871e-07, | |
| "loss": 2.5013, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 5.420442189198985, | |
| "grad_norm": 6.810518264770508, | |
| "learning_rate": 5.697552784431865e-07, | |
| "loss": 2.4812, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 5.43010752688172, | |
| "grad_norm": 6.675147533416748, | |
| "learning_rate": 5.659534845986417e-07, | |
| "loss": 2.5223, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 5.439772864564456, | |
| "grad_norm": 6.812685489654541, | |
| "learning_rate": 5.621594049325939e-07, | |
| "loss": 2.4554, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 5.449438202247191, | |
| "grad_norm": 6.6521992683410645, | |
| "learning_rate": 5.583731068760822e-07, | |
| "loss": 2.5003, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 5.459103539929926, | |
| "grad_norm": 6.802224159240723, | |
| "learning_rate": 5.545946577218469e-07, | |
| "loss": 2.509, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 5.4687688776126615, | |
| "grad_norm": 6.921814441680908, | |
| "learning_rate": 5.508241246231303e-07, | |
| "loss": 2.5157, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 5.473601546454029, | |
| "eval_loss": 2.6369357109069824, | |
| "eval_runtime": 217.0718, | |
| "eval_samples_per_second": 101.671, | |
| "eval_steps_per_second": 4.238, | |
| "step": 5665 | |
| }, | |
| { | |
| "epoch": 5.478434215295397, | |
| "grad_norm": 6.57248592376709, | |
| "learning_rate": 5.470615745924869e-07, | |
| "loss": 2.5003, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 5.488099552978132, | |
| "grad_norm": 6.756587028503418, | |
| "learning_rate": 5.433070745005889e-07, | |
| "loss": 2.4746, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 5.497764890660868, | |
| "grad_norm": 6.835014343261719, | |
| "learning_rate": 5.395606910750401e-07, | |
| "loss": 2.4871, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 5.507430228343603, | |
| "grad_norm": 7.088858604431152, | |
| "learning_rate": 5.358224908991895e-07, | |
| "loss": 2.4799, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 5.517095566026338, | |
| "grad_norm": 7.005571365356445, | |
| "learning_rate": 5.320925404109466e-07, | |
| "loss": 2.5439, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 5.526760903709073, | |
| "grad_norm": 6.776179313659668, | |
| "learning_rate": 5.283709059016033e-07, | |
| "loss": 2.5226, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 5.536426241391808, | |
| "grad_norm": 6.528167247772217, | |
| "learning_rate": 5.246576535146523e-07, | |
| "loss": 2.516, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 5.5460915790745435, | |
| "grad_norm": 6.705376625061035, | |
| "learning_rate": 5.20952849244615e-07, | |
| "loss": 2.5408, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 5.55575691675728, | |
| "grad_norm": 6.841287136077881, | |
| "learning_rate": 5.172565589358658e-07, | |
| "loss": 2.4921, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 5.565422254440015, | |
| "grad_norm": 6.934755802154541, | |
| "learning_rate": 5.13568848281463e-07, | |
| "loss": 2.5019, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 5.573154524586203, | |
| "eval_loss": 2.6358206272125244, | |
| "eval_runtime": 216.9617, | |
| "eval_samples_per_second": 101.723, | |
| "eval_steps_per_second": 4.24, | |
| "step": 5768 | |
| }, | |
| { | |
| "epoch": 5.57508759212275, | |
| "grad_norm": 6.7719621658325195, | |
| "learning_rate": 5.09889782821983e-07, | |
| "loss": 2.5053, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 5.584752929805485, | |
| "grad_norm": 6.658035755157471, | |
| "learning_rate": 5.062194279443508e-07, | |
| "loss": 2.5161, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 5.59441826748822, | |
| "grad_norm": 7.038604736328125, | |
| "learning_rate": 5.025578488806836e-07, | |
| "loss": 2.5208, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 5.604083605170955, | |
| "grad_norm": 6.811711311340332, | |
| "learning_rate": 4.989051107071268e-07, | |
| "loss": 2.5461, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 5.613748942853691, | |
| "grad_norm": 7.0844526290893555, | |
| "learning_rate": 4.952612783427008e-07, | |
| "loss": 2.5076, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 5.6234142805364264, | |
| "grad_norm": 6.744603157043457, | |
| "learning_rate": 4.916264165481448e-07, | |
| "loss": 2.4963, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 5.633079618219162, | |
| "grad_norm": 6.718221187591553, | |
| "learning_rate": 4.880005899247663e-07, | |
| "loss": 2.4961, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 5.642744955901897, | |
| "grad_norm": 7.078195095062256, | |
| "learning_rate": 4.843838629132949e-07, | |
| "loss": 2.5171, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 5.652410293584632, | |
| "grad_norm": 6.604617118835449, | |
| "learning_rate": 4.80776299792734e-07, | |
| "loss": 2.5146, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 5.662075631267367, | |
| "grad_norm": 6.939282417297363, | |
| "learning_rate": 4.771779646792216e-07, | |
| "loss": 2.4872, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 5.671740968950103, | |
| "grad_norm": 7.01817512512207, | |
| "learning_rate": 4.7358892152488726e-07, | |
| "loss": 2.487, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 5.672707502718376, | |
| "eval_loss": 2.6341195106506348, | |
| "eval_runtime": 216.8979, | |
| "eval_samples_per_second": 101.753, | |
| "eval_steps_per_second": 4.242, | |
| "step": 5871 | |
| }, | |
| { | |
| "epoch": 5.681406306632838, | |
| "grad_norm": 7.151918888092041, | |
| "learning_rate": 4.700092341167182e-07, | |
| "loss": 2.5233, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 5.691071644315573, | |
| "grad_norm": 6.588758945465088, | |
| "learning_rate": 4.664389660754253e-07, | |
| "loss": 2.5248, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 5.7007369819983085, | |
| "grad_norm": 6.761035442352295, | |
| "learning_rate": 4.6287818085431064e-07, | |
| "loss": 2.4915, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 5.710402319681044, | |
| "grad_norm": 6.56200647354126, | |
| "learning_rate": 4.5932694173814246e-07, | |
| "loss": 2.5467, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 5.72006765736378, | |
| "grad_norm": 6.924496650695801, | |
| "learning_rate": 4.5578531184202726e-07, | |
| "loss": 2.5266, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 5.729732995046515, | |
| "grad_norm": 6.611005783081055, | |
| "learning_rate": 4.522533541102914e-07, | |
| "loss": 2.5247, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 5.73939833272925, | |
| "grad_norm": 6.712198734283447, | |
| "learning_rate": 4.487311313153598e-07, | |
| "loss": 2.4902, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 5.749063670411985, | |
| "grad_norm": 6.897911071777344, | |
| "learning_rate": 4.452187060566409e-07, | |
| "loss": 2.4757, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 5.75872900809472, | |
| "grad_norm": 6.599294185638428, | |
| "learning_rate": 4.417161407594163e-07, | |
| "loss": 2.5007, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 5.768394345777455, | |
| "grad_norm": 6.836195945739746, | |
| "learning_rate": 4.3822349767372667e-07, | |
| "loss": 2.511, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 5.77226048085055, | |
| "eval_loss": 2.633843421936035, | |
| "eval_runtime": 216.737, | |
| "eval_samples_per_second": 101.828, | |
| "eval_steps_per_second": 4.245, | |
| "step": 5974 | |
| }, | |
| { | |
| "epoch": 5.7780596834601905, | |
| "grad_norm": 6.8379974365234375, | |
| "learning_rate": 4.3474083887327076e-07, | |
| "loss": 2.5229, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 5.7877250211429265, | |
| "grad_norm": 6.931849002838135, | |
| "learning_rate": 4.312682262542978e-07, | |
| "loss": 2.4955, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 5.797390358825662, | |
| "grad_norm": 6.655295372009277, | |
| "learning_rate": 4.278057215345109e-07, | |
| "loss": 2.5219, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 5.807055696508397, | |
| "grad_norm": 6.884836673736572, | |
| "learning_rate": 4.2435338625196727e-07, | |
| "loss": 2.5298, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 5.816721034191132, | |
| "grad_norm": 6.96956205368042, | |
| "learning_rate": 4.209112817639856e-07, | |
| "loss": 2.5129, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 5.826386371873867, | |
| "grad_norm": 6.961551666259766, | |
| "learning_rate": 4.174794692460571e-07, | |
| "loss": 2.4823, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 5.836051709556603, | |
| "grad_norm": 6.782859802246094, | |
| "learning_rate": 4.1405800969075534e-07, | |
| "loss": 2.5208, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 5.845717047239338, | |
| "grad_norm": 6.812822341918945, | |
| "learning_rate": 4.106469639066552e-07, | |
| "loss": 2.4742, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 5.855382384922073, | |
| "grad_norm": 6.870484352111816, | |
| "learning_rate": 4.072463925172497e-07, | |
| "loss": 2.5023, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 5.8650477226048086, | |
| "grad_norm": 6.649935722351074, | |
| "learning_rate": 4.0385635595987344e-07, | |
| "loss": 2.5102, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 5.871813458982723, | |
| "eval_loss": 2.6324048042297363, | |
| "eval_runtime": 216.9994, | |
| "eval_samples_per_second": 101.705, | |
| "eval_steps_per_second": 4.24, | |
| "step": 6077 | |
| }, | |
| { | |
| "epoch": 5.874713060287544, | |
| "grad_norm": 6.996982097625732, | |
| "learning_rate": 4.004769144846299e-07, | |
| "loss": 2.523, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 5.884378397970279, | |
| "grad_norm": 6.676599979400635, | |
| "learning_rate": 3.9710812815331797e-07, | |
| "loss": 2.4997, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 5.894043735653014, | |
| "grad_norm": 6.77655029296875, | |
| "learning_rate": 3.9375005683836683e-07, | |
| "loss": 2.498, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 5.90370907333575, | |
| "grad_norm": 6.703573703765869, | |
| "learning_rate": 3.9040276022176996e-07, | |
| "loss": 2.5095, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 5.913374411018485, | |
| "grad_norm": 6.955379486083984, | |
| "learning_rate": 3.870662977940264e-07, | |
| "loss": 2.5226, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 5.92303974870122, | |
| "grad_norm": 6.614869117736816, | |
| "learning_rate": 3.8374072885308184e-07, | |
| "loss": 2.5237, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 5.932705086383955, | |
| "grad_norm": 6.875811576843262, | |
| "learning_rate": 3.8042611250327516e-07, | |
| "loss": 2.551, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 5.942370424066691, | |
| "grad_norm": 6.832152843475342, | |
| "learning_rate": 3.7712250765428824e-07, | |
| "loss": 2.4825, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 5.952035761749427, | |
| "grad_norm": 6.9608683586120605, | |
| "learning_rate": 3.738299730200987e-07, | |
| "loss": 2.4832, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 5.961701099432162, | |
| "grad_norm": 6.698019504547119, | |
| "learning_rate": 3.7054856711793736e-07, | |
| "loss": 2.5244, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 5.971366437114897, | |
| "grad_norm": 7.041402816772461, | |
| "learning_rate": 3.6727834826724634e-07, | |
| "loss": 2.5294, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 5.971366437114897, | |
| "eval_loss": 2.631476879119873, | |
| "eval_runtime": 216.9835, | |
| "eval_samples_per_second": 101.713, | |
| "eval_steps_per_second": 4.24, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 5.981031774797632, | |
| "grad_norm": 6.974484443664551, | |
| "learning_rate": 3.640193745886446e-07, | |
| "loss": 2.5173, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 5.990697112480367, | |
| "grad_norm": 6.792074680328369, | |
| "learning_rate": 3.6077170400289337e-07, | |
| "loss": 2.5276, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 9.531786918640137, | |
| "learning_rate": 3.5753539422986725e-07, | |
| "loss": 2.5002, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 6.009665337682735, | |
| "grad_norm": 6.820946216583252, | |
| "learning_rate": 3.543105027875296e-07, | |
| "loss": 2.5161, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 6.01933067536547, | |
| "grad_norm": 6.973489284515381, | |
| "learning_rate": 3.5109708699090777e-07, | |
| "loss": 2.5012, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 6.028996013048205, | |
| "grad_norm": 7.010828495025635, | |
| "learning_rate": 3.478952039510774e-07, | |
| "loss": 2.4784, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 6.0386613507309415, | |
| "grad_norm": 6.972498416900635, | |
| "learning_rate": 3.4470491057414475e-07, | |
| "loss": 2.5262, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 6.048326688413677, | |
| "grad_norm": 6.622044563293457, | |
| "learning_rate": 3.415262635602364e-07, | |
| "loss": 2.4681, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 6.057992026096412, | |
| "grad_norm": 7.308681011199951, | |
| "learning_rate": 3.3835931940249294e-07, | |
| "loss": 2.5143, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 6.067657363779147, | |
| "grad_norm": 6.736526966094971, | |
| "learning_rate": 3.352041343860621e-07, | |
| "loss": 2.4933, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 6.070556965083967, | |
| "eval_loss": 2.6330554485321045, | |
| "eval_runtime": 216.8848, | |
| "eval_samples_per_second": 101.759, | |
| "eval_steps_per_second": 4.242, | |
| "step": 6283 | |
| }, | |
| { | |
| "epoch": 6.077322701461882, | |
| "grad_norm": 6.952606201171875, | |
| "learning_rate": 3.320607645871011e-07, | |
| "loss": 2.4915, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 6.086988039144618, | |
| "grad_norm": 6.871399402618408, | |
| "learning_rate": 3.289292658717776e-07, | |
| "loss": 2.4658, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 6.096653376827353, | |
| "grad_norm": 6.710361480712891, | |
| "learning_rate": 3.2580969389527955e-07, | |
| "loss": 2.4651, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 6.106318714510088, | |
| "grad_norm": 6.776742458343506, | |
| "learning_rate": 3.2270210410082345e-07, | |
| "loss": 2.5001, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 6.1159840521928235, | |
| "grad_norm": 6.880600452423096, | |
| "learning_rate": 3.1960655171867037e-07, | |
| "loss": 2.467, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 6.125649389875559, | |
| "grad_norm": 6.831501483917236, | |
| "learning_rate": 3.1652309176514405e-07, | |
| "loss": 2.4905, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 6.135314727558294, | |
| "grad_norm": 6.918708324432373, | |
| "learning_rate": 3.134517790416528e-07, | |
| "loss": 2.4513, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 6.144980065241029, | |
| "grad_norm": 6.8542561531066895, | |
| "learning_rate": 3.103926681337168e-07, | |
| "loss": 2.5004, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 6.154645402923765, | |
| "grad_norm": 6.689946174621582, | |
| "learning_rate": 3.07345813409996e-07, | |
| "loss": 2.4817, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 6.1643107406065, | |
| "grad_norm": 6.745342254638672, | |
| "learning_rate": 3.0431126902132575e-07, | |
| "loss": 2.492, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 6.170109943216141, | |
| "eval_loss": 2.632904291152954, | |
| "eval_runtime": 216.9357, | |
| "eval_samples_per_second": 101.735, | |
| "eval_steps_per_second": 4.241, | |
| "step": 6386 | |
| }, | |
| { | |
| "epoch": 6.173976078289235, | |
| "grad_norm": 6.870169639587402, | |
| "learning_rate": 3.012890888997528e-07, | |
| "loss": 2.4701, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 6.18364141597197, | |
| "grad_norm": 6.764638900756836, | |
| "learning_rate": 2.982793267575775e-07, | |
| "loss": 2.4718, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 6.1933067536547055, | |
| "grad_norm": 7.040791988372803, | |
| "learning_rate": 2.952820360863999e-07, | |
| "loss": 2.4883, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 6.2029720913374415, | |
| "grad_norm": 6.948042392730713, | |
| "learning_rate": 2.9229727015616735e-07, | |
| "loss": 2.4951, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 6.212637429020177, | |
| "grad_norm": 6.7030463218688965, | |
| "learning_rate": 2.893250820142299e-07, | |
| "loss": 2.4897, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 6.222302766702912, | |
| "grad_norm": 6.751480579376221, | |
| "learning_rate": 2.863655244843949e-07, | |
| "loss": 2.5016, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 6.231968104385647, | |
| "grad_norm": 6.898858070373535, | |
| "learning_rate": 2.834186501659912e-07, | |
| "loss": 2.5046, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 6.241633442068382, | |
| "grad_norm": 6.5952301025390625, | |
| "learning_rate": 2.804845114329316e-07, | |
| "loss": 2.5032, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 6.251298779751117, | |
| "grad_norm": 6.756124019622803, | |
| "learning_rate": 2.7756316043278315e-07, | |
| "loss": 2.5008, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 6.260964117433852, | |
| "grad_norm": 6.867387771606445, | |
| "learning_rate": 2.7465464908584135e-07, | |
| "loss": 2.5096, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 6.269662921348314, | |
| "eval_loss": 2.6323742866516113, | |
| "eval_runtime": 216.8606, | |
| "eval_samples_per_second": 101.77, | |
| "eval_steps_per_second": 4.242, | |
| "step": 6489 | |
| }, | |
| { | |
| "epoch": 6.270629455116588, | |
| "grad_norm": 6.6687703132629395, | |
| "learning_rate": 2.717590290842051e-07, | |
| "loss": 2.5012, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 6.280294792799324, | |
| "grad_norm": 7.008709907531738, | |
| "learning_rate": 2.6887635189086077e-07, | |
| "loss": 2.4931, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 6.289960130482059, | |
| "grad_norm": 6.839788436889648, | |
| "learning_rate": 2.6600666873876474e-07, | |
| "loss": 2.4933, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 6.299625468164794, | |
| "grad_norm": 6.8228654861450195, | |
| "learning_rate": 2.631500306299349e-07, | |
| "loss": 2.4999, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 6.309290805847529, | |
| "grad_norm": 6.8285393714904785, | |
| "learning_rate": 2.6030648833454307e-07, | |
| "loss": 2.4577, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 6.318956143530265, | |
| "grad_norm": 6.882188320159912, | |
| "learning_rate": 2.574760923900133e-07, | |
| "loss": 2.4612, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 6.328621481213, | |
| "grad_norm": 6.814681053161621, | |
| "learning_rate": 2.54658893100124e-07, | |
| "loss": 2.4773, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 6.338286818895735, | |
| "grad_norm": 6.943098068237305, | |
| "learning_rate": 2.518549405341125e-07, | |
| "loss": 2.5, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 6.3479521565784705, | |
| "grad_norm": 6.890428066253662, | |
| "learning_rate": 2.49064284525787e-07, | |
| "loss": 2.5145, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 6.357617494261206, | |
| "grad_norm": 6.88997220993042, | |
| "learning_rate": 2.4628697467263913e-07, | |
| "loss": 2.486, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 6.367282831943941, | |
| "grad_norm": 6.790536403656006, | |
| "learning_rate": 2.435230603349635e-07, | |
| "loss": 2.4943, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 6.369215899480488, | |
| "eval_loss": 2.6316308975219727, | |
| "eval_runtime": 216.8902, | |
| "eval_samples_per_second": 101.757, | |
| "eval_steps_per_second": 4.242, | |
| "step": 6592 | |
| }, | |
| { | |
| "epoch": 6.376948169626676, | |
| "grad_norm": 6.717286109924316, | |
| "learning_rate": 2.4077259063498087e-07, | |
| "loss": 2.4916, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 6.386613507309412, | |
| "grad_norm": 6.9318647384643555, | |
| "learning_rate": 2.3803561445596366e-07, | |
| "loss": 2.5037, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 6.396278844992147, | |
| "grad_norm": 7.279837608337402, | |
| "learning_rate": 2.353121804413687e-07, | |
| "loss": 2.4524, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 6.405944182674882, | |
| "grad_norm": 6.827029705047607, | |
| "learning_rate": 2.3260233699397126e-07, | |
| "loss": 2.4705, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 6.415609520357617, | |
| "grad_norm": 6.849607467651367, | |
| "learning_rate": 2.2990613227500645e-07, | |
| "loss": 2.4501, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 6.4252748580403525, | |
| "grad_norm": 6.832201957702637, | |
| "learning_rate": 2.272236142033115e-07, | |
| "loss": 2.4941, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 6.4349401957230885, | |
| "grad_norm": 7.062440395355225, | |
| "learning_rate": 2.2455483045447498e-07, | |
| "loss": 2.4879, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 6.444605533405824, | |
| "grad_norm": 7.106632709503174, | |
| "learning_rate": 2.2189982845999057e-07, | |
| "loss": 2.5048, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 6.454270871088559, | |
| "grad_norm": 6.965383052825928, | |
| "learning_rate": 2.1925865540641132e-07, | |
| "loss": 2.5375, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 6.463936208771294, | |
| "grad_norm": 6.780632972717285, | |
| "learning_rate": 2.1663135823451418e-07, | |
| "loss": 2.4805, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 6.4687688776126615, | |
| "eval_loss": 2.63108229637146, | |
| "eval_runtime": 216.9071, | |
| "eval_samples_per_second": 101.749, | |
| "eval_steps_per_second": 4.241, | |
| "step": 6695 | |
| }, | |
| { | |
| "epoch": 6.473601546454029, | |
| "grad_norm": 6.964837074279785, | |
| "learning_rate": 2.1401798363846336e-07, | |
| "loss": 2.5012, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 6.483266884136764, | |
| "grad_norm": 6.848756313323975, | |
| "learning_rate": 2.1141857806498143e-07, | |
| "loss": 2.4931, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 6.492932221819499, | |
| "grad_norm": 6.881086349487305, | |
| "learning_rate": 2.088331877125238e-07, | |
| "loss": 2.5012, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 6.502597559502235, | |
| "grad_norm": 6.669610500335693, | |
| "learning_rate": 2.062618585304573e-07, | |
| "loss": 2.488, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 6.5122628971849705, | |
| "grad_norm": 6.833005905151367, | |
| "learning_rate": 2.037046362182444e-07, | |
| "loss": 2.4838, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 6.521928234867706, | |
| "grad_norm": 6.958085060119629, | |
| "learning_rate": 2.0116156622462977e-07, | |
| "loss": 2.4601, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 6.531593572550441, | |
| "grad_norm": 6.800379276275635, | |
| "learning_rate": 1.986326937468339e-07, | |
| "loss": 2.4848, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 6.541258910233176, | |
| "grad_norm": 6.915721416473389, | |
| "learning_rate": 1.9611806372974816e-07, | |
| "loss": 2.4784, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 6.550924247915912, | |
| "grad_norm": 6.845888614654541, | |
| "learning_rate": 1.936177208651374e-07, | |
| "loss": 2.5028, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 6.560589585598647, | |
| "grad_norm": 6.784244060516357, | |
| "learning_rate": 1.9113170959084569e-07, | |
| "loss": 2.4622, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 6.568321855744835, | |
| "eval_loss": 2.6306145191192627, | |
| "eval_runtime": 216.9083, | |
| "eval_samples_per_second": 101.748, | |
| "eval_steps_per_second": 4.241, | |
| "step": 6798 | |
| }, | |
| { | |
| "epoch": 6.570254923281382, | |
| "grad_norm": 6.741705894470215, | |
| "learning_rate": 1.8866007409000495e-07, | |
| "loss": 2.4956, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 6.579920260964117, | |
| "grad_norm": 6.939857006072998, | |
| "learning_rate": 1.8620285829025196e-07, | |
| "loss": 2.4728, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 6.589585598646853, | |
| "grad_norm": 6.82913064956665, | |
| "learning_rate": 1.8376010586294542e-07, | |
| "loss": 2.457, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 6.599250936329588, | |
| "grad_norm": 6.889610290527344, | |
| "learning_rate": 1.8133186022239188e-07, | |
| "loss": 2.4735, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 6.608916274012323, | |
| "grad_norm": 6.8231120109558105, | |
| "learning_rate": 1.7891816452507236e-07, | |
| "loss": 2.4836, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 6.618581611695059, | |
| "grad_norm": 6.723885536193848, | |
| "learning_rate": 1.7651906166887598e-07, | |
| "loss": 2.4752, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 6.628246949377794, | |
| "grad_norm": 6.732489585876465, | |
| "learning_rate": 1.7413459429233857e-07, | |
| "loss": 2.4853, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 6.637912287060529, | |
| "grad_norm": 6.842257022857666, | |
| "learning_rate": 1.71764804773883e-07, | |
| "loss": 2.5025, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 6.647577624743264, | |
| "grad_norm": 6.585004806518555, | |
| "learning_rate": 1.6940973523106794e-07, | |
| "loss": 2.4823, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 6.657242962425999, | |
| "grad_norm": 6.8570966720581055, | |
| "learning_rate": 1.6706942751983745e-07, | |
| "loss": 2.5025, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 6.6669083001087355, | |
| "grad_norm": 6.777892112731934, | |
| "learning_rate": 1.6474392323377828e-07, | |
| "loss": 2.4994, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 6.667874833877009, | |
| "eval_loss": 2.6304829120635986, | |
| "eval_runtime": 216.9988, | |
| "eval_samples_per_second": 101.706, | |
| "eval_steps_per_second": 4.24, | |
| "step": 6901 | |
| }, | |
| { | |
| "epoch": 6.676573637791471, | |
| "grad_norm": 7.168168544769287, | |
| "learning_rate": 1.6243326370338062e-07, | |
| "loss": 2.5063, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 6.686238975474206, | |
| "grad_norm": 6.96233606338501, | |
| "learning_rate": 1.6013748999530276e-07, | |
| "loss": 2.4905, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 6.695904313156941, | |
| "grad_norm": 6.856019496917725, | |
| "learning_rate": 1.5785664291164246e-07, | |
| "loss": 2.4624, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 6.705569650839676, | |
| "grad_norm": 6.714591979980469, | |
| "learning_rate": 1.5559076298921025e-07, | |
| "loss": 2.4877, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 6.715234988522411, | |
| "grad_norm": 6.764096736907959, | |
| "learning_rate": 1.5333989049881058e-07, | |
| "loss": 2.4733, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 6.724900326205146, | |
| "grad_norm": 6.856327056884766, | |
| "learning_rate": 1.511040654445247e-07, | |
| "loss": 2.4986, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 6.734565663887882, | |
| "grad_norm": 6.840289115905762, | |
| "learning_rate": 1.4888332756300027e-07, | |
| "loss": 2.4862, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 6.7442310015706175, | |
| "grad_norm": 7.013392448425293, | |
| "learning_rate": 1.4667771632274596e-07, | |
| "loss": 2.5031, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 6.753896339253353, | |
| "grad_norm": 6.860467433929443, | |
| "learning_rate": 1.4448727092342816e-07, | |
| "loss": 2.511, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 6.763561676936088, | |
| "grad_norm": 6.86760139465332, | |
| "learning_rate": 1.4231203029517615e-07, | |
| "loss": 2.5187, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 6.767427812009182, | |
| "eval_loss": 2.6297874450683594, | |
| "eval_runtime": 216.6476, | |
| "eval_samples_per_second": 101.871, | |
| "eval_steps_per_second": 4.247, | |
| "step": 7004 | |
| }, | |
| { | |
| "epoch": 6.773227014618823, | |
| "grad_norm": 6.677191734313965, | |
| "learning_rate": 1.4015203309788848e-07, | |
| "loss": 2.4913, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 6.782892352301559, | |
| "grad_norm": 6.7386155128479, | |
| "learning_rate": 1.3800731772054796e-07, | |
| "loss": 2.4563, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 6.792557689984294, | |
| "grad_norm": 6.725878715515137, | |
| "learning_rate": 1.3587792228053718e-07, | |
| "loss": 2.4936, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 6.802223027667029, | |
| "grad_norm": 6.603059768676758, | |
| "learning_rate": 1.3376388462296217e-07, | |
| "loss": 2.4484, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 6.811888365349764, | |
| "grad_norm": 6.6211042404174805, | |
| "learning_rate": 1.3166524231998055e-07, | |
| "loss": 2.4887, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 6.8215537030324995, | |
| "grad_norm": 6.715404510498047, | |
| "learning_rate": 1.295820326701319e-07, | |
| "loss": 2.5034, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 6.831219040715235, | |
| "grad_norm": 6.930505275726318, | |
| "learning_rate": 1.2751429269767667e-07, | |
| "loss": 2.4997, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 6.84088437839797, | |
| "grad_norm": 6.84429931640625, | |
| "learning_rate": 1.2546205915193687e-07, | |
| "loss": 2.5128, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 6.850549716080706, | |
| "grad_norm": 7.0082478523254395, | |
| "learning_rate": 1.2342536850664354e-07, | |
| "loss": 2.4893, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 6.860215053763441, | |
| "grad_norm": 6.885166645050049, | |
| "learning_rate": 1.2140425695928858e-07, | |
| "loss": 2.5144, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 6.866980790141356, | |
| "eval_loss": 2.629671335220337, | |
| "eval_runtime": 216.8264, | |
| "eval_samples_per_second": 101.786, | |
| "eval_steps_per_second": 4.243, | |
| "step": 7107 | |
| }, | |
| { | |
| "epoch": 6.869880391446176, | |
| "grad_norm": 7.1056599617004395, | |
| "learning_rate": 1.193987604304809e-07, | |
| "loss": 2.4867, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 6.879545729128911, | |
| "grad_norm": 6.84068489074707, | |
| "learning_rate": 1.1740891456330892e-07, | |
| "loss": 2.4736, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 6.889211066811646, | |
| "grad_norm": 6.737675189971924, | |
| "learning_rate": 1.1543475472270613e-07, | |
| "loss": 2.4826, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 6.898876404494382, | |
| "grad_norm": 6.845552444458008, | |
| "learning_rate": 1.1347631599482321e-07, | |
| "loss": 2.5346, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 6.908541742177118, | |
| "grad_norm": 6.8293776512146, | |
| "learning_rate": 1.1153363318640396e-07, | |
| "loss": 2.4788, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 6.918207079859853, | |
| "grad_norm": 6.818983554840088, | |
| "learning_rate": 1.096067408241671e-07, | |
| "loss": 2.5074, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 6.927872417542588, | |
| "grad_norm": 6.84577751159668, | |
| "learning_rate": 1.076956731541927e-07, | |
| "loss": 2.494, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 6.937537755225323, | |
| "grad_norm": 6.472375869750977, | |
| "learning_rate": 1.0580046414131261e-07, | |
| "loss": 2.5109, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 6.947203092908058, | |
| "grad_norm": 7.070675373077393, | |
| "learning_rate": 1.0392114746850867e-07, | |
| "loss": 2.4984, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 6.956868430590793, | |
| "grad_norm": 7.004439353942871, | |
| "learning_rate": 1.0205775653631176e-07, | |
| "loss": 2.4856, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 6.966533768273529, | |
| "grad_norm": 6.824222564697266, | |
| "learning_rate": 1.0021032446221023e-07, | |
| "loss": 2.507, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 6.966533768273529, | |
| "eval_loss": 2.6293044090270996, | |
| "eval_runtime": 216.6368, | |
| "eval_samples_per_second": 101.876, | |
| "eval_steps_per_second": 4.247, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 6.9761991059562645, | |
| "grad_norm": 6.812296390533447, | |
| "learning_rate": 9.837888408006e-08, | |
| "loss": 2.5143, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 6.985864443639, | |
| "grad_norm": 6.687375068664551, | |
| "learning_rate": 9.65634679395011e-08, | |
| "loss": 2.4881, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 6.995529781321735, | |
| "grad_norm": 6.915345668792725, | |
| "learning_rate": 9.476410830538063e-08, | |
| "loss": 2.4649, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 7.004832668841368, | |
| "grad_norm": 7.076071739196777, | |
| "learning_rate": 9.298083715717686e-08, | |
| "loss": 2.4633, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 7.014498006524103, | |
| "grad_norm": 6.80445671081543, | |
| "learning_rate": 9.121368618843361e-08, | |
| "loss": 2.4815, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 7.024163344206838, | |
| "grad_norm": 6.910451412200928, | |
| "learning_rate": 8.946268680619407e-08, | |
| "loss": 2.5297, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 7.033828681889574, | |
| "grad_norm": 6.937068462371826, | |
| "learning_rate": 8.772787013044558e-08, | |
| "loss": 2.4856, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 7.043494019572309, | |
| "grad_norm": 6.77278995513916, | |
| "learning_rate": 8.600926699356414e-08, | |
| "loss": 2.4626, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 7.053159357255044, | |
| "grad_norm": 6.759533882141113, | |
| "learning_rate": 8.430690793976758e-08, | |
| "loss": 2.4845, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 7.062824694937779, | |
| "grad_norm": 6.756450653076172, | |
| "learning_rate": 8.262082322457297e-08, | |
| "loss": 2.4992, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 7.0657242962426, | |
| "eval_loss": 2.6299219131469727, | |
| "eval_runtime": 216.8528, | |
| "eval_samples_per_second": 101.774, | |
| "eval_steps_per_second": 4.243, | |
| "step": 7313 | |
| }, | |
| { | |
| "epoch": 7.0724900326205145, | |
| "grad_norm": 7.033835411071777, | |
| "learning_rate": 8.09510428142578e-08, | |
| "loss": 2.4857, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 7.08215537030325, | |
| "grad_norm": 6.858207702636719, | |
| "learning_rate": 7.929759638532851e-08, | |
| "loss": 2.4614, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 7.091820707985986, | |
| "grad_norm": 6.827333927154541, | |
| "learning_rate": 7.766051332399226e-08, | |
| "loss": 2.4479, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 7.101486045668721, | |
| "grad_norm": 7.060365676879883, | |
| "learning_rate": 7.603982272563547e-08, | |
| "loss": 2.5128, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 7.111151383351456, | |
| "grad_norm": 6.903797626495361, | |
| "learning_rate": 7.443555339430573e-08, | |
| "loss": 2.4585, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 7.120816721034191, | |
| "grad_norm": 6.946906566619873, | |
| "learning_rate": 7.284773384220034e-08, | |
| "loss": 2.4955, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 7.130482058716926, | |
| "grad_norm": 7.100877285003662, | |
| "learning_rate": 7.127639228916004e-08, | |
| "loss": 2.4804, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 7.140147396399661, | |
| "grad_norm": 6.856292724609375, | |
| "learning_rate": 6.972155666216684e-08, | |
| "loss": 2.4788, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 7.149812734082397, | |
| "grad_norm": 7.044023036956787, | |
| "learning_rate": 6.818325459484786e-08, | |
| "loss": 2.5149, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 7.1594780717651325, | |
| "grad_norm": 6.6180572509765625, | |
| "learning_rate": 6.666151342698412e-08, | |
| "loss": 2.4784, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 7.165277274374773, | |
| "eval_loss": 2.6300928592681885, | |
| "eval_runtime": 216.7445, | |
| "eval_samples_per_second": 101.825, | |
| "eval_steps_per_second": 4.245, | |
| "step": 7416 | |
| }, | |
| { | |
| "epoch": 7.169143409447868, | |
| "grad_norm": 6.7274932861328125, | |
| "learning_rate": 6.515636020402481e-08, | |
| "loss": 2.4571, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 7.178808747130603, | |
| "grad_norm": 6.896090507507324, | |
| "learning_rate": 6.36678216766069e-08, | |
| "loss": 2.4801, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 7.188474084813338, | |
| "grad_norm": 6.6245622634887695, | |
| "learning_rate": 6.219592430007869e-08, | |
| "loss": 2.4743, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 7.198139422496073, | |
| "grad_norm": 6.647139549255371, | |
| "learning_rate": 6.074069423403105e-08, | |
| "loss": 2.4691, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 7.207804760178809, | |
| "grad_norm": 6.740390777587891, | |
| "learning_rate": 5.9302157341830864e-08, | |
| "loss": 2.4892, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 7.217470097861544, | |
| "grad_norm": 6.634116172790527, | |
| "learning_rate": 5.788033919016311e-08, | |
| "loss": 2.4693, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 7.227135435544279, | |
| "grad_norm": 6.560644149780273, | |
| "learning_rate": 5.647526504857514e-08, | |
| "loss": 2.4722, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 7.2368007732270145, | |
| "grad_norm": 6.656121730804443, | |
| "learning_rate": 5.5086959889027894e-08, | |
| "loss": 2.4589, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 7.24646611090975, | |
| "grad_norm": 6.90827751159668, | |
| "learning_rate": 5.371544838545283e-08, | |
| "loss": 2.4813, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 7.256131448592485, | |
| "grad_norm": 6.8788323402404785, | |
| "learning_rate": 5.236075491331205e-08, | |
| "loss": 2.4721, | |
| "step": 7510 | |
| }, | |
| { | |
| "epoch": 7.264830252506947, | |
| "eval_loss": 2.629812479019165, | |
| "eval_runtime": 216.6204, | |
| "eval_samples_per_second": 101.883, | |
| "eval_steps_per_second": 4.247, | |
| "step": 7519 | |
| }, | |
| { | |
| "epoch": 7.265796786275221, | |
| "grad_norm": 6.811087131500244, | |
| "learning_rate": 5.10229035491665e-08, | |
| "loss": 2.4821, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 7.275462123957956, | |
| "grad_norm": 6.634406089782715, | |
| "learning_rate": 4.970191807024693e-08, | |
| "loss": 2.501, | |
| "step": 7530 | |
| }, | |
| { | |
| "epoch": 7.285127461640691, | |
| "grad_norm": 6.741237640380859, | |
| "learning_rate": 4.8397821954032194e-08, | |
| "loss": 2.4787, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 7.294792799323426, | |
| "grad_norm": 6.966506481170654, | |
| "learning_rate": 4.7110638377831025e-08, | |
| "loss": 2.4442, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 7.304458137006161, | |
| "grad_norm": 6.796621799468994, | |
| "learning_rate": 4.584039021837094e-08, | |
| "loss": 2.4723, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 7.314123474688897, | |
| "grad_norm": 6.76340913772583, | |
| "learning_rate": 4.4587100051391547e-08, | |
| "loss": 2.4815, | |
| "step": 7570 | |
| }, | |
| { | |
| "epoch": 7.323788812371633, | |
| "grad_norm": 6.656253337860107, | |
| "learning_rate": 4.3350790151242876e-08, | |
| "loss": 2.5071, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 7.333454150054368, | |
| "grad_norm": 6.851405620574951, | |
| "learning_rate": 4.2131482490490035e-08, | |
| "loss": 2.4624, | |
| "step": 7590 | |
| }, | |
| { | |
| "epoch": 7.343119487737103, | |
| "grad_norm": 6.677999973297119, | |
| "learning_rate": 4.092919873952205e-08, | |
| "loss": 2.4766, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 7.352784825419838, | |
| "grad_norm": 6.938327312469482, | |
| "learning_rate": 3.9743960266167334e-08, | |
| "loss": 2.4837, | |
| "step": 7610 | |
| }, | |
| { | |
| "epoch": 7.362450163102573, | |
| "grad_norm": 6.797394275665283, | |
| "learning_rate": 3.857578813531392e-08, | |
| "loss": 2.4926, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 7.36438323063912, | |
| "eval_loss": 2.6299386024475098, | |
| "eval_runtime": 216.5913, | |
| "eval_samples_per_second": 101.897, | |
| "eval_steps_per_second": 4.248, | |
| "step": 7622 | |
| }, | |
| { | |
| "epoch": 7.372115500785308, | |
| "grad_norm": 6.87264347076416, | |
| "learning_rate": 3.742470310853441e-08, | |
| "loss": 2.4897, | |
| "step": 7630 | |
| }, | |
| { | |
| "epoch": 7.381780838468044, | |
| "grad_norm": 7.126553535461426, | |
| "learning_rate": 3.6290725643717715e-08, | |
| "loss": 2.4782, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 7.3914461761507795, | |
| "grad_norm": 6.758020401000977, | |
| "learning_rate": 3.5173875894704886e-08, | |
| "loss": 2.4454, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 7.401111513833515, | |
| "grad_norm": 7.062627792358398, | |
| "learning_rate": 3.4074173710931796e-08, | |
| "loss": 2.4876, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 7.41077685151625, | |
| "grad_norm": 6.85789680480957, | |
| "learning_rate": 3.299163863707522e-08, | |
| "loss": 2.4866, | |
| "step": 7670 | |
| }, | |
| { | |
| "epoch": 7.420442189198985, | |
| "grad_norm": 6.748703956604004, | |
| "learning_rate": 3.1926289912706185e-08, | |
| "loss": 2.5081, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 7.43010752688172, | |
| "grad_norm": 6.617964744567871, | |
| "learning_rate": 3.08781464719482e-08, | |
| "loss": 2.4899, | |
| "step": 7690 | |
| }, | |
| { | |
| "epoch": 7.439772864564456, | |
| "grad_norm": 6.949032306671143, | |
| "learning_rate": 2.98472269431399e-08, | |
| "loss": 2.4634, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 7.449438202247191, | |
| "grad_norm": 6.690010070800781, | |
| "learning_rate": 2.883354964850493e-08, | |
| "loss": 2.489, | |
| "step": 7710 | |
| }, | |
| { | |
| "epoch": 7.459103539929926, | |
| "grad_norm": 6.775691509246826, | |
| "learning_rate": 2.7837132603825696e-08, | |
| "loss": 2.4834, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 7.463936208771294, | |
| "eval_loss": 2.629732370376587, | |
| "eval_runtime": 216.725, | |
| "eval_samples_per_second": 101.834, | |
| "eval_steps_per_second": 4.245, | |
| "step": 7725 | |
| }, | |
| { | |
| "epoch": 7.4687688776126615, | |
| "grad_norm": 7.173056602478027, | |
| "learning_rate": 2.6857993518123455e-08, | |
| "loss": 2.4805, | |
| "step": 7730 | |
| }, | |
| { | |
| "epoch": 7.478434215295397, | |
| "grad_norm": 6.82067346572876, | |
| "learning_rate": 2.5896149793343423e-08, | |
| "loss": 2.4992, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 7.488099552978132, | |
| "grad_norm": 6.860231876373291, | |
| "learning_rate": 2.495161852404526e-08, | |
| "loss": 2.4786, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 7.497764890660868, | |
| "grad_norm": 6.8974432945251465, | |
| "learning_rate": 2.4024416497100298e-08, | |
| "loss": 2.5135, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 7.507430228343603, | |
| "grad_norm": 6.890160083770752, | |
| "learning_rate": 2.3114560191391575e-08, | |
| "loss": 2.4968, | |
| "step": 7770 | |
| }, | |
| { | |
| "epoch": 7.517095566026338, | |
| "grad_norm": 6.756450176239014, | |
| "learning_rate": 2.22220657775225e-08, | |
| "loss": 2.5055, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 7.526760903709073, | |
| "grad_norm": 6.810698986053467, | |
| "learning_rate": 2.1346949117528435e-08, | |
| "loss": 2.4775, | |
| "step": 7790 | |
| }, | |
| { | |
| "epoch": 7.536426241391808, | |
| "grad_norm": 6.732553482055664, | |
| "learning_rate": 2.048922576459522e-08, | |
| "loss": 2.4679, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 7.5460915790745435, | |
| "grad_norm": 7.055318355560303, | |
| "learning_rate": 1.964891096278276e-08, | |
| "loss": 2.4829, | |
| "step": 7810 | |
| }, | |
| { | |
| "epoch": 7.55575691675728, | |
| "grad_norm": 6.638191223144531, | |
| "learning_rate": 1.882601964675379e-08, | |
| "loss": 2.4568, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 7.5634891869034675, | |
| "eval_loss": 2.6296987533569336, | |
| "eval_runtime": 216.739, | |
| "eval_samples_per_second": 101.828, | |
| "eval_steps_per_second": 4.245, | |
| "step": 7828 | |
| }, | |
| { | |
| "epoch": 7.565422254440015, | |
| "grad_norm": 6.956434726715088, | |
| "learning_rate": 1.8020566441508843e-08, | |
| "loss": 2.4692, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 7.57508759212275, | |
| "grad_norm": 6.698721885681152, | |
| "learning_rate": 1.7232565662126164e-08, | |
| "loss": 2.4487, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 7.584752929805485, | |
| "grad_norm": 6.7048726081848145, | |
| "learning_rate": 1.6462031313507096e-08, | |
| "loss": 2.4678, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 7.59441826748822, | |
| "grad_norm": 6.834412097930908, | |
| "learning_rate": 1.5708977090127417e-08, | |
| "loss": 2.4789, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 7.604083605170955, | |
| "grad_norm": 6.895366191864014, | |
| "learning_rate": 1.4973416375793967e-08, | |
| "loss": 2.4942, | |
| "step": 7870 | |
| }, | |
| { | |
| "epoch": 7.613748942853691, | |
| "grad_norm": 7.1303324699401855, | |
| "learning_rate": 1.4255362243406621e-08, | |
| "loss": 2.5303, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 7.6234142805364264, | |
| "grad_norm": 6.808503150939941, | |
| "learning_rate": 1.3554827454726136e-08, | |
| "loss": 2.4786, | |
| "step": 7890 | |
| }, | |
| { | |
| "epoch": 7.633079618219162, | |
| "grad_norm": 6.925174713134766, | |
| "learning_rate": 1.2871824460147007e-08, | |
| "loss": 2.4547, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 7.642744955901897, | |
| "grad_norm": 6.883964538574219, | |
| "learning_rate": 1.2206365398476637e-08, | |
| "loss": 2.506, | |
| "step": 7910 | |
| }, | |
| { | |
| "epoch": 7.652410293584632, | |
| "grad_norm": 6.8656086921691895, | |
| "learning_rate": 1.155846209671918e-08, | |
| "loss": 2.5117, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 7.662075631267367, | |
| "grad_norm": 6.914687633514404, | |
| "learning_rate": 1.0928126069865818e-08, | |
| "loss": 2.4916, | |
| "step": 7930 | |
| }, | |
| { | |
| "epoch": 7.663042165035641, | |
| "eval_loss": 2.629610061645508, | |
| "eval_runtime": 216.9358, | |
| "eval_samples_per_second": 101.735, | |
| "eval_steps_per_second": 4.241, | |
| "step": 7931 | |
| }, | |
| { | |
| "epoch": 7.671740968950103, | |
| "grad_norm": 6.8651323318481445, | |
| "learning_rate": 1.0315368520689372e-08, | |
| "loss": 2.4809, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 7.681406306632838, | |
| "grad_norm": 6.866594314575195, | |
| "learning_rate": 9.720200339546236e-09, | |
| "loss": 2.4871, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 7.691071644315573, | |
| "grad_norm": 6.732547283172607, | |
| "learning_rate": 9.142632104181648e-09, | |
| "loss": 2.4786, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 7.7007369819983085, | |
| "grad_norm": 6.74143123626709, | |
| "learning_rate": 8.58267407954283e-09, | |
| "loss": 2.4774, | |
| "step": 7970 | |
| }, | |
| { | |
| "epoch": 7.710402319681044, | |
| "grad_norm": 6.904804706573486, | |
| "learning_rate": 8.040336217595588e-09, | |
| "loss": 2.4869, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 7.72006765736378, | |
| "grad_norm": 6.923266410827637, | |
| "learning_rate": 7.515628157148012e-09, | |
| "loss": 2.4933, | |
| "step": 7990 | |
| }, | |
| { | |
| "epoch": 7.729732995046515, | |
| "grad_norm": 7.009883880615234, | |
| "learning_rate": 7.008559223679156e-09, | |
| "loss": 2.4392, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 7.73939833272925, | |
| "grad_norm": 6.632232189178467, | |
| "learning_rate": 6.519138429172954e-09, | |
| "loss": 2.4594, | |
| "step": 8010 | |
| }, | |
| { | |
| "epoch": 7.749063670411985, | |
| "grad_norm": 6.914027214050293, | |
| "learning_rate": 6.047374471958466e-09, | |
| "loss": 2.4873, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 7.75872900809472, | |
| "grad_norm": 7.000319957733154, | |
| "learning_rate": 5.59327573655477e-09, | |
| "loss": 2.5232, | |
| "step": 8030 | |
| }, | |
| { | |
| "epoch": 7.762595143167815, | |
| "eval_loss": 2.6296308040618896, | |
| "eval_runtime": 216.9179, | |
| "eval_samples_per_second": 101.744, | |
| "eval_steps_per_second": 4.241, | |
| "step": 8034 | |
| }, | |
| { | |
| "epoch": 7.768394345777455, | |
| "grad_norm": 6.82133674621582, | |
| "learning_rate": 5.156850293522752e-09, | |
| "loss": 2.4846, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 7.7780596834601905, | |
| "grad_norm": 6.938842296600342, | |
| "learning_rate": 4.7381058993205545e-09, | |
| "loss": 2.474, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 7.7877250211429265, | |
| "grad_norm": 6.969207286834717, | |
| "learning_rate": 4.3370499961667975e-09, | |
| "loss": 2.4701, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 7.797390358825662, | |
| "grad_norm": 6.929446220397949, | |
| "learning_rate": 3.953689711907792e-09, | |
| "loss": 2.4944, | |
| "step": 8070 | |
| }, | |
| { | |
| "epoch": 7.807055696508397, | |
| "grad_norm": 6.946097373962402, | |
| "learning_rate": 3.588031859890761e-09, | |
| "loss": 2.4888, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 7.816721034191132, | |
| "grad_norm": 6.968674182891846, | |
| "learning_rate": 3.240082938843147e-09, | |
| "loss": 2.4995, | |
| "step": 8090 | |
| }, | |
| { | |
| "epoch": 7.826386371873867, | |
| "grad_norm": 6.90951681137085, | |
| "learning_rate": 2.9098491327564924e-09, | |
| "loss": 2.4757, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 7.836051709556603, | |
| "grad_norm": 6.8854756355285645, | |
| "learning_rate": 2.5973363107774095e-09, | |
| "loss": 2.4781, | |
| "step": 8110 | |
| }, | |
| { | |
| "epoch": 7.845717047239338, | |
| "grad_norm": 7.083747386932373, | |
| "learning_rate": 2.3025500271023346e-09, | |
| "loss": 2.4496, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 7.855382384922073, | |
| "grad_norm": 6.7878851890563965, | |
| "learning_rate": 2.0254955208794944e-09, | |
| "loss": 2.4695, | |
| "step": 8130 | |
| }, | |
| { | |
| "epoch": 7.862148121299988, | |
| "eval_loss": 2.629627227783203, | |
| "eval_runtime": 216.9855, | |
| "eval_samples_per_second": 101.712, | |
| "eval_steps_per_second": 4.24, | |
| "step": 8137 | |
| }, | |
| { | |
| "epoch": 7.8650477226048086, | |
| "grad_norm": 6.706069469451904, | |
| "learning_rate": 1.7661777161156467e-09, | |
| "loss": 2.4856, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 7.874713060287544, | |
| "grad_norm": 6.6752705574035645, | |
| "learning_rate": 1.524601221588151e-09, | |
| "loss": 2.4755, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 7.884378397970279, | |
| "grad_norm": 6.708074569702148, | |
| "learning_rate": 1.3007703307635897e-09, | |
| "loss": 2.4643, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 7.894043735653014, | |
| "grad_norm": 7.023872375488281, | |
| "learning_rate": 1.0946890217213844e-09, | |
| "loss": 2.499, | |
| "step": 8170 | |
| }, | |
| { | |
| "epoch": 7.90370907333575, | |
| "grad_norm": 6.72566032409668, | |
| "learning_rate": 9.063609570826302e-10, | |
| "loss": 2.4384, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 7.913374411018485, | |
| "grad_norm": 6.935588836669922, | |
| "learning_rate": 7.357894839457035e-10, | |
| "loss": 2.4758, | |
| "step": 8190 | |
| }, | |
| { | |
| "epoch": 7.92303974870122, | |
| "grad_norm": 6.855502128601074, | |
| "learning_rate": 5.829776338259762e-10, | |
| "loss": 2.4676, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 7.932705086383955, | |
| "grad_norm": 6.738182544708252, | |
| "learning_rate": 4.479281226028586e-10, | |
| "loss": 2.4874, | |
| "step": 8210 | |
| }, | |
| { | |
| "epoch": 7.942370424066691, | |
| "grad_norm": 6.814088821411133, | |
| "learning_rate": 3.3064335047061633e-10, | |
| "loss": 2.5022, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 7.952035761749427, | |
| "grad_norm": 6.909735202789307, | |
| "learning_rate": 2.3112540189640372e-10, | |
| "loss": 2.4729, | |
| "step": 8230 | |
| }, | |
| { | |
| "epoch": 7.961701099432162, | |
| "grad_norm": 6.548181533813477, | |
| "learning_rate": 1.4937604558284967e-10, | |
| "loss": 2.4591, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 7.961701099432162, | |
| "eval_loss": 2.629610061645508, | |
| "eval_runtime": 217.2357, | |
| "eval_samples_per_second": 101.595, | |
| "eval_steps_per_second": 4.235, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 7.971366437114897, | |
| "grad_norm": 6.895495891571045, | |
| "learning_rate": 8.539673443686002e-11, | |
| "loss": 2.4801, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 7.981031774797632, | |
| "grad_norm": 6.714371204376221, | |
| "learning_rate": 3.91886055433055e-11, | |
| "loss": 2.4589, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 7.990697112480367, | |
| "grad_norm": 6.664721965789795, | |
| "learning_rate": 1.0752480145481691e-11, | |
| "loss": 2.511, | |
| "step": 8270 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 9.770947456359863, | |
| "learning_rate": 8.886363023208332e-14, | |
| "loss": 2.4974, | |
| "step": 8280 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 8280, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 103, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.118170148932321e+18, | |
| "train_batch_size": 24, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |