| { |
| "best_global_step": null, |
| "best_metric": 1.6212482452392578, |
| "best_model_checkpoint": null, |
| "epoch": 1.0749359130859375, |
| "eval_steps": 5000, |
| "global_step": 524288, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "base_loss": 0.30045702931284907, |
| "epoch": 0.00095367431640625, |
| "grad_norm": 0.2715776264667511, |
| "learning_rate": 4.995241165161133e-05, |
| "lookahead_loss": 11.636415706634521, |
| "loss": 5.9684, |
| "step": 500 |
| }, |
| { |
| "base_loss": 0.30115938156843186, |
| "epoch": 0.0019073486328125, |
| "grad_norm": 0.23746199905872345, |
| "learning_rate": 4.990472793579102e-05, |
| "lookahead_loss": 10.842903160095215, |
| "loss": 5.572, |
| "step": 1000 |
| }, |
| { |
| "base_loss": 0.30052139541506767, |
| "epoch": 0.00286102294921875, |
| "grad_norm": 0.2835578918457031, |
| "learning_rate": 4.98570442199707e-05, |
| "lookahead_loss": 10.13417279434204, |
| "loss": 5.2173, |
| "step": 1500 |
| }, |
| { |
| "base_loss": 0.30269933369755747, |
| "epoch": 0.003814697265625, |
| "grad_norm": 0.24823836982250214, |
| "learning_rate": 4.9809360504150393e-05, |
| "lookahead_loss": 9.486023275375366, |
| "loss": 4.8944, |
| "step": 2000 |
| }, |
| { |
| "base_loss": 0.3041781492829323, |
| "epoch": 0.00476837158203125, |
| "grad_norm": 0.20649601519107819, |
| "learning_rate": 4.9761676788330084e-05, |
| "lookahead_loss": 8.79746763420105, |
| "loss": 4.5508, |
| "step": 2500 |
| }, |
| { |
| "base_loss": 0.31303099401295187, |
| "epoch": 0.0057220458984375, |
| "grad_norm": 0.1906881481409073, |
| "learning_rate": 4.971399307250977e-05, |
| "lookahead_loss": 8.270847107887269, |
| "loss": 4.2919, |
| "step": 3000 |
| }, |
| { |
| "base_loss": 0.33166604954004286, |
| "epoch": 0.00667572021484375, |
| "grad_norm": 0.1698817014694214, |
| "learning_rate": 4.966630935668946e-05, |
| "lookahead_loss": 7.874463705062866, |
| "loss": 4.1031, |
| "step": 3500 |
| }, |
| { |
| "base_loss": 0.31333299943804743, |
| "epoch": 0.00762939453125, |
| "grad_norm": 0.174786776304245, |
| "learning_rate": 4.961862564086914e-05, |
| "lookahead_loss": 7.546367056846619, |
| "loss": 3.9299, |
| "step": 4000 |
| }, |
| { |
| "base_loss": 0.31245614659786225, |
| "epoch": 0.00858306884765625, |
| "grad_norm": 0.14247459173202515, |
| "learning_rate": 4.957094192504883e-05, |
| "lookahead_loss": 7.2693215188980105, |
| "loss": 3.7909, |
| "step": 4500 |
| }, |
| { |
| "base_loss": 0.292913170427084, |
| "epoch": 0.0095367431640625, |
| "grad_norm": 0.1335776448249817, |
| "learning_rate": 4.952325820922852e-05, |
| "lookahead_loss": 7.0413890533447265, |
| "loss": 3.6672, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.0095367431640625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 7.667783941650391, |
| "eval_lookahead_perplexity": 2138.337510077118, |
| "eval_loss": 3.8992421627044678, |
| "eval_perplexity": 49.36502426986664, |
| "eval_runtime": 486.7779, |
| "eval_samples_per_second": 20.543, |
| "eval_steps_per_second": 1.284, |
| "step": 5000 |
| }, |
| { |
| "base_loss": 0.30041549998521805, |
| "epoch": 0.01049041748046875, |
| "grad_norm": 0.14120376110076904, |
| "learning_rate": 4.9475574493408205e-05, |
| "lookahead_loss": 6.863395688056946, |
| "loss": 3.5819, |
| "step": 5500 |
| }, |
| { |
| "base_loss": 0.29533531844615935, |
| "epoch": 0.011444091796875, |
| "grad_norm": 0.14177614450454712, |
| "learning_rate": 4.9427890777587895e-05, |
| "lookahead_loss": 6.756114417076111, |
| "loss": 3.5257, |
| "step": 6000 |
| }, |
| { |
| "base_loss": 0.29999259182810784, |
| "epoch": 0.01239776611328125, |
| "grad_norm": 0.15401475131511688, |
| "learning_rate": 4.938020706176758e-05, |
| "lookahead_loss": 6.607378579139709, |
| "loss": 3.4537, |
| "step": 6500 |
| }, |
| { |
| "base_loss": 0.29886582669615747, |
| "epoch": 0.0133514404296875, |
| "grad_norm": 0.17274411022663116, |
| "learning_rate": 4.933252334594727e-05, |
| "lookahead_loss": 6.502672654151916, |
| "loss": 3.4008, |
| "step": 7000 |
| }, |
| { |
| "base_loss": 0.3034250964820385, |
| "epoch": 0.01430511474609375, |
| "grad_norm": 0.16889062523841858, |
| "learning_rate": 4.928483963012696e-05, |
| "lookahead_loss": 6.298652252197265, |
| "loss": 3.301, |
| "step": 7500 |
| }, |
| { |
| "base_loss": 0.3204497399777174, |
| "epoch": 0.0152587890625, |
| "grad_norm": 0.1580396145582199, |
| "learning_rate": 4.923715591430664e-05, |
| "lookahead_loss": 6.183374300003051, |
| "loss": 3.2519, |
| "step": 8000 |
| }, |
| { |
| "base_loss": 0.31960979211330415, |
| "epoch": 0.01621246337890625, |
| "grad_norm": 0.19017435610294342, |
| "learning_rate": 4.918947219848633e-05, |
| "lookahead_loss": 6.093465467453003, |
| "loss": 3.2065, |
| "step": 8500 |
| }, |
| { |
| "base_loss": 0.3001296965777874, |
| "epoch": 0.0171661376953125, |
| "grad_norm": 0.1364353597164154, |
| "learning_rate": 4.9141788482666016e-05, |
| "lookahead_loss": 6.014976463317871, |
| "loss": 3.1576, |
| "step": 9000 |
| }, |
| { |
| "base_loss": 0.32021681547164915, |
| "epoch": 0.01811981201171875, |
| "grad_norm": 0.24822595715522766, |
| "learning_rate": 4.9094104766845706e-05, |
| "lookahead_loss": 5.915456521987915, |
| "loss": 3.1178, |
| "step": 9500 |
| }, |
| { |
| "base_loss": 0.29040670284628867, |
| "epoch": 0.019073486328125, |
| "grad_norm": 0.19323401153087616, |
| "learning_rate": 4.9046421051025396e-05, |
| "lookahead_loss": 5.8278450555801395, |
| "loss": 3.0591, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.019073486328125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 6.077848429870605, |
| "eval_lookahead_perplexity": 436.08990659185014, |
| "eval_loss": 3.1042745113372803, |
| "eval_perplexity": 22.293039759463593, |
| "eval_runtime": 483.2744, |
| "eval_samples_per_second": 20.692, |
| "eval_steps_per_second": 1.293, |
| "step": 10000 |
| }, |
| { |
| "base_loss": 0.3004113952517509, |
| "epoch": 0.02002716064453125, |
| "grad_norm": 0.16262881457805634, |
| "learning_rate": 4.899873733520508e-05, |
| "lookahead_loss": 5.826037519454956, |
| "loss": 3.0632, |
| "step": 10500 |
| }, |
| { |
| "base_loss": 0.29663331305980684, |
| "epoch": 0.0209808349609375, |
| "grad_norm": 0.15377278625965118, |
| "learning_rate": 4.895105361938477e-05, |
| "lookahead_loss": 5.7655581150054935, |
| "loss": 3.0311, |
| "step": 11000 |
| }, |
| { |
| "base_loss": 0.30206509011983873, |
| "epoch": 0.02193450927734375, |
| "grad_norm": 0.6924819350242615, |
| "learning_rate": 4.890336990356445e-05, |
| "lookahead_loss": 5.726267605781556, |
| "loss": 3.0142, |
| "step": 11500 |
| }, |
| { |
| "base_loss": 0.30140692415833475, |
| "epoch": 0.02288818359375, |
| "grad_norm": 0.17533016204833984, |
| "learning_rate": 4.8855686187744143e-05, |
| "lookahead_loss": 5.6108699903488155, |
| "loss": 2.9561, |
| "step": 12000 |
| }, |
| { |
| "base_loss": 0.31570343241095544, |
| "epoch": 0.02384185791015625, |
| "grad_norm": 0.14791598916053772, |
| "learning_rate": 4.8808002471923834e-05, |
| "lookahead_loss": 5.538150405883789, |
| "loss": 2.9269, |
| "step": 12500 |
| }, |
| { |
| "base_loss": 0.33028968888521193, |
| "epoch": 0.0247955322265625, |
| "grad_norm": 0.12097828835248947, |
| "learning_rate": 4.876031875610352e-05, |
| "lookahead_loss": 5.539091997146606, |
| "loss": 2.9347, |
| "step": 13000 |
| }, |
| { |
| "base_loss": 0.3067899467945099, |
| "epoch": 0.02574920654296875, |
| "grad_norm": 0.1595277637243271, |
| "learning_rate": 4.871263504028321e-05, |
| "lookahead_loss": 5.434582984924316, |
| "loss": 2.8707, |
| "step": 13500 |
| }, |
| { |
| "base_loss": 0.3104507875740528, |
| "epoch": 0.026702880859375, |
| "grad_norm": 0.16813045740127563, |
| "learning_rate": 4.866495132446289e-05, |
| "lookahead_loss": 5.4183082113265995, |
| "loss": 2.8644, |
| "step": 14000 |
| }, |
| { |
| "base_loss": 0.295670255869627, |
| "epoch": 0.02765655517578125, |
| "grad_norm": 0.23366433382034302, |
| "learning_rate": 4.861726760864258e-05, |
| "lookahead_loss": 5.329585377693176, |
| "loss": 2.8126, |
| "step": 14500 |
| }, |
| { |
| "base_loss": 0.3073807775378227, |
| "epoch": 0.0286102294921875, |
| "grad_norm": 0.15339840948581696, |
| "learning_rate": 4.856958389282227e-05, |
| "lookahead_loss": 5.379592286109924, |
| "loss": 2.8435, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.0286102294921875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 5.3830560668945315, |
| "eval_lookahead_perplexity": 217.68652449406255, |
| "eval_loss": 2.75687837600708, |
| "eval_perplexity": 15.750598680461218, |
| "eval_runtime": 488.4895, |
| "eval_samples_per_second": 20.471, |
| "eval_steps_per_second": 1.279, |
| "step": 15000 |
| }, |
| { |
| "base_loss": 0.296496417850256, |
| "epoch": 0.02956390380859375, |
| "grad_norm": 0.16044539213180542, |
| "learning_rate": 4.8521900177001955e-05, |
| "lookahead_loss": 5.344062633514405, |
| "loss": 2.8203, |
| "step": 15500 |
| }, |
| { |
| "base_loss": 0.29590288090705874, |
| "epoch": 0.030517578125, |
| "grad_norm": 0.16459447145462036, |
| "learning_rate": 4.8474216461181645e-05, |
| "lookahead_loss": 5.331672690868378, |
| "loss": 2.8138, |
| "step": 16000 |
| }, |
| { |
| "base_loss": 0.3003334278166294, |
| "epoch": 0.03147125244140625, |
| "grad_norm": 0.14433036744594574, |
| "learning_rate": 4.842653274536133e-05, |
| "lookahead_loss": 5.245349229812622, |
| "loss": 2.7728, |
| "step": 16500 |
| }, |
| { |
| "base_loss": 0.3256162821352482, |
| "epoch": 0.0324249267578125, |
| "grad_norm": 0.17356151342391968, |
| "learning_rate": 4.837884902954102e-05, |
| "lookahead_loss": 5.219405631065369, |
| "loss": 2.7725, |
| "step": 17000 |
| }, |
| { |
| "base_loss": 0.3199668276309967, |
| "epoch": 0.03337860107421875, |
| "grad_norm": 0.15259094536304474, |
| "learning_rate": 4.833116531372071e-05, |
| "lookahead_loss": 5.178223248481751, |
| "loss": 2.7491, |
| "step": 17500 |
| }, |
| { |
| "base_loss": 0.29680381083488466, |
| "epoch": 0.034332275390625, |
| "grad_norm": 0.20254507660865784, |
| "learning_rate": 4.828348159790039e-05, |
| "lookahead_loss": 5.133180852890015, |
| "loss": 2.715, |
| "step": 18000 |
| }, |
| { |
| "base_loss": 0.30402446049451826, |
| "epoch": 0.03528594970703125, |
| "grad_norm": 0.14859794080257416, |
| "learning_rate": 4.823579788208008e-05, |
| "lookahead_loss": 5.102789646148682, |
| "loss": 2.7034, |
| "step": 18500 |
| }, |
| { |
| "base_loss": 0.2954226844608784, |
| "epoch": 0.0362396240234375, |
| "grad_norm": 0.1865054816007614, |
| "learning_rate": 4.8188114166259766e-05, |
| "lookahead_loss": 5.056313884735108, |
| "loss": 2.6759, |
| "step": 19000 |
| }, |
| { |
| "base_loss": 0.30284518826007845, |
| "epoch": 0.03719329833984375, |
| "grad_norm": 0.1533517986536026, |
| "learning_rate": 4.8140430450439456e-05, |
| "lookahead_loss": 5.113425381660462, |
| "loss": 2.7081, |
| "step": 19500 |
| }, |
| { |
| "base_loss": 0.293648807734251, |
| "epoch": 0.03814697265625, |
| "grad_norm": 0.12334468960762024, |
| "learning_rate": 4.8092746734619146e-05, |
| "lookahead_loss": 5.080516023635864, |
| "loss": 2.6871, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.03814697265625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 4.964036189270019, |
| "eval_lookahead_perplexity": 143.1704944953996, |
| "eval_loss": 2.547368288040161, |
| "eval_perplexity": 12.773443485997294, |
| "eval_runtime": 505.4082, |
| "eval_samples_per_second": 19.786, |
| "eval_steps_per_second": 1.237, |
| "step": 20000 |
| }, |
| { |
| "base_loss": 0.30936932846903803, |
| "epoch": 0.03910064697265625, |
| "grad_norm": 0.14457817375659943, |
| "learning_rate": 4.804506301879883e-05, |
| "lookahead_loss": 5.047291966438293, |
| "loss": 2.6783, |
| "step": 20500 |
| }, |
| { |
| "base_loss": 0.29837565070390704, |
| "epoch": 0.0400543212890625, |
| "grad_norm": 0.21649453043937683, |
| "learning_rate": 4.799737930297852e-05, |
| "lookahead_loss": 4.948660229682923, |
| "loss": 2.6235, |
| "step": 21000 |
| }, |
| { |
| "base_loss": 0.327464056879282, |
| "epoch": 0.04100799560546875, |
| "grad_norm": 0.1511124223470688, |
| "learning_rate": 4.79496955871582e-05, |
| "lookahead_loss": 5.014459494590759, |
| "loss": 2.671, |
| "step": 21500 |
| }, |
| { |
| "base_loss": 0.3259224636852741, |
| "epoch": 0.041961669921875, |
| "grad_norm": 0.18786948919296265, |
| "learning_rate": 4.7902011871337893e-05, |
| "lookahead_loss": 4.958053824424744, |
| "loss": 2.642, |
| "step": 22000 |
| }, |
| { |
| "base_loss": 0.30848885998129844, |
| "epoch": 0.04291534423828125, |
| "grad_norm": 0.18608908355236053, |
| "learning_rate": 4.7854328155517584e-05, |
| "lookahead_loss": 4.910871742725372, |
| "loss": 2.6097, |
| "step": 22500 |
| }, |
| { |
| "base_loss": 0.2953577929735184, |
| "epoch": 0.0438690185546875, |
| "grad_norm": 0.13473840057849884, |
| "learning_rate": 4.780664443969727e-05, |
| "lookahead_loss": 4.861847942352295, |
| "loss": 2.5786, |
| "step": 23000 |
| }, |
| { |
| "base_loss": 0.3016613866984844, |
| "epoch": 0.04482269287109375, |
| "grad_norm": 0.12197423726320267, |
| "learning_rate": 4.775896072387696e-05, |
| "lookahead_loss": 4.872081851005555, |
| "loss": 2.5869, |
| "step": 23500 |
| }, |
| { |
| "base_loss": 0.2971103771924973, |
| "epoch": 0.0457763671875, |
| "grad_norm": 0.16922320425510406, |
| "learning_rate": 4.771127700805664e-05, |
| "lookahead_loss": 4.889053022384643, |
| "loss": 2.5931, |
| "step": 24000 |
| }, |
| { |
| "base_loss": 0.300103415876627, |
| "epoch": 0.04673004150390625, |
| "grad_norm": 0.16374553740024567, |
| "learning_rate": 4.766359329223633e-05, |
| "lookahead_loss": 4.88103800201416, |
| "loss": 2.5906, |
| "step": 24500 |
| }, |
| { |
| "base_loss": 0.3028304523229599, |
| "epoch": 0.0476837158203125, |
| "grad_norm": 0.1691102385520935, |
| "learning_rate": 4.761590957641602e-05, |
| "lookahead_loss": 4.8170537824630735, |
| "loss": 2.5599, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.0476837158203125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 4.681251558685303, |
| "eval_lookahead_perplexity": 107.90503758244142, |
| "eval_loss": 2.4059760570526123, |
| "eval_perplexity": 11.08924874105288, |
| "eval_runtime": 483.5965, |
| "eval_samples_per_second": 20.678, |
| "eval_steps_per_second": 1.292, |
| "step": 25000 |
| }, |
| { |
| "base_loss": 0.3226933609247208, |
| "epoch": 0.04863739013671875, |
| "grad_norm": 0.3987274169921875, |
| "learning_rate": 4.7568225860595705e-05, |
| "lookahead_loss": 4.831066390037536, |
| "loss": 2.5769, |
| "step": 25500 |
| }, |
| { |
| "base_loss": 0.3246669633388519, |
| "epoch": 0.049591064453125, |
| "grad_norm": 0.1506359577178955, |
| "learning_rate": 4.7520542144775395e-05, |
| "lookahead_loss": 4.826577740669251, |
| "loss": 2.5756, |
| "step": 26000 |
| }, |
| { |
| "base_loss": 0.31835618990659714, |
| "epoch": 0.05054473876953125, |
| "grad_norm": 0.2562532126903534, |
| "learning_rate": 4.747285842895508e-05, |
| "lookahead_loss": 4.776721940994262, |
| "loss": 2.5475, |
| "step": 26500 |
| }, |
| { |
| "base_loss": 0.3007115146815777, |
| "epoch": 0.0514984130859375, |
| "grad_norm": 0.18583890795707703, |
| "learning_rate": 4.742517471313477e-05, |
| "lookahead_loss": 4.746668409347534, |
| "loss": 2.5237, |
| "step": 27000 |
| }, |
| { |
| "base_loss": 0.30024259850382806, |
| "epoch": 0.05245208740234375, |
| "grad_norm": 0.1737774759531021, |
| "learning_rate": 4.737749099731446e-05, |
| "lookahead_loss": 4.724320489406586, |
| "loss": 2.5123, |
| "step": 27500 |
| }, |
| { |
| "base_loss": 0.30464168420433996, |
| "epoch": 0.05340576171875, |
| "grad_norm": 0.18554258346557617, |
| "learning_rate": 4.732980728149414e-05, |
| "lookahead_loss": 4.769838083267212, |
| "loss": 2.5372, |
| "step": 28000 |
| }, |
| { |
| "base_loss": 0.2989484859406948, |
| "epoch": 0.05435943603515625, |
| "grad_norm": 0.24365681409835815, |
| "learning_rate": 4.728212356567383e-05, |
| "lookahead_loss": 4.73219411945343, |
| "loss": 2.5156, |
| "step": 28500 |
| }, |
| { |
| "base_loss": 0.315606110394001, |
| "epoch": 0.0553131103515625, |
| "grad_norm": 0.16112400591373444, |
| "learning_rate": 4.7234439849853516e-05, |
| "lookahead_loss": 4.720495784759522, |
| "loss": 2.5181, |
| "step": 29000 |
| }, |
| { |
| "base_loss": 0.323923219949007, |
| "epoch": 0.05626678466796875, |
| "grad_norm": 0.14975038170814514, |
| "learning_rate": 4.7186756134033206e-05, |
| "lookahead_loss": 4.705821178436279, |
| "loss": 2.5149, |
| "step": 29500 |
| }, |
| { |
| "base_loss": 0.3346382395327091, |
| "epoch": 0.057220458984375, |
| "grad_norm": 0.12785978615283966, |
| "learning_rate": 4.7139072418212896e-05, |
| "lookahead_loss": 4.724789978027344, |
| "loss": 2.5297, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.057220458984375, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 4.474773878097534, |
| "eval_lookahead_perplexity": 87.77475037203264, |
| "eval_loss": 2.30273699760437, |
| "eval_perplexity": 10.00151916148414, |
| "eval_runtime": 489.7644, |
| "eval_samples_per_second": 20.418, |
| "eval_steps_per_second": 1.276, |
| "step": 30000 |
| }, |
| { |
| "base_loss": 0.30464139559865, |
| "epoch": 0.05817413330078125, |
| "grad_norm": 0.16110007464885712, |
| "learning_rate": 4.709138870239258e-05, |
| "lookahead_loss": 4.650503679275513, |
| "loss": 2.4776, |
| "step": 30500 |
| }, |
| { |
| "base_loss": 0.29905567806959155, |
| "epoch": 0.0591278076171875, |
| "grad_norm": 0.15517863631248474, |
| "learning_rate": 4.704370498657227e-05, |
| "lookahead_loss": 4.638624626159668, |
| "loss": 2.4688, |
| "step": 31000 |
| }, |
| { |
| "base_loss": 0.2991310947537422, |
| "epoch": 0.06008148193359375, |
| "grad_norm": 0.23121874034404755, |
| "learning_rate": 4.699602127075195e-05, |
| "lookahead_loss": 4.616633594036102, |
| "loss": 2.4579, |
| "step": 31500 |
| }, |
| { |
| "base_loss": 0.3083145318031311, |
| "epoch": 0.06103515625, |
| "grad_norm": 0.2543278932571411, |
| "learning_rate": 4.6948337554931643e-05, |
| "lookahead_loss": 4.696196820259094, |
| "loss": 2.5023, |
| "step": 32000 |
| }, |
| { |
| "base_loss": 0.30308397909998896, |
| "epoch": 0.06198883056640625, |
| "grad_norm": 0.13134761154651642, |
| "learning_rate": 4.6900653839111334e-05, |
| "lookahead_loss": 4.6643982214927675, |
| "loss": 2.4837, |
| "step": 32500 |
| }, |
| { |
| "base_loss": 0.3018778342306614, |
| "epoch": 0.0629425048828125, |
| "grad_norm": 0.1542576104402542, |
| "learning_rate": 4.685297012329102e-05, |
| "lookahead_loss": 4.626801939964294, |
| "loss": 2.4643, |
| "step": 33000 |
| }, |
| { |
| "base_loss": 0.3218779897689819, |
| "epoch": 0.06389617919921875, |
| "grad_norm": 0.13860082626342773, |
| "learning_rate": 4.680528640747071e-05, |
| "lookahead_loss": 4.630942379951477, |
| "loss": 2.4764, |
| "step": 33500 |
| }, |
| { |
| "base_loss": 0.32255707490444185, |
| "epoch": 0.064849853515625, |
| "grad_norm": 0.21500709652900696, |
| "learning_rate": 4.675760269165039e-05, |
| "lookahead_loss": 4.614894771099091, |
| "loss": 2.4687, |
| "step": 34000 |
| }, |
| { |
| "base_loss": 0.3057953714132309, |
| "epoch": 0.06580352783203125, |
| "grad_norm": 0.15437884628772736, |
| "learning_rate": 4.670991897583008e-05, |
| "lookahead_loss": 4.562792086601258, |
| "loss": 2.4343, |
| "step": 34500 |
| }, |
| { |
| "base_loss": 0.2983690336048603, |
| "epoch": 0.0667572021484375, |
| "grad_norm": 0.17949432134628296, |
| "learning_rate": 4.666223526000977e-05, |
| "lookahead_loss": 4.535246185302734, |
| "loss": 2.4168, |
| "step": 35000 |
| }, |
| { |
| "epoch": 0.0667572021484375, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 4.3161673141479495, |
| "eval_lookahead_perplexity": 74.90100543624399, |
| "eval_loss": 2.2234339714050293, |
| "eval_perplexity": 9.23900292614348, |
| "eval_runtime": 498.5159, |
| "eval_samples_per_second": 20.06, |
| "eval_steps_per_second": 1.254, |
| "step": 35000 |
| }, |
| { |
| "base_loss": 0.3004172631800175, |
| "epoch": 0.06771087646484375, |
| "grad_norm": 0.14633019268512726, |
| "learning_rate": 4.6614551544189455e-05, |
| "lookahead_loss": 4.555401055812836, |
| "loss": 2.4279, |
| "step": 35500 |
| }, |
| { |
| "base_loss": 0.3054021218121052, |
| "epoch": 0.06866455078125, |
| "grad_norm": 0.1640414297580719, |
| "learning_rate": 4.6566867828369145e-05, |
| "lookahead_loss": 4.591926307678222, |
| "loss": 2.4487, |
| "step": 36000 |
| }, |
| { |
| "base_loss": 0.30075133538246157, |
| "epoch": 0.06961822509765625, |
| "grad_norm": 0.14715056121349335, |
| "learning_rate": 4.651918411254883e-05, |
| "lookahead_loss": 4.545757569789886, |
| "loss": 2.4233, |
| "step": 36500 |
| }, |
| { |
| "base_loss": 0.3224307889938354, |
| "epoch": 0.0705718994140625, |
| "grad_norm": 0.1614302396774292, |
| "learning_rate": 4.647150039672852e-05, |
| "lookahead_loss": 4.540217909812927, |
| "loss": 2.4313, |
| "step": 37000 |
| }, |
| { |
| "base_loss": 0.3294345450103283, |
| "epoch": 0.07152557373046875, |
| "grad_norm": 0.15803970396518707, |
| "learning_rate": 4.642381668090821e-05, |
| "lookahead_loss": 4.554252586364746, |
| "loss": 2.4418, |
| "step": 37500 |
| }, |
| { |
| "base_loss": 0.3225139188170433, |
| "epoch": 0.072479248046875, |
| "grad_norm": 0.1647147685289383, |
| "learning_rate": 4.637613296508789e-05, |
| "lookahead_loss": 4.5207296891212465, |
| "loss": 2.4216, |
| "step": 38000 |
| }, |
| { |
| "base_loss": 0.312881602704525, |
| "epoch": 0.07343292236328125, |
| "grad_norm": 0.1871267408132553, |
| "learning_rate": 4.632844924926758e-05, |
| "lookahead_loss": 4.507406691551209, |
| "loss": 2.4101, |
| "step": 38500 |
| }, |
| { |
| "base_loss": 0.3021739726960659, |
| "epoch": 0.0743865966796875, |
| "grad_norm": 0.1738116592168808, |
| "learning_rate": 4.6280765533447266e-05, |
| "lookahead_loss": 4.469052557945251, |
| "loss": 2.3856, |
| "step": 39000 |
| }, |
| { |
| "base_loss": 0.30172099885344505, |
| "epoch": 0.07534027099609375, |
| "grad_norm": 0.16887560486793518, |
| "learning_rate": 4.6233081817626956e-05, |
| "lookahead_loss": 4.467407505512238, |
| "loss": 2.3846, |
| "step": 39500 |
| }, |
| { |
| "base_loss": 0.30044360157847405, |
| "epoch": 0.0762939453125, |
| "grad_norm": 0.26040539145469666, |
| "learning_rate": 4.6185398101806646e-05, |
| "lookahead_loss": 4.4797073764801025, |
| "loss": 2.3901, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.0762939453125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 4.1904504711151125, |
| "eval_lookahead_perplexity": 66.05253902049455, |
| "eval_loss": 2.1605753898620605, |
| "eval_perplexity": 8.676128378831928, |
| "eval_runtime": 487.0689, |
| "eval_samples_per_second": 20.531, |
| "eval_steps_per_second": 1.283, |
| "step": 40000 |
| }, |
| { |
| "base_loss": 0.3015633761882782, |
| "epoch": 0.07724761962890625, |
| "grad_norm": 0.21266193687915802, |
| "learning_rate": 4.613771438598633e-05, |
| "lookahead_loss": 4.494357872486114, |
| "loss": 2.398, |
| "step": 40500 |
| }, |
| { |
| "base_loss": 0.31515628564357756, |
| "epoch": 0.0782012939453125, |
| "grad_norm": 0.16350935399532318, |
| "learning_rate": 4.609003067016602e-05, |
| "lookahead_loss": 4.463918738365173, |
| "loss": 2.3895, |
| "step": 41000 |
| }, |
| { |
| "base_loss": 0.3230645119249821, |
| "epoch": 0.07915496826171875, |
| "grad_norm": 0.14223527908325195, |
| "learning_rate": 4.60423469543457e-05, |
| "lookahead_loss": 4.479482789039611, |
| "loss": 2.4013, |
| "step": 41500 |
| }, |
| { |
| "base_loss": 0.31478354924917223, |
| "epoch": 0.080108642578125, |
| "grad_norm": 0.21286998689174652, |
| "learning_rate": 4.5994663238525393e-05, |
| "lookahead_loss": 4.4453483581542965, |
| "loss": 2.3801, |
| "step": 42000 |
| }, |
| { |
| "base_loss": 0.3150367656648159, |
| "epoch": 0.08106231689453125, |
| "grad_norm": 0.17431187629699707, |
| "learning_rate": 4.5946979522705084e-05, |
| "lookahead_loss": 4.43265785074234, |
| "loss": 2.3738, |
| "step": 42500 |
| }, |
| { |
| "base_loss": 0.2954510691165924, |
| "epoch": 0.0820159912109375, |
| "grad_norm": 0.1371452659368515, |
| "learning_rate": 4.589929580688477e-05, |
| "lookahead_loss": 4.397426519393921, |
| "loss": 2.3464, |
| "step": 43000 |
| }, |
| { |
| "base_loss": 0.29434001427888873, |
| "epoch": 0.08296966552734375, |
| "grad_norm": 0.13817064464092255, |
| "learning_rate": 4.585161209106446e-05, |
| "lookahead_loss": 4.442314762115479, |
| "loss": 2.3683, |
| "step": 43500 |
| }, |
| { |
| "base_loss": 0.2983709729015827, |
| "epoch": 0.08392333984375, |
| "grad_norm": 0.17395979166030884, |
| "learning_rate": 4.580392837524414e-05, |
| "lookahead_loss": 4.447078158378601, |
| "loss": 2.3727, |
| "step": 44000 |
| }, |
| { |
| "base_loss": 0.29992626640200615, |
| "epoch": 0.08487701416015625, |
| "grad_norm": 0.17240917682647705, |
| "learning_rate": 4.575624465942383e-05, |
| "lookahead_loss": 4.4108530750274655, |
| "loss": 2.3554, |
| "step": 44500 |
| }, |
| { |
| "base_loss": 0.32140542250871657, |
| "epoch": 0.0858306884765625, |
| "grad_norm": 0.2007725089788437, |
| "learning_rate": 4.570856094360352e-05, |
| "lookahead_loss": 4.3953737797737125, |
| "loss": 2.3584, |
| "step": 45000 |
| }, |
| { |
| "epoch": 0.0858306884765625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 4.088313439559936, |
| "eval_lookahead_perplexity": 59.639221683003754, |
| "eval_loss": 2.109506607055664, |
| "eval_perplexity": 8.244172664370447, |
| "eval_runtime": 486.2971, |
| "eval_samples_per_second": 20.564, |
| "eval_steps_per_second": 1.285, |
| "step": 45000 |
| }, |
| { |
| "base_loss": 0.32445472630858424, |
| "epoch": 0.08678436279296875, |
| "grad_norm": 0.22776088118553162, |
| "learning_rate": 4.5660877227783205e-05, |
| "lookahead_loss": 4.4375409073829655, |
| "loss": 2.381, |
| "step": 45500 |
| }, |
| { |
| "base_loss": 0.33727448362112045, |
| "epoch": 0.087738037109375, |
| "grad_norm": 0.20219068229198456, |
| "learning_rate": 4.5613193511962895e-05, |
| "lookahead_loss": 4.402894349098205, |
| "loss": 2.3701, |
| "step": 46000 |
| }, |
| { |
| "base_loss": 0.2958546592593193, |
| "epoch": 0.08869171142578125, |
| "grad_norm": 0.13857534527778625, |
| "learning_rate": 4.556550979614258e-05, |
| "lookahead_loss": 4.363266070842743, |
| "loss": 2.3296, |
| "step": 46500 |
| }, |
| { |
| "base_loss": 0.2990704481303692, |
| "epoch": 0.0896453857421875, |
| "grad_norm": 0.17887870967388153, |
| "learning_rate": 4.551782608032227e-05, |
| "lookahead_loss": 4.336292638778686, |
| "loss": 2.3177, |
| "step": 47000 |
| }, |
| { |
| "base_loss": 0.3050165086686611, |
| "epoch": 0.09059906005859375, |
| "grad_norm": 0.14284111559391022, |
| "learning_rate": 4.547014236450196e-05, |
| "lookahead_loss": 4.395085669517517, |
| "loss": 2.3501, |
| "step": 47500 |
| }, |
| { |
| "base_loss": 0.307517321318388, |
| "epoch": 0.091552734375, |
| "grad_norm": 0.14320409297943115, |
| "learning_rate": 4.542245864868164e-05, |
| "lookahead_loss": 4.417602932453155, |
| "loss": 2.3626, |
| "step": 48000 |
| }, |
| { |
| "base_loss": 0.3067179475426674, |
| "epoch": 0.09250640869140625, |
| "grad_norm": 0.14618393778800964, |
| "learning_rate": 4.537477493286133e-05, |
| "lookahead_loss": 4.324424335956573, |
| "loss": 2.3156, |
| "step": 48500 |
| }, |
| { |
| "base_loss": 0.34884196099638937, |
| "epoch": 0.0934600830078125, |
| "grad_norm": 0.14412052929401398, |
| "learning_rate": 4.5327091217041016e-05, |
| "lookahead_loss": 4.406433558940887, |
| "loss": 2.3776, |
| "step": 49000 |
| }, |
| { |
| "base_loss": 0.3168534035682678, |
| "epoch": 0.09441375732421875, |
| "grad_norm": 0.15117081999778748, |
| "learning_rate": 4.5279407501220706e-05, |
| "lookahead_loss": 4.37341592168808, |
| "loss": 2.3451, |
| "step": 49500 |
| }, |
| { |
| "base_loss": 0.3365150539577007, |
| "epoch": 0.095367431640625, |
| "grad_norm": 0.29998552799224854, |
| "learning_rate": 4.523172378540039e-05, |
| "lookahead_loss": 4.387516963481903, |
| "loss": 2.362, |
| "step": 50000 |
| }, |
| { |
| "epoch": 0.095367431640625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 4.00546947631836, |
| "eval_lookahead_perplexity": 54.897591469211974, |
| "eval_loss": 2.068084955215454, |
| "eval_perplexity": 7.909661249131722, |
| "eval_runtime": 486.3798, |
| "eval_samples_per_second": 20.56, |
| "eval_steps_per_second": 1.285, |
| "step": 50000 |
| }, |
| { |
| "base_loss": 0.29778670057654383, |
| "epoch": 0.09632110595703125, |
| "grad_norm": 0.15356530249118805, |
| "learning_rate": 4.518404006958008e-05, |
| "lookahead_loss": 4.299853836059571, |
| "loss": 2.2988, |
| "step": 50500 |
| }, |
| { |
| "base_loss": 0.2963479610979557, |
| "epoch": 0.0972747802734375, |
| "grad_norm": 0.17061887681484222, |
| "learning_rate": 4.513635635375977e-05, |
| "lookahead_loss": 4.333408058643341, |
| "loss": 2.3149, |
| "step": 51000 |
| }, |
| { |
| "base_loss": 0.3014124562442303, |
| "epoch": 0.09822845458984375, |
| "grad_norm": 0.19273534417152405, |
| "learning_rate": 4.508867263793945e-05, |
| "lookahead_loss": 4.332042829036713, |
| "loss": 2.3167, |
| "step": 51500 |
| }, |
| { |
| "base_loss": 0.3079428587257862, |
| "epoch": 0.09918212890625, |
| "grad_norm": 0.17310389876365662, |
| "learning_rate": 4.5040988922119143e-05, |
| "lookahead_loss": 4.3280877280235295, |
| "loss": 2.318, |
| "step": 52000 |
| }, |
| { |
| "base_loss": 0.3165371402204037, |
| "epoch": 0.10013580322265625, |
| "grad_norm": 0.2102889120578766, |
| "learning_rate": 4.499330520629883e-05, |
| "lookahead_loss": 4.3219221534729, |
| "loss": 2.3192, |
| "step": 52500 |
| }, |
| { |
| "base_loss": 0.3282755868136883, |
| "epoch": 0.1010894775390625, |
| "grad_norm": 0.12816853821277618, |
| "learning_rate": 4.494562149047852e-05, |
| "lookahead_loss": 4.345856199264526, |
| "loss": 2.3371, |
| "step": 53000 |
| }, |
| { |
| "base_loss": 0.3201599704921246, |
| "epoch": 0.10204315185546875, |
| "grad_norm": 0.18837909400463104, |
| "learning_rate": 4.489793777465821e-05, |
| "lookahead_loss": 4.310313300609589, |
| "loss": 2.3152, |
| "step": 53500 |
| }, |
| { |
| "base_loss": 0.29424001121521, |
| "epoch": 0.102996826171875, |
| "grad_norm": 0.20326119661331177, |
| "learning_rate": 4.485025405883789e-05, |
| "lookahead_loss": 4.260730008602143, |
| "loss": 2.2775, |
| "step": 54000 |
| }, |
| { |
| "base_loss": 0.3039598934650421, |
| "epoch": 0.10395050048828125, |
| "grad_norm": 0.19273315370082855, |
| "learning_rate": 4.480257034301758e-05, |
| "lookahead_loss": 4.278165885925293, |
| "loss": 2.2911, |
| "step": 54500 |
| }, |
| { |
| "base_loss": 0.3027501743733883, |
| "epoch": 0.1049041748046875, |
| "grad_norm": 0.31297221779823303, |
| "learning_rate": 4.4754886627197264e-05, |
| "lookahead_loss": 4.322804376602173, |
| "loss": 2.3128, |
| "step": 55000 |
| }, |
| { |
| "epoch": 0.1049041748046875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.9343780502319334, |
| "eval_lookahead_perplexity": 51.130339562617046, |
| "eval_loss": 2.0325393676757812, |
| "eval_perplexity": 7.633445893645447, |
| "eval_runtime": 484.3719, |
| "eval_samples_per_second": 20.645, |
| "eval_steps_per_second": 1.29, |
| "step": 55000 |
| }, |
| { |
| "base_loss": 0.30902848035097125, |
| "epoch": 0.10585784912109375, |
| "grad_norm": 0.14435261487960815, |
| "learning_rate": 4.4707202911376955e-05, |
| "lookahead_loss": 4.311323729991913, |
| "loss": 2.3102, |
| "step": 55500 |
| }, |
| { |
| "base_loss": 0.32790091571211816, |
| "epoch": 0.1068115234375, |
| "grad_norm": 0.14303159713745117, |
| "learning_rate": 4.4659519195556645e-05, |
| "lookahead_loss": 4.295704743385315, |
| "loss": 2.3118, |
| "step": 56000 |
| }, |
| { |
| "base_loss": 0.34225816893577576, |
| "epoch": 0.10776519775390625, |
| "grad_norm": 0.16590921580791473, |
| "learning_rate": 4.461183547973633e-05, |
| "lookahead_loss": 4.334332738399506, |
| "loss": 2.3383, |
| "step": 56500 |
| }, |
| { |
| "base_loss": 0.378170046120882, |
| "epoch": 0.1087188720703125, |
| "grad_norm": 0.13906623423099518, |
| "learning_rate": 4.456415176391602e-05, |
| "lookahead_loss": 4.338398130893707, |
| "loss": 2.3583, |
| "step": 57000 |
| }, |
| { |
| "base_loss": 0.29169481843709943, |
| "epoch": 0.10967254638671875, |
| "grad_norm": 0.13996054232120514, |
| "learning_rate": 4.45164680480957e-05, |
| "lookahead_loss": 4.234890432357788, |
| "loss": 2.2633, |
| "step": 57500 |
| }, |
| { |
| "base_loss": 0.29581671801209447, |
| "epoch": 0.110626220703125, |
| "grad_norm": 0.20492452383041382, |
| "learning_rate": 4.446878433227539e-05, |
| "lookahead_loss": 4.2333492503166195, |
| "loss": 2.2646, |
| "step": 58000 |
| }, |
| { |
| "base_loss": 0.30925117334723473, |
| "epoch": 0.11157989501953125, |
| "grad_norm": 0.15514181554317474, |
| "learning_rate": 4.442110061645508e-05, |
| "lookahead_loss": 4.287873956203461, |
| "loss": 2.2986, |
| "step": 58500 |
| }, |
| { |
| "base_loss": 0.3024054784178734, |
| "epoch": 0.1125335693359375, |
| "grad_norm": 0.13332504034042358, |
| "learning_rate": 4.4373416900634766e-05, |
| "lookahead_loss": 4.307773890495301, |
| "loss": 2.3051, |
| "step": 59000 |
| }, |
| { |
| "base_loss": 0.3064781714081764, |
| "epoch": 0.11348724365234375, |
| "grad_norm": 0.15052156150341034, |
| "learning_rate": 4.4325733184814456e-05, |
| "lookahead_loss": 4.252724995613098, |
| "loss": 2.2796, |
| "step": 59500 |
| }, |
| { |
| "base_loss": 0.33560348653793337, |
| "epoch": 0.11444091796875, |
| "grad_norm": 0.1650613248348236, |
| "learning_rate": 4.427804946899414e-05, |
| "lookahead_loss": 4.293933411121368, |
| "loss": 2.3148, |
| "step": 60000 |
| }, |
| { |
| "epoch": 0.11444091796875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.873882612991333, |
| "eval_lookahead_perplexity": 48.12888963812929, |
| "eval_loss": 2.002291679382324, |
| "eval_perplexity": 7.406008864179334, |
| "eval_runtime": 485.5371, |
| "eval_samples_per_second": 20.596, |
| "eval_steps_per_second": 1.287, |
| "step": 60000 |
| }, |
| { |
| "base_loss": 0.32748540678620336, |
| "epoch": 0.11539459228515625, |
| "grad_norm": 0.17119185626506805, |
| "learning_rate": 4.423036575317383e-05, |
| "lookahead_loss": 4.253311916828156, |
| "loss": 2.2904, |
| "step": 60500 |
| }, |
| { |
| "base_loss": 0.31137031635642054, |
| "epoch": 0.1163482666015625, |
| "grad_norm": 0.13608764111995697, |
| "learning_rate": 4.418268203735352e-05, |
| "lookahead_loss": 4.232373682975769, |
| "loss": 2.2719, |
| "step": 61000 |
| }, |
| { |
| "base_loss": 0.29493060091137885, |
| "epoch": 0.11730194091796875, |
| "grad_norm": 0.18083657324314117, |
| "learning_rate": 4.41349983215332e-05, |
| "lookahead_loss": 4.186727853775024, |
| "loss": 2.2408, |
| "step": 61500 |
| }, |
| { |
| "base_loss": 0.29388627085089686, |
| "epoch": 0.118255615234375, |
| "grad_norm": 0.1371856927871704, |
| "learning_rate": 4.4087314605712893e-05, |
| "lookahead_loss": 4.225665160179139, |
| "loss": 2.2598, |
| "step": 62000 |
| }, |
| { |
| "base_loss": 0.3018366146683693, |
| "epoch": 0.11920928955078125, |
| "grad_norm": 0.20487329363822937, |
| "learning_rate": 4.403963088989258e-05, |
| "lookahead_loss": 4.264792753696442, |
| "loss": 2.2833, |
| "step": 62500 |
| }, |
| { |
| "base_loss": 0.3014587008357048, |
| "epoch": 0.1201629638671875, |
| "grad_norm": 0.150614932179451, |
| "learning_rate": 4.399194717407227e-05, |
| "lookahead_loss": 4.226875496387482, |
| "loss": 2.2642, |
| "step": 63000 |
| }, |
| { |
| "base_loss": 0.3339847734570503, |
| "epoch": 0.12111663818359375, |
| "grad_norm": 0.14908407628536224, |
| "learning_rate": 4.394426345825196e-05, |
| "lookahead_loss": 4.266850714683533, |
| "loss": 2.3004, |
| "step": 63500 |
| }, |
| { |
| "base_loss": 0.3062296485900879, |
| "epoch": 0.1220703125, |
| "grad_norm": 0.14350071549415588, |
| "learning_rate": 4.389657974243164e-05, |
| "lookahead_loss": 4.188070932388306, |
| "loss": 2.2471, |
| "step": 64000 |
| }, |
| { |
| "base_loss": 0.30985459744930266, |
| "epoch": 0.12302398681640625, |
| "grad_norm": 0.1504562944173813, |
| "learning_rate": 4.384889602661133e-05, |
| "lookahead_loss": 4.190308025836945, |
| "loss": 2.2501, |
| "step": 64500 |
| }, |
| { |
| "base_loss": 0.2970647314786911, |
| "epoch": 0.1239776611328125, |
| "grad_norm": 0.1937413513660431, |
| "learning_rate": 4.3801212310791014e-05, |
| "lookahead_loss": 4.1838707237243655, |
| "loss": 2.2405, |
| "step": 65000 |
| }, |
| { |
| "epoch": 0.1239776611328125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.8223541236877443, |
| "eval_lookahead_perplexity": 45.71169273408033, |
| "eval_loss": 1.9765273332595825, |
| "eval_perplexity": 7.2176349735977094, |
| "eval_runtime": 484.8269, |
| "eval_samples_per_second": 20.626, |
| "eval_steps_per_second": 1.289, |
| "step": 65000 |
| }, |
| { |
| "base_loss": 0.3036232475936413, |
| "epoch": 0.12493133544921875, |
| "grad_norm": 0.1427551954984665, |
| "learning_rate": 4.3753528594970705e-05, |
| "lookahead_loss": 4.240097631454468, |
| "loss": 2.2719, |
| "step": 65500 |
| }, |
| { |
| "base_loss": 0.3062092212736607, |
| "epoch": 0.125885009765625, |
| "grad_norm": 0.1701672226190567, |
| "learning_rate": 4.3705844879150395e-05, |
| "lookahead_loss": 4.233000110626221, |
| "loss": 2.2696, |
| "step": 66000 |
| }, |
| { |
| "base_loss": 0.3083333975672722, |
| "epoch": 0.12683868408203125, |
| "grad_norm": 0.1478368043899536, |
| "learning_rate": 4.365816116333008e-05, |
| "lookahead_loss": 4.2112864146232605, |
| "loss": 2.2598, |
| "step": 66500 |
| }, |
| { |
| "base_loss": 0.32698157826066016, |
| "epoch": 0.1277923583984375, |
| "grad_norm": 0.1420971006155014, |
| "learning_rate": 4.361047744750977e-05, |
| "lookahead_loss": 4.215134252548218, |
| "loss": 2.2711, |
| "step": 67000 |
| }, |
| { |
| "base_loss": 0.3078202583193779, |
| "epoch": 0.12874603271484375, |
| "grad_norm": 0.23012402653694153, |
| "learning_rate": 4.356279373168945e-05, |
| "lookahead_loss": 4.165988350391388, |
| "loss": 2.2369, |
| "step": 67500 |
| }, |
| { |
| "base_loss": 0.3053109573423862, |
| "epoch": 0.12969970703125, |
| "grad_norm": 0.18060369789600372, |
| "learning_rate": 4.351511001586914e-05, |
| "lookahead_loss": 4.1732166509628295, |
| "loss": 2.2393, |
| "step": 68000 |
| }, |
| { |
| "base_loss": 0.3097400109171867, |
| "epoch": 0.13065338134765625, |
| "grad_norm": 0.15356962382793427, |
| "learning_rate": 4.346742630004883e-05, |
| "lookahead_loss": 4.150314831733704, |
| "loss": 2.23, |
| "step": 68500 |
| }, |
| { |
| "base_loss": 0.3077418188452721, |
| "epoch": 0.1316070556640625, |
| "grad_norm": 0.16534677147865295, |
| "learning_rate": 4.3419742584228516e-05, |
| "lookahead_loss": 4.2109439077377315, |
| "loss": 2.2593, |
| "step": 69000 |
| }, |
| { |
| "base_loss": 0.3032139558494091, |
| "epoch": 0.13256072998046875, |
| "grad_norm": 0.1445273905992508, |
| "learning_rate": 4.3372058868408206e-05, |
| "lookahead_loss": 4.206652547359466, |
| "loss": 2.2549, |
| "step": 69500 |
| }, |
| { |
| "base_loss": 0.3113168263733387, |
| "epoch": 0.133514404296875, |
| "grad_norm": 0.14419260621070862, |
| "learning_rate": 4.332437515258789e-05, |
| "lookahead_loss": 4.168332862854004, |
| "loss": 2.2398, |
| "step": 70000 |
| }, |
| { |
| "epoch": 0.133514404296875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.775430511856079, |
| "eval_lookahead_perplexity": 43.616281600719844, |
| "eval_loss": 1.9530656337738037, |
| "eval_perplexity": 7.050268024589573, |
| "eval_runtime": 481.81, |
| "eval_samples_per_second": 20.755, |
| "eval_steps_per_second": 1.297, |
| "step": 70000 |
| }, |
| { |
| "base_loss": 0.33833707132935525, |
| "epoch": 0.13446807861328125, |
| "grad_norm": 0.19232244789600372, |
| "learning_rate": 4.327669143676758e-05, |
| "lookahead_loss": 4.234199364185333, |
| "loss": 2.2863, |
| "step": 70500 |
| }, |
| { |
| "base_loss": 0.31973540037870407, |
| "epoch": 0.1354217529296875, |
| "grad_norm": 0.2945224344730377, |
| "learning_rate": 4.322900772094727e-05, |
| "lookahead_loss": 4.172246160030365, |
| "loss": 2.246, |
| "step": 71000 |
| }, |
| { |
| "base_loss": 0.3021405778825283, |
| "epoch": 0.13637542724609375, |
| "grad_norm": 0.21431593596935272, |
| "learning_rate": 4.318132400512695e-05, |
| "lookahead_loss": 4.1314473094940185, |
| "loss": 2.2168, |
| "step": 71500 |
| }, |
| { |
| "base_loss": 0.2982295399904251, |
| "epoch": 0.1373291015625, |
| "grad_norm": 0.17282553017139435, |
| "learning_rate": 4.3133640289306643e-05, |
| "lookahead_loss": 4.108474971294403, |
| "loss": 2.2034, |
| "step": 72000 |
| }, |
| { |
| "base_loss": 0.3037794386148453, |
| "epoch": 0.13828277587890625, |
| "grad_norm": 0.18910439312458038, |
| "learning_rate": 4.308595657348633e-05, |
| "lookahead_loss": 4.186692704200745, |
| "loss": 2.2452, |
| "step": 72500 |
| }, |
| { |
| "base_loss": 0.30183823220431805, |
| "epoch": 0.1392364501953125, |
| "grad_norm": 0.1664671003818512, |
| "learning_rate": 4.303827285766602e-05, |
| "lookahead_loss": 4.19723641204834, |
| "loss": 2.2495, |
| "step": 73000 |
| }, |
| { |
| "base_loss": 0.30403331050276755, |
| "epoch": 0.14019012451171875, |
| "grad_norm": 0.1586393415927887, |
| "learning_rate": 4.299058914184571e-05, |
| "lookahead_loss": 4.1403014822006226, |
| "loss": 2.2222, |
| "step": 73500 |
| }, |
| { |
| "base_loss": 0.3824539307653904, |
| "epoch": 0.141143798828125, |
| "grad_norm": 0.17738763988018036, |
| "learning_rate": 4.294290542602539e-05, |
| "lookahead_loss": 4.264650414466858, |
| "loss": 2.3236, |
| "step": 74000 |
| }, |
| { |
| "base_loss": 0.3024712265729904, |
| "epoch": 0.14209747314453125, |
| "grad_norm": 0.15253642201423645, |
| "learning_rate": 4.289522171020508e-05, |
| "lookahead_loss": 4.125599290370941, |
| "loss": 2.214, |
| "step": 74500 |
| }, |
| { |
| "base_loss": 0.3230967881381512, |
| "epoch": 0.1430511474609375, |
| "grad_norm": 0.19057177007198334, |
| "learning_rate": 4.2847537994384764e-05, |
| "lookahead_loss": 4.12642933511734, |
| "loss": 2.2248, |
| "step": 75000 |
| }, |
| { |
| "epoch": 0.1430511474609375, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.7355900829315187, |
| "eval_lookahead_perplexity": 41.912750266117726, |
| "eval_loss": 1.93314528465271, |
| "eval_perplexity": 6.911213826492632, |
| "eval_runtime": 481.4425, |
| "eval_samples_per_second": 20.771, |
| "eval_steps_per_second": 1.298, |
| "step": 75000 |
| }, |
| { |
| "base_loss": 0.30004050022363665, |
| "epoch": 0.14400482177734375, |
| "grad_norm": 0.17912127077579498, |
| "learning_rate": 4.2799854278564455e-05, |
| "lookahead_loss": 4.098903262138367, |
| "loss": 2.1995, |
| "step": 75500 |
| }, |
| { |
| "base_loss": 0.31687936970591546, |
| "epoch": 0.14495849609375, |
| "grad_norm": 0.15588033199310303, |
| "learning_rate": 4.2752170562744145e-05, |
| "lookahead_loss": 4.163789962291718, |
| "loss": 2.2403, |
| "step": 76000 |
| }, |
| { |
| "base_loss": 0.3090082891881466, |
| "epoch": 0.14591217041015625, |
| "grad_norm": 0.15440773963928223, |
| "learning_rate": 4.270448684692383e-05, |
| "lookahead_loss": 4.168988561153411, |
| "loss": 2.239, |
| "step": 76500 |
| }, |
| { |
| "base_loss": 0.30597079479694367, |
| "epoch": 0.1468658447265625, |
| "grad_norm": 0.13688406348228455, |
| "learning_rate": 4.265680313110352e-05, |
| "lookahead_loss": 4.118725947856903, |
| "loss": 2.2123, |
| "step": 77000 |
| }, |
| { |
| "base_loss": 0.34478209909796714, |
| "epoch": 0.14781951904296875, |
| "grad_norm": 0.14708669483661652, |
| "learning_rate": 4.26091194152832e-05, |
| "lookahead_loss": 4.186521942615509, |
| "loss": 2.2657, |
| "step": 77500 |
| }, |
| { |
| "base_loss": 0.3085326923131943, |
| "epoch": 0.148773193359375, |
| "grad_norm": 0.1381761133670807, |
| "learning_rate": 4.256143569946289e-05, |
| "lookahead_loss": 4.102399334430695, |
| "loss": 2.2055, |
| "step": 78000 |
| }, |
| { |
| "base_loss": 0.31045393279194833, |
| "epoch": 0.14972686767578125, |
| "grad_norm": 0.18813666701316833, |
| "learning_rate": 4.251375198364258e-05, |
| "lookahead_loss": 4.1153163766860965, |
| "loss": 2.2129, |
| "step": 78500 |
| }, |
| { |
| "base_loss": 0.2935263271927834, |
| "epoch": 0.1506805419921875, |
| "grad_norm": 0.21148359775543213, |
| "learning_rate": 4.2466068267822266e-05, |
| "lookahead_loss": 4.083489650249481, |
| "loss": 2.1885, |
| "step": 79000 |
| }, |
| { |
| "base_loss": 0.3048303987979889, |
| "epoch": 0.15163421630859375, |
| "grad_norm": 0.16598311066627502, |
| "learning_rate": 4.2418384552001956e-05, |
| "lookahead_loss": 4.14640766620636, |
| "loss": 2.2256, |
| "step": 79500 |
| }, |
| { |
| "base_loss": 0.3079901858270168, |
| "epoch": 0.152587890625, |
| "grad_norm": 0.2019839733839035, |
| "learning_rate": 4.237070083618164e-05, |
| "lookahead_loss": 4.159954071044922, |
| "loss": 2.234, |
| "step": 80000 |
| }, |
| { |
| "epoch": 0.152587890625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.698641411590576, |
| "eval_lookahead_perplexity": 40.392390432330735, |
| "eval_loss": 1.9146709442138672, |
| "eval_perplexity": 6.784705882914828, |
| "eval_runtime": 485.524, |
| "eval_samples_per_second": 20.596, |
| "eval_steps_per_second": 1.287, |
| "step": 80000 |
| }, |
| { |
| "base_loss": 0.32198003405332565, |
| "epoch": 0.15354156494140625, |
| "grad_norm": 0.14169633388519287, |
| "learning_rate": 4.232301712036133e-05, |
| "lookahead_loss": 4.123985225200653, |
| "loss": 2.223, |
| "step": 80500 |
| }, |
| { |
| "base_loss": 0.3435799330174923, |
| "epoch": 0.1544952392578125, |
| "grad_norm": 0.2051265984773636, |
| "learning_rate": 4.227533340454102e-05, |
| "lookahead_loss": 4.174958533763886, |
| "loss": 2.2593, |
| "step": 81000 |
| }, |
| { |
| "base_loss": 0.3113198747932911, |
| "epoch": 0.15544891357421875, |
| "grad_norm": 0.22451823949813843, |
| "learning_rate": 4.22276496887207e-05, |
| "lookahead_loss": 4.103942976951599, |
| "loss": 2.2076, |
| "step": 81500 |
| }, |
| { |
| "base_loss": 0.2957187399119139, |
| "epoch": 0.156402587890625, |
| "grad_norm": 0.19755777716636658, |
| "learning_rate": 4.2179965972900393e-05, |
| "lookahead_loss": 4.071250169277191, |
| "loss": 2.1835, |
| "step": 82000 |
| }, |
| { |
| "base_loss": 0.29787460842728614, |
| "epoch": 0.15735626220703125, |
| "grad_norm": 0.14888489246368408, |
| "learning_rate": 4.213228225708008e-05, |
| "lookahead_loss": 4.07864601278305, |
| "loss": 2.1883, |
| "step": 82500 |
| }, |
| { |
| "base_loss": 0.3036723498404026, |
| "epoch": 0.1583099365234375, |
| "grad_norm": 0.14837269484996796, |
| "learning_rate": 4.208459854125977e-05, |
| "lookahead_loss": 4.1651582074165345, |
| "loss": 2.2344, |
| "step": 83000 |
| }, |
| { |
| "base_loss": 0.3110756404399872, |
| "epoch": 0.15926361083984375, |
| "grad_norm": 0.1456403285264969, |
| "learning_rate": 4.203691482543946e-05, |
| "lookahead_loss": 4.12848590517044, |
| "loss": 2.2198, |
| "step": 83500 |
| }, |
| { |
| "base_loss": 0.3243219917714596, |
| "epoch": 0.16021728515625, |
| "grad_norm": 0.1554984450340271, |
| "learning_rate": 4.198923110961914e-05, |
| "lookahead_loss": 4.138046524524689, |
| "loss": 2.2312, |
| "step": 84000 |
| }, |
| { |
| "base_loss": 0.3183397548496723, |
| "epoch": 0.16117095947265625, |
| "grad_norm": 0.15073776245117188, |
| "learning_rate": 4.194154739379883e-05, |
| "lookahead_loss": 4.095347220897675, |
| "loss": 2.2068, |
| "step": 84500 |
| }, |
| { |
| "base_loss": 0.330243824750185, |
| "epoch": 0.1621246337890625, |
| "grad_norm": 0.20515553653240204, |
| "learning_rate": 4.1893863677978514e-05, |
| "lookahead_loss": 4.1033834571838375, |
| "loss": 2.2168, |
| "step": 85000 |
| }, |
| { |
| "epoch": 0.1621246337890625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.6668980613708495, |
| "eval_lookahead_perplexity": 39.130337503517424, |
| "eval_loss": 1.898799180984497, |
| "eval_perplexity": 6.677870711585694, |
| "eval_runtime": 491.2668, |
| "eval_samples_per_second": 20.356, |
| "eval_steps_per_second": 1.272, |
| "step": 85000 |
| }, |
| { |
| "base_loss": 0.28920619750022886, |
| "epoch": 0.16307830810546875, |
| "grad_norm": 0.13687625527381897, |
| "learning_rate": 4.1846179962158205e-05, |
| "lookahead_loss": 4.051657072067261, |
| "loss": 2.1704, |
| "step": 85500 |
| }, |
| { |
| "base_loss": 0.3127485100328922, |
| "epoch": 0.164031982421875, |
| "grad_norm": 0.1961473971605301, |
| "learning_rate": 4.1798496246337895e-05, |
| "lookahead_loss": 4.111975946426392, |
| "loss": 2.2124, |
| "step": 86000 |
| }, |
| { |
| "base_loss": 0.3023957554399967, |
| "epoch": 0.16498565673828125, |
| "grad_norm": 0.21571995317935944, |
| "learning_rate": 4.175081253051758e-05, |
| "lookahead_loss": 4.115322601795197, |
| "loss": 2.2089, |
| "step": 86500 |
| }, |
| { |
| "base_loss": 0.3064508207142353, |
| "epoch": 0.1659393310546875, |
| "grad_norm": 0.145101398229599, |
| "learning_rate": 4.170312881469727e-05, |
| "lookahead_loss": 4.093011445045471, |
| "loss": 2.1997, |
| "step": 87000 |
| }, |
| { |
| "base_loss": 0.33141738665103915, |
| "epoch": 0.16689300537109375, |
| "grad_norm": 0.13913673162460327, |
| "learning_rate": 4.165544509887695e-05, |
| "lookahead_loss": 4.137466729640961, |
| "loss": 2.2344, |
| "step": 87500 |
| }, |
| { |
| "base_loss": 0.3255680377185345, |
| "epoch": 0.1678466796875, |
| "grad_norm": 0.1342954784631729, |
| "learning_rate": 4.160776138305664e-05, |
| "lookahead_loss": 4.086215874195099, |
| "loss": 2.2059, |
| "step": 88000 |
| }, |
| { |
| "base_loss": 0.3133760218322277, |
| "epoch": 0.16880035400390625, |
| "grad_norm": 0.15926498174667358, |
| "learning_rate": 4.156007766723633e-05, |
| "lookahead_loss": 4.077684763908386, |
| "loss": 2.1955, |
| "step": 88500 |
| }, |
| { |
| "base_loss": 0.29871078038215637, |
| "epoch": 0.1697540283203125, |
| "grad_norm": 0.25558069348335266, |
| "learning_rate": 4.1512393951416016e-05, |
| "lookahead_loss": 4.051617414474487, |
| "loss": 2.1752, |
| "step": 89000 |
| }, |
| { |
| "base_loss": 0.29546582013368605, |
| "epoch": 0.17070770263671875, |
| "grad_norm": 0.1507255733013153, |
| "learning_rate": 4.1464710235595706e-05, |
| "lookahead_loss": 4.0750104126930236, |
| "loss": 2.1852, |
| "step": 89500 |
| }, |
| { |
| "base_loss": 0.2991917096078396, |
| "epoch": 0.171661376953125, |
| "grad_norm": 0.13587379455566406, |
| "learning_rate": 4.141702651977539e-05, |
| "lookahead_loss": 4.071310368061066, |
| "loss": 2.1853, |
| "step": 90000 |
| }, |
| { |
| "epoch": 0.171661376953125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.63601548538208, |
| "eval_lookahead_perplexity": 37.94036122372511, |
| "eval_loss": 1.8833580017089844, |
| "eval_perplexity": 6.575548533346909, |
| "eval_runtime": 499.4539, |
| "eval_samples_per_second": 20.022, |
| "eval_steps_per_second": 1.251, |
| "step": 90000 |
| }, |
| { |
| "base_loss": 0.297812943726778, |
| "epoch": 0.17261505126953125, |
| "grad_norm": 0.16026511788368225, |
| "learning_rate": 4.136934280395508e-05, |
| "lookahead_loss": 4.0754151496887205, |
| "loss": 2.1866, |
| "step": 90500 |
| }, |
| { |
| "base_loss": 0.30764649564027785, |
| "epoch": 0.1735687255859375, |
| "grad_norm": 0.48659810423851013, |
| "learning_rate": 4.132165908813477e-05, |
| "lookahead_loss": 4.057919836521148, |
| "loss": 2.1828, |
| "step": 91000 |
| }, |
| { |
| "base_loss": 0.3142555268108845, |
| "epoch": 0.17452239990234375, |
| "grad_norm": 0.18999813497066498, |
| "learning_rate": 4.127397537231445e-05, |
| "lookahead_loss": 4.059666626930237, |
| "loss": 2.187, |
| "step": 91500 |
| }, |
| { |
| "base_loss": 0.3496557460427284, |
| "epoch": 0.17547607421875, |
| "grad_norm": 0.17107349634170532, |
| "learning_rate": 4.1226291656494143e-05, |
| "lookahead_loss": 4.136337936401367, |
| "loss": 2.243, |
| "step": 92000 |
| }, |
| { |
| "base_loss": 0.3016935878098011, |
| "epoch": 0.17642974853515625, |
| "grad_norm": 0.23676873743534088, |
| "learning_rate": 4.117860794067383e-05, |
| "lookahead_loss": 4.028763621330262, |
| "loss": 2.1652, |
| "step": 92500 |
| }, |
| { |
| "base_loss": 0.300347177952528, |
| "epoch": 0.1773834228515625, |
| "grad_norm": 0.1585322767496109, |
| "learning_rate": 4.113092422485352e-05, |
| "lookahead_loss": 4.0394937310218815, |
| "loss": 2.1699, |
| "step": 93000 |
| }, |
| { |
| "base_loss": 0.3073237894177437, |
| "epoch": 0.17833709716796875, |
| "grad_norm": 0.23585672676563263, |
| "learning_rate": 4.108324050903321e-05, |
| "lookahead_loss": 4.032268433570862, |
| "loss": 2.1698, |
| "step": 93500 |
| }, |
| { |
| "base_loss": 0.3024279504716396, |
| "epoch": 0.179290771484375, |
| "grad_norm": 0.17836162447929382, |
| "learning_rate": 4.103555679321289e-05, |
| "lookahead_loss": 4.024306795597076, |
| "loss": 2.1634, |
| "step": 94000 |
| }, |
| { |
| "base_loss": 0.3055584655106068, |
| "epoch": 0.18024444580078125, |
| "grad_norm": 0.14800947904586792, |
| "learning_rate": 4.098787307739258e-05, |
| "lookahead_loss": 4.064500618457794, |
| "loss": 2.185, |
| "step": 94500 |
| }, |
| { |
| "base_loss": 0.29581878417730334, |
| "epoch": 0.1811981201171875, |
| "grad_norm": 0.16330750286579132, |
| "learning_rate": 4.0940189361572264e-05, |
| "lookahead_loss": 4.053201458930969, |
| "loss": 2.1745, |
| "step": 95000 |
| }, |
| { |
| "epoch": 0.1811981201171875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.608971565246582, |
| "eval_lookahead_perplexity": 36.9280551838023, |
| "eval_loss": 1.8698359727859497, |
| "eval_perplexity": 6.487232229383202, |
| "eval_runtime": 547.0525, |
| "eval_samples_per_second": 18.28, |
| "eval_steps_per_second": 1.142, |
| "step": 95000 |
| }, |
| { |
| "base_loss": 0.2980945122539997, |
| "epoch": 0.18215179443359375, |
| "grad_norm": 0.14234571158885956, |
| "learning_rate": 4.0892505645751955e-05, |
| "lookahead_loss": 4.067723794937134, |
| "loss": 2.1829, |
| "step": 95500 |
| }, |
| { |
| "base_loss": 0.2986116451621056, |
| "epoch": 0.18310546875, |
| "grad_norm": 0.15030185878276825, |
| "learning_rate": 4.0844821929931645e-05, |
| "lookahead_loss": 4.031040393829346, |
| "loss": 2.1648, |
| "step": 96000 |
| }, |
| { |
| "base_loss": 0.31211488363146783, |
| "epoch": 0.18405914306640625, |
| "grad_norm": 0.2024800330400467, |
| "learning_rate": 4.079713821411133e-05, |
| "lookahead_loss": 4.039662356376648, |
| "loss": 2.1759, |
| "step": 96500 |
| }, |
| { |
| "base_loss": 0.3342977456152439, |
| "epoch": 0.1850128173828125, |
| "grad_norm": 0.18318872153759003, |
| "learning_rate": 4.074945449829102e-05, |
| "lookahead_loss": 4.070985550403595, |
| "loss": 2.2026, |
| "step": 97000 |
| }, |
| { |
| "base_loss": 0.31514875215291976, |
| "epoch": 0.18596649169921875, |
| "grad_norm": 0.14978346228599548, |
| "learning_rate": 4.07017707824707e-05, |
| "lookahead_loss": 4.028551621437073, |
| "loss": 2.1718, |
| "step": 97500 |
| }, |
| { |
| "base_loss": 0.3053509466052055, |
| "epoch": 0.186920166015625, |
| "grad_norm": 0.2080519199371338, |
| "learning_rate": 4.065408706665039e-05, |
| "lookahead_loss": 4.030301760673523, |
| "loss": 2.1678, |
| "step": 98000 |
| }, |
| { |
| "base_loss": 0.29078236150741577, |
| "epoch": 0.18787384033203125, |
| "grad_norm": 0.16793227195739746, |
| "learning_rate": 4.060640335083008e-05, |
| "lookahead_loss": 3.9797529973983763, |
| "loss": 2.1353, |
| "step": 98500 |
| }, |
| { |
| "base_loss": 0.29214190459251405, |
| "epoch": 0.1888275146484375, |
| "grad_norm": 0.19143177568912506, |
| "learning_rate": 4.0558719635009766e-05, |
| "lookahead_loss": 4.013256893157959, |
| "loss": 2.1527, |
| "step": 99000 |
| }, |
| { |
| "base_loss": 0.2928229000866413, |
| "epoch": 0.18978118896484375, |
| "grad_norm": 0.2626541554927826, |
| "learning_rate": 4.0511035919189456e-05, |
| "lookahead_loss": 4.013317709445953, |
| "loss": 2.1531, |
| "step": 99500 |
| }, |
| { |
| "base_loss": 0.29795382434129714, |
| "epoch": 0.19073486328125, |
| "grad_norm": 0.1662345677614212, |
| "learning_rate": 4.046335220336914e-05, |
| "lookahead_loss": 4.026696702957153, |
| "loss": 2.1623, |
| "step": 100000 |
| }, |
| { |
| "epoch": 0.19073486328125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.5836494529724123, |
| "eval_lookahead_perplexity": 36.00469882921293, |
| "eval_loss": 1.8571749925613403, |
| "eval_perplexity": 6.405615275997352, |
| "eval_runtime": 518.6821, |
| "eval_samples_per_second": 19.28, |
| "eval_steps_per_second": 1.205, |
| "step": 100000 |
| }, |
| { |
| "base_loss": 0.29810027703642844, |
| "epoch": 0.19168853759765625, |
| "grad_norm": 0.1798204481601715, |
| "learning_rate": 4.041566848754883e-05, |
| "lookahead_loss": 4.021183431148529, |
| "loss": 2.1596, |
| "step": 100500 |
| }, |
| { |
| "base_loss": 0.30463395109772684, |
| "epoch": 0.1926422119140625, |
| "grad_norm": 0.15981730818748474, |
| "learning_rate": 4.036798477172852e-05, |
| "lookahead_loss": 4.006066414356232, |
| "loss": 2.1553, |
| "step": 101000 |
| }, |
| { |
| "base_loss": 0.33558367761969565, |
| "epoch": 0.19359588623046875, |
| "grad_norm": 0.21296393871307373, |
| "learning_rate": 4.03203010559082e-05, |
| "lookahead_loss": 4.060854420661927, |
| "loss": 2.1982, |
| "step": 101500 |
| }, |
| { |
| "base_loss": 0.3199073303639889, |
| "epoch": 0.194549560546875, |
| "grad_norm": 0.1356714516878128, |
| "learning_rate": 4.0272617340087893e-05, |
| "lookahead_loss": 4.027761749267578, |
| "loss": 2.1738, |
| "step": 102000 |
| }, |
| { |
| "base_loss": 0.30011415255069734, |
| "epoch": 0.19550323486328125, |
| "grad_norm": 0.15449336171150208, |
| "learning_rate": 4.022493362426758e-05, |
| "lookahead_loss": 3.9922111649513243, |
| "loss": 2.1462, |
| "step": 102500 |
| }, |
| { |
| "base_loss": 0.2923199172616005, |
| "epoch": 0.1964569091796875, |
| "grad_norm": 0.18760551512241364, |
| "learning_rate": 4.017724990844727e-05, |
| "lookahead_loss": 3.9661373071670534, |
| "loss": 2.1292, |
| "step": 103000 |
| }, |
| { |
| "base_loss": 0.29327082937955856, |
| "epoch": 0.19741058349609375, |
| "grad_norm": 0.2747306823730469, |
| "learning_rate": 4.012956619262696e-05, |
| "lookahead_loss": 4.003834127426147, |
| "loss": 2.1486, |
| "step": 103500 |
| }, |
| { |
| "base_loss": 0.2999972744882107, |
| "epoch": 0.1983642578125, |
| "grad_norm": 0.14901545643806458, |
| "learning_rate": 4.008188247680664e-05, |
| "lookahead_loss": 4.033395376682281, |
| "loss": 2.1667, |
| "step": 104000 |
| }, |
| { |
| "base_loss": 0.2973039819002152, |
| "epoch": 0.19931793212890625, |
| "grad_norm": 0.23263269662857056, |
| "learning_rate": 4.003419876098633e-05, |
| "lookahead_loss": 4.03858584690094, |
| "loss": 2.1679, |
| "step": 104500 |
| }, |
| { |
| "base_loss": 0.3063103293478489, |
| "epoch": 0.2002716064453125, |
| "grad_norm": 0.17267969250679016, |
| "learning_rate": 3.9986515045166014e-05, |
| "lookahead_loss": 4.011103184700012, |
| "loss": 2.1587, |
| "step": 105000 |
| }, |
| { |
| "epoch": 0.2002716064453125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.5608823120117186, |
| "eval_lookahead_perplexity": 35.19423574717032, |
| "eval_loss": 1.8457914590835571, |
| "eval_perplexity": 6.333110204970354, |
| "eval_runtime": 490.4884, |
| "eval_samples_per_second": 20.388, |
| "eval_steps_per_second": 1.274, |
| "step": 105000 |
| }, |
| { |
| "base_loss": 0.3124685201644897, |
| "epoch": 0.20122528076171875, |
| "grad_norm": 0.15897978842258453, |
| "learning_rate": 3.9938831329345705e-05, |
| "lookahead_loss": 4.016992502689361, |
| "loss": 2.1647, |
| "step": 105500 |
| }, |
| { |
| "base_loss": 0.3321477827131748, |
| "epoch": 0.202178955078125, |
| "grad_norm": 0.14868295192718506, |
| "learning_rate": 3.9891147613525395e-05, |
| "lookahead_loss": 4.048239949703216, |
| "loss": 2.1902, |
| "step": 106000 |
| }, |
| { |
| "base_loss": 0.2998480386734009, |
| "epoch": 0.20313262939453125, |
| "grad_norm": 0.1452517807483673, |
| "learning_rate": 3.984346389770508e-05, |
| "lookahead_loss": 3.990150417327881, |
| "loss": 2.145, |
| "step": 106500 |
| }, |
| { |
| "base_loss": 0.30396230933070184, |
| "epoch": 0.2040863037109375, |
| "grad_norm": 0.15313097834587097, |
| "learning_rate": 3.979578018188477e-05, |
| "lookahead_loss": 3.9821047258377074, |
| "loss": 2.143, |
| "step": 107000 |
| }, |
| { |
| "base_loss": 0.30401156124472617, |
| "epoch": 0.20503997802734375, |
| "grad_norm": 0.19967354834079742, |
| "learning_rate": 3.974809646606445e-05, |
| "lookahead_loss": 3.970831639289856, |
| "loss": 2.1374, |
| "step": 107500 |
| }, |
| { |
| "base_loss": 0.29991251334547997, |
| "epoch": 0.20599365234375, |
| "grad_norm": 0.15913745760917664, |
| "learning_rate": 3.970041275024414e-05, |
| "lookahead_loss": 4.019209212779999, |
| "loss": 2.1596, |
| "step": 108000 |
| }, |
| { |
| "base_loss": 0.30137626150250435, |
| "epoch": 0.20694732666015625, |
| "grad_norm": 0.34907975792884827, |
| "learning_rate": 3.965272903442383e-05, |
| "lookahead_loss": 4.026716927051544, |
| "loss": 2.164, |
| "step": 108500 |
| }, |
| { |
| "base_loss": 0.2976263118684292, |
| "epoch": 0.2079010009765625, |
| "grad_norm": 0.1488516479730606, |
| "learning_rate": 3.9605045318603516e-05, |
| "lookahead_loss": 4.010719275474548, |
| "loss": 2.1542, |
| "step": 109000 |
| }, |
| { |
| "base_loss": 0.3038526868522167, |
| "epoch": 0.20885467529296875, |
| "grad_norm": 0.18493635952472687, |
| "learning_rate": 3.9557361602783206e-05, |
| "lookahead_loss": 3.985397423744202, |
| "loss": 2.1446, |
| "step": 109500 |
| }, |
| { |
| "base_loss": 0.3238096301853657, |
| "epoch": 0.209808349609375, |
| "grad_norm": 0.1789693385362625, |
| "learning_rate": 3.950967788696289e-05, |
| "lookahead_loss": 4.032813804626465, |
| "loss": 2.1783, |
| "step": 110000 |
| }, |
| { |
| "epoch": 0.209808349609375, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.5394810447692873, |
| "eval_lookahead_perplexity": 34.44903704327508, |
| "eval_loss": 1.8350908756256104, |
| "eval_perplexity": 6.265703519291424, |
| "eval_runtime": 500.8146, |
| "eval_samples_per_second": 19.967, |
| "eval_steps_per_second": 1.248, |
| "step": 110000 |
| }, |
| { |
| "base_loss": 0.31262540474534034, |
| "epoch": 0.21076202392578125, |
| "grad_norm": 0.12846623361110687, |
| "learning_rate": 3.946199417114258e-05, |
| "lookahead_loss": 3.9918526377677916, |
| "loss": 2.1522, |
| "step": 110500 |
| }, |
| { |
| "base_loss": 0.30065977996587756, |
| "epoch": 0.2117156982421875, |
| "grad_norm": 0.18878379464149475, |
| "learning_rate": 3.941431045532227e-05, |
| "lookahead_loss": 3.9505230865478516, |
| "loss": 2.1256, |
| "step": 111000 |
| }, |
| { |
| "base_loss": 0.29354970484972, |
| "epoch": 0.21266937255859375, |
| "grad_norm": 0.15901413559913635, |
| "learning_rate": 3.936662673950195e-05, |
| "lookahead_loss": 3.9420759234428404, |
| "loss": 2.1178, |
| "step": 111500 |
| }, |
| { |
| "base_loss": 0.29862283357977865, |
| "epoch": 0.213623046875, |
| "grad_norm": 0.15839649736881256, |
| "learning_rate": 3.9318943023681643e-05, |
| "lookahead_loss": 3.988606767177582, |
| "loss": 2.1436, |
| "step": 112000 |
| }, |
| { |
| "base_loss": 0.29591704466938973, |
| "epoch": 0.21457672119140625, |
| "grad_norm": 0.13983677327632904, |
| "learning_rate": 3.927125930786133e-05, |
| "lookahead_loss": 3.9879449706077574, |
| "loss": 2.1419, |
| "step": 112500 |
| }, |
| { |
| "base_loss": 0.2976507830321789, |
| "epoch": 0.2155303955078125, |
| "grad_norm": 0.15823175013065338, |
| "learning_rate": 3.922357559204102e-05, |
| "lookahead_loss": 3.9824121689796446, |
| "loss": 2.14, |
| "step": 113000 |
| }, |
| { |
| "base_loss": 0.3091459658145905, |
| "epoch": 0.21648406982421875, |
| "grad_norm": 0.12450090050697327, |
| "learning_rate": 3.917589187622071e-05, |
| "lookahead_loss": 3.9995364723205564, |
| "loss": 2.1543, |
| "step": 113500 |
| }, |
| { |
| "base_loss": 0.33537453308701515, |
| "epoch": 0.217437744140625, |
| "grad_norm": 0.18551339209079742, |
| "learning_rate": 3.912820816040039e-05, |
| "lookahead_loss": 4.021560028076172, |
| "loss": 2.1785, |
| "step": 114000 |
| }, |
| { |
| "base_loss": 0.32016737046837807, |
| "epoch": 0.21839141845703125, |
| "grad_norm": 0.17938633263111115, |
| "learning_rate": 3.908052444458008e-05, |
| "lookahead_loss": 3.9909664607048034, |
| "loss": 2.1556, |
| "step": 114500 |
| }, |
| { |
| "base_loss": 0.30930135017633437, |
| "epoch": 0.2193450927734375, |
| "grad_norm": 0.15878070890903473, |
| "learning_rate": 3.9032840728759764e-05, |
| "lookahead_loss": 3.963454393863678, |
| "loss": 2.1364, |
| "step": 115000 |
| }, |
| { |
| "epoch": 0.2193450927734375, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.520967939376831, |
| "eval_lookahead_perplexity": 33.81714557401866, |
| "eval_loss": 1.8258343935012817, |
| "eval_perplexity": 6.207972750791024, |
| "eval_runtime": 509.2129, |
| "eval_samples_per_second": 19.638, |
| "eval_steps_per_second": 1.227, |
| "step": 115000 |
| }, |
| { |
| "base_loss": 0.3047959138453007, |
| "epoch": 0.22029876708984375, |
| "grad_norm": 0.20373134315013885, |
| "learning_rate": 3.8985157012939455e-05, |
| "lookahead_loss": 3.9657868213653567, |
| "loss": 2.1353, |
| "step": 115500 |
| }, |
| { |
| "base_loss": 0.29702088606357574, |
| "epoch": 0.22125244140625, |
| "grad_norm": 0.15585345029830933, |
| "learning_rate": 3.8937473297119145e-05, |
| "lookahead_loss": 3.9414098200798033, |
| "loss": 2.1192, |
| "step": 116000 |
| }, |
| { |
| "base_loss": 0.2947344943881035, |
| "epoch": 0.22220611572265625, |
| "grad_norm": 0.1686229705810547, |
| "learning_rate": 3.888978958129883e-05, |
| "lookahead_loss": 3.985533133983612, |
| "loss": 2.1401, |
| "step": 116500 |
| }, |
| { |
| "base_loss": 0.29663796299695966, |
| "epoch": 0.2231597900390625, |
| "grad_norm": 0.5541319251060486, |
| "learning_rate": 3.884210586547852e-05, |
| "lookahead_loss": 3.9794252281188967, |
| "loss": 2.138, |
| "step": 117000 |
| }, |
| { |
| "base_loss": 0.3019692142158747, |
| "epoch": 0.22411346435546875, |
| "grad_norm": 0.1443110853433609, |
| "learning_rate": 3.87944221496582e-05, |
| "lookahead_loss": 3.977426125049591, |
| "loss": 2.1397, |
| "step": 117500 |
| }, |
| { |
| "base_loss": 0.3167287348806858, |
| "epoch": 0.225067138671875, |
| "grad_norm": 0.1557740718126297, |
| "learning_rate": 3.874673843383789e-05, |
| "lookahead_loss": 3.9934216737747192, |
| "loss": 2.1551, |
| "step": 118000 |
| }, |
| { |
| "base_loss": 0.3616081200838089, |
| "epoch": 0.22602081298828125, |
| "grad_norm": 0.1348077803850174, |
| "learning_rate": 3.869905471801758e-05, |
| "lookahead_loss": 4.030314054965973, |
| "loss": 2.196, |
| "step": 118500 |
| }, |
| { |
| "base_loss": 0.32006074047088623, |
| "epoch": 0.2269744873046875, |
| "grad_norm": 0.16196569800376892, |
| "learning_rate": 3.8651371002197266e-05, |
| "lookahead_loss": 3.9777443284988405, |
| "loss": 2.1489, |
| "step": 119000 |
| }, |
| { |
| "base_loss": 0.30098330533504486, |
| "epoch": 0.22792816162109375, |
| "grad_norm": 0.17852458357810974, |
| "learning_rate": 3.8603687286376956e-05, |
| "lookahead_loss": 3.941022171020508, |
| "loss": 2.121, |
| "step": 119500 |
| }, |
| { |
| "base_loss": 0.29374924197793006, |
| "epoch": 0.2288818359375, |
| "grad_norm": 0.15789750218391418, |
| "learning_rate": 3.855600357055664e-05, |
| "lookahead_loss": 3.9294084067344666, |
| "loss": 2.1116, |
| "step": 120000 |
| }, |
| { |
| "epoch": 0.2288818359375, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.503778175354004, |
| "eval_lookahead_perplexity": 33.24080459612871, |
| "eval_loss": 1.8172394037246704, |
| "eval_perplexity": 6.154843936341777, |
| "eval_runtime": 486.0945, |
| "eval_samples_per_second": 20.572, |
| "eval_steps_per_second": 1.286, |
| "step": 120000 |
| }, |
| { |
| "base_loss": 0.30340095722675325, |
| "epoch": 0.22983551025390625, |
| "grad_norm": 0.1268227994441986, |
| "learning_rate": 3.850831985473633e-05, |
| "lookahead_loss": 3.997514929294586, |
| "loss": 2.1505, |
| "step": 120500 |
| }, |
| { |
| "base_loss": 0.2969953828752041, |
| "epoch": 0.2307891845703125, |
| "grad_norm": 0.38102588057518005, |
| "learning_rate": 3.846063613891602e-05, |
| "lookahead_loss": 3.981098453044891, |
| "loss": 2.139, |
| "step": 121000 |
| }, |
| { |
| "base_loss": 0.3105453898310661, |
| "epoch": 0.23174285888671875, |
| "grad_norm": 0.2956802248954773, |
| "learning_rate": 3.84129524230957e-05, |
| "lookahead_loss": 3.9859898743629456, |
| "loss": 2.1483, |
| "step": 121500 |
| }, |
| { |
| "base_loss": 0.312623037725687, |
| "epoch": 0.232696533203125, |
| "grad_norm": 0.20321176946163177, |
| "learning_rate": 3.8365268707275393e-05, |
| "lookahead_loss": 3.9476396222114563, |
| "loss": 2.1301, |
| "step": 122000 |
| }, |
| { |
| "base_loss": 0.3441284774243832, |
| "epoch": 0.23365020751953125, |
| "grad_norm": 0.1659804880619049, |
| "learning_rate": 3.831758499145508e-05, |
| "lookahead_loss": 4.019089301586151, |
| "loss": 2.1816, |
| "step": 122500 |
| }, |
| { |
| "base_loss": 0.3158799746334553, |
| "epoch": 0.2346038818359375, |
| "grad_norm": 0.1387973129749298, |
| "learning_rate": 3.826990127563477e-05, |
| "lookahead_loss": 3.9791625356674194, |
| "loss": 2.1475, |
| "step": 123000 |
| }, |
| { |
| "base_loss": 0.3052534331381321, |
| "epoch": 0.23555755615234375, |
| "grad_norm": 0.18591973185539246, |
| "learning_rate": 3.822221755981446e-05, |
| "lookahead_loss": 3.9717899789810183, |
| "loss": 2.1385, |
| "step": 123500 |
| }, |
| { |
| "base_loss": 0.30015348917245865, |
| "epoch": 0.23651123046875, |
| "grad_norm": 0.14250557124614716, |
| "learning_rate": 3.817453384399414e-05, |
| "lookahead_loss": 3.9358688526153562, |
| "loss": 2.118, |
| "step": 124000 |
| }, |
| { |
| "base_loss": 0.30166161328554153, |
| "epoch": 0.23746490478515625, |
| "grad_norm": 0.15538230538368225, |
| "learning_rate": 3.812685012817383e-05, |
| "lookahead_loss": 3.9643000559806825, |
| "loss": 2.133, |
| "step": 124500 |
| }, |
| { |
| "base_loss": 0.2973581215441227, |
| "epoch": 0.2384185791015625, |
| "grad_norm": 0.1487993448972702, |
| "learning_rate": 3.8079166412353514e-05, |
| "lookahead_loss": 3.9540540752410887, |
| "loss": 2.1257, |
| "step": 125000 |
| }, |
| { |
| "epoch": 0.2384185791015625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.4853565464019773, |
| "eval_lookahead_perplexity": 32.63406059268419, |
| "eval_loss": 1.8080284595489502, |
| "eval_perplexity": 6.098412305711686, |
| "eval_runtime": 483.8181, |
| "eval_samples_per_second": 20.669, |
| "eval_steps_per_second": 1.292, |
| "step": 125000 |
| }, |
| { |
| "base_loss": 0.3022316889762878, |
| "epoch": 1.0009536743164062, |
| "grad_norm": 0.160108283162117, |
| "learning_rate": 3.8031482696533205e-05, |
| "lookahead_loss": 3.9718240842819212, |
| "loss": 2.137, |
| "step": 125500 |
| }, |
| { |
| "base_loss": 0.2998015423119068, |
| "epoch": 1.0019073486328125, |
| "grad_norm": 0.1442178189754486, |
| "learning_rate": 3.7983798980712895e-05, |
| "lookahead_loss": 3.9606360173225403, |
| "loss": 2.1302, |
| "step": 126000 |
| }, |
| { |
| "base_loss": 0.299979487746954, |
| "epoch": 1.0028610229492188, |
| "grad_norm": 0.21493926644325256, |
| "learning_rate": 3.793611526489258e-05, |
| "lookahead_loss": 3.956595335960388, |
| "loss": 2.1283, |
| "step": 126500 |
| }, |
| { |
| "base_loss": 0.30338721710443495, |
| "epoch": 1.003814697265625, |
| "grad_norm": 0.24776244163513184, |
| "learning_rate": 3.788843154907227e-05, |
| "lookahead_loss": 3.9652762699127195, |
| "loss": 2.1343, |
| "step": 127000 |
| }, |
| { |
| "base_loss": 0.30518372932076454, |
| "epoch": 1.0047683715820312, |
| "grad_norm": 0.28755414485931396, |
| "learning_rate": 3.784074783325195e-05, |
| "lookahead_loss": 3.941191442966461, |
| "loss": 2.1232, |
| "step": 127500 |
| }, |
| { |
| "base_loss": 0.3134980680346489, |
| "epoch": 1.0057220458984375, |
| "grad_norm": 0.16672217845916748, |
| "learning_rate": 3.779306411743164e-05, |
| "lookahead_loss": 3.9538578872680663, |
| "loss": 2.1337, |
| "step": 128000 |
| }, |
| { |
| "base_loss": 0.3313628733754158, |
| "epoch": 1.0066757202148438, |
| "grad_norm": 0.18939979374408722, |
| "learning_rate": 3.774538040161133e-05, |
| "lookahead_loss": 3.9658765501976014, |
| "loss": 2.1486, |
| "step": 128500 |
| }, |
| { |
| "base_loss": 0.31349258169531824, |
| "epoch": 1.00762939453125, |
| "grad_norm": 0.1514354944229126, |
| "learning_rate": 3.7697696685791016e-05, |
| "lookahead_loss": 3.946809937477112, |
| "loss": 2.1302, |
| "step": 129000 |
| }, |
| { |
| "base_loss": 0.31480157864093783, |
| "epoch": 1.0085830688476562, |
| "grad_norm": 0.14762338995933533, |
| "learning_rate": 3.7650012969970706e-05, |
| "lookahead_loss": 3.94188436794281, |
| "loss": 2.1283, |
| "step": 129500 |
| }, |
| { |
| "base_loss": 0.29147451075911524, |
| "epoch": 1.0095367431640625, |
| "grad_norm": 0.13176566362380981, |
| "learning_rate": 3.760232925415039e-05, |
| "lookahead_loss": 3.9021912565231323, |
| "loss": 2.0968, |
| "step": 130000 |
| }, |
| { |
| "epoch": 1.0095367431640625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.4698605045318605, |
| "eval_lookahead_perplexity": 32.13225982748101, |
| "eval_loss": 1.8002804517745972, |
| "eval_perplexity": 6.05134433671433, |
| "eval_runtime": 488.4824, |
| "eval_samples_per_second": 20.472, |
| "eval_steps_per_second": 1.279, |
| "step": 130000 |
| }, |
| { |
| "base_loss": 0.30006001514196395, |
| "epoch": 1.0104904174804688, |
| "grad_norm": 0.14655235409736633, |
| "learning_rate": 3.755464553833008e-05, |
| "lookahead_loss": 3.9301828422546388, |
| "loss": 2.1151, |
| "step": 130500 |
| }, |
| { |
| "base_loss": 0.2960101006031036, |
| "epoch": 1.011444091796875, |
| "grad_norm": 0.1633438616991043, |
| "learning_rate": 3.750696182250977e-05, |
| "lookahead_loss": 3.952839255809784, |
| "loss": 2.1244, |
| "step": 131000 |
| }, |
| { |
| "base_loss": 0.29892793264985085, |
| "epoch": 1.0123977661132812, |
| "grad_norm": 0.18194669485092163, |
| "learning_rate": 3.745927810668945e-05, |
| "lookahead_loss": 3.958092218399048, |
| "loss": 2.1285, |
| "step": 131500 |
| }, |
| { |
| "base_loss": 0.29978542965650556, |
| "epoch": 1.0133514404296875, |
| "grad_norm": 0.17729413509368896, |
| "learning_rate": 3.7411594390869143e-05, |
| "lookahead_loss": 3.952873631000519, |
| "loss": 2.1263, |
| "step": 132000 |
| }, |
| { |
| "base_loss": 0.3069946175217628, |
| "epoch": 1.0143051147460938, |
| "grad_norm": 0.17295287549495697, |
| "learning_rate": 3.736391067504883e-05, |
| "lookahead_loss": 3.938451060295105, |
| "loss": 2.1227, |
| "step": 132500 |
| }, |
| { |
| "base_loss": 0.3185223871767521, |
| "epoch": 1.0152587890625, |
| "grad_norm": 0.17466206848621368, |
| "learning_rate": 3.731622695922852e-05, |
| "lookahead_loss": 3.9561206374168396, |
| "loss": 2.1373, |
| "step": 133000 |
| }, |
| { |
| "base_loss": 0.321269671857357, |
| "epoch": 1.0162124633789062, |
| "grad_norm": 0.17413805425167084, |
| "learning_rate": 3.726854324340821e-05, |
| "lookahead_loss": 3.9549911060333254, |
| "loss": 2.1381, |
| "step": 133500 |
| }, |
| { |
| "base_loss": 0.29983696776628493, |
| "epoch": 1.0171661376953125, |
| "grad_norm": 0.1588663011789322, |
| "learning_rate": 3.722085952758789e-05, |
| "lookahead_loss": 3.919067234992981, |
| "loss": 2.1095, |
| "step": 134000 |
| }, |
| { |
| "base_loss": 0.31978207612037657, |
| "epoch": 1.0181198120117188, |
| "grad_norm": 0.28145942091941833, |
| "learning_rate": 3.717317581176758e-05, |
| "lookahead_loss": 3.9233868684768676, |
| "loss": 2.1216, |
| "step": 134500 |
| }, |
| { |
| "base_loss": 0.29129536652565, |
| "epoch": 1.019073486328125, |
| "grad_norm": 0.1842055320739746, |
| "learning_rate": 3.7125492095947264e-05, |
| "lookahead_loss": 3.8867431244850157, |
| "loss": 2.089, |
| "step": 135000 |
| }, |
| { |
| "epoch": 1.019073486328125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.455427379989624, |
| "eval_lookahead_perplexity": 31.671821693026402, |
| "eval_loss": 1.7930638790130615, |
| "eval_perplexity": 6.007831565384858, |
| "eval_runtime": 491.2385, |
| "eval_samples_per_second": 20.357, |
| "eval_steps_per_second": 1.272, |
| "step": 135000 |
| }, |
| { |
| "base_loss": 0.30145278573036194, |
| "epoch": 1.0200271606445312, |
| "grad_norm": 0.1709691286087036, |
| "learning_rate": 3.7077808380126955e-05, |
| "lookahead_loss": 3.9409879336357116, |
| "loss": 2.1212, |
| "step": 135500 |
| }, |
| { |
| "base_loss": 0.2936854332089424, |
| "epoch": 1.0209808349609375, |
| "grad_norm": 0.1593596488237381, |
| "learning_rate": 3.7030124664306645e-05, |
| "lookahead_loss": 3.926469912528992, |
| "loss": 2.1101, |
| "step": 136000 |
| }, |
| { |
| "base_loss": 0.3014064610004425, |
| "epoch": 1.0219345092773438, |
| "grad_norm": 0.6354989409446716, |
| "learning_rate": 3.698244094848633e-05, |
| "lookahead_loss": 3.9481157250404357, |
| "loss": 2.1248, |
| "step": 136500 |
| }, |
| { |
| "base_loss": 0.30362616485357286, |
| "epoch": 1.02288818359375, |
| "grad_norm": 0.16273941099643707, |
| "learning_rate": 3.693475723266602e-05, |
| "lookahead_loss": 3.9114366030693053, |
| "loss": 2.1075, |
| "step": 137000 |
| }, |
| { |
| "base_loss": 0.3144021729230881, |
| "epoch": 1.0238418579101562, |
| "grad_norm": 0.149958074092865, |
| "learning_rate": 3.68870735168457e-05, |
| "lookahead_loss": 3.929403066635132, |
| "loss": 2.1219, |
| "step": 137500 |
| }, |
| { |
| "base_loss": 0.330579254090786, |
| "epoch": 1.0247955322265625, |
| "grad_norm": 0.12735570967197418, |
| "learning_rate": 3.683938980102539e-05, |
| "lookahead_loss": 3.9694002509117126, |
| "loss": 2.15, |
| "step": 138000 |
| }, |
| { |
| "base_loss": 0.30834819096326826, |
| "epoch": 1.0257492065429688, |
| "grad_norm": 0.15742145478725433, |
| "learning_rate": 3.679170608520508e-05, |
| "lookahead_loss": 3.903434811115265, |
| "loss": 2.1059, |
| "step": 138500 |
| }, |
| { |
| "base_loss": 0.30765692061185834, |
| "epoch": 1.026702880859375, |
| "grad_norm": 0.1584819257259369, |
| "learning_rate": 3.6744022369384766e-05, |
| "lookahead_loss": 3.905990194797516, |
| "loss": 2.1068, |
| "step": 139000 |
| }, |
| { |
| "base_loss": 0.29641868540644645, |
| "epoch": 1.0276565551757812, |
| "grad_norm": 0.2251375913619995, |
| "learning_rate": 3.6696338653564456e-05, |
| "lookahead_loss": 3.871658296585083, |
| "loss": 2.084, |
| "step": 139500 |
| }, |
| { |
| "base_loss": 0.30686542350053786, |
| "epoch": 1.0286102294921875, |
| "grad_norm": 0.15121251344680786, |
| "learning_rate": 3.664865493774414e-05, |
| "lookahead_loss": 3.9316897687911987, |
| "loss": 2.1193, |
| "step": 140000 |
| }, |
| { |
| "epoch": 1.0286102294921875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.4411208850860597, |
| "eval_lookahead_perplexity": 31.221934763309736, |
| "eval_loss": 1.785910725593567, |
| "eval_perplexity": 5.965009961818914, |
| "eval_runtime": 495.6044, |
| "eval_samples_per_second": 20.177, |
| "eval_steps_per_second": 1.261, |
| "step": 140000 |
| }, |
| { |
| "base_loss": 0.29295339247584345, |
| "epoch": 1.0295639038085938, |
| "grad_norm": 0.17040514945983887, |
| "learning_rate": 3.660097122192383e-05, |
| "lookahead_loss": 3.922786801815033, |
| "loss": 2.1079, |
| "step": 140500 |
| }, |
| { |
| "base_loss": 0.29418606969714167, |
| "epoch": 1.030517578125, |
| "grad_norm": 0.16101863980293274, |
| "learning_rate": 3.655328750610352e-05, |
| "lookahead_loss": 3.9305174765586854, |
| "loss": 2.1124, |
| "step": 141000 |
| }, |
| { |
| "base_loss": 0.30073931351304056, |
| "epoch": 1.0314712524414062, |
| "grad_norm": 0.15563958883285522, |
| "learning_rate": 3.65056037902832e-05, |
| "lookahead_loss": 3.9169350867271424, |
| "loss": 2.1088, |
| "step": 141500 |
| }, |
| { |
| "base_loss": 0.32550489193201065, |
| "epoch": 1.0324249267578125, |
| "grad_norm": 0.18198108673095703, |
| "learning_rate": 3.6457920074462893e-05, |
| "lookahead_loss": 3.949872624874115, |
| "loss": 2.1377, |
| "step": 142000 |
| }, |
| { |
| "base_loss": 0.3217253153324127, |
| "epoch": 1.0333786010742188, |
| "grad_norm": 0.14466656744480133, |
| "learning_rate": 3.641023635864258e-05, |
| "lookahead_loss": 3.943008924484253, |
| "loss": 2.1324, |
| "step": 142500 |
| }, |
| { |
| "base_loss": 0.29591422697901726, |
| "epoch": 1.034332275390625, |
| "grad_norm": 0.22209474444389343, |
| "learning_rate": 3.636255264282227e-05, |
| "lookahead_loss": 3.89274987077713, |
| "loss": 2.0943, |
| "step": 143000 |
| }, |
| { |
| "base_loss": 0.3019189378321171, |
| "epoch": 1.0352859497070312, |
| "grad_norm": 0.15056173503398895, |
| "learning_rate": 3.631486892700196e-05, |
| "lookahead_loss": 3.8982916412353514, |
| "loss": 2.1001, |
| "step": 143500 |
| }, |
| { |
| "base_loss": 0.29187374815344813, |
| "epoch": 1.0362396240234375, |
| "grad_norm": 0.1609506458044052, |
| "learning_rate": 3.626718521118164e-05, |
| "lookahead_loss": 3.876904351711273, |
| "loss": 2.0844, |
| "step": 144000 |
| }, |
| { |
| "base_loss": 0.30099624979496004, |
| "epoch": 1.0371932983398438, |
| "grad_norm": 0.15686625242233276, |
| "learning_rate": 3.621950149536133e-05, |
| "lookahead_loss": 3.9365439949035643, |
| "loss": 2.1188, |
| "step": 144500 |
| }, |
| { |
| "base_loss": 0.2944833936691284, |
| "epoch": 1.03814697265625, |
| "grad_norm": 0.12234937399625778, |
| "learning_rate": 3.6171817779541014e-05, |
| "lookahead_loss": 3.925371481895447, |
| "loss": 2.1099, |
| "step": 145000 |
| }, |
| { |
| "epoch": 1.03814697265625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.427335873413086, |
| "eval_lookahead_perplexity": 30.794492942144828, |
| "eval_loss": 1.7790181636810303, |
| "eval_perplexity": 5.924037127456275, |
| "eval_runtime": 486.0413, |
| "eval_samples_per_second": 20.574, |
| "eval_steps_per_second": 1.286, |
| "step": 145000 |
| }, |
| { |
| "base_loss": 0.3096110028922558, |
| "epoch": 1.0391006469726562, |
| "grad_norm": 0.14352020621299744, |
| "learning_rate": 3.6124134063720705e-05, |
| "lookahead_loss": 3.9274420647621153, |
| "loss": 2.1185, |
| "step": 145500 |
| }, |
| { |
| "base_loss": 0.29815685757994653, |
| "epoch": 1.0400543212890625, |
| "grad_norm": 0.19689391553401947, |
| "learning_rate": 3.6076450347900395e-05, |
| "lookahead_loss": 3.8799168334007264, |
| "loss": 2.089, |
| "step": 146000 |
| }, |
| { |
| "base_loss": 0.33026822620630264, |
| "epoch": 1.0410079956054688, |
| "grad_norm": 0.15674136579036713, |
| "learning_rate": 3.602876663208008e-05, |
| "lookahead_loss": 3.9665490646362302, |
| "loss": 2.1484, |
| "step": 146500 |
| }, |
| { |
| "base_loss": 0.32754166290163994, |
| "epoch": 1.041961669921875, |
| "grad_norm": 0.2157350480556488, |
| "learning_rate": 3.598108291625977e-05, |
| "lookahead_loss": 3.9263464074134826, |
| "loss": 2.1269, |
| "step": 147000 |
| }, |
| { |
| "base_loss": 0.308051556378603, |
| "epoch": 1.0429153442382812, |
| "grad_norm": 0.17233142256736755, |
| "learning_rate": 3.593339920043945e-05, |
| "lookahead_loss": 3.892621549129486, |
| "loss": 2.1003, |
| "step": 147500 |
| }, |
| { |
| "base_loss": 0.2963902007639408, |
| "epoch": 1.0438690185546875, |
| "grad_norm": 0.13420014083385468, |
| "learning_rate": 3.588571548461914e-05, |
| "lookahead_loss": 3.8680308771133425, |
| "loss": 2.0822, |
| "step": 148000 |
| }, |
| { |
| "base_loss": 0.303954668790102, |
| "epoch": 1.0448226928710938, |
| "grad_norm": 0.13218580186367035, |
| "learning_rate": 3.583803176879883e-05, |
| "lookahead_loss": 3.886783453464508, |
| "loss": 2.0954, |
| "step": 148500 |
| }, |
| { |
| "base_loss": 0.29838690185546873, |
| "epoch": 1.0457763671875, |
| "grad_norm": 0.16112856566905975, |
| "learning_rate": 3.5790348052978516e-05, |
| "lookahead_loss": 3.9105755825042725, |
| "loss": 2.1045, |
| "step": 149000 |
| }, |
| { |
| "base_loss": 0.2983519469201565, |
| "epoch": 1.0467300415039062, |
| "grad_norm": 0.1744556576013565, |
| "learning_rate": 3.5742664337158206e-05, |
| "lookahead_loss": 3.9085661787986754, |
| "loss": 2.1035, |
| "step": 149500 |
| }, |
| { |
| "base_loss": 0.30389092776179316, |
| "epoch": 1.0476837158203125, |
| "grad_norm": 0.15804961323738098, |
| "learning_rate": 3.569498062133789e-05, |
| "lookahead_loss": 3.8888008046150206, |
| "loss": 2.0963, |
| "step": 150000 |
| }, |
| { |
| "epoch": 1.0476837158203125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.4153005847930906, |
| "eval_lookahead_perplexity": 30.426093674257224, |
| "eval_loss": 1.7730005979537964, |
| "eval_perplexity": 5.88849588779296, |
| "eval_runtime": 485.4933, |
| "eval_samples_per_second": 20.598, |
| "eval_steps_per_second": 1.287, |
| "step": 150000 |
| }, |
| { |
| "base_loss": 0.32404559567570684, |
| "epoch": 1.0486373901367188, |
| "grad_norm": 0.3761279881000519, |
| "learning_rate": 3.564729690551758e-05, |
| "lookahead_loss": 3.9307862186431883, |
| "loss": 2.1274, |
| "step": 150500 |
| }, |
| { |
| "base_loss": 0.32713927403092385, |
| "epoch": 1.049591064453125, |
| "grad_norm": 0.16427947580814362, |
| "learning_rate": 3.559961318969727e-05, |
| "lookahead_loss": 3.9374624252319337, |
| "loss": 2.1323, |
| "step": 151000 |
| }, |
| { |
| "base_loss": 0.3163092802464962, |
| "epoch": 1.0505447387695312, |
| "grad_norm": 0.23663388192653656, |
| "learning_rate": 3.555192947387695e-05, |
| "lookahead_loss": 3.8996983790397644, |
| "loss": 2.108, |
| "step": 151500 |
| }, |
| { |
| "base_loss": 0.30190867054462434, |
| "epoch": 1.0514984130859375, |
| "grad_norm": 0.19574706256389618, |
| "learning_rate": 3.5504245758056643e-05, |
| "lookahead_loss": 3.879706892490387, |
| "loss": 2.0908, |
| "step": 152000 |
| }, |
| { |
| "base_loss": 0.2993427519798279, |
| "epoch": 1.0524520874023438, |
| "grad_norm": 0.18463152647018433, |
| "learning_rate": 3.545656204223633e-05, |
| "lookahead_loss": 3.8694649033546447, |
| "loss": 2.0844, |
| "step": 152500 |
| }, |
| { |
| "base_loss": 0.3071295386552811, |
| "epoch": 1.05340576171875, |
| "grad_norm": 0.17730920016765594, |
| "learning_rate": 3.540887832641602e-05, |
| "lookahead_loss": 3.9193247327804563, |
| "loss": 2.1132, |
| "step": 153000 |
| }, |
| { |
| "base_loss": 0.2997964630126953, |
| "epoch": 1.0543594360351562, |
| "grad_norm": 0.262504905462265, |
| "learning_rate": 3.536119461059571e-05, |
| "lookahead_loss": 3.8937467522621154, |
| "loss": 2.0968, |
| "step": 153500 |
| }, |
| { |
| "base_loss": 0.31396923500299456, |
| "epoch": 1.0553131103515625, |
| "grad_norm": 0.1696975976228714, |
| "learning_rate": 3.531351089477539e-05, |
| "lookahead_loss": 3.9104519414901735, |
| "loss": 2.1122, |
| "step": 154000 |
| }, |
| { |
| "base_loss": 0.32232173484563825, |
| "epoch": 1.0562667846679688, |
| "grad_norm": 0.14248883724212646, |
| "learning_rate": 3.526582717895508e-05, |
| "lookahead_loss": 3.9199597582817076, |
| "loss": 2.1211, |
| "step": 154500 |
| }, |
| { |
| "base_loss": 0.33595413306355476, |
| "epoch": 1.057220458984375, |
| "grad_norm": 0.13409440219402313, |
| "learning_rate": 3.5218143463134764e-05, |
| "lookahead_loss": 3.951625358104706, |
| "loss": 2.1438, |
| "step": 155000 |
| }, |
| { |
| "epoch": 1.057220458984375, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.4037932720184325, |
| "eval_lookahead_perplexity": 30.077977877910172, |
| "eval_loss": 1.7672468423843384, |
| "eval_perplexity": 5.854712206507675, |
| "eval_runtime": 489.1245, |
| "eval_samples_per_second": 20.445, |
| "eval_steps_per_second": 1.278, |
| "step": 155000 |
| }, |
| { |
| "base_loss": 0.30358468553423884, |
| "epoch": 1.0581741333007812, |
| "grad_norm": 0.14868056774139404, |
| "learning_rate": 3.5170459747314455e-05, |
| "lookahead_loss": 3.875640995025635, |
| "loss": 2.0896, |
| "step": 155500 |
| }, |
| { |
| "base_loss": 0.29896630710363387, |
| "epoch": 1.0591278076171875, |
| "grad_norm": 0.15365566313266754, |
| "learning_rate": 3.5122776031494145e-05, |
| "lookahead_loss": 3.8703284606933592, |
| "loss": 2.0846, |
| "step": 156000 |
| }, |
| { |
| "base_loss": 0.3018466059863567, |
| "epoch": 1.0600814819335938, |
| "grad_norm": 0.23302938044071198, |
| "learning_rate": 3.507509231567383e-05, |
| "lookahead_loss": 3.8674917163848876, |
| "loss": 2.0847, |
| "step": 156500 |
| }, |
| { |
| "base_loss": 0.30544874557852747, |
| "epoch": 1.06103515625, |
| "grad_norm": 0.2524946928024292, |
| "learning_rate": 3.502740859985352e-05, |
| "lookahead_loss": 3.937124222278595, |
| "loss": 2.1213, |
| "step": 157000 |
| }, |
| { |
| "base_loss": 0.30323341020941735, |
| "epoch": 1.0619888305664062, |
| "grad_norm": 0.1309209018945694, |
| "learning_rate": 3.49797248840332e-05, |
| "lookahead_loss": 3.9118480677604675, |
| "loss": 2.1075, |
| "step": 157500 |
| }, |
| { |
| "base_loss": 0.30131002590060235, |
| "epoch": 1.0629425048828125, |
| "grad_norm": 0.1601804494857788, |
| "learning_rate": 3.493204116821289e-05, |
| "lookahead_loss": 3.897795463562012, |
| "loss": 2.0996, |
| "step": 158000 |
| }, |
| { |
| "base_loss": 0.31901666805148127, |
| "epoch": 1.0638961791992188, |
| "grad_norm": 0.14104294776916504, |
| "learning_rate": 3.488435745239258e-05, |
| "lookahead_loss": 3.9272995166778566, |
| "loss": 2.1232, |
| "step": 158500 |
| }, |
| { |
| "base_loss": 0.32057506546378134, |
| "epoch": 1.064849853515625, |
| "grad_norm": 0.19912464916706085, |
| "learning_rate": 3.4836673736572266e-05, |
| "lookahead_loss": 3.919455467700958, |
| "loss": 2.12, |
| "step": 159000 |
| }, |
| { |
| "base_loss": 0.3094568813741207, |
| "epoch": 1.0658035278320312, |
| "grad_norm": 0.13988551497459412, |
| "learning_rate": 3.4788990020751956e-05, |
| "lookahead_loss": 3.8748404712677003, |
| "loss": 2.0921, |
| "step": 159500 |
| }, |
| { |
| "base_loss": 0.296497131973505, |
| "epoch": 1.0667572021484375, |
| "grad_norm": 0.17116788029670715, |
| "learning_rate": 3.474130630493164e-05, |
| "lookahead_loss": 3.8537814893722535, |
| "loss": 2.0751, |
| "step": 160000 |
| }, |
| { |
| "epoch": 1.0667572021484375, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.393118642807007, |
| "eval_lookahead_perplexity": 29.758614193648153, |
| "eval_loss": 1.7619096040725708, |
| "eval_perplexity": 5.823547453052975, |
| "eval_runtime": 489.6754, |
| "eval_samples_per_second": 20.422, |
| "eval_steps_per_second": 1.276, |
| "step": 160000 |
| }, |
| { |
| "base_loss": 0.2961631261408329, |
| "epoch": 1.0677108764648438, |
| "grad_norm": 0.14856025576591492, |
| "learning_rate": 3.469362258911133e-05, |
| "lookahead_loss": 3.8731155366897583, |
| "loss": 2.0846, |
| "step": 160500 |
| }, |
| { |
| "base_loss": 0.30877826517820356, |
| "epoch": 1.06866455078125, |
| "grad_norm": 0.15069471299648285, |
| "learning_rate": 3.464593887329102e-05, |
| "lookahead_loss": 3.9271927394866943, |
| "loss": 2.118, |
| "step": 161000 |
| }, |
| { |
| "base_loss": 0.3005406486093998, |
| "epoch": 1.0696182250976562, |
| "grad_norm": 0.15068742632865906, |
| "learning_rate": 3.45982551574707e-05, |
| "lookahead_loss": 3.88241503572464, |
| "loss": 2.0915, |
| "step": 161500 |
| }, |
| { |
| "base_loss": 0.32187015274167063, |
| "epoch": 1.0705718994140625, |
| "grad_norm": 0.15749786794185638, |
| "learning_rate": 3.4550571441650393e-05, |
| "lookahead_loss": 3.896246009349823, |
| "loss": 2.1091, |
| "step": 162000 |
| }, |
| { |
| "base_loss": 0.33053677862882613, |
| "epoch": 1.0715255737304688, |
| "grad_norm": 0.15866954624652863, |
| "learning_rate": 3.450288772583008e-05, |
| "lookahead_loss": 3.9282021570205687, |
| "loss": 2.1294, |
| "step": 162500 |
| }, |
| { |
| "base_loss": 0.32069788879156114, |
| "epoch": 1.072479248046875, |
| "grad_norm": 0.1594492495059967, |
| "learning_rate": 3.445520401000977e-05, |
| "lookahead_loss": 3.8940247020721435, |
| "loss": 2.1074, |
| "step": 163000 |
| }, |
| { |
| "base_loss": 0.31262470212578775, |
| "epoch": 1.0734329223632812, |
| "grad_norm": 0.17305102944374084, |
| "learning_rate": 3.440752029418946e-05, |
| "lookahead_loss": 3.8849755458831785, |
| "loss": 2.0988, |
| "step": 163500 |
| }, |
| { |
| "base_loss": 0.2981148832142353, |
| "epoch": 1.0743865966796875, |
| "grad_norm": 0.16819824278354645, |
| "learning_rate": 3.435983657836914e-05, |
| "lookahead_loss": 3.8529111161231993, |
| "loss": 2.0755, |
| "step": 164000 |
| }, |
| { |
| "base_loss": 0.30023786443471906, |
| "epoch": 1.0753402709960938, |
| "grad_norm": 0.17850428819656372, |
| "learning_rate": 3.431215286254883e-05, |
| "lookahead_loss": 3.861279788017273, |
| "loss": 2.0808, |
| "step": 164500 |
| }, |
| { |
| "base_loss": 0.3020662295222282, |
| "epoch": 1.0762939453125, |
| "grad_norm": 0.2424204796552658, |
| "learning_rate": 3.4264469146728514e-05, |
| "lookahead_loss": 3.8807444310188295, |
| "loss": 2.0914, |
| "step": 165000 |
| }, |
| { |
| "epoch": 1.0762939453125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.3821912258148195, |
| "eval_lookahead_perplexity": 29.435199668131006, |
| "eval_loss": 1.7564458847045898, |
| "eval_perplexity": 5.79181598888891, |
| "eval_runtime": 489.9893, |
| "eval_samples_per_second": 20.409, |
| "eval_steps_per_second": 1.276, |
| "step": 165000 |
| }, |
| { |
| "base_loss": 0.3027431888282299, |
| "epoch": 1.0772476196289062, |
| "grad_norm": 0.20542722940444946, |
| "learning_rate": 3.4216785430908205e-05, |
| "lookahead_loss": 3.8899433569908144, |
| "loss": 2.0963, |
| "step": 165500 |
| }, |
| { |
| "base_loss": 0.3171701425611973, |
| "epoch": 1.0782012939453125, |
| "grad_norm": 0.16201598942279816, |
| "learning_rate": 3.4169101715087895e-05, |
| "lookahead_loss": 3.8952909932136537, |
| "loss": 2.1062, |
| "step": 166000 |
| }, |
| { |
| "base_loss": 0.3263061309456825, |
| "epoch": 1.0791549682617188, |
| "grad_norm": 0.14267025887966156, |
| "learning_rate": 3.412141799926758e-05, |
| "lookahead_loss": 3.915688913345337, |
| "loss": 2.121, |
| "step": 166500 |
| }, |
| { |
| "base_loss": 0.3166075404882431, |
| "epoch": 1.080108642578125, |
| "grad_norm": 0.20565369725227356, |
| "learning_rate": 3.407373428344727e-05, |
| "lookahead_loss": 3.884951609611511, |
| "loss": 2.1008, |
| "step": 167000 |
| }, |
| { |
| "base_loss": 0.3153759800195694, |
| "epoch": 1.0810623168945312, |
| "grad_norm": 0.17952857911586761, |
| "learning_rate": 3.402605056762695e-05, |
| "lookahead_loss": 3.874412197113037, |
| "loss": 2.0949, |
| "step": 167500 |
| }, |
| { |
| "base_loss": 0.29424766221642495, |
| "epoch": 1.0820159912109375, |
| "grad_norm": 0.13297192752361298, |
| "learning_rate": 3.397836685180664e-05, |
| "lookahead_loss": 3.8378429608345033, |
| "loss": 2.066, |
| "step": 168000 |
| }, |
| { |
| "base_loss": 0.2951381744146347, |
| "epoch": 1.0829696655273438, |
| "grad_norm": 0.14463625848293304, |
| "learning_rate": 3.393068313598633e-05, |
| "lookahead_loss": 3.878976944446564, |
| "loss": 2.0871, |
| "step": 168500 |
| }, |
| { |
| "base_loss": 0.29598655554652215, |
| "epoch": 1.08392333984375, |
| "grad_norm": 0.1720210164785385, |
| "learning_rate": 3.3882999420166016e-05, |
| "lookahead_loss": 3.883972409248352, |
| "loss": 2.09, |
| "step": 169000 |
| }, |
| { |
| "base_loss": 0.3003324483036995, |
| "epoch": 1.0848770141601562, |
| "grad_norm": 0.16413024067878723, |
| "learning_rate": 3.3835315704345706e-05, |
| "lookahead_loss": 3.866434679508209, |
| "loss": 2.0834, |
| "step": 169500 |
| }, |
| { |
| "base_loss": 0.324860055655241, |
| "epoch": 1.0858306884765625, |
| "grad_norm": 0.21429774165153503, |
| "learning_rate": 3.378763198852539e-05, |
| "lookahead_loss": 3.8790123524665834, |
| "loss": 2.1019, |
| "step": 170000 |
| }, |
| { |
| "epoch": 1.0858306884765625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.3718783203125, |
| "eval_lookahead_perplexity": 29.13319717374854, |
| "eval_loss": 1.7512894868850708, |
| "eval_perplexity": 5.762027947051168, |
| "eval_runtime": 481.9044, |
| "eval_samples_per_second": 20.751, |
| "eval_steps_per_second": 1.297, |
| "step": 170000 |
| }, |
| { |
| "base_loss": 0.32848941776156426, |
| "epoch": 1.0867843627929688, |
| "grad_norm": 0.2181519716978073, |
| "learning_rate": 3.373994827270508e-05, |
| "lookahead_loss": 3.916994530200958, |
| "loss": 2.1227, |
| "step": 170500 |
| }, |
| { |
| "base_loss": 0.3349419977068901, |
| "epoch": 1.087738037109375, |
| "grad_norm": 0.21681463718414307, |
| "learning_rate": 3.369226455688477e-05, |
| "lookahead_loss": 3.889006247520447, |
| "loss": 2.112, |
| "step": 171000 |
| }, |
| { |
| "base_loss": 0.29626981797814367, |
| "epoch": 1.0886917114257812, |
| "grad_norm": 0.13979732990264893, |
| "learning_rate": 3.364458084106445e-05, |
| "lookahead_loss": 3.845555930137634, |
| "loss": 2.0709, |
| "step": 171500 |
| }, |
| { |
| "base_loss": 0.29588885527849196, |
| "epoch": 1.0896453857421875, |
| "grad_norm": 0.17025181651115417, |
| "learning_rate": 3.3596897125244143e-05, |
| "lookahead_loss": 3.825295521736145, |
| "loss": 2.0606, |
| "step": 172000 |
| }, |
| { |
| "base_loss": 0.30355440092086794, |
| "epoch": 1.0905990600585938, |
| "grad_norm": 0.13651303946971893, |
| "learning_rate": 3.354921340942383e-05, |
| "lookahead_loss": 3.8832921752929686, |
| "loss": 2.0934, |
| "step": 172500 |
| }, |
| { |
| "base_loss": 0.3085533272922039, |
| "epoch": 1.091552734375, |
| "grad_norm": 0.1449888050556183, |
| "learning_rate": 3.350152969360352e-05, |
| "lookahead_loss": 3.900917944908142, |
| "loss": 2.1047, |
| "step": 173000 |
| }, |
| { |
| "base_loss": 0.30814607721567155, |
| "epoch": 1.0925064086914062, |
| "grad_norm": 0.14506219327449799, |
| "learning_rate": 3.345384597778321e-05, |
| "lookahead_loss": 3.8408993144035337, |
| "loss": 2.0745, |
| "step": 173500 |
| }, |
| { |
| "base_loss": 0.3475791245102882, |
| "epoch": 1.0934600830078125, |
| "grad_norm": 0.14961472153663635, |
| "learning_rate": 3.340616226196289e-05, |
| "lookahead_loss": 3.9275504984855654, |
| "loss": 2.1376, |
| "step": 174000 |
| }, |
| { |
| "base_loss": 0.3188588669300079, |
| "epoch": 1.0944137573242188, |
| "grad_norm": 0.14812441170215607, |
| "learning_rate": 3.335847854614258e-05, |
| "lookahead_loss": 3.894184876918793, |
| "loss": 2.1065, |
| "step": 174500 |
| }, |
| { |
| "base_loss": 0.33637256652116776, |
| "epoch": 1.095367431640625, |
| "grad_norm": 0.31748202443122864, |
| "learning_rate": 3.3310794830322264e-05, |
| "lookahead_loss": 3.91321995306015, |
| "loss": 2.1248, |
| "step": 175000 |
| }, |
| { |
| "epoch": 1.095367431640625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.362889016342163, |
| "eval_lookahead_perplexity": 28.872483581226124, |
| "eval_loss": 1.7467947006225586, |
| "eval_perplexity": 5.736186981305101, |
| "eval_runtime": 492.3817, |
| "eval_samples_per_second": 20.309, |
| "eval_steps_per_second": 1.269, |
| "step": 175000 |
| }, |
| { |
| "base_loss": 0.29970453238487244, |
| "epoch": 1.0963211059570312, |
| "grad_norm": 0.14985507726669312, |
| "learning_rate": 3.3263111114501955e-05, |
| "lookahead_loss": 3.8269423189163208, |
| "loss": 2.0633, |
| "step": 175500 |
| }, |
| { |
| "base_loss": 0.29638376808166506, |
| "epoch": 1.0972747802734375, |
| "grad_norm": 0.15558375418186188, |
| "learning_rate": 3.3215427398681645e-05, |
| "lookahead_loss": 3.8555296115875244, |
| "loss": 2.076, |
| "step": 176000 |
| }, |
| { |
| "base_loss": 0.3054467994570732, |
| "epoch": 1.0982284545898438, |
| "grad_norm": 0.1894250363111496, |
| "learning_rate": 3.316774368286133e-05, |
| "lookahead_loss": 3.8709189410209657, |
| "loss": 2.0882, |
| "step": 176500 |
| }, |
| { |
| "base_loss": 0.3071099489927292, |
| "epoch": 1.09918212890625, |
| "grad_norm": 0.16460929811000824, |
| "learning_rate": 3.312005996704102e-05, |
| "lookahead_loss": 3.862484317779541, |
| "loss": 2.0848, |
| "step": 177000 |
| }, |
| { |
| "base_loss": 0.31426447916030886, |
| "epoch": 1.1001358032226562, |
| "grad_norm": 0.21863171458244324, |
| "learning_rate": 3.30723762512207e-05, |
| "lookahead_loss": 3.8745448336601256, |
| "loss": 2.0944, |
| "step": 177500 |
| }, |
| { |
| "base_loss": 0.32723835909366605, |
| "epoch": 1.1010894775390625, |
| "grad_norm": 0.1326635330915451, |
| "learning_rate": 3.302469253540039e-05, |
| "lookahead_loss": 3.9010628695487974, |
| "loss": 2.1142, |
| "step": 178000 |
| }, |
| { |
| "base_loss": 0.31847833314538004, |
| "epoch": 1.1020431518554688, |
| "grad_norm": 0.18970361351966858, |
| "learning_rate": 3.297700881958008e-05, |
| "lookahead_loss": 3.864170029640198, |
| "loss": 2.0913, |
| "step": 178500 |
| }, |
| { |
| "base_loss": 0.29289821565151214, |
| "epoch": 1.102996826171875, |
| "grad_norm": 0.20295552909374237, |
| "learning_rate": 3.2929325103759766e-05, |
| "lookahead_loss": 3.814200548171997, |
| "loss": 2.0535, |
| "step": 179000 |
| }, |
| { |
| "base_loss": 0.30456195056438445, |
| "epoch": 1.1039505004882812, |
| "grad_norm": 0.20044149458408356, |
| "learning_rate": 3.2881641387939456e-05, |
| "lookahead_loss": 3.842505895137787, |
| "loss": 2.0735, |
| "step": 179500 |
| }, |
| { |
| "base_loss": 0.3080640316605568, |
| "epoch": 1.1049041748046875, |
| "grad_norm": 0.2961556911468506, |
| "learning_rate": 3.283395767211914e-05, |
| "lookahead_loss": 3.883462808609009, |
| "loss": 2.0958, |
| "step": 180000 |
| }, |
| { |
| "epoch": 1.1049041748046875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.353512685394287, |
| "eval_lookahead_perplexity": 28.603030833040698, |
| "eval_loss": 1.7421066761016846, |
| "eval_perplexity": 5.709358531431193, |
| "eval_runtime": 509.3192, |
| "eval_samples_per_second": 19.634, |
| "eval_steps_per_second": 1.227, |
| "step": 180000 |
| }, |
| { |
| "base_loss": 0.31145399552583697, |
| "epoch": 1.1058578491210938, |
| "grad_norm": 0.14174073934555054, |
| "learning_rate": 3.278627395629883e-05, |
| "lookahead_loss": 3.878586507797241, |
| "loss": 2.095, |
| "step": 180500 |
| }, |
| { |
| "base_loss": 0.3290363503992558, |
| "epoch": 1.1068115234375, |
| "grad_norm": 0.14076529443264008, |
| "learning_rate": 3.273859024047852e-05, |
| "lookahead_loss": 3.87795290517807, |
| "loss": 2.1035, |
| "step": 181000 |
| }, |
| { |
| "base_loss": 0.34425261700153353, |
| "epoch": 1.1077651977539062, |
| "grad_norm": 0.16690996289253235, |
| "learning_rate": 3.26909065246582e-05, |
| "lookahead_loss": 3.9159616875648497, |
| "loss": 2.1301, |
| "step": 181500 |
| }, |
| { |
| "base_loss": 0.37842708241939543, |
| "epoch": 1.1087188720703125, |
| "grad_norm": 0.13356585800647736, |
| "learning_rate": 3.2643222808837893e-05, |
| "lookahead_loss": 3.9294180998802184, |
| "loss": 2.1539, |
| "step": 182000 |
| }, |
| { |
| "base_loss": 0.29286388018727305, |
| "epoch": 1.1096725463867188, |
| "grad_norm": 0.1421193927526474, |
| "learning_rate": 3.259553909301758e-05, |
| "lookahead_loss": 3.819779777050018, |
| "loss": 2.0563, |
| "step": 182500 |
| }, |
| { |
| "base_loss": 0.2963064706027508, |
| "epoch": 1.110626220703125, |
| "grad_norm": 0.19730889797210693, |
| "learning_rate": 3.254785537719727e-05, |
| "lookahead_loss": 3.8228342752456665, |
| "loss": 2.0596, |
| "step": 183000 |
| }, |
| { |
| "base_loss": 0.310688713490963, |
| "epoch": 1.1115798950195312, |
| "grad_norm": 0.15284715592861176, |
| "learning_rate": 3.250017166137696e-05, |
| "lookahead_loss": 3.8795133504867554, |
| "loss": 2.0951, |
| "step": 183500 |
| }, |
| { |
| "base_loss": 0.304560353577137, |
| "epoch": 1.1125335693359375, |
| "grad_norm": 0.1316368132829666, |
| "learning_rate": 3.245248794555664e-05, |
| "lookahead_loss": 3.8906916728019714, |
| "loss": 2.0976, |
| "step": 184000 |
| }, |
| { |
| "base_loss": 0.3029217945933342, |
| "epoch": 1.1134872436523438, |
| "grad_norm": 0.14683161675930023, |
| "learning_rate": 3.240480422973633e-05, |
| "lookahead_loss": 3.8557743334770205, |
| "loss": 2.0793, |
| "step": 184500 |
| }, |
| { |
| "base_loss": 0.33166853222250936, |
| "epoch": 1.11444091796875, |
| "grad_norm": 0.17274287343025208, |
| "learning_rate": 3.2357120513916014e-05, |
| "lookahead_loss": 3.903276960849762, |
| "loss": 2.1175, |
| "step": 185000 |
| }, |
| { |
| "epoch": 1.11444091796875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.3450291194915773, |
| "eval_lookahead_perplexity": 28.361401524222025, |
| "eval_loss": 1.7378648519515991, |
| "eval_perplexity": 5.685191728431611, |
| "eval_runtime": 481.6228, |
| "eval_samples_per_second": 20.763, |
| "eval_steps_per_second": 1.298, |
| "step": 185000 |
| }, |
| { |
| "base_loss": 0.3226691042780876, |
| "epoch": 1.1153945922851562, |
| "grad_norm": 0.16963951289653778, |
| "learning_rate": 3.2309436798095705e-05, |
| "lookahead_loss": 3.864322167873383, |
| "loss": 2.0935, |
| "step": 185500 |
| }, |
| { |
| "base_loss": 0.31310846722126007, |
| "epoch": 1.1163482666015625, |
| "grad_norm": 0.12234390527009964, |
| "learning_rate": 3.2261753082275395e-05, |
| "lookahead_loss": 3.8433740344047544, |
| "loss": 2.0782, |
| "step": 186000 |
| }, |
| { |
| "base_loss": 0.29350434136390685, |
| "epoch": 1.1173019409179688, |
| "grad_norm": 0.16640856862068176, |
| "learning_rate": 3.221406936645508e-05, |
| "lookahead_loss": 3.7999322633743287, |
| "loss": 2.0467, |
| "step": 186500 |
| }, |
| { |
| "base_loss": 0.2932390958070755, |
| "epoch": 1.118255615234375, |
| "grad_norm": 0.13954715430736542, |
| "learning_rate": 3.216638565063477e-05, |
| "lookahead_loss": 3.8404451036453247, |
| "loss": 2.0668, |
| "step": 187000 |
| }, |
| { |
| "base_loss": 0.30293849104642867, |
| "epoch": 1.1192092895507812, |
| "grad_norm": 0.2102259248495102, |
| "learning_rate": 3.211870193481445e-05, |
| "lookahead_loss": 3.876214940547943, |
| "loss": 2.0896, |
| "step": 187500 |
| }, |
| { |
| "base_loss": 0.3002570872604847, |
| "epoch": 1.1201629638671875, |
| "grad_norm": 0.1469457745552063, |
| "learning_rate": 3.207101821899414e-05, |
| "lookahead_loss": 3.8522539978027344, |
| "loss": 2.0763, |
| "step": 188000 |
| }, |
| { |
| "base_loss": 0.33394081115722657, |
| "epoch": 1.1211166381835938, |
| "grad_norm": 0.1487540900707245, |
| "learning_rate": 3.202333450317383e-05, |
| "lookahead_loss": 3.899791095733643, |
| "loss": 2.1169, |
| "step": 188500 |
| }, |
| { |
| "base_loss": 0.305567186832428, |
| "epoch": 1.1220703125, |
| "grad_norm": 0.13522186875343323, |
| "learning_rate": 3.1975650787353516e-05, |
| "lookahead_loss": 3.8232165246009826, |
| "loss": 2.0644, |
| "step": 189000 |
| }, |
| { |
| "base_loss": 0.3088062160909176, |
| "epoch": 1.1230239868164062, |
| "grad_norm": 0.14893706142902374, |
| "learning_rate": 3.1927967071533206e-05, |
| "lookahead_loss": 3.825707914352417, |
| "loss": 2.0673, |
| "step": 189500 |
| }, |
| { |
| "base_loss": 0.2980837540626526, |
| "epoch": 1.1239776611328125, |
| "grad_norm": 0.1944948434829712, |
| "learning_rate": 3.188028335571289e-05, |
| "lookahead_loss": 3.8185536642074585, |
| "loss": 2.0583, |
| "step": 190000 |
| }, |
| { |
| "epoch": 1.1239776611328125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.337442953109741, |
| "eval_lookahead_perplexity": 28.147061251859267, |
| "eval_loss": 1.7340717315673828, |
| "eval_perplexity": 5.663667958729687, |
| "eval_runtime": 490.8569, |
| "eval_samples_per_second": 20.373, |
| "eval_steps_per_second": 1.273, |
| "step": 190000 |
| }, |
| { |
| "base_loss": 0.3022361363172531, |
| "epoch": 1.1249313354492188, |
| "grad_norm": 0.13972483575344086, |
| "learning_rate": 3.183259963989258e-05, |
| "lookahead_loss": 3.871019568443298, |
| "loss": 2.0866, |
| "step": 190500 |
| }, |
| { |
| "base_loss": 0.3076981520354748, |
| "epoch": 1.125885009765625, |
| "grad_norm": 0.1670301854610443, |
| "learning_rate": 3.178491592407227e-05, |
| "lookahead_loss": 3.870435709476471, |
| "loss": 2.0891, |
| "step": 191000 |
| }, |
| { |
| "base_loss": 0.3076484650671482, |
| "epoch": 1.1268386840820312, |
| "grad_norm": 0.15010391175746918, |
| "learning_rate": 3.173723220825195e-05, |
| "lookahead_loss": 3.858703122615814, |
| "loss": 2.0832, |
| "step": 191500 |
| }, |
| { |
| "base_loss": 0.32482430759072306, |
| "epoch": 1.1277923583984375, |
| "grad_norm": 0.1445780098438263, |
| "learning_rate": 3.1689548492431643e-05, |
| "lookahead_loss": 3.868783023357391, |
| "loss": 2.0968, |
| "step": 192000 |
| }, |
| { |
| "base_loss": 0.3072025769650936, |
| "epoch": 1.1287460327148438, |
| "grad_norm": 0.21696054935455322, |
| "learning_rate": 3.164186477661133e-05, |
| "lookahead_loss": 3.820800142288208, |
| "loss": 2.064, |
| "step": 192500 |
| }, |
| { |
| "base_loss": 0.3038946217596531, |
| "epoch": 1.12969970703125, |
| "grad_norm": 0.18033991754055023, |
| "learning_rate": 3.159418106079102e-05, |
| "lookahead_loss": 3.8287801537513735, |
| "loss": 2.0663, |
| "step": 193000 |
| }, |
| { |
| "base_loss": 0.309920187741518, |
| "epoch": 1.1306533813476562, |
| "grad_norm": 0.14635299146175385, |
| "learning_rate": 3.154649734497071e-05, |
| "lookahead_loss": 3.809571493625641, |
| "loss": 2.0597, |
| "step": 193500 |
| }, |
| { |
| "base_loss": 0.3105273153483868, |
| "epoch": 1.1316070556640625, |
| "grad_norm": 0.16371551156044006, |
| "learning_rate": 3.149881362915039e-05, |
| "lookahead_loss": 3.8675531783103945, |
| "loss": 2.089, |
| "step": 194000 |
| }, |
| { |
| "base_loss": 0.3032006587386131, |
| "epoch": 1.1325607299804688, |
| "grad_norm": 0.1437891125679016, |
| "learning_rate": 3.145112991333008e-05, |
| "lookahead_loss": 3.859119012832642, |
| "loss": 2.0812, |
| "step": 194500 |
| }, |
| { |
| "base_loss": 0.3097851026952267, |
| "epoch": 1.133514404296875, |
| "grad_norm": 0.1416500210762024, |
| "learning_rate": 3.1403446197509764e-05, |
| "lookahead_loss": 3.838276375770569, |
| "loss": 2.074, |
| "step": 195000 |
| }, |
| { |
| "epoch": 1.133514404296875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.1307004702925682, |
| "eval_base_perplexity": 1.1396263782287075, |
| "eval_lookahead_loss": 3.3290446380615233, |
| "eval_lookahead_perplexity": 27.91166322078159, |
| "eval_loss": 1.729872703552246, |
| "eval_perplexity": 5.639935918922495, |
| "eval_runtime": 483.0611, |
| "eval_samples_per_second": 20.701, |
| "eval_steps_per_second": 1.294, |
| "step": 195000 |
| }, |
| { |
| "base_loss": 0.3019270299375057, |
| "epoch": 1.0009536743164062, |
| "grad_norm": 0.11448461562395096, |
| "learning_rate": 3.1355762481689455e-05, |
| "lookahead_loss": 3.852882830142975, |
| "loss": 2.073, |
| "step": 195500 |
| }, |
| { |
| "base_loss": 0.3022316916286945, |
| "epoch": 1.0019073486328125, |
| "grad_norm": 0.17280858755111694, |
| "learning_rate": 3.1308078765869145e-05, |
| "lookahead_loss": 3.83609538269043, |
| "loss": 2.0718, |
| "step": 196000 |
| }, |
| { |
| "base_loss": 0.3106894801259041, |
| "epoch": 1.0028610229492188, |
| "grad_norm": 0.1027756780385971, |
| "learning_rate": 3.126039505004883e-05, |
| "lookahead_loss": 3.846992799282074, |
| "loss": 2.0705, |
| "step": 196500 |
| }, |
| { |
| "base_loss": 0.3196644955575466, |
| "epoch": 1.003814697265625, |
| "grad_norm": 0.10951551049947739, |
| "learning_rate": 3.121271133422852e-05, |
| "lookahead_loss": 3.8443777050971986, |
| "loss": 2.0839, |
| "step": 197000 |
| }, |
| { |
| "base_loss": 0.30172179120779036, |
| "epoch": 1.0047683715820312, |
| "grad_norm": 0.0957549512386322, |
| "learning_rate": 3.11650276184082e-05, |
| "lookahead_loss": 3.806076729774475, |
| "loss": 2.0577, |
| "step": 197500 |
| }, |
| { |
| "base_loss": 0.2981105833351612, |
| "epoch": 1.0057220458984375, |
| "grad_norm": 0.1120913177728653, |
| "learning_rate": 3.111734390258789e-05, |
| "lookahead_loss": 3.831531247615814, |
| "loss": 2.064, |
| "step": 198000 |
| }, |
| { |
| "base_loss": 0.29635272261500356, |
| "epoch": 1.0066757202148438, |
| "grad_norm": 0.11260683089494705, |
| "learning_rate": 3.106966018676758e-05, |
| "lookahead_loss": 3.8422910799980166, |
| "loss": 2.0727, |
| "step": 198500 |
| }, |
| { |
| "base_loss": 0.313492115303874, |
| "epoch": 1.00762939453125, |
| "grad_norm": 0.12370772659778595, |
| "learning_rate": 3.1021976470947266e-05, |
| "lookahead_loss": 3.8489174375534057, |
| "loss": 2.0755, |
| "step": 199000 |
| }, |
| { |
| "base_loss": 0.31625834056735036, |
| "epoch": 1.0085830688476562, |
| "grad_norm": 0.12102854251861572, |
| "learning_rate": 3.0974292755126956e-05, |
| "lookahead_loss": 3.839761669635773, |
| "loss": 2.0706, |
| "step": 199500 |
| }, |
| { |
| "base_loss": 0.3014394761025906, |
| "epoch": 1.0095367431640625, |
| "grad_norm": 0.11906581372022629, |
| "learning_rate": 3.092660903930664e-05, |
| "lookahead_loss": 3.7962422103881837, |
| "loss": 2.0537, |
| "step": 200000 |
| }, |
| { |
| "epoch": 1.0095367431640625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.3221299815863468, |
| "eval_lookahead_perplexity": 27.71932938548797, |
| "eval_loss": 1.724821925163269, |
| "eval_perplexity": 5.611521669910201, |
| "eval_runtime": 491.8555, |
| "eval_samples_per_second": 10.166, |
| "eval_steps_per_second": 0.319, |
| "step": 200000 |
| }, |
| { |
| "base_loss": 0.29815256175398824, |
| "epoch": 1.0104904174804688, |
| "grad_norm": 0.1050342544913292, |
| "learning_rate": 3.087892532348633e-05, |
| "lookahead_loss": 3.8349307403564454, |
| "loss": 2.0643, |
| "step": 200500 |
| }, |
| { |
| "base_loss": 0.300417543143034, |
| "epoch": 1.011444091796875, |
| "grad_norm": 0.10707239806652069, |
| "learning_rate": 3.083124160766602e-05, |
| "lookahead_loss": 3.8183065605163575, |
| "loss": 2.0646, |
| "step": 201000 |
| }, |
| { |
| "base_loss": 0.3254467163980007, |
| "epoch": 1.0123977661132812, |
| "grad_norm": 0.09747885912656784, |
| "learning_rate": 3.07835578918457e-05, |
| "lookahead_loss": 3.8502072682380675, |
| "loss": 2.0876, |
| "step": 201500 |
| }, |
| { |
| "base_loss": 0.3060005504488945, |
| "epoch": 1.0133514404296875, |
| "grad_norm": 0.12023238092660904, |
| "learning_rate": 3.0735874176025393e-05, |
| "lookahead_loss": 3.8015957136154173, |
| "loss": 2.0575, |
| "step": 202000 |
| }, |
| { |
| "base_loss": 0.29856650426983833, |
| "epoch": 1.0143051147460938, |
| "grad_norm": 0.12048441171646118, |
| "learning_rate": 3.068819046020508e-05, |
| "lookahead_loss": 3.8015453901290894, |
| "loss": 2.0505, |
| "step": 202500 |
| }, |
| { |
| "base_loss": 0.2938228516280651, |
| "epoch": 1.0152587890625, |
| "grad_norm": 0.11696764826774597, |
| "learning_rate": 3.064050674438477e-05, |
| "lookahead_loss": 3.8269337430000303, |
| "loss": 2.0616, |
| "step": 203000 |
| }, |
| { |
| "base_loss": 0.31177652820944785, |
| "epoch": 1.0162124633789062, |
| "grad_norm": 0.11982905864715576, |
| "learning_rate": 3.059282302856446e-05, |
| "lookahead_loss": 3.8382421617507934, |
| "loss": 2.0762, |
| "step": 203500 |
| }, |
| { |
| "base_loss": 0.3144752032160759, |
| "epoch": 1.0171661376953125, |
| "grad_norm": 0.13648830354213715, |
| "learning_rate": 3.054513931274414e-05, |
| "lookahead_loss": 3.83622642326355, |
| "loss": 2.0702, |
| "step": 204000 |
| }, |
| { |
| "base_loss": 0.3018057085573673, |
| "epoch": 1.0181198120117188, |
| "grad_norm": 0.13782760500907898, |
| "learning_rate": 3.049745559692383e-05, |
| "lookahead_loss": 3.8047565789222717, |
| "loss": 2.0474, |
| "step": 204500 |
| }, |
| { |
| "base_loss": 0.29815602460503576, |
| "epoch": 1.019073486328125, |
| "grad_norm": 0.1043187752366066, |
| "learning_rate": 3.0449771881103518e-05, |
| "lookahead_loss": 3.838974130153656, |
| "loss": 2.0681, |
| "step": 205000 |
| }, |
| { |
| "epoch": 1.019073486328125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.311551880912659, |
| "eval_lookahead_perplexity": 27.42765691874828, |
| "eval_loss": 1.7195392847061157, |
| "eval_perplexity": 5.581956179194468, |
| "eval_runtime": 511.3254, |
| "eval_samples_per_second": 9.779, |
| "eval_steps_per_second": 0.307, |
| "step": 205000 |
| }, |
| { |
| "base_loss": 0.30170127359032634, |
| "epoch": 1.0200271606445312, |
| "grad_norm": 0.13211366534233093, |
| "learning_rate": 3.0402088165283205e-05, |
| "lookahead_loss": 3.8131870975494384, |
| "loss": 2.0591, |
| "step": 205500 |
| }, |
| { |
| "base_loss": 0.32987111946940423, |
| "epoch": 1.0209808349609375, |
| "grad_norm": 0.15636946260929108, |
| "learning_rate": 3.035440444946289e-05, |
| "lookahead_loss": 3.8654905581474304, |
| "loss": 2.0925, |
| "step": 206000 |
| }, |
| { |
| "base_loss": 0.3035789504647255, |
| "epoch": 1.0219345092773438, |
| "grad_norm": 0.10754602402448654, |
| "learning_rate": 3.0306720733642578e-05, |
| "lookahead_loss": 3.7953832607269287, |
| "loss": 2.0467, |
| "step": 206500 |
| }, |
| { |
| "base_loss": 0.2993673265874386, |
| "epoch": 1.02288818359375, |
| "grad_norm": 0.1499767005443573, |
| "learning_rate": 3.025903701782227e-05, |
| "lookahead_loss": 3.802317718505859, |
| "loss": 2.0559, |
| "step": 207000 |
| }, |
| { |
| "base_loss": 0.30179986253380775, |
| "epoch": 1.0238418579101562, |
| "grad_norm": 0.10085665434598923, |
| "learning_rate": 3.0211353302001955e-05, |
| "lookahead_loss": 3.8097751059532166, |
| "loss": 2.0572, |
| "step": 207500 |
| }, |
| { |
| "base_loss": 0.3247649165391922, |
| "epoch": 1.0247955322265625, |
| "grad_norm": 0.10755620151758194, |
| "learning_rate": 3.0163669586181642e-05, |
| "lookahead_loss": 3.8524597969055177, |
| "loss": 2.0871, |
| "step": 208000 |
| }, |
| { |
| "base_loss": 0.30702361911535264, |
| "epoch": 1.0257492065429688, |
| "grad_norm": 0.12487711757421494, |
| "learning_rate": 3.011598587036133e-05, |
| "lookahead_loss": 3.7942523493766784, |
| "loss": 2.0572, |
| "step": 208500 |
| }, |
| { |
| "base_loss": 0.30614723709225655, |
| "epoch": 1.026702880859375, |
| "grad_norm": 0.12863144278526306, |
| "learning_rate": 3.0068302154541016e-05, |
| "lookahead_loss": 3.821696516036987, |
| "loss": 2.0563, |
| "step": 209000 |
| }, |
| { |
| "base_loss": 0.30975785833597186, |
| "epoch": 1.0276565551757812, |
| "grad_norm": 0.10876427590847015, |
| "learning_rate": 3.0020618438720706e-05, |
| "lookahead_loss": 3.827836051464081, |
| "loss": 2.0651, |
| "step": 209500 |
| }, |
| { |
| "base_loss": 0.33282243901491165, |
| "epoch": 1.0286102294921875, |
| "grad_norm": 0.1228252649307251, |
| "learning_rate": 2.9972934722900393e-05, |
| "lookahead_loss": 3.8672441935539243, |
| "loss": 2.0942, |
| "step": 210000 |
| }, |
| { |
| "epoch": 1.0286102294921875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.302361313908245, |
| "eval_lookahead_perplexity": 27.176736018932548, |
| "eval_loss": 1.7149664163589478, |
| "eval_perplexity": 5.556488902099112, |
| "eval_runtime": 493.9456, |
| "eval_samples_per_second": 10.123, |
| "eval_steps_per_second": 0.318, |
| "step": 210000 |
| }, |
| { |
| "base_loss": 0.303214881747961, |
| "epoch": 1.0295639038085938, |
| "grad_norm": 0.11192873120307922, |
| "learning_rate": 2.992525100708008e-05, |
| "lookahead_loss": 3.7915321893692018, |
| "loss": 2.0473, |
| "step": 210500 |
| }, |
| { |
| "base_loss": 0.3033870562314987, |
| "epoch": 1.030517578125, |
| "grad_norm": 0.14577656984329224, |
| "learning_rate": 2.9877567291259766e-05, |
| "lookahead_loss": 3.8221058592796324, |
| "loss": 2.0646, |
| "step": 211000 |
| }, |
| { |
| "base_loss": 0.30246006432175637, |
| "epoch": 1.0314712524414062, |
| "grad_norm": 0.12820690870285034, |
| "learning_rate": 2.9829883575439453e-05, |
| "lookahead_loss": 3.830271818637848, |
| "loss": 2.0663, |
| "step": 211500 |
| }, |
| { |
| "base_loss": 0.31892248579859733, |
| "epoch": 1.0324249267578125, |
| "grad_norm": 0.13428226113319397, |
| "learning_rate": 2.9782199859619143e-05, |
| "lookahead_loss": 3.847850811481476, |
| "loss": 2.0872, |
| "step": 212000 |
| }, |
| { |
| "base_loss": 0.3053825112581253, |
| "epoch": 1.0333786010742188, |
| "grad_norm": 0.16832296550273895, |
| "learning_rate": 2.973451614379883e-05, |
| "lookahead_loss": 3.7897876076698305, |
| "loss": 2.0465, |
| "step": 212500 |
| }, |
| { |
| "base_loss": 0.3021465467214584, |
| "epoch": 1.034332275390625, |
| "grad_norm": 0.12703529000282288, |
| "learning_rate": 2.9686832427978517e-05, |
| "lookahead_loss": 3.8201312785148622, |
| "loss": 2.0634, |
| "step": 213000 |
| }, |
| { |
| "base_loss": 0.30765057054162026, |
| "epoch": 1.0352859497070312, |
| "grad_norm": 0.12233400344848633, |
| "learning_rate": 2.9639148712158204e-05, |
| "lookahead_loss": 3.8115249166488647, |
| "loss": 2.0635, |
| "step": 213500 |
| }, |
| { |
| "base_loss": 0.3246081721484661, |
| "epoch": 1.0362396240234375, |
| "grad_norm": 0.11228856444358826, |
| "learning_rate": 2.959146499633789e-05, |
| "lookahead_loss": 3.8441762342453, |
| "loss": 2.0814, |
| "step": 214000 |
| }, |
| { |
| "base_loss": 0.3049994637668133, |
| "epoch": 1.0371932983398438, |
| "grad_norm": 0.1296556442975998, |
| "learning_rate": 2.954378128051758e-05, |
| "lookahead_loss": 3.7977512683868406, |
| "loss": 2.0533, |
| "step": 214500 |
| }, |
| { |
| "base_loss": 0.30023205706477163, |
| "epoch": 1.03814697265625, |
| "grad_norm": 0.13668769598007202, |
| "learning_rate": 2.9496097564697268e-05, |
| "lookahead_loss": 3.7937655339241028, |
| "loss": 2.051, |
| "step": 215000 |
| }, |
| { |
| "epoch": 1.03814697265625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.293529276649792, |
| "eval_lookahead_perplexity": 26.93776691924059, |
| "eval_loss": 1.7105357646942139, |
| "eval_perplexity": 5.531924493648194, |
| "eval_runtime": 525.3589, |
| "eval_samples_per_second": 9.517, |
| "eval_steps_per_second": 0.299, |
| "step": 215000 |
| }, |
| { |
| "base_loss": 0.3105517603158951, |
| "epoch": 1.0391006469726562, |
| "grad_norm": 0.12106386572122574, |
| "learning_rate": 2.9448413848876955e-05, |
| "lookahead_loss": 3.8179289078712464, |
| "loss": 2.0674, |
| "step": 215500 |
| }, |
| { |
| "base_loss": 0.3201953979730606, |
| "epoch": 1.0400543212890625, |
| "grad_norm": 0.16168224811553955, |
| "learning_rate": 2.940073013305664e-05, |
| "lookahead_loss": 3.835119602203369, |
| "loss": 2.0769, |
| "step": 216000 |
| }, |
| { |
| "base_loss": 0.31106969705224036, |
| "epoch": 1.0410079956054688, |
| "grad_norm": 0.10200289636850357, |
| "learning_rate": 2.9353046417236328e-05, |
| "lookahead_loss": 3.7980401215553283, |
| "loss": 2.0487, |
| "step": 216500 |
| }, |
| { |
| "base_loss": 0.29374569734930994, |
| "epoch": 1.041961669921875, |
| "grad_norm": 0.13975954055786133, |
| "learning_rate": 2.930536270141602e-05, |
| "lookahead_loss": 3.8102884998321533, |
| "loss": 2.0536, |
| "step": 217000 |
| }, |
| { |
| "base_loss": 0.30810881498456, |
| "epoch": 1.0429153442382812, |
| "grad_norm": 0.2878585755825043, |
| "learning_rate": 2.9257678985595705e-05, |
| "lookahead_loss": 3.80524068403244, |
| "loss": 2.0591, |
| "step": 217500 |
| }, |
| { |
| "base_loss": 0.32673985859751703, |
| "epoch": 1.0438690185546875, |
| "grad_norm": 0.1362403929233551, |
| "learning_rate": 2.9209995269775392e-05, |
| "lookahead_loss": 3.832825825691223, |
| "loss": 2.0847, |
| "step": 218000 |
| }, |
| { |
| "base_loss": 0.2944556847214699, |
| "epoch": 1.0448226928710938, |
| "grad_norm": 0.18242229521274567, |
| "learning_rate": 2.916231155395508e-05, |
| "lookahead_loss": 3.7715793895721434, |
| "loss": 2.0345, |
| "step": 218500 |
| }, |
| { |
| "base_loss": 0.3020890684425831, |
| "epoch": 1.0457763671875, |
| "grad_norm": 0.09675723314285278, |
| "learning_rate": 2.9114627838134766e-05, |
| "lookahead_loss": 3.8328170766830443, |
| "loss": 2.0663, |
| "step": 219000 |
| }, |
| { |
| "base_loss": 0.32630685463547704, |
| "epoch": 1.0467300415039062, |
| "grad_norm": 0.10813385993242264, |
| "learning_rate": 2.9066944122314456e-05, |
| "lookahead_loss": 3.826720841407776, |
| "loss": 2.0744, |
| "step": 219500 |
| }, |
| { |
| "base_loss": 0.3254209460914135, |
| "epoch": 1.0476837158203125, |
| "grad_norm": 0.18731825053691864, |
| "learning_rate": 2.9019260406494143e-05, |
| "lookahead_loss": 3.837134199142456, |
| "loss": 2.0853, |
| "step": 220000 |
| }, |
| { |
| "epoch": 1.0476837158203125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.2851505911769197, |
| "eval_lookahead_perplexity": 26.71300675513349, |
| "eval_loss": 1.7063519954681396, |
| "eval_perplexity": 5.508828545937477, |
| "eval_runtime": 501.181, |
| "eval_samples_per_second": 9.976, |
| "eval_steps_per_second": 0.313, |
| "step": 220000 |
| }, |
| { |
| "base_loss": 0.29404987835884094, |
| "epoch": 1.0486373901367188, |
| "grad_norm": 0.11766321212053299, |
| "learning_rate": 2.897157669067383e-05, |
| "lookahead_loss": 3.7706230659484863, |
| "loss": 2.0379, |
| "step": 220500 |
| }, |
| { |
| "base_loss": 0.30766302010416985, |
| "epoch": 1.049591064453125, |
| "grad_norm": 0.10604788362979889, |
| "learning_rate": 2.8923892974853516e-05, |
| "lookahead_loss": 3.80825559425354, |
| "loss": 2.0572, |
| "step": 221000 |
| }, |
| { |
| "base_loss": 0.3222936154305935, |
| "epoch": 1.0505447387695312, |
| "grad_norm": 0.10031577199697495, |
| "learning_rate": 2.8876209259033203e-05, |
| "lookahead_loss": 3.831725327968597, |
| "loss": 2.0767, |
| "step": 221500 |
| }, |
| { |
| "base_loss": 0.3045852819383144, |
| "epoch": 1.0514984130859375, |
| "grad_norm": 0.1485632359981537, |
| "learning_rate": 2.8828525543212893e-05, |
| "lookahead_loss": 3.774880935192108, |
| "loss": 2.0437, |
| "step": 222000 |
| }, |
| { |
| "base_loss": 0.30816273841261865, |
| "epoch": 1.0524520874023438, |
| "grad_norm": 0.18679194152355194, |
| "learning_rate": 2.878084182739258e-05, |
| "lookahead_loss": 3.8043067450523376, |
| "loss": 2.0553, |
| "step": 222500 |
| }, |
| { |
| "base_loss": 0.32319829949736595, |
| "epoch": 1.05340576171875, |
| "grad_norm": 0.11528552323579788, |
| "learning_rate": 2.8733158111572267e-05, |
| "lookahead_loss": 3.8278827791213987, |
| "loss": 2.0701, |
| "step": 223000 |
| }, |
| { |
| "base_loss": 0.3588077034056187, |
| "epoch": 1.0543594360351562, |
| "grad_norm": 0.10808339715003967, |
| "learning_rate": 2.8685474395751954e-05, |
| "lookahead_loss": 3.8612174353599547, |
| "loss": 2.1125, |
| "step": 223500 |
| }, |
| { |
| "base_loss": 0.29601221990585325, |
| "epoch": 1.0553131103515625, |
| "grad_norm": 0.13398034870624542, |
| "learning_rate": 2.863779067993164e-05, |
| "lookahead_loss": 3.758062246322632, |
| "loss": 2.0311, |
| "step": 224000 |
| }, |
| { |
| "base_loss": 0.3043306003510952, |
| "epoch": 1.0562667846679688, |
| "grad_norm": 0.10396202653646469, |
| "learning_rate": 2.859010696411133e-05, |
| "lookahead_loss": 3.826102759838104, |
| "loss": 2.0674, |
| "step": 224500 |
| }, |
| { |
| "base_loss": 0.31856663155555726, |
| "epoch": 1.057220458984375, |
| "grad_norm": 0.12544843554496765, |
| "learning_rate": 2.8542423248291018e-05, |
| "lookahead_loss": 3.824730550289154, |
| "loss": 2.0742, |
| "step": 225000 |
| }, |
| { |
| "epoch": 1.057220458984375, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.2769267330535303, |
| "eval_lookahead_perplexity": 26.494223631272767, |
| "eval_loss": 1.7022335529327393, |
| "eval_perplexity": 5.486187407250397, |
| "eval_runtime": 485.513, |
| "eval_samples_per_second": 10.298, |
| "eval_steps_per_second": 0.323, |
| "step": 225000 |
| }, |
| { |
| "base_loss": 0.32003091636300085, |
| "epoch": 1.0581741333007812, |
| "grad_norm": 0.10620646923780441, |
| "learning_rate": 2.8494739532470705e-05, |
| "lookahead_loss": 3.801748302936554, |
| "loss": 2.0581, |
| "step": 225500 |
| }, |
| { |
| "base_loss": 0.29179560819268224, |
| "epoch": 1.0591278076171875, |
| "grad_norm": 0.09537842869758606, |
| "learning_rate": 2.844705581665039e-05, |
| "lookahead_loss": 3.76057571554184, |
| "loss": 2.0317, |
| "step": 226000 |
| }, |
| { |
| "base_loss": 0.30011604171991346, |
| "epoch": 1.0600814819335938, |
| "grad_norm": 0.09911732375621796, |
| "learning_rate": 2.8399372100830078e-05, |
| "lookahead_loss": 3.809082925796509, |
| "loss": 2.0564, |
| "step": 226500 |
| }, |
| { |
| "base_loss": 0.3230703995227814, |
| "epoch": 1.06103515625, |
| "grad_norm": 0.09991439431905746, |
| "learning_rate": 2.835168838500977e-05, |
| "lookahead_loss": 3.8153861479759215, |
| "loss": 2.0645, |
| "step": 227000 |
| }, |
| { |
| "base_loss": 0.30637279444932936, |
| "epoch": 1.0619888305664062, |
| "grad_norm": 0.13179980218410492, |
| "learning_rate": 2.8304004669189455e-05, |
| "lookahead_loss": 3.786368088722229, |
| "loss": 2.0365, |
| "step": 227500 |
| }, |
| { |
| "base_loss": 0.3066646957695484, |
| "epoch": 1.0629425048828125, |
| "grad_norm": 0.1195763424038887, |
| "learning_rate": 2.8256320953369142e-05, |
| "lookahead_loss": 3.8267566895484926, |
| "loss": 2.0608, |
| "step": 228000 |
| }, |
| { |
| "base_loss": 0.31583699241280555, |
| "epoch": 1.0638961791992188, |
| "grad_norm": 0.1081557646393776, |
| "learning_rate": 2.820863723754883e-05, |
| "lookahead_loss": 3.819593190193176, |
| "loss": 2.0659, |
| "step": 228500 |
| }, |
| { |
| "base_loss": 0.30502623090147973, |
| "epoch": 1.064849853515625, |
| "grad_norm": 0.12640877068042755, |
| "learning_rate": 2.8160953521728516e-05, |
| "lookahead_loss": 3.7740664672851563, |
| "loss": 2.0424, |
| "step": 229000 |
| }, |
| { |
| "base_loss": 0.30984748020768166, |
| "epoch": 1.0658035278320312, |
| "grad_norm": 0.13270865380764008, |
| "learning_rate": 2.8113269805908206e-05, |
| "lookahead_loss": 3.7949006910324097, |
| "loss": 2.0514, |
| "step": 229500 |
| }, |
| { |
| "base_loss": 0.3079377235472202, |
| "epoch": 1.0667572021484375, |
| "grad_norm": 0.10393428802490234, |
| "learning_rate": 2.8065586090087893e-05, |
| "lookahead_loss": 3.801414387702942, |
| "loss": 2.0527, |
| "step": 230000 |
| }, |
| { |
| "epoch": 1.0667572021484375, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.269460886050337, |
| "eval_lookahead_perplexity": 26.297158356119088, |
| "eval_loss": 1.6984984874725342, |
| "eval_perplexity": 5.4657343586728935, |
| "eval_runtime": 501.4886, |
| "eval_samples_per_second": 9.97, |
| "eval_steps_per_second": 0.313, |
| "step": 230000 |
| }, |
| { |
| "base_loss": 0.33075104707479475, |
| "epoch": 1.0677108764648438, |
| "grad_norm": 0.15981747210025787, |
| "learning_rate": 2.801790237426758e-05, |
| "lookahead_loss": 3.83703492307663, |
| "loss": 2.081, |
| "step": 230500 |
| }, |
| { |
| "base_loss": 0.3024148660302162, |
| "epoch": 1.06866455078125, |
| "grad_norm": 0.13739366829395294, |
| "learning_rate": 2.7970218658447266e-05, |
| "lookahead_loss": 3.754325032234192, |
| "loss": 2.0253, |
| "step": 231000 |
| }, |
| { |
| "base_loss": 0.3048691195845604, |
| "epoch": 1.0696182250976562, |
| "grad_norm": 0.12613807618618011, |
| "learning_rate": 2.7922534942626953e-05, |
| "lookahead_loss": 3.824920612812042, |
| "loss": 2.0616, |
| "step": 231500 |
| }, |
| { |
| "base_loss": 0.3433072043955326, |
| "epoch": 1.0705718994140625, |
| "grad_norm": 0.10592526942491531, |
| "learning_rate": 2.7874851226806643e-05, |
| "lookahead_loss": 3.845939799785614, |
| "loss": 2.0977, |
| "step": 232000 |
| }, |
| { |
| "base_loss": 0.3126540828049183, |
| "epoch": 1.0715255737304688, |
| "grad_norm": 0.14438621699810028, |
| "learning_rate": 2.782716751098633e-05, |
| "lookahead_loss": 3.7780807838439943, |
| "loss": 2.045, |
| "step": 232500 |
| }, |
| { |
| "base_loss": 0.30936010053753854, |
| "epoch": 1.072479248046875, |
| "grad_norm": 0.12224919348955154, |
| "learning_rate": 2.7779483795166017e-05, |
| "lookahead_loss": 3.790074597835541, |
| "loss": 2.0452, |
| "step": 233000 |
| }, |
| { |
| "base_loss": 0.3016264271736145, |
| "epoch": 1.0734329223632812, |
| "grad_norm": 0.09938216209411621, |
| "learning_rate": 2.7731800079345704e-05, |
| "lookahead_loss": 3.7917777862548827, |
| "loss": 2.0494, |
| "step": 233500 |
| }, |
| { |
| "base_loss": 0.3292314064204693, |
| "epoch": 1.0743865966796875, |
| "grad_norm": 0.10616600513458252, |
| "learning_rate": 2.768411636352539e-05, |
| "lookahead_loss": 3.8125745573043823, |
| "loss": 2.0687, |
| "step": 234000 |
| }, |
| { |
| "base_loss": 0.304562608808279, |
| "epoch": 1.0753402709960938, |
| "grad_norm": 0.12816548347473145, |
| "learning_rate": 2.763643264770508e-05, |
| "lookahead_loss": 3.7680485906600953, |
| "loss": 2.0349, |
| "step": 234500 |
| }, |
| { |
| "base_loss": 0.3058357034623623, |
| "epoch": 1.0762939453125, |
| "grad_norm": 0.16024808585643768, |
| "learning_rate": 2.7588748931884768e-05, |
| "lookahead_loss": 3.8191744446754456, |
| "loss": 2.0615, |
| "step": 235000 |
| }, |
| { |
| "epoch": 1.0762939453125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.262039216562582, |
| "eval_lookahead_perplexity": 26.102711989194106, |
| "eval_loss": 1.6948232650756836, |
| "eval_perplexity": 5.445683437708976, |
| "eval_runtime": 506.7872, |
| "eval_samples_per_second": 9.866, |
| "eval_steps_per_second": 0.31, |
| "step": 235000 |
| }, |
| { |
| "base_loss": 0.32823599469661713, |
| "epoch": 1.0772476196289062, |
| "grad_norm": 0.1520950198173523, |
| "learning_rate": 2.7541065216064455e-05, |
| "lookahead_loss": 3.822832639217377, |
| "loss": 2.0821, |
| "step": 235500 |
| }, |
| { |
| "base_loss": 0.3016549552977085, |
| "epoch": 1.0782012939453125, |
| "grad_norm": 0.12211694568395615, |
| "learning_rate": 2.749338150024414e-05, |
| "lookahead_loss": 3.756803053855896, |
| "loss": 2.0364, |
| "step": 236000 |
| }, |
| { |
| "base_loss": 0.29828149917721747, |
| "epoch": 1.0791549682617188, |
| "grad_norm": 0.12640158832073212, |
| "learning_rate": 2.7445697784423828e-05, |
| "lookahead_loss": 3.789377547264099, |
| "loss": 2.0469, |
| "step": 236500 |
| }, |
| { |
| "base_loss": 0.31368752831220625, |
| "epoch": 1.080108642578125, |
| "grad_norm": 0.11661666631698608, |
| "learning_rate": 2.739801406860352e-05, |
| "lookahead_loss": 3.815073594093323, |
| "loss": 2.0704, |
| "step": 237000 |
| }, |
| { |
| "base_loss": 0.3168468432724476, |
| "epoch": 1.0810623168945312, |
| "grad_norm": 0.15805600583553314, |
| "learning_rate": 2.7350330352783205e-05, |
| "lookahead_loss": 3.788653570652008, |
| "loss": 2.0595, |
| "step": 237500 |
| }, |
| { |
| "base_loss": 0.29921497783064843, |
| "epoch": 1.0820159912109375, |
| "grad_norm": 0.13112910091876984, |
| "learning_rate": 2.7302646636962892e-05, |
| "lookahead_loss": 3.765235338687897, |
| "loss": 2.0388, |
| "step": 238000 |
| }, |
| { |
| "base_loss": 0.3023185026049614, |
| "epoch": 1.0829696655273438, |
| "grad_norm": 0.09834863990545273, |
| "learning_rate": 2.725496292114258e-05, |
| "lookahead_loss": 3.7986365513801577, |
| "loss": 2.0509, |
| "step": 238500 |
| }, |
| { |
| "base_loss": 0.33467673206329346, |
| "epoch": 1.08392333984375, |
| "grad_norm": 0.10940661281347275, |
| "learning_rate": 2.7207279205322266e-05, |
| "lookahead_loss": 3.8246895036697386, |
| "loss": 2.0764, |
| "step": 239000 |
| }, |
| { |
| "base_loss": 0.30601534658670426, |
| "epoch": 1.0848770141601562, |
| "grad_norm": 0.1469735950231552, |
| "learning_rate": 2.7159595489501956e-05, |
| "lookahead_loss": 3.773017762184143, |
| "loss": 2.0409, |
| "step": 239500 |
| }, |
| { |
| "base_loss": 0.2985472394824028, |
| "epoch": 1.0858306884765625, |
| "grad_norm": 0.10987813770771027, |
| "learning_rate": 2.7111911773681643e-05, |
| "lookahead_loss": 3.779964041233063, |
| "loss": 2.038, |
| "step": 240000 |
| }, |
| { |
| "epoch": 1.0858306884765625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.2550360676579584, |
| "eval_lookahead_perplexity": 25.920549410649766, |
| "eval_loss": 1.6912927627563477, |
| "eval_perplexity": 5.426491338512043, |
| "eval_runtime": 484.3801, |
| "eval_samples_per_second": 10.322, |
| "eval_steps_per_second": 0.324, |
| "step": 240000 |
| }, |
| { |
| "base_loss": 0.3033053425848484, |
| "epoch": 1.0867843627929688, |
| "grad_norm": 0.4711226522922516, |
| "learning_rate": 2.706422805786133e-05, |
| "lookahead_loss": 3.782335328578949, |
| "loss": 2.0383, |
| "step": 240500 |
| }, |
| { |
| "base_loss": 0.33623868149518965, |
| "epoch": 1.087738037109375, |
| "grad_norm": 0.10267792642116547, |
| "learning_rate": 2.7016544342041016e-05, |
| "lookahead_loss": 3.8300727925300597, |
| "loss": 2.0763, |
| "step": 241000 |
| }, |
| { |
| "base_loss": 0.3016283850669861, |
| "epoch": 1.0886917114257812, |
| "grad_norm": 0.12222345918416977, |
| "learning_rate": 2.6968860626220703e-05, |
| "lookahead_loss": 3.7537764272689818, |
| "loss": 2.0287, |
| "step": 241500 |
| }, |
| { |
| "base_loss": 0.3083756065964699, |
| "epoch": 1.0896453857421875, |
| "grad_norm": 0.11414934694766998, |
| "learning_rate": 2.6921176910400393e-05, |
| "lookahead_loss": 3.752362766265869, |
| "loss": 2.0298, |
| "step": 242000 |
| }, |
| { |
| "base_loss": 0.2994114246070385, |
| "epoch": 1.0905990600585938, |
| "grad_norm": 0.10676714032888412, |
| "learning_rate": 2.687349319458008e-05, |
| "lookahead_loss": 3.7789285941123962, |
| "loss": 2.0418, |
| "step": 242500 |
| }, |
| { |
| "base_loss": 0.29944976773858073, |
| "epoch": 1.091552734375, |
| "grad_norm": 0.10604005306959152, |
| "learning_rate": 2.6825809478759767e-05, |
| "lookahead_loss": 3.779764890193939, |
| "loss": 2.0375, |
| "step": 243000 |
| }, |
| { |
| "base_loss": 0.32598793333768844, |
| "epoch": 1.0925064086914062, |
| "grad_norm": 0.16571524739265442, |
| "learning_rate": 2.6778125762939454e-05, |
| "lookahead_loss": 3.800921561717987, |
| "loss": 2.0587, |
| "step": 243500 |
| }, |
| { |
| "base_loss": 0.30856517258286476, |
| "epoch": 1.0934600830078125, |
| "grad_norm": 0.12707704305648804, |
| "learning_rate": 2.673044204711914e-05, |
| "lookahead_loss": 3.7637797536849975, |
| "loss": 2.0395, |
| "step": 244000 |
| }, |
| { |
| "base_loss": 0.2889265112578869, |
| "epoch": 1.0944137573242188, |
| "grad_norm": 0.16245336830615997, |
| "learning_rate": 2.668275833129883e-05, |
| "lookahead_loss": 3.7417610387802123, |
| "loss": 2.0145, |
| "step": 244500 |
| }, |
| { |
| "base_loss": 0.2941120155751705, |
| "epoch": 1.095367431640625, |
| "grad_norm": 0.11395586282014847, |
| "learning_rate": 2.6635074615478518e-05, |
| "lookahead_loss": 3.7530963735580443, |
| "loss": 2.0266, |
| "step": 245000 |
| }, |
| { |
| "epoch": 1.095367431640625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.2479548934168707, |
| "eval_lookahead_perplexity": 25.737649820044062, |
| "eval_loss": 1.687726616859436, |
| "eval_perplexity": 5.4071741431311375, |
| "eval_runtime": 512.0102, |
| "eval_samples_per_second": 9.765, |
| "eval_steps_per_second": 0.307, |
| "step": 245000 |
| }, |
| { |
| "base_loss": 0.3011808663010597, |
| "epoch": 1.0963211059570312, |
| "grad_norm": 0.2450723797082901, |
| "learning_rate": 2.6587390899658205e-05, |
| "lookahead_loss": 3.7636666469573976, |
| "loss": 2.0324, |
| "step": 245500 |
| }, |
| { |
| "base_loss": 0.33242677092552186, |
| "epoch": 1.0972747802734375, |
| "grad_norm": 0.11119936406612396, |
| "learning_rate": 2.653970718383789e-05, |
| "lookahead_loss": 3.8032662024497985, |
| "loss": 2.0627, |
| "step": 246000 |
| }, |
| { |
| "base_loss": 0.29240253108739855, |
| "epoch": 1.0982284545898438, |
| "grad_norm": 0.12222541123628616, |
| "learning_rate": 2.6492023468017578e-05, |
| "lookahead_loss": 3.7323589258193968, |
| "loss": 2.0156, |
| "step": 246500 |
| }, |
| { |
| "base_loss": 0.29588570061326025, |
| "epoch": 1.09918212890625, |
| "grad_norm": 0.09957096725702286, |
| "learning_rate": 2.644433975219727e-05, |
| "lookahead_loss": 3.7686120963096617, |
| "loss": 2.035, |
| "step": 247000 |
| }, |
| { |
| "base_loss": 0.2992869386672974, |
| "epoch": 1.1001358032226562, |
| "grad_norm": 0.1540381759405136, |
| "learning_rate": 2.6396656036376955e-05, |
| "lookahead_loss": 3.775865716457367, |
| "loss": 2.0415, |
| "step": 247500 |
| }, |
| { |
| "base_loss": 0.3209733834564686, |
| "epoch": 1.1010894775390625, |
| "grad_norm": 0.09748150408267975, |
| "learning_rate": 2.6348972320556642e-05, |
| "lookahead_loss": 3.805611068725586, |
| "loss": 2.059, |
| "step": 248000 |
| }, |
| { |
| "base_loss": 0.301429179161787, |
| "epoch": 1.1020431518554688, |
| "grad_norm": 0.14414113759994507, |
| "learning_rate": 2.630128860473633e-05, |
| "lookahead_loss": 3.752038255691528, |
| "loss": 2.0265, |
| "step": 248500 |
| }, |
| { |
| "base_loss": 0.2956730664372444, |
| "epoch": 1.102996826171875, |
| "grad_norm": 0.117804616689682, |
| "learning_rate": 2.6253604888916016e-05, |
| "lookahead_loss": 3.7491831588745117, |
| "loss": 2.031, |
| "step": 249000 |
| }, |
| { |
| "base_loss": 0.29523133793473244, |
| "epoch": 1.1039505004882812, |
| "grad_norm": 0.09736798703670502, |
| "learning_rate": 2.6205921173095706e-05, |
| "lookahead_loss": 3.778213514328003, |
| "loss": 2.0405, |
| "step": 249500 |
| }, |
| { |
| "base_loss": 0.31446042719483375, |
| "epoch": 1.1049041748046875, |
| "grad_norm": 0.11215147376060486, |
| "learning_rate": 2.6158237457275393e-05, |
| "lookahead_loss": 3.776704475879669, |
| "loss": 2.0502, |
| "step": 250000 |
| }, |
| { |
| "epoch": 1.1049041748046875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.241219103145904, |
| "eval_lookahead_perplexity": 25.5648689698412, |
| "eval_loss": 1.6843818426132202, |
| "eval_perplexity": 5.389118579038349, |
| "eval_runtime": 490.435, |
| "eval_samples_per_second": 10.195, |
| "eval_steps_per_second": 0.32, |
| "step": 250000 |
| }, |
| { |
| "base_loss": 0.30885917544364927, |
| "epoch": 1.1058578491210938, |
| "grad_norm": 0.14926961064338684, |
| "learning_rate": 2.611055374145508e-05, |
| "lookahead_loss": 3.7549472694396973, |
| "loss": 2.0303, |
| "step": 250500 |
| }, |
| { |
| "base_loss": 0.29513884752988817, |
| "epoch": 1.1068115234375, |
| "grad_norm": 0.11320330202579498, |
| "learning_rate": 2.6062870025634766e-05, |
| "lookahead_loss": 3.7395705704689024, |
| "loss": 2.02, |
| "step": 251000 |
| }, |
| { |
| "base_loss": 0.2974797194004059, |
| "epoch": 1.1077651977539062, |
| "grad_norm": 0.1485828310251236, |
| "learning_rate": 2.6015186309814453e-05, |
| "lookahead_loss": 3.767058692932129, |
| "loss": 2.0293, |
| "step": 251500 |
| }, |
| { |
| "base_loss": 0.3221335953772068, |
| "epoch": 1.1087188720703125, |
| "grad_norm": 0.12389354407787323, |
| "learning_rate": 2.5967502593994143e-05, |
| "lookahead_loss": 3.7966585497856142, |
| "loss": 2.0583, |
| "step": 252000 |
| }, |
| { |
| "base_loss": 0.31462463283538816, |
| "epoch": 1.1096725463867188, |
| "grad_norm": 0.12376336008310318, |
| "learning_rate": 2.591981887817383e-05, |
| "lookahead_loss": 3.75765408372879, |
| "loss": 2.0376, |
| "step": 252500 |
| }, |
| { |
| "base_loss": 0.3006181915104389, |
| "epoch": 1.110626220703125, |
| "grad_norm": 0.10401830077171326, |
| "learning_rate": 2.5872135162353517e-05, |
| "lookahead_loss": 3.7382473673820495, |
| "loss": 2.0207, |
| "step": 253000 |
| }, |
| { |
| "base_loss": 0.2966276684105396, |
| "epoch": 1.1115798950195312, |
| "grad_norm": 0.2943388521671295, |
| "learning_rate": 2.5824451446533204e-05, |
| "lookahead_loss": 3.77330969953537, |
| "loss": 2.0325, |
| "step": 253500 |
| }, |
| { |
| "base_loss": 0.30631836572289467, |
| "epoch": 1.1125335693359375, |
| "grad_norm": 0.14963500201702118, |
| "learning_rate": 2.577676773071289e-05, |
| "lookahead_loss": 3.777679774284363, |
| "loss": 2.0433, |
| "step": 254000 |
| }, |
| { |
| "base_loss": 0.3482712984383106, |
| "epoch": 1.1134872436523438, |
| "grad_norm": 0.11030125617980957, |
| "learning_rate": 2.572908401489258e-05, |
| "lookahead_loss": 3.8148307304382323, |
| "loss": 2.0723, |
| "step": 254500 |
| }, |
| { |
| "base_loss": 0.29542491587996483, |
| "epoch": 1.11444091796875, |
| "grad_norm": 0.11233729124069214, |
| "learning_rate": 2.5681400299072268e-05, |
| "lookahead_loss": 3.7286681451797485, |
| "loss": 2.014, |
| "step": 255000 |
| }, |
| { |
| "epoch": 1.11444091796875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.2360856357854777, |
| "eval_lookahead_perplexity": 25.433968822689298, |
| "eval_loss": 1.6818461418151855, |
| "eval_perplexity": 5.375470697541516, |
| "eval_runtime": 525.262, |
| "eval_samples_per_second": 9.519, |
| "eval_steps_per_second": 0.299, |
| "step": 255000 |
| }, |
| { |
| "base_loss": 0.2994180084168911, |
| "epoch": 1.1153945922851562, |
| "grad_norm": 0.3878551721572876, |
| "learning_rate": 2.5633716583251955e-05, |
| "lookahead_loss": 3.7766121606826784, |
| "loss": 2.0408, |
| "step": 255500 |
| }, |
| { |
| "base_loss": 0.3126500599086285, |
| "epoch": 1.1163482666015625, |
| "grad_norm": 0.1614830195903778, |
| "learning_rate": 2.558603286743164e-05, |
| "lookahead_loss": 3.7671957154273987, |
| "loss": 2.0408, |
| "step": 256000 |
| }, |
| { |
| "base_loss": 0.3251005619764328, |
| "epoch": 1.1173019409179688, |
| "grad_norm": 0.10149198770523071, |
| "learning_rate": 2.5538349151611328e-05, |
| "lookahead_loss": 3.8031828441619875, |
| "loss": 2.0681, |
| "step": 256500 |
| }, |
| { |
| "base_loss": 0.3045504302084446, |
| "epoch": 1.118255615234375, |
| "grad_norm": 0.10480870306491852, |
| "learning_rate": 2.549066543579102e-05, |
| "lookahead_loss": 3.7594980635643007, |
| "loss": 2.0284, |
| "step": 257000 |
| }, |
| { |
| "base_loss": 0.3033926927447319, |
| "epoch": 1.1192092895507812, |
| "grad_norm": 0.1331453174352646, |
| "learning_rate": 2.5442981719970705e-05, |
| "lookahead_loss": 3.7688667068481445, |
| "loss": 2.0333, |
| "step": 257500 |
| }, |
| { |
| "base_loss": 0.3065674279928207, |
| "epoch": 2.0009536743164062, |
| "grad_norm": 0.11469951272010803, |
| "learning_rate": 2.5395298004150392e-05, |
| "lookahead_loss": 3.7806892952919005, |
| "loss": 2.0367, |
| "step": 258000 |
| }, |
| { |
| "base_loss": 0.30196980077028274, |
| "epoch": 2.0019073486328125, |
| "grad_norm": 0.17589828372001648, |
| "learning_rate": 2.534761428833008e-05, |
| "lookahead_loss": 3.761930028438568, |
| "loss": 2.0342, |
| "step": 258500 |
| }, |
| { |
| "base_loss": 0.3121912784278393, |
| "epoch": 2.0028610229492188, |
| "grad_norm": 0.10175404697656631, |
| "learning_rate": 2.5299930572509766e-05, |
| "lookahead_loss": 3.7761128277778626, |
| "loss": 2.0353, |
| "step": 259000 |
| }, |
| { |
| "base_loss": 0.320074492007494, |
| "epoch": 2.003814697265625, |
| "grad_norm": 0.11250407993793488, |
| "learning_rate": 2.5252246856689456e-05, |
| "lookahead_loss": 3.772739490509033, |
| "loss": 2.0486, |
| "step": 259500 |
| }, |
| { |
| "base_loss": 0.3025432696044445, |
| "epoch": 2.0047683715820312, |
| "grad_norm": 0.10035926848649979, |
| "learning_rate": 2.5204563140869143e-05, |
| "lookahead_loss": 3.7355602521896363, |
| "loss": 2.0217, |
| "step": 260000 |
| }, |
| { |
| "epoch": 2.0047683715820312, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.229613203591051, |
| "eval_lookahead_perplexity": 25.269880781246492, |
| "eval_loss": 1.678591012954712, |
| "eval_perplexity": 5.358001295737592, |
| "eval_runtime": 520.5248, |
| "eval_samples_per_second": 9.606, |
| "eval_steps_per_second": 0.302, |
| "step": 260000 |
| }, |
| { |
| "base_loss": 0.29722241711616515, |
| "epoch": 2.0057220458984375, |
| "grad_norm": 0.11531388014554977, |
| "learning_rate": 2.515687942504883e-05, |
| "lookahead_loss": 3.757506604194641, |
| "loss": 2.0282, |
| "step": 260500 |
| }, |
| { |
| "base_loss": 0.29807624077796935, |
| "epoch": 2.0066757202148438, |
| "grad_norm": 0.103513203561306, |
| "learning_rate": 2.5109195709228516e-05, |
| "lookahead_loss": 3.769570269584656, |
| "loss": 2.0366, |
| "step": 261000 |
| }, |
| { |
| "base_loss": 0.3125916388332844, |
| "epoch": 2.00762939453125, |
| "grad_norm": 0.12189048528671265, |
| "learning_rate": 2.5061511993408203e-05, |
| "lookahead_loss": 3.777708933353424, |
| "loss": 2.0416, |
| "step": 261500 |
| }, |
| { |
| "base_loss": 0.31632441571354863, |
| "epoch": 2.0085830688476562, |
| "grad_norm": 0.11706750094890594, |
| "learning_rate": 2.5013828277587893e-05, |
| "lookahead_loss": 3.768247010707855, |
| "loss": 2.0345, |
| "step": 262000 |
| }, |
| { |
| "base_loss": 0.2996086142659187, |
| "epoch": 2.0095367431640625, |
| "grad_norm": 0.12055575102567673, |
| "learning_rate": 2.496614456176758e-05, |
| "lookahead_loss": 3.724477370262146, |
| "loss": 2.0186, |
| "step": 262500 |
| }, |
| { |
| "base_loss": 0.2994557471871376, |
| "epoch": 2.0104904174804688, |
| "grad_norm": 0.09918702393770218, |
| "learning_rate": 2.4918460845947267e-05, |
| "lookahead_loss": 3.7654948663711547, |
| "loss": 2.0302, |
| "step": 263000 |
| }, |
| { |
| "base_loss": 0.30171854814887045, |
| "epoch": 2.011444091796875, |
| "grad_norm": 0.10484851896762848, |
| "learning_rate": 2.4870777130126954e-05, |
| "lookahead_loss": 3.748970988750458, |
| "loss": 2.0291, |
| "step": 263500 |
| }, |
| { |
| "base_loss": 0.3268180110156536, |
| "epoch": 2.0123977661132812, |
| "grad_norm": 0.09725604206323624, |
| "learning_rate": 2.482309341430664e-05, |
| "lookahead_loss": 3.7823777060508728, |
| "loss": 2.0534, |
| "step": 264000 |
| }, |
| { |
| "base_loss": 0.30524489533901217, |
| "epoch": 2.0133514404296875, |
| "grad_norm": 0.11222957819700241, |
| "learning_rate": 2.477540969848633e-05, |
| "lookahead_loss": 3.732293013095856, |
| "loss": 2.0222, |
| "step": 264500 |
| }, |
| { |
| "base_loss": 0.29953240939974785, |
| "epoch": 2.0143051147460938, |
| "grad_norm": 0.12111784517765045, |
| "learning_rate": 2.4727725982666018e-05, |
| "lookahead_loss": 3.73520312833786, |
| "loss": 2.0174, |
| "step": 265000 |
| }, |
| { |
| "epoch": 2.0143051147460938, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.2239884416135354, |
| "eval_lookahead_perplexity": 25.128142711795284, |
| "eval_loss": 1.6757808923721313, |
| "eval_perplexity": 5.342965801684929, |
| "eval_runtime": 518.5198, |
| "eval_samples_per_second": 9.643, |
| "eval_steps_per_second": 0.303, |
| "step": 265000 |
| }, |
| { |
| "base_loss": 0.2956546121239662, |
| "epoch": 2.0152587890625, |
| "grad_norm": 0.11768271774053574, |
| "learning_rate": 2.4680042266845705e-05, |
| "lookahead_loss": 3.7577808418273926, |
| "loss": 2.0272, |
| "step": 265500 |
| }, |
| { |
| "base_loss": 0.31376709473133085, |
| "epoch": 2.0162124633789062, |
| "grad_norm": 0.1221555769443512, |
| "learning_rate": 2.463235855102539e-05, |
| "lookahead_loss": 3.771606719017029, |
| "loss": 2.0436, |
| "step": 266000 |
| }, |
| { |
| "base_loss": 0.31018616977334024, |
| "epoch": 2.0171661376953125, |
| "grad_norm": 0.13717730343341827, |
| "learning_rate": 2.4584674835205078e-05, |
| "lookahead_loss": 3.764633895397186, |
| "loss": 2.0331, |
| "step": 266500 |
| }, |
| { |
| "base_loss": 0.30160776057839395, |
| "epoch": 2.0181198120117188, |
| "grad_norm": 0.13953110575675964, |
| "learning_rate": 2.453699111938477e-05, |
| "lookahead_loss": 3.7366544070243837, |
| "loss": 2.0145, |
| "step": 267000 |
| }, |
| { |
| "base_loss": 0.3013426844775677, |
| "epoch": 2.019073486328125, |
| "grad_norm": 0.10401706397533417, |
| "learning_rate": 2.4489307403564455e-05, |
| "lookahead_loss": 3.773498547077179, |
| "loss": 2.036, |
| "step": 267500 |
| }, |
| { |
| "base_loss": 0.30037295311689377, |
| "epoch": 2.0200271606445312, |
| "grad_norm": 0.1209816038608551, |
| "learning_rate": 2.4441623687744142e-05, |
| "lookahead_loss": 3.7449615778923033, |
| "loss": 2.0239, |
| "step": 268000 |
| }, |
| { |
| "base_loss": 0.33094308829307556, |
| "epoch": 2.0209808349609375, |
| "grad_norm": 0.14925232529640198, |
| "learning_rate": 2.439393997192383e-05, |
| "lookahead_loss": 3.802969255924225, |
| "loss": 2.0617, |
| "step": 268500 |
| }, |
| { |
| "base_loss": 0.3023415932953358, |
| "epoch": 2.0219345092773438, |
| "grad_norm": 0.1070331484079361, |
| "learning_rate": 2.4346256256103516e-05, |
| "lookahead_loss": 3.728753103733063, |
| "loss": 2.0133, |
| "step": 269000 |
| }, |
| { |
| "base_loss": 0.30017873507738113, |
| "epoch": 2.02288818359375, |
| "grad_norm": 0.15573182702064514, |
| "learning_rate": 2.4298572540283206e-05, |
| "lookahead_loss": 3.736812686443329, |
| "loss": 2.0228, |
| "step": 269500 |
| }, |
| { |
| "base_loss": 0.30104174053668975, |
| "epoch": 2.0238418579101562, |
| "grad_norm": 0.12105967849493027, |
| "learning_rate": 2.4250888824462893e-05, |
| "lookahead_loss": 3.7435108699798585, |
| "loss": 2.0246, |
| "step": 270000 |
| }, |
| { |
| "epoch": 2.0238418579101562, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.2185772630734184, |
| "eval_lookahead_perplexity": 24.992537069258876, |
| "eval_loss": 1.6730648279190063, |
| "eval_perplexity": 5.328473651912189, |
| "eval_runtime": 520.3975, |
| "eval_samples_per_second": 9.608, |
| "eval_steps_per_second": 0.302, |
| "step": 270000 |
| }, |
| { |
| "base_loss": 0.32663769656419755, |
| "epoch": 2.0247955322265625, |
| "grad_norm": 0.10453636199235916, |
| "learning_rate": 2.420320510864258e-05, |
| "lookahead_loss": 3.78777219581604, |
| "loss": 2.0562, |
| "step": 270500 |
| }, |
| { |
| "base_loss": 0.3077376018166542, |
| "epoch": 2.0257492065429688, |
| "grad_norm": 0.12286706268787384, |
| "learning_rate": 2.4155521392822266e-05, |
| "lookahead_loss": 3.732728928089142, |
| "loss": 2.0261, |
| "step": 271000 |
| }, |
| { |
| "base_loss": 0.304711830675602, |
| "epoch": 2.026702880859375, |
| "grad_norm": 0.12905757129192352, |
| "learning_rate": 2.4107837677001953e-05, |
| "lookahead_loss": 3.755615324497223, |
| "loss": 2.0239, |
| "step": 271500 |
| }, |
| { |
| "base_loss": 0.30911297634243967, |
| "epoch": 2.0276565551757812, |
| "grad_norm": 0.1109541729092598, |
| "learning_rate": 2.406015396118164e-05, |
| "lookahead_loss": 3.763146149635315, |
| "loss": 2.0324, |
| "step": 272000 |
| }, |
| { |
| "base_loss": 0.3343582956790924, |
| "epoch": 2.0286102294921875, |
| "grad_norm": 0.13435722887516022, |
| "learning_rate": 2.401247024536133e-05, |
| "lookahead_loss": 3.80504735994339, |
| "loss": 2.0631, |
| "step": 272500 |
| }, |
| { |
| "base_loss": 0.30309502825140955, |
| "epoch": 2.0295639038085938, |
| "grad_norm": 0.10892323404550552, |
| "learning_rate": 2.3964786529541017e-05, |
| "lookahead_loss": 3.728642023563385, |
| "loss": 2.0167, |
| "step": 273000 |
| }, |
| { |
| "base_loss": 0.30229189068078993, |
| "epoch": 2.030517578125, |
| "grad_norm": 0.15014854073524475, |
| "learning_rate": 2.3917102813720704e-05, |
| "lookahead_loss": 3.759530487060547, |
| "loss": 2.0321, |
| "step": 273500 |
| }, |
| { |
| "base_loss": 0.3015878119468689, |
| "epoch": 2.0314712524414062, |
| "grad_norm": 0.12702982127666473, |
| "learning_rate": 2.386941909790039e-05, |
| "lookahead_loss": 3.7643159022331236, |
| "loss": 2.0323, |
| "step": 274000 |
| }, |
| { |
| "base_loss": 0.31617325788736345, |
| "epoch": 2.0324249267578125, |
| "grad_norm": 0.1333305388689041, |
| "learning_rate": 2.3821735382080078e-05, |
| "lookahead_loss": 3.7832584075927733, |
| "loss": 2.0536, |
| "step": 274500 |
| }, |
| { |
| "base_loss": 0.3030018242299557, |
| "epoch": 2.0333786010742188, |
| "grad_norm": 0.16368670761585236, |
| "learning_rate": 2.3774051666259768e-05, |
| "lookahead_loss": 3.7274523305892946, |
| "loss": 2.0161, |
| "step": 275000 |
| }, |
| { |
| "epoch": 2.0333786010742188, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.213857348353718, |
| "eval_lookahead_perplexity": 24.874852375000955, |
| "eval_loss": 1.6707115173339844, |
| "eval_perplexity": 5.315948841629713, |
| "eval_runtime": 567.9157, |
| "eval_samples_per_second": 8.804, |
| "eval_steps_per_second": 0.276, |
| "step": 275000 |
| }, |
| { |
| "base_loss": 0.3039812208414078, |
| "epoch": 2.034332275390625, |
| "grad_norm": 0.13603663444519043, |
| "learning_rate": 2.3726367950439455e-05, |
| "lookahead_loss": 3.7608649916648864, |
| "loss": 2.0336, |
| "step": 275500 |
| }, |
| { |
| "base_loss": 0.30951715883612635, |
| "epoch": 2.0352859497070312, |
| "grad_norm": 0.13090349733829498, |
| "learning_rate": 2.367868423461914e-05, |
| "lookahead_loss": 3.751265535354614, |
| "loss": 2.0318, |
| "step": 276000 |
| }, |
| { |
| "base_loss": 0.3263999198377132, |
| "epoch": 2.0362396240234375, |
| "grad_norm": 0.10014423727989197, |
| "learning_rate": 2.3631000518798828e-05, |
| "lookahead_loss": 3.783777579784393, |
| "loss": 2.0521, |
| "step": 276500 |
| }, |
| { |
| "base_loss": 0.3047963642179966, |
| "epoch": 2.0371932983398438, |
| "grad_norm": 0.12176624685525894, |
| "learning_rate": 2.3583316802978515e-05, |
| "lookahead_loss": 3.737890814781189, |
| "loss": 2.0226, |
| "step": 277000 |
| }, |
| { |
| "base_loss": 0.3006651694476604, |
| "epoch": 2.03814697265625, |
| "grad_norm": 0.13232555985450745, |
| "learning_rate": 2.3535633087158205e-05, |
| "lookahead_loss": 3.7341004371643067, |
| "loss": 2.0212, |
| "step": 277500 |
| }, |
| { |
| "base_loss": 0.30875834566354754, |
| "epoch": 2.0391006469726562, |
| "grad_norm": 0.12054823338985443, |
| "learning_rate": 2.3487949371337892e-05, |
| "lookahead_loss": 3.754430528640747, |
| "loss": 2.0355, |
| "step": 278000 |
| }, |
| { |
| "base_loss": 0.32003281235694886, |
| "epoch": 2.0400543212890625, |
| "grad_norm": 0.13997812569141388, |
| "learning_rate": 2.344026565551758e-05, |
| "lookahead_loss": 3.7748019156455994, |
| "loss": 2.0462, |
| "step": 278500 |
| }, |
| { |
| "base_loss": 0.3103601124882698, |
| "epoch": 2.0410079956054688, |
| "grad_norm": 0.1007557213306427, |
| "learning_rate": 2.3392581939697266e-05, |
| "lookahead_loss": 3.7388244090080263, |
| "loss": 2.0189, |
| "step": 279000 |
| }, |
| { |
| "base_loss": 0.2956464845538139, |
| "epoch": 2.041961669921875, |
| "grad_norm": 0.13650935888290405, |
| "learning_rate": 2.3344898223876953e-05, |
| "lookahead_loss": 3.751103096961975, |
| "loss": 2.0244, |
| "step": 279500 |
| }, |
| { |
| "base_loss": 0.30817501452565194, |
| "epoch": 2.0429153442382812, |
| "grad_norm": 0.28911060094833374, |
| "learning_rate": 2.3297214508056643e-05, |
| "lookahead_loss": 3.7451971626281737, |
| "loss": 2.0305, |
| "step": 280000 |
| }, |
| { |
| "epoch": 2.0429153442382812, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.208491450300613, |
| "eval_lookahead_perplexity": 24.74173392249817, |
| "eval_loss": 1.668028473854065, |
| "eval_perplexity": 5.3017050366693725, |
| "eval_runtime": 503.074, |
| "eval_samples_per_second": 9.939, |
| "eval_steps_per_second": 0.312, |
| "step": 280000 |
| }, |
| { |
| "base_loss": 0.326471285879612, |
| "epoch": 2.0438690185546875, |
| "grad_norm": 0.12640319764614105, |
| "learning_rate": 2.324953079223633e-05, |
| "lookahead_loss": 3.774631119728088, |
| "loss": 2.0557, |
| "step": 280500 |
| }, |
| { |
| "base_loss": 0.2915420908033848, |
| "epoch": 2.0448226928710938, |
| "grad_norm": 0.1708357036113739, |
| "learning_rate": 2.3201847076416016e-05, |
| "lookahead_loss": 3.7091518301963804, |
| "loss": 2.0031, |
| "step": 281000 |
| }, |
| { |
| "base_loss": 0.3044695939719677, |
| "epoch": 2.0457763671875, |
| "grad_norm": 0.10427648574113846, |
| "learning_rate": 2.3154163360595703e-05, |
| "lookahead_loss": 3.771734592437744, |
| "loss": 2.0368, |
| "step": 281500 |
| }, |
| { |
| "base_loss": 0.33020448702573774, |
| "epoch": 2.0467300415039062, |
| "grad_norm": 0.10459216684103012, |
| "learning_rate": 2.310647964477539e-05, |
| "lookahead_loss": 3.7720982518196107, |
| "loss": 2.0471, |
| "step": 282000 |
| }, |
| { |
| "base_loss": 0.3262847933769226, |
| "epoch": 2.0476837158203125, |
| "grad_norm": 0.19303348660469055, |
| "learning_rate": 2.305879592895508e-05, |
| "lookahead_loss": 3.781227571964264, |
| "loss": 2.0568, |
| "step": 282500 |
| }, |
| { |
| "base_loss": 0.29507314643263816, |
| "epoch": 2.0486373901367188, |
| "grad_norm": 0.10965248942375183, |
| "learning_rate": 2.3011112213134767e-05, |
| "lookahead_loss": 3.712780210018158, |
| "loss": 2.0076, |
| "step": 283000 |
| }, |
| { |
| "base_loss": 0.3051584759950638, |
| "epoch": 2.049591064453125, |
| "grad_norm": 0.10179181396961212, |
| "learning_rate": 2.2963428497314454e-05, |
| "lookahead_loss": 3.746872082710266, |
| "loss": 2.025, |
| "step": 283500 |
| }, |
| { |
| "base_loss": 0.31943696123361587, |
| "epoch": 2.0505447387695312, |
| "grad_norm": 0.1000838577747345, |
| "learning_rate": 2.291574478149414e-05, |
| "lookahead_loss": 3.772828236103058, |
| "loss": 2.0468, |
| "step": 284000 |
| }, |
| { |
| "base_loss": 0.30451616686582567, |
| "epoch": 2.0514984130859375, |
| "grad_norm": 0.14745627343654633, |
| "learning_rate": 2.2868061065673828e-05, |
| "lookahead_loss": 3.71900003194809, |
| "loss": 2.0156, |
| "step": 284500 |
| }, |
| { |
| "base_loss": 0.3078126339912415, |
| "epoch": 2.0524520874023438, |
| "grad_norm": 0.18430376052856445, |
| "learning_rate": 2.2820377349853518e-05, |
| "lookahead_loss": 3.748859833717346, |
| "loss": 2.0266, |
| "step": 285000 |
| }, |
| { |
| "epoch": 2.0524520874023438, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.2039166052882284, |
| "eval_lookahead_perplexity": 24.628802842522166, |
| "eval_loss": 1.6657613515853882, |
| "eval_perplexity": 5.289699037794899, |
| "eval_runtime": 518.0968, |
| "eval_samples_per_second": 9.651, |
| "eval_steps_per_second": 0.303, |
| "step": 285000 |
| }, |
| { |
| "base_loss": 0.32404885134100914, |
| "epoch": 2.05340576171875, |
| "grad_norm": 0.11742905527353287, |
| "learning_rate": 2.2772693634033205e-05, |
| "lookahead_loss": 3.7723129653930663, |
| "loss": 2.0421, |
| "step": 285500 |
| }, |
| { |
| "base_loss": 0.3565616801381111, |
| "epoch": 2.0543594360351562, |
| "grad_norm": 0.1019706130027771, |
| "learning_rate": 2.272500991821289e-05, |
| "lookahead_loss": 3.8037442412376405, |
| "loss": 2.0832, |
| "step": 286000 |
| }, |
| { |
| "base_loss": 0.29305290046334265, |
| "epoch": 2.0553131103515625, |
| "grad_norm": 0.13263586163520813, |
| "learning_rate": 2.2677326202392578e-05, |
| "lookahead_loss": 3.6994579930305482, |
| "loss": 2.0018, |
| "step": 286500 |
| }, |
| { |
| "base_loss": 0.3075584282577038, |
| "epoch": 2.0562667846679688, |
| "grad_norm": 0.10312948375940323, |
| "learning_rate": 2.2629642486572265e-05, |
| "lookahead_loss": 3.770684916973114, |
| "loss": 2.0401, |
| "step": 287000 |
| }, |
| { |
| "base_loss": 0.3192341819703579, |
| "epoch": 2.057220458984375, |
| "grad_norm": 0.12222248315811157, |
| "learning_rate": 2.2581958770751955e-05, |
| "lookahead_loss": 3.7722288217544557, |
| "loss": 2.0463, |
| "step": 287500 |
| }, |
| { |
| "base_loss": 0.32363886943459513, |
| "epoch": 2.0581741333007812, |
| "grad_norm": 0.10712938755750656, |
| "learning_rate": 2.2534275054931642e-05, |
| "lookahead_loss": 3.747902335643768, |
| "loss": 2.0318, |
| "step": 288000 |
| }, |
| { |
| "base_loss": 0.2921108921468258, |
| "epoch": 2.0591278076171875, |
| "grad_norm": 0.0956321582198143, |
| "learning_rate": 2.248659133911133e-05, |
| "lookahead_loss": 3.7060699305534364, |
| "loss": 2.0051, |
| "step": 288500 |
| }, |
| { |
| "base_loss": 0.30325917214155196, |
| "epoch": 2.0600814819335938, |
| "grad_norm": 0.09532496333122253, |
| "learning_rate": 2.2438907623291016e-05, |
| "lookahead_loss": 3.756898371219635, |
| "loss": 2.0312, |
| "step": 289000 |
| }, |
| { |
| "base_loss": 0.3216978460550308, |
| "epoch": 2.06103515625, |
| "grad_norm": 0.09469062834978104, |
| "learning_rate": 2.2391223907470703e-05, |
| "lookahead_loss": 3.760906336784363, |
| "loss": 2.0383, |
| "step": 289500 |
| }, |
| { |
| "base_loss": 0.3074358084797859, |
| "epoch": 2.0619888305664062, |
| "grad_norm": 0.12240971624851227, |
| "learning_rate": 2.2343540191650393e-05, |
| "lookahead_loss": 3.732211685180664, |
| "loss": 2.0106, |
| "step": 290000 |
| }, |
| { |
| "epoch": 2.0619888305664062, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111277975594, |
| "eval_base_perplexity": 1.1401970664566357, |
| "eval_lookahead_loss": 3.19971718346349, |
| "eval_lookahead_perplexity": 24.52559297291427, |
| "eval_loss": 1.6636607646942139, |
| "eval_perplexity": 5.278599227477334, |
| "eval_runtime": 496.2507, |
| "eval_samples_per_second": 10.076, |
| "eval_steps_per_second": 0.316, |
| "step": 290000 |
| }, |
| { |
| "base_loss": 0.30197526678442954, |
| "epoch": 1.0009536743164062, |
| "grad_norm": 0.11595375835895538, |
| "learning_rate": 2.229585647583008e-05, |
| "lookahead_loss": 3.7548108019828796, |
| "loss": 2.0242, |
| "step": 290500 |
| }, |
| { |
| "base_loss": 0.303896483540535, |
| "epoch": 1.0019073486328125, |
| "grad_norm": 0.1678297072649002, |
| "learning_rate": 2.2248172760009766e-05, |
| "lookahead_loss": 3.738233793735504, |
| "loss": 2.0227, |
| "step": 291000 |
| }, |
| { |
| "base_loss": 0.3094813532233238, |
| "epoch": 1.0028610229492188, |
| "grad_norm": 0.10476084798574448, |
| "learning_rate": 2.2200489044189453e-05, |
| "lookahead_loss": 3.749328505039215, |
| "loss": 2.0219, |
| "step": 291500 |
| }, |
| { |
| "base_loss": 0.3199170651733875, |
| "epoch": 1.003814697265625, |
| "grad_norm": 0.11634726822376251, |
| "learning_rate": 2.215280532836914e-05, |
| "lookahead_loss": 3.7493094959259032, |
| "loss": 2.0361, |
| "step": 292000 |
| }, |
| { |
| "base_loss": 0.30184895062446593, |
| "epoch": 1.0047683715820312, |
| "grad_norm": 0.09533069282770157, |
| "learning_rate": 2.210512161254883e-05, |
| "lookahead_loss": 3.7101937403678895, |
| "loss": 2.0101, |
| "step": 292500 |
| }, |
| { |
| "base_loss": 0.2977984355092049, |
| "epoch": 1.0057220458984375, |
| "grad_norm": 0.11591842770576477, |
| "learning_rate": 2.2057437896728517e-05, |
| "lookahead_loss": 3.733954050540924, |
| "loss": 2.0147, |
| "step": 293000 |
| }, |
| { |
| "base_loss": 0.2989386010617018, |
| "epoch": 1.0066757202148438, |
| "grad_norm": 0.1040799543261528, |
| "learning_rate": 2.2009754180908204e-05, |
| "lookahead_loss": 3.745620455265045, |
| "loss": 2.0247, |
| "step": 293500 |
| }, |
| { |
| "base_loss": 0.3137947543263435, |
| "epoch": 1.00762939453125, |
| "grad_norm": 0.12795260548591614, |
| "learning_rate": 2.196207046508789e-05, |
| "lookahead_loss": 3.755497174739838, |
| "loss": 2.0303, |
| "step": 294000 |
| }, |
| { |
| "base_loss": 0.31258952274918556, |
| "epoch": 1.0085830688476562, |
| "grad_norm": 0.12033551186323166, |
| "learning_rate": 2.1914386749267578e-05, |
| "lookahead_loss": 3.7433795986175538, |
| "loss": 2.0219, |
| "step": 294500 |
| }, |
| { |
| "base_loss": 0.3022870315015316, |
| "epoch": 1.0095367431640625, |
| "grad_norm": 0.12253336608409882, |
| "learning_rate": 2.1866703033447268e-05, |
| "lookahead_loss": 3.7020899171829225, |
| "loss": 2.006, |
| "step": 295000 |
| }, |
| { |
| "epoch": 1.0095367431640625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.195268607368104, |
| "eval_lookahead_perplexity": 24.416731325480427, |
| "eval_loss": 1.6614117622375488, |
| "eval_perplexity": 5.266740984454094, |
| "eval_runtime": 262.7535, |
| "eval_samples_per_second": 19.029, |
| "eval_steps_per_second": 0.598, |
| "step": 295000 |
| }, |
| { |
| "base_loss": 0.29763062146306035, |
| "epoch": 1.0104904174804688, |
| "grad_norm": 0.10351324081420898, |
| "learning_rate": 2.1819019317626955e-05, |
| "lookahead_loss": 3.7392460746765135, |
| "loss": 2.0173, |
| "step": 295500 |
| }, |
| { |
| "base_loss": 0.3011737278997898, |
| "epoch": 1.011444091796875, |
| "grad_norm": 0.10836062580347061, |
| "learning_rate": 2.177133560180664e-05, |
| "lookahead_loss": 3.725526228427887, |
| "loss": 2.018, |
| "step": 296000 |
| }, |
| { |
| "base_loss": 0.32275081843137743, |
| "epoch": 1.0123977661132812, |
| "grad_norm": 0.09027555584907532, |
| "learning_rate": 2.1723651885986328e-05, |
| "lookahead_loss": 3.756115399837494, |
| "loss": 2.0399, |
| "step": 296500 |
| }, |
| { |
| "base_loss": 0.30656733042001727, |
| "epoch": 1.0133514404296875, |
| "grad_norm": 0.11751335114240646, |
| "learning_rate": 2.1675968170166015e-05, |
| "lookahead_loss": 3.708749872684479, |
| "loss": 2.0119, |
| "step": 297000 |
| }, |
| { |
| "base_loss": 0.29944423550367355, |
| "epoch": 1.0143051147460938, |
| "grad_norm": 0.11894430220127106, |
| "learning_rate": 2.1628284454345705e-05, |
| "lookahead_loss": 3.710795109272003, |
| "loss": 2.0053, |
| "step": 297500 |
| }, |
| { |
| "base_loss": 0.29441548812389373, |
| "epoch": 1.0152587890625, |
| "grad_norm": 0.12353851646184921, |
| "learning_rate": 2.1580600738525392e-05, |
| "lookahead_loss": 3.733433099746704, |
| "loss": 2.0149, |
| "step": 298000 |
| }, |
| { |
| "base_loss": 0.31012057706713675, |
| "epoch": 1.0162124633789062, |
| "grad_norm": 0.11940598487854004, |
| "learning_rate": 2.153291702270508e-05, |
| "lookahead_loss": 3.7454442892074584, |
| "loss": 2.0313, |
| "step": 298500 |
| }, |
| { |
| "base_loss": 0.3121089872717857, |
| "epoch": 1.0171661376953125, |
| "grad_norm": 0.1333819031715393, |
| "learning_rate": 2.1485233306884766e-05, |
| "lookahead_loss": 3.7422564029693604, |
| "loss": 2.0218, |
| "step": 299000 |
| }, |
| { |
| "base_loss": 0.30401164934039115, |
| "epoch": 1.0181198120117188, |
| "grad_norm": 0.13284090161323547, |
| "learning_rate": 2.1437549591064453e-05, |
| "lookahead_loss": 3.715003073692322, |
| "loss": 2.0031, |
| "step": 299500 |
| }, |
| { |
| "base_loss": 0.29751659095287325, |
| "epoch": 1.019073486328125, |
| "grad_norm": 0.10176288336515427, |
| "learning_rate": 2.1389865875244143e-05, |
| "lookahead_loss": 3.743918424129486, |
| "loss": 2.0219, |
| "step": 300000 |
| }, |
| { |
| "epoch": 1.019073486328125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1906816144339953, |
| "eval_lookahead_perplexity": 24.304988429281245, |
| "eval_loss": 1.659116268157959, |
| "eval_perplexity": 5.254665077095094, |
| "eval_runtime": 276.7109, |
| "eval_samples_per_second": 18.069, |
| "eval_steps_per_second": 0.567, |
| "step": 300000 |
| }, |
| { |
| "base_loss": 0.30135778871178626, |
| "epoch": 1.0200271606445312, |
| "grad_norm": 0.12351817637681961, |
| "learning_rate": 2.134218215942383e-05, |
| "lookahead_loss": 3.722639967441559, |
| "loss": 2.0144, |
| "step": 300500 |
| }, |
| { |
| "base_loss": 0.33046225929260253, |
| "epoch": 1.0209808349609375, |
| "grad_norm": 0.14412406086921692, |
| "learning_rate": 2.1294498443603516e-05, |
| "lookahead_loss": 3.7789668612480165, |
| "loss": 2.0485, |
| "step": 301000 |
| }, |
| { |
| "base_loss": 0.30414657789468763, |
| "epoch": 1.0219345092773438, |
| "grad_norm": 0.10363082587718964, |
| "learning_rate": 2.1246814727783203e-05, |
| "lookahead_loss": 3.7076259078979494, |
| "loss": 2.0031, |
| "step": 301500 |
| }, |
| { |
| "base_loss": 0.30107878148555756, |
| "epoch": 1.02288818359375, |
| "grad_norm": 0.1513744294643402, |
| "learning_rate": 2.119913101196289e-05, |
| "lookahead_loss": 3.7153729853630066, |
| "loss": 2.0111, |
| "step": 302000 |
| }, |
| { |
| "base_loss": 0.3019954281449318, |
| "epoch": 1.0238418579101562, |
| "grad_norm": 0.10837174206972122, |
| "learning_rate": 2.115144729614258e-05, |
| "lookahead_loss": 3.721953295707703, |
| "loss": 2.014, |
| "step": 302500 |
| }, |
| { |
| "base_loss": 0.3265440165698528, |
| "epoch": 1.0247955322265625, |
| "grad_norm": 0.11248663067817688, |
| "learning_rate": 2.1103763580322267e-05, |
| "lookahead_loss": 3.766532400608063, |
| "loss": 2.0446, |
| "step": 303000 |
| }, |
| { |
| "base_loss": 0.3089427370727062, |
| "epoch": 1.0257492065429688, |
| "grad_norm": 0.12824861705303192, |
| "learning_rate": 2.1056079864501954e-05, |
| "lookahead_loss": 3.712160005092621, |
| "loss": 2.0148, |
| "step": 303500 |
| }, |
| { |
| "base_loss": 0.306296229749918, |
| "epoch": 1.026702880859375, |
| "grad_norm": 0.13572391867637634, |
| "learning_rate": 2.100839614868164e-05, |
| "lookahead_loss": 3.7354738450050355, |
| "loss": 2.0129, |
| "step": 304000 |
| }, |
| { |
| "base_loss": 0.30920383241772653, |
| "epoch": 1.0276565551757812, |
| "grad_norm": 0.11274685710668564, |
| "learning_rate": 2.0960712432861328e-05, |
| "lookahead_loss": 3.7408050112724305, |
| "loss": 2.0211, |
| "step": 304500 |
| }, |
| { |
| "base_loss": 0.33220478031039236, |
| "epoch": 1.0286102294921875, |
| "grad_norm": 0.12302955985069275, |
| "learning_rate": 2.0913028717041018e-05, |
| "lookahead_loss": 3.7831042833328246, |
| "loss": 2.0522, |
| "step": 305000 |
| }, |
| { |
| "epoch": 1.0286102294921875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.186847817021818, |
| "eval_lookahead_perplexity": 24.211986416887246, |
| "eval_loss": 1.657226800918579, |
| "eval_perplexity": 5.24474593347748, |
| "eval_runtime": 260.7532, |
| "eval_samples_per_second": 19.175, |
| "eval_steps_per_second": 0.602, |
| "step": 305000 |
| }, |
| { |
| "base_loss": 0.30326411041617396, |
| "epoch": 1.0295639038085938, |
| "grad_norm": 0.11241597682237625, |
| "learning_rate": 2.0865345001220705e-05, |
| "lookahead_loss": 3.7075537395477296, |
| "loss": 2.0049, |
| "step": 305500 |
| }, |
| { |
| "base_loss": 0.3031138954460621, |
| "epoch": 1.030517578125, |
| "grad_norm": 0.14674971997737885, |
| "learning_rate": 2.081766128540039e-05, |
| "lookahead_loss": 3.736467706680298, |
| "loss": 2.0225, |
| "step": 306000 |
| }, |
| { |
| "base_loss": 0.30234866255521775, |
| "epoch": 1.0314712524414062, |
| "grad_norm": 0.12188810110092163, |
| "learning_rate": 2.0769977569580078e-05, |
| "lookahead_loss": 3.742179157733917, |
| "loss": 2.0214, |
| "step": 306500 |
| }, |
| { |
| "base_loss": 0.3155796425938606, |
| "epoch": 1.0324249267578125, |
| "grad_norm": 0.13792704045772552, |
| "learning_rate": 2.0722293853759765e-05, |
| "lookahead_loss": 3.759408875465393, |
| "loss": 2.0432, |
| "step": 307000 |
| }, |
| { |
| "base_loss": 0.3022744803726673, |
| "epoch": 1.0333786010742188, |
| "grad_norm": 0.15973329544067383, |
| "learning_rate": 2.0674610137939455e-05, |
| "lookahead_loss": 3.7038854308128357, |
| "loss": 2.0042, |
| "step": 307500 |
| }, |
| { |
| "base_loss": 0.30410280799865724, |
| "epoch": 1.034332275390625, |
| "grad_norm": 0.1354731023311615, |
| "learning_rate": 2.0626926422119142e-05, |
| "lookahead_loss": 3.73946187210083, |
| "loss": 2.0229, |
| "step": 308000 |
| }, |
| { |
| "base_loss": 0.3077150760293007, |
| "epoch": 1.0352859497070312, |
| "grad_norm": 0.12362109869718552, |
| "learning_rate": 2.057924270629883e-05, |
| "lookahead_loss": 3.7301273741722105, |
| "loss": 2.0218, |
| "step": 308500 |
| }, |
| { |
| "base_loss": 0.3269314341843128, |
| "epoch": 1.0362396240234375, |
| "grad_norm": 0.10148072987794876, |
| "learning_rate": 2.0531558990478516e-05, |
| "lookahead_loss": 3.7653005418777465, |
| "loss": 2.0419, |
| "step": 309000 |
| }, |
| { |
| "base_loss": 0.30525318866968154, |
| "epoch": 1.0371932983398438, |
| "grad_norm": 0.10899261385202408, |
| "learning_rate": 2.0483875274658203e-05, |
| "lookahead_loss": 3.7169791021347045, |
| "loss": 2.0118, |
| "step": 309500 |
| }, |
| { |
| "base_loss": 0.3003401378691196, |
| "epoch": 1.03814697265625, |
| "grad_norm": 0.13924409449100494, |
| "learning_rate": 2.0436191558837893e-05, |
| "lookahead_loss": 3.7134488053321837, |
| "loss": 2.0096, |
| "step": 310000 |
| }, |
| { |
| "epoch": 1.03814697265625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1832272579875616, |
| "eval_lookahead_perplexity": 24.124483990164148, |
| "eval_loss": 1.6554052829742432, |
| "eval_perplexity": 5.235201230209359, |
| "eval_runtime": 270.7386, |
| "eval_samples_per_second": 18.468, |
| "eval_steps_per_second": 0.58, |
| "step": 310000 |
| }, |
| { |
| "base_loss": 0.3116057882905006, |
| "epoch": 1.0391006469726562, |
| "grad_norm": 0.12299172580242157, |
| "learning_rate": 2.038850784301758e-05, |
| "lookahead_loss": 3.7361023359298704, |
| "loss": 2.0261, |
| "step": 310500 |
| }, |
| { |
| "base_loss": 0.31716569018363955, |
| "epoch": 1.0400543212890625, |
| "grad_norm": 0.15561062097549438, |
| "learning_rate": 2.0340824127197266e-05, |
| "lookahead_loss": 3.752716485500336, |
| "loss": 2.0352, |
| "step": 311000 |
| }, |
| { |
| "base_loss": 0.31002197673916815, |
| "epoch": 1.0410079956054688, |
| "grad_norm": 0.1071806252002716, |
| "learning_rate": 2.0293140411376953e-05, |
| "lookahead_loss": 3.717531894683838, |
| "loss": 2.0087, |
| "step": 311500 |
| }, |
| { |
| "base_loss": 0.29521755149960516, |
| "epoch": 1.041961669921875, |
| "grad_norm": 0.1366235613822937, |
| "learning_rate": 2.024545669555664e-05, |
| "lookahead_loss": 3.729984639644623, |
| "loss": 2.014, |
| "step": 312000 |
| }, |
| { |
| "base_loss": 0.30736519694328307, |
| "epoch": 1.0429153442382812, |
| "grad_norm": 0.2894051969051361, |
| "learning_rate": 2.019777297973633e-05, |
| "lookahead_loss": 3.724388958930969, |
| "loss": 2.0196, |
| "step": 312500 |
| }, |
| { |
| "base_loss": 0.3271687869429588, |
| "epoch": 1.0438690185546875, |
| "grad_norm": 0.13099634647369385, |
| "learning_rate": 2.0150089263916017e-05, |
| "lookahead_loss": 3.7537019534111025, |
| "loss": 2.0458, |
| "step": 313000 |
| }, |
| { |
| "base_loss": 0.2943850245475769, |
| "epoch": 1.0448226928710938, |
| "grad_norm": 0.17744140326976776, |
| "learning_rate": 2.0102405548095704e-05, |
| "lookahead_loss": 3.6922522506713866, |
| "loss": 1.9939, |
| "step": 313500 |
| }, |
| { |
| "base_loss": 0.30418619123101237, |
| "epoch": 1.0457763671875, |
| "grad_norm": 0.10149285197257996, |
| "learning_rate": 2.005472183227539e-05, |
| "lookahead_loss": 3.7536733145713805, |
| "loss": 2.0266, |
| "step": 314000 |
| }, |
| { |
| "base_loss": 0.32734892451763153, |
| "epoch": 1.0467300415039062, |
| "grad_norm": 0.10566597431898117, |
| "learning_rate": 2.0007038116455078e-05, |
| "lookahead_loss": 3.7515933270454407, |
| "loss": 2.037, |
| "step": 314500 |
| }, |
| { |
| "base_loss": 0.32642096510529517, |
| "epoch": 1.0476837158203125, |
| "grad_norm": 0.18560998141765594, |
| "learning_rate": 1.9959354400634768e-05, |
| "lookahead_loss": 3.7600359020233154, |
| "loss": 2.0466, |
| "step": 315000 |
| }, |
| { |
| "epoch": 1.0476837158203125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1796208700052087, |
| "eval_lookahead_perplexity": 24.037638434531967, |
| "eval_loss": 1.6536099910736084, |
| "eval_perplexity": 5.225810947513935, |
| "eval_runtime": 273.2772, |
| "eval_samples_per_second": 18.296, |
| "eval_steps_per_second": 0.575, |
| "step": 315000 |
| }, |
| { |
| "base_loss": 0.29649946457147597, |
| "epoch": 1.0486373901367188, |
| "grad_norm": 0.10538148880004883, |
| "learning_rate": 1.9911670684814455e-05, |
| "lookahead_loss": 3.694501425266266, |
| "loss": 1.9992, |
| "step": 315500 |
| }, |
| { |
| "base_loss": 0.3057677939236164, |
| "epoch": 1.049591064453125, |
| "grad_norm": 0.10804512351751328, |
| "learning_rate": 1.986398696899414e-05, |
| "lookahead_loss": 3.7278350348472595, |
| "loss": 2.0164, |
| "step": 316000 |
| }, |
| { |
| "base_loss": 0.3218669015169144, |
| "epoch": 1.0505447387695312, |
| "grad_norm": 0.0984341949224472, |
| "learning_rate": 1.9816303253173828e-05, |
| "lookahead_loss": 3.7570176639556885, |
| "loss": 2.0388, |
| "step": 316500 |
| }, |
| { |
| "base_loss": 0.308034790366888, |
| "epoch": 1.0514984130859375, |
| "grad_norm": 0.14431750774383545, |
| "learning_rate": 1.9768619537353515e-05, |
| "lookahead_loss": 3.701095435142517, |
| "loss": 2.0069, |
| "step": 317000 |
| }, |
| { |
| "base_loss": 0.30695659655332563, |
| "epoch": 1.0524520874023438, |
| "grad_norm": 0.18022307753562927, |
| "learning_rate": 1.9720935821533205e-05, |
| "lookahead_loss": 3.726999051570892, |
| "loss": 2.0167, |
| "step": 317500 |
| }, |
| { |
| "base_loss": 0.3215196977555752, |
| "epoch": 1.05340576171875, |
| "grad_norm": 0.12188173830509186, |
| "learning_rate": 1.9673252105712892e-05, |
| "lookahead_loss": 3.7509958362579345, |
| "loss": 2.0307, |
| "step": 318000 |
| }, |
| { |
| "base_loss": 0.35528673872351646, |
| "epoch": 1.0543594360351562, |
| "grad_norm": 0.10480683296918869, |
| "learning_rate": 1.962556838989258e-05, |
| "lookahead_loss": 3.7855782594680787, |
| "loss": 2.0744, |
| "step": 318500 |
| }, |
| { |
| "base_loss": 0.2939756731390953, |
| "epoch": 1.0553131103515625, |
| "grad_norm": 0.13534528017044067, |
| "learning_rate": 1.9577884674072266e-05, |
| "lookahead_loss": 3.680368016242981, |
| "loss": 1.9924, |
| "step": 319000 |
| }, |
| { |
| "base_loss": 0.30533788445591925, |
| "epoch": 1.0562667846679688, |
| "grad_norm": 0.10960806906223297, |
| "learning_rate": 1.9530200958251953e-05, |
| "lookahead_loss": 3.7495523767471313, |
| "loss": 2.0292, |
| "step": 319500 |
| }, |
| { |
| "base_loss": 0.3139654756486416, |
| "epoch": 1.057220458984375, |
| "grad_norm": 0.11507224291563034, |
| "learning_rate": 1.9482517242431643e-05, |
| "lookahead_loss": 3.747137411594391, |
| "loss": 2.0341, |
| "step": 320000 |
| }, |
| { |
| "epoch": 1.057220458984375, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1759323974767812, |
| "eval_lookahead_perplexity": 23.94913957865158, |
| "eval_loss": 1.6517695188522339, |
| "eval_perplexity": 5.216201832997494, |
| "eval_runtime": 259.5136, |
| "eval_samples_per_second": 19.267, |
| "eval_steps_per_second": 0.605, |
| "step": 320000 |
| }, |
| { |
| "base_loss": 0.32300865882635116, |
| "epoch": 1.0581741333007812, |
| "grad_norm": 0.10415331274271011, |
| "learning_rate": 1.943483352661133e-05, |
| "lookahead_loss": 3.728764622211456, |
| "loss": 2.0226, |
| "step": 320500 |
| }, |
| { |
| "base_loss": 0.28907309558987615, |
| "epoch": 1.0591278076171875, |
| "grad_norm": 0.0982637032866478, |
| "learning_rate": 1.9387149810791016e-05, |
| "lookahead_loss": 3.6841273069381715, |
| "loss": 1.9931, |
| "step": 321000 |
| }, |
| { |
| "base_loss": 0.30057010012865065, |
| "epoch": 1.0600814819335938, |
| "grad_norm": 0.09693081676959991, |
| "learning_rate": 1.9339466094970703e-05, |
| "lookahead_loss": 3.735932330608368, |
| "loss": 2.0199, |
| "step": 321500 |
| }, |
| { |
| "base_loss": 0.32112616834044455, |
| "epoch": 1.06103515625, |
| "grad_norm": 0.09655743092298508, |
| "learning_rate": 1.929178237915039e-05, |
| "lookahead_loss": 3.740573130607605, |
| "loss": 2.0288, |
| "step": 322000 |
| }, |
| { |
| "base_loss": 0.3060283879637718, |
| "epoch": 1.0619888305664062, |
| "grad_norm": 0.1286919116973877, |
| "learning_rate": 1.924409866333008e-05, |
| "lookahead_loss": 3.712129928588867, |
| "loss": 1.9997, |
| "step": 322500 |
| }, |
| { |
| "base_loss": 0.31152518782019617, |
| "epoch": 1.0629425048828125, |
| "grad_norm": 0.11635252833366394, |
| "learning_rate": 1.9196414947509767e-05, |
| "lookahead_loss": 3.7569445767402647, |
| "loss": 2.0279, |
| "step": 323000 |
| }, |
| { |
| "base_loss": 0.3149063532948494, |
| "epoch": 1.0638961791992188, |
| "grad_norm": 0.10551901161670685, |
| "learning_rate": 1.9148731231689454e-05, |
| "lookahead_loss": 3.7492296714782714, |
| "loss": 2.03, |
| "step": 323500 |
| }, |
| { |
| "base_loss": 0.30411062452197074, |
| "epoch": 1.064849853515625, |
| "grad_norm": 0.12699651718139648, |
| "learning_rate": 1.910104751586914e-05, |
| "lookahead_loss": 3.7036221413612367, |
| "loss": 2.0074, |
| "step": 324000 |
| }, |
| { |
| "base_loss": 0.30933507332205773, |
| "epoch": 1.0658035278320312, |
| "grad_norm": 0.1357489973306656, |
| "learning_rate": 1.9053363800048828e-05, |
| "lookahead_loss": 3.7238325271606447, |
| "loss": 2.0153, |
| "step": 324500 |
| }, |
| { |
| "base_loss": 0.30638799047470094, |
| "epoch": 1.0667572021484375, |
| "grad_norm": 0.104975126683712, |
| "learning_rate": 1.9005680084228518e-05, |
| "lookahead_loss": 3.7292118496894835, |
| "loss": 2.017, |
| "step": 325000 |
| }, |
| { |
| "epoch": 1.0667572021484375, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1727946909090963, |
| "eval_lookahead_perplexity": 23.87411197496282, |
| "eval_loss": 1.6501920223236084, |
| "eval_perplexity": 5.207979779548762, |
| "eval_runtime": 269.7309, |
| "eval_samples_per_second": 18.537, |
| "eval_steps_per_second": 0.582, |
| "step": 325000 |
| }, |
| { |
| "base_loss": 0.32938760298490527, |
| "epoch": 1.0677108764648438, |
| "grad_norm": 0.15414147078990936, |
| "learning_rate": 1.8957996368408205e-05, |
| "lookahead_loss": 3.7661532316207884, |
| "loss": 2.0456, |
| "step": 325500 |
| }, |
| { |
| "base_loss": 0.29950347980856895, |
| "epoch": 1.06866455078125, |
| "grad_norm": 0.13640785217285156, |
| "learning_rate": 1.891031265258789e-05, |
| "lookahead_loss": 3.6806137619018555, |
| "loss": 1.9879, |
| "step": 326000 |
| }, |
| { |
| "base_loss": 0.30374919882416723, |
| "epoch": 1.0696182250976562, |
| "grad_norm": 0.1291172206401825, |
| "learning_rate": 1.8862628936767578e-05, |
| "lookahead_loss": 3.753649913311005, |
| "loss": 2.0277, |
| "step": 326500 |
| }, |
| { |
| "base_loss": 0.34455711591243743, |
| "epoch": 1.0705718994140625, |
| "grad_norm": 0.11020272970199585, |
| "learning_rate": 1.8814945220947265e-05, |
| "lookahead_loss": 3.7804429450035095, |
| "loss": 2.066, |
| "step": 327000 |
| }, |
| { |
| "base_loss": 0.31508783569931986, |
| "epoch": 1.0715255737304688, |
| "grad_norm": 0.14544732868671417, |
| "learning_rate": 1.8767261505126955e-05, |
| "lookahead_loss": 3.710645009994507, |
| "loss": 2.0101, |
| "step": 327500 |
| }, |
| { |
| "base_loss": 0.3064769520163536, |
| "epoch": 1.072479248046875, |
| "grad_norm": 0.12241372466087341, |
| "learning_rate": 1.8719577789306642e-05, |
| "lookahead_loss": 3.7174899125099183, |
| "loss": 2.0103, |
| "step": 328000 |
| }, |
| { |
| "base_loss": 0.3032188524603844, |
| "epoch": 1.0734329223632812, |
| "grad_norm": 0.09545071423053741, |
| "learning_rate": 1.867189407348633e-05, |
| "lookahead_loss": 3.723200294494629, |
| "loss": 2.017, |
| "step": 328500 |
| }, |
| { |
| "base_loss": 0.3287756524384022, |
| "epoch": 1.0743865966796875, |
| "grad_norm": 0.111233189702034, |
| "learning_rate": 1.8624210357666016e-05, |
| "lookahead_loss": 3.746195571899414, |
| "loss": 2.0363, |
| "step": 329000 |
| }, |
| { |
| "base_loss": 0.30459495696425437, |
| "epoch": 1.0753402709960938, |
| "grad_norm": 0.12619462609291077, |
| "learning_rate": 1.8576526641845703e-05, |
| "lookahead_loss": 3.699687201976776, |
| "loss": 1.9999, |
| "step": 329500 |
| }, |
| { |
| "base_loss": 0.3038223915994167, |
| "epoch": 1.0762939453125, |
| "grad_norm": 0.15558308362960815, |
| "learning_rate": 1.8528842926025393e-05, |
| "lookahead_loss": 3.7490292925834656, |
| "loss": 2.0269, |
| "step": 330000 |
| }, |
| { |
| "epoch": 1.0762939453125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1694573937132717, |
| "eval_lookahead_perplexity": 23.79456976983049, |
| "eval_loss": 1.6485565900802612, |
| "eval_perplexity": 5.1994694424312895, |
| "eval_runtime": 269.5983, |
| "eval_samples_per_second": 18.546, |
| "eval_steps_per_second": 0.582, |
| "step": 330000 |
| }, |
| { |
| "base_loss": 0.33005355006456377, |
| "epoch": 1.0772476196289062, |
| "grad_norm": 0.14846207201480865, |
| "learning_rate": 1.848115921020508e-05, |
| "lookahead_loss": 3.757576567649841, |
| "loss": 2.0498, |
| "step": 330500 |
| }, |
| { |
| "base_loss": 0.3011254695951939, |
| "epoch": 1.0782012939453125, |
| "grad_norm": 0.12544012069702148, |
| "learning_rate": 1.8433475494384766e-05, |
| "lookahead_loss": 3.6908311409950256, |
| "loss": 2.0027, |
| "step": 331000 |
| }, |
| { |
| "base_loss": 0.30074691036343576, |
| "epoch": 1.0791549682617188, |
| "grad_norm": 0.12219205498695374, |
| "learning_rate": 1.8385791778564453e-05, |
| "lookahead_loss": 3.723267038345337, |
| "loss": 2.0147, |
| "step": 331500 |
| }, |
| { |
| "base_loss": 0.31164744511246684, |
| "epoch": 1.080108642578125, |
| "grad_norm": 0.11832752823829651, |
| "learning_rate": 1.833810806274414e-05, |
| "lookahead_loss": 3.748631308555603, |
| "loss": 2.0351, |
| "step": 332000 |
| }, |
| { |
| "base_loss": 0.3218230297267437, |
| "epoch": 1.0810623168945312, |
| "grad_norm": 0.15903718769550323, |
| "learning_rate": 1.829042434692383e-05, |
| "lookahead_loss": 3.7279847102165222, |
| "loss": 2.0282, |
| "step": 332500 |
| }, |
| { |
| "base_loss": 0.30160315957665446, |
| "epoch": 1.0820159912109375, |
| "grad_norm": 0.13598766922950745, |
| "learning_rate": 1.8242740631103517e-05, |
| "lookahead_loss": 3.701110266685486, |
| "loss": 2.0043, |
| "step": 333000 |
| }, |
| { |
| "base_loss": 0.3046890263557434, |
| "epoch": 1.0829696655273438, |
| "grad_norm": 0.10043879598379135, |
| "learning_rate": 1.8195056915283204e-05, |
| "lookahead_loss": 3.7326065835952758, |
| "loss": 2.0182, |
| "step": 333500 |
| }, |
| { |
| "base_loss": 0.3337951873242855, |
| "epoch": 1.08392333984375, |
| "grad_norm": 0.11642859876155853, |
| "learning_rate": 1.814737319946289e-05, |
| "lookahead_loss": 3.7617497820854187, |
| "loss": 2.0431, |
| "step": 334000 |
| }, |
| { |
| "base_loss": 0.30679659196734427, |
| "epoch": 1.0848770141601562, |
| "grad_norm": 0.14971092343330383, |
| "learning_rate": 1.8099689483642578e-05, |
| "lookahead_loss": 3.7082361845970153, |
| "loss": 2.0092, |
| "step": 334500 |
| }, |
| { |
| "base_loss": 0.29602449855208396, |
| "epoch": 1.0858306884765625, |
| "grad_norm": 0.10793782025575638, |
| "learning_rate": 1.8052005767822268e-05, |
| "lookahead_loss": 3.712603385448456, |
| "loss": 2.003, |
| "step": 335000 |
| }, |
| { |
| "epoch": 1.0858306884765625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.16634484906547, |
| "eval_lookahead_perplexity": 23.72062324966851, |
| "eval_loss": 1.646978497505188, |
| "eval_perplexity": 5.191270669222933, |
| "eval_runtime": 258.8839, |
| "eval_samples_per_second": 19.314, |
| "eval_steps_per_second": 0.606, |
| "step": 335000 |
| }, |
| { |
| "base_loss": 0.30177975767850873, |
| "epoch": 1.0867843627929688, |
| "grad_norm": 0.4744811952114105, |
| "learning_rate": 1.8004322052001955e-05, |
| "lookahead_loss": 3.7170921821594236, |
| "loss": 2.0061, |
| "step": 335500 |
| }, |
| { |
| "base_loss": 0.3368028699159622, |
| "epoch": 1.087738037109375, |
| "grad_norm": 0.10788233578205109, |
| "learning_rate": 1.795663833618164e-05, |
| "lookahead_loss": 3.7672131505012514, |
| "loss": 2.0461, |
| "step": 336000 |
| }, |
| { |
| "base_loss": 0.30262881484627724, |
| "epoch": 1.0886917114257812, |
| "grad_norm": 0.12896187603473663, |
| "learning_rate": 1.7908954620361328e-05, |
| "lookahead_loss": 3.6916392154693605, |
| "loss": 1.9969, |
| "step": 336500 |
| }, |
| { |
| "base_loss": 0.3077995398044586, |
| "epoch": 1.0896453857421875, |
| "grad_norm": 0.10718485713005066, |
| "learning_rate": 1.7861270904541015e-05, |
| "lookahead_loss": 3.688416923999786, |
| "loss": 1.9978, |
| "step": 337000 |
| }, |
| { |
| "base_loss": 0.2990732188224792, |
| "epoch": 1.0905990600585938, |
| "grad_norm": 0.10033638030290604, |
| "learning_rate": 1.7813587188720705e-05, |
| "lookahead_loss": 3.7155744066238405, |
| "loss": 2.0097, |
| "step": 337500 |
| }, |
| { |
| "base_loss": 0.2992991936802864, |
| "epoch": 1.091552734375, |
| "grad_norm": 0.10070586949586868, |
| "learning_rate": 1.7765903472900392e-05, |
| "lookahead_loss": 3.716888844013214, |
| "loss": 2.0061, |
| "step": 338000 |
| }, |
| { |
| "base_loss": 0.32812165850400926, |
| "epoch": 1.0925064086914062, |
| "grad_norm": 0.1552288979291916, |
| "learning_rate": 1.771821975708008e-05, |
| "lookahead_loss": 3.7439418969154357, |
| "loss": 2.0298, |
| "step": 338500 |
| }, |
| { |
| "base_loss": 0.30864692279696465, |
| "epoch": 1.0934600830078125, |
| "grad_norm": 0.1215750053524971, |
| "learning_rate": 1.7670536041259766e-05, |
| "lookahead_loss": 3.700509956359863, |
| "loss": 2.008, |
| "step": 339000 |
| }, |
| { |
| "base_loss": 0.2887690741121769, |
| "epoch": 1.0944137573242188, |
| "grad_norm": 0.16854335367679596, |
| "learning_rate": 1.7622852325439453e-05, |
| "lookahead_loss": 3.6794085698127748, |
| "loss": 1.9825, |
| "step": 339500 |
| }, |
| { |
| "base_loss": 0.2968901333212852, |
| "epoch": 1.095367431640625, |
| "grad_norm": 0.10205203294754028, |
| "learning_rate": 1.7575168609619143e-05, |
| "lookahead_loss": 3.6928190813064576, |
| "loss": 1.9965, |
| "step": 340000 |
| }, |
| { |
| "epoch": 1.095367431640625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1630953965476527, |
| "eval_lookahead_perplexity": 23.64366930752887, |
| "eval_loss": 1.6453267335891724, |
| "eval_perplexity": 5.1827029934899995, |
| "eval_runtime": 270.7892, |
| "eval_samples_per_second": 18.465, |
| "eval_steps_per_second": 0.58, |
| "step": 340000 |
| }, |
| { |
| "base_loss": 0.2998797046840191, |
| "epoch": 1.0963211059570312, |
| "grad_norm": 0.2383318990468979, |
| "learning_rate": 1.752748489379883e-05, |
| "lookahead_loss": 3.701192867279053, |
| "loss": 2.0021, |
| "step": 340500 |
| }, |
| { |
| "base_loss": 0.3316424978226423, |
| "epoch": 1.0972747802734375, |
| "grad_norm": 0.10822620987892151, |
| "learning_rate": 1.7479801177978516e-05, |
| "lookahead_loss": 3.744793387889862, |
| "loss": 2.033, |
| "step": 341000 |
| }, |
| { |
| "base_loss": 0.2931605673134327, |
| "epoch": 1.0982284545898438, |
| "grad_norm": 0.1306353211402893, |
| "learning_rate": 1.7432117462158203e-05, |
| "lookahead_loss": 3.670068524837494, |
| "loss": 1.9853, |
| "step": 341500 |
| }, |
| { |
| "base_loss": 0.2951606792807579, |
| "epoch": 1.09918212890625, |
| "grad_norm": 0.09544999897480011, |
| "learning_rate": 1.738443374633789e-05, |
| "lookahead_loss": 3.7054534935951233, |
| "loss": 2.0026, |
| "step": 342000 |
| }, |
| { |
| "base_loss": 0.3005833325088024, |
| "epoch": 1.1001358032226562, |
| "grad_norm": 0.1566513329744339, |
| "learning_rate": 1.733675003051758e-05, |
| "lookahead_loss": 3.712607653141022, |
| "loss": 2.0106, |
| "step": 342500 |
| }, |
| { |
| "base_loss": 0.320584302932024, |
| "epoch": 1.1010894775390625, |
| "grad_norm": 0.10093113034963608, |
| "learning_rate": 1.7289066314697267e-05, |
| "lookahead_loss": 3.7455484852790835, |
| "loss": 2.0294, |
| "step": 343000 |
| }, |
| { |
| "base_loss": 0.3009133404493332, |
| "epoch": 1.1020431518554688, |
| "grad_norm": 0.14453580975532532, |
| "learning_rate": 1.7241382598876954e-05, |
| "lookahead_loss": 3.6938126912117006, |
| "loss": 1.9963, |
| "step": 343500 |
| }, |
| { |
| "base_loss": 0.2956097418367863, |
| "epoch": 1.102996826171875, |
| "grad_norm": 0.11491036415100098, |
| "learning_rate": 1.719369888305664e-05, |
| "lookahead_loss": 3.6890286202430724, |
| "loss": 1.9998, |
| "step": 344000 |
| }, |
| { |
| "base_loss": 0.297827641248703, |
| "epoch": 1.1039505004882812, |
| "grad_norm": 0.096994549036026, |
| "learning_rate": 1.7146015167236328e-05, |
| "lookahead_loss": 3.7211020727157593, |
| "loss": 2.012, |
| "step": 344500 |
| }, |
| { |
| "base_loss": 0.31351589208841324, |
| "epoch": 1.1049041748046875, |
| "grad_norm": 0.11693672835826874, |
| "learning_rate": 1.7098331451416018e-05, |
| "lookahead_loss": 3.7185617332458496, |
| "loss": 2.0197, |
| "step": 345000 |
| }, |
| { |
| "epoch": 1.1049041748046875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1599354583996173, |
| "eval_lookahead_perplexity": 23.569074694176884, |
| "eval_loss": 1.6437718868255615, |
| "eval_perplexity": 5.174650945986001, |
| "eval_runtime": 258.3426, |
| "eval_samples_per_second": 19.354, |
| "eval_steps_per_second": 0.608, |
| "step": 345000 |
| }, |
| { |
| "base_loss": 0.30757557436823846, |
| "epoch": 1.1058578491210938, |
| "grad_norm": 0.15182138979434967, |
| "learning_rate": 1.7050647735595705e-05, |
| "lookahead_loss": 3.69487420463562, |
| "loss": 1.9979, |
| "step": 345500 |
| }, |
| { |
| "base_loss": 0.2956553302705288, |
| "epoch": 1.1068115234375, |
| "grad_norm": 0.11485382914543152, |
| "learning_rate": 1.700296401977539e-05, |
| "lookahead_loss": 3.681716691493988, |
| "loss": 1.99, |
| "step": 346000 |
| }, |
| { |
| "base_loss": 0.29734967839717863, |
| "epoch": 1.1077651977539062, |
| "grad_norm": 0.15719352662563324, |
| "learning_rate": 1.6955280303955078e-05, |
| "lookahead_loss": 3.707757304191589, |
| "loss": 2.0001, |
| "step": 346500 |
| }, |
| { |
| "base_loss": 0.32284524619579313, |
| "epoch": 1.1087188720703125, |
| "grad_norm": 0.12326517701148987, |
| "learning_rate": 1.6907596588134765e-05, |
| "lookahead_loss": 3.7393799867630007, |
| "loss": 2.0291, |
| "step": 347000 |
| }, |
| { |
| "base_loss": 0.3172047883272171, |
| "epoch": 1.1096725463867188, |
| "grad_norm": 0.11677565425634384, |
| "learning_rate": 1.6859912872314455e-05, |
| "lookahead_loss": 3.702916480064392, |
| "loss": 2.0123, |
| "step": 347500 |
| }, |
| { |
| "base_loss": 0.29914562621712687, |
| "epoch": 1.110626220703125, |
| "grad_norm": 0.10666316747665405, |
| "learning_rate": 1.6812229156494142e-05, |
| "lookahead_loss": 3.679292615890503, |
| "loss": 1.9916, |
| "step": 348000 |
| }, |
| { |
| "base_loss": 0.2961756982207298, |
| "epoch": 1.1115798950195312, |
| "grad_norm": 0.2921413481235504, |
| "learning_rate": 1.676454544067383e-05, |
| "lookahead_loss": 3.7147518510818482, |
| "loss": 2.0041, |
| "step": 348500 |
| }, |
| { |
| "base_loss": 0.3077882871925831, |
| "epoch": 1.1125335693359375, |
| "grad_norm": 0.14457540214061737, |
| "learning_rate": 1.6716861724853516e-05, |
| "lookahead_loss": 3.7223079199790954, |
| "loss": 2.0166, |
| "step": 349000 |
| }, |
| { |
| "base_loss": 0.34582618343830107, |
| "epoch": 1.1134872436523438, |
| "grad_norm": 0.11859697103500366, |
| "learning_rate": 1.6669178009033203e-05, |
| "lookahead_loss": 3.7550181512832643, |
| "loss": 2.0428, |
| "step": 349500 |
| }, |
| { |
| "base_loss": 0.2972444402873516, |
| "epoch": 1.11444091796875, |
| "grad_norm": 0.1123783141374588, |
| "learning_rate": 1.6621494293212893e-05, |
| "lookahead_loss": 3.675015371799469, |
| "loss": 1.9874, |
| "step": 350000 |
| }, |
| { |
| "epoch": 1.11444091796875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.15782101618977, |
| "eval_lookahead_perplexity": 23.519291897767186, |
| "eval_loss": 1.6427289247512817, |
| "eval_perplexity": 5.1692567947382555, |
| "eval_runtime": 269.8055, |
| "eval_samples_per_second": 18.532, |
| "eval_steps_per_second": 0.582, |
| "step": 350000 |
| }, |
| { |
| "base_loss": 0.29772910493612287, |
| "epoch": 1.1153945922851562, |
| "grad_norm": 0.3871125280857086, |
| "learning_rate": 1.657381057739258e-05, |
| "lookahead_loss": 3.7167241282463075, |
| "loss": 2.0107, |
| "step": 350500 |
| }, |
| { |
| "base_loss": 0.31204655358195305, |
| "epoch": 1.1163482666015625, |
| "grad_norm": 0.166167750954628, |
| "learning_rate": 1.6526126861572266e-05, |
| "lookahead_loss": 3.7098571991920473, |
| "loss": 2.0117, |
| "step": 351000 |
| }, |
| { |
| "base_loss": 0.3257904815077782, |
| "epoch": 1.1173019409179688, |
| "grad_norm": 0.09978712350130081, |
| "learning_rate": 1.6478443145751953e-05, |
| "lookahead_loss": 3.7479370784759523, |
| "loss": 2.0397, |
| "step": 351500 |
| }, |
| { |
| "base_loss": 0.30611594703793527, |
| "epoch": 1.118255615234375, |
| "grad_norm": 0.11194101721048355, |
| "learning_rate": 1.643075942993164e-05, |
| "lookahead_loss": 3.7034087748527527, |
| "loss": 2.0015, |
| "step": 352000 |
| }, |
| { |
| "base_loss": 0.29987680965662, |
| "epoch": 1.1192092895507812, |
| "grad_norm": 0.13494464755058289, |
| "learning_rate": 1.638307571411133e-05, |
| "lookahead_loss": 3.7097480974197388, |
| "loss": 2.0036, |
| "step": 352500 |
| }, |
| { |
| "base_loss": 0.3036254093050957, |
| "epoch": 2.0009536743164062, |
| "grad_norm": 0.11621030420064926, |
| "learning_rate": 1.6335391998291017e-05, |
| "lookahead_loss": 3.721737900733948, |
| "loss": 2.0079, |
| "step": 353000 |
| }, |
| { |
| "base_loss": 0.3016641443669796, |
| "epoch": 2.0019073486328125, |
| "grad_norm": 0.17441171407699585, |
| "learning_rate": 1.6287708282470704e-05, |
| "lookahead_loss": 3.704204406738281, |
| "loss": 2.0054, |
| "step": 353500 |
| }, |
| { |
| "base_loss": 0.31078750917315484, |
| "epoch": 2.0028610229492188, |
| "grad_norm": 0.10131888836622238, |
| "learning_rate": 1.624002456665039e-05, |
| "lookahead_loss": 3.7189337663650512, |
| "loss": 2.0074, |
| "step": 354000 |
| }, |
| { |
| "base_loss": 0.3186642001867294, |
| "epoch": 2.003814697265625, |
| "grad_norm": 0.11608010530471802, |
| "learning_rate": 1.6192340850830078e-05, |
| "lookahead_loss": 3.717122416496277, |
| "loss": 2.021, |
| "step": 354500 |
| }, |
| { |
| "base_loss": 0.29983767235279085, |
| "epoch": 2.0047683715820312, |
| "grad_norm": 0.10390781611204147, |
| "learning_rate": 1.6144657135009768e-05, |
| "lookahead_loss": 3.6769887518882753, |
| "loss": 1.9942, |
| "step": 355000 |
| }, |
| { |
| "epoch": 2.0047683715820312, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1547467457219813, |
| "eval_lookahead_perplexity": 23.447098261503868, |
| "eval_loss": 1.6411741971969604, |
| "eval_perplexity": 5.1612262530339486, |
| "eval_runtime": 273.268, |
| "eval_samples_per_second": 18.297, |
| "eval_steps_per_second": 0.575, |
| "step": 355000 |
| }, |
| { |
| "base_loss": 0.29767213702201845, |
| "epoch": 2.0057220458984375, |
| "grad_norm": 0.1111893281340599, |
| "learning_rate": 1.6096973419189455e-05, |
| "lookahead_loss": 3.703052345275879, |
| "loss": 2.0003, |
| "step": 355500 |
| }, |
| { |
| "base_loss": 0.2978040435314179, |
| "epoch": 2.0066757202148438, |
| "grad_norm": 0.1064620316028595, |
| "learning_rate": 1.604928970336914e-05, |
| "lookahead_loss": 3.7156985325813294, |
| "loss": 2.0087, |
| "step": 356000 |
| }, |
| { |
| "base_loss": 0.3102393752634525, |
| "epoch": 2.00762939453125, |
| "grad_norm": 0.11691620200872421, |
| "learning_rate": 1.6001605987548828e-05, |
| "lookahead_loss": 3.7233915762901306, |
| "loss": 2.015, |
| "step": 356500 |
| }, |
| { |
| "base_loss": 0.3170112347304821, |
| "epoch": 2.0085830688476562, |
| "grad_norm": 0.11584022641181946, |
| "learning_rate": 1.5953922271728515e-05, |
| "lookahead_loss": 3.716030221939087, |
| "loss": 2.0091, |
| "step": 357000 |
| }, |
| { |
| "base_loss": 0.3013848161697388, |
| "epoch": 2.0095367431640625, |
| "grad_norm": 0.1189672127366066, |
| "learning_rate": 1.5906238555908205e-05, |
| "lookahead_loss": 3.673254894256592, |
| "loss": 1.9913, |
| "step": 357500 |
| }, |
| { |
| "base_loss": 0.3008648828268051, |
| "epoch": 2.0104904174804688, |
| "grad_norm": 0.10629413276910782, |
| "learning_rate": 1.5858554840087892e-05, |
| "lookahead_loss": 3.7115158891677855, |
| "loss": 2.003, |
| "step": 358000 |
| }, |
| { |
| "base_loss": 0.3007318134009838, |
| "epoch": 2.011444091796875, |
| "grad_norm": 0.11223334819078445, |
| "learning_rate": 1.581087112426758e-05, |
| "lookahead_loss": 3.6943654375076296, |
| "loss": 2.0027, |
| "step": 358500 |
| }, |
| { |
| "base_loss": 0.3253892393708229, |
| "epoch": 2.0123977661132812, |
| "grad_norm": 0.09513326734304428, |
| "learning_rate": 1.5763187408447266e-05, |
| "lookahead_loss": 3.7291233649253845, |
| "loss": 2.0275, |
| "step": 359000 |
| }, |
| { |
| "base_loss": 0.3064581930339336, |
| "epoch": 2.0133514404296875, |
| "grad_norm": 0.12533146142959595, |
| "learning_rate": 1.5715503692626953e-05, |
| "lookahead_loss": 3.68066414642334, |
| "loss": 1.998, |
| "step": 359500 |
| }, |
| { |
| "base_loss": 0.30020537215471266, |
| "epoch": 2.0143051147460938, |
| "grad_norm": 0.1161990836262703, |
| "learning_rate": 1.5667819976806643e-05, |
| "lookahead_loss": 3.682435676574707, |
| "loss": 1.9911, |
| "step": 360000 |
| }, |
| { |
| "epoch": 2.0143051147460938, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.152189729312738, |
| "eval_lookahead_perplexity": 23.387220233675123, |
| "eval_loss": 1.6399027109146118, |
| "eval_perplexity": 5.154667994904291, |
| "eval_runtime": 259.7548, |
| "eval_samples_per_second": 19.249, |
| "eval_steps_per_second": 0.604, |
| "step": 360000 |
| }, |
| { |
| "base_loss": 0.2963980228602886, |
| "epoch": 2.0152587890625, |
| "grad_norm": 0.12211249768733978, |
| "learning_rate": 1.562013626098633e-05, |
| "lookahead_loss": 3.7067537345886232, |
| "loss": 2.0011, |
| "step": 360500 |
| }, |
| { |
| "base_loss": 0.3123014765381813, |
| "epoch": 2.0162124633789062, |
| "grad_norm": 0.12035073339939117, |
| "learning_rate": 1.5572452545166016e-05, |
| "lookahead_loss": 3.7187122321128845, |
| "loss": 2.0164, |
| "step": 361000 |
| }, |
| { |
| "base_loss": 0.311051389247179, |
| "epoch": 2.0171661376953125, |
| "grad_norm": 0.12999185919761658, |
| "learning_rate": 1.5524768829345703e-05, |
| "lookahead_loss": 3.7131579689979555, |
| "loss": 2.0084, |
| "step": 361500 |
| }, |
| { |
| "base_loss": 0.30117547073960305, |
| "epoch": 2.0181198120117188, |
| "grad_norm": 0.12995636463165283, |
| "learning_rate": 1.547708511352539e-05, |
| "lookahead_loss": 3.684767934322357, |
| "loss": 1.9879, |
| "step": 362000 |
| }, |
| { |
| "base_loss": 0.2991112365424633, |
| "epoch": 2.019073486328125, |
| "grad_norm": 0.10578301548957825, |
| "learning_rate": 1.542940139770508e-05, |
| "lookahead_loss": 3.7182036762237547, |
| "loss": 2.0095, |
| "step": 362500 |
| }, |
| { |
| "base_loss": 0.2995945112109184, |
| "epoch": 2.0200271606445312, |
| "grad_norm": 0.12268956005573273, |
| "learning_rate": 1.5381717681884767e-05, |
| "lookahead_loss": 3.693427396774292, |
| "loss": 1.9995, |
| "step": 363000 |
| }, |
| { |
| "base_loss": 0.32935504597425463, |
| "epoch": 2.0209808349609375, |
| "grad_norm": 0.15605683624744415, |
| "learning_rate": 1.5334033966064454e-05, |
| "lookahead_loss": 3.7494262118339536, |
| "loss": 2.0335, |
| "step": 363500 |
| }, |
| { |
| "base_loss": 0.3044169374704361, |
| "epoch": 2.0219345092773438, |
| "grad_norm": 0.10574875771999359, |
| "learning_rate": 1.528635025024414e-05, |
| "lookahead_loss": 3.6809173183441164, |
| "loss": 1.9887, |
| "step": 364000 |
| }, |
| { |
| "base_loss": 0.2996359769701958, |
| "epoch": 2.02288818359375, |
| "grad_norm": 0.15029314160346985, |
| "learning_rate": 1.523866653442383e-05, |
| "lookahead_loss": 3.686254850387573, |
| "loss": 1.9976, |
| "step": 364500 |
| }, |
| { |
| "base_loss": 0.30142849957942963, |
| "epoch": 2.0238418579101562, |
| "grad_norm": 0.10843583941459656, |
| "learning_rate": 1.5190982818603516e-05, |
| "lookahead_loss": 3.6926897540092467, |
| "loss": 1.9987, |
| "step": 365000 |
| }, |
| { |
| "epoch": 2.0238418579101562, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1496820206078477, |
| "eval_lookahead_perplexity": 23.328645372952177, |
| "eval_loss": 1.6386431455612183, |
| "eval_perplexity": 5.148179440926399, |
| "eval_runtime": 271.1091, |
| "eval_samples_per_second": 18.443, |
| "eval_steps_per_second": 0.579, |
| "step": 365000 |
| }, |
| { |
| "base_loss": 0.3248125557899475, |
| "epoch": 2.0247955322265625, |
| "grad_norm": 0.09933142364025116, |
| "learning_rate": 1.5143299102783205e-05, |
| "lookahead_loss": 3.7378739171028137, |
| "loss": 2.0301, |
| "step": 365500 |
| }, |
| { |
| "base_loss": 0.31028636208176613, |
| "epoch": 2.0257492065429688, |
| "grad_norm": 0.12542510032653809, |
| "learning_rate": 1.5095615386962891e-05, |
| "lookahead_loss": 3.6860274262428283, |
| "loss": 2.0024, |
| "step": 366000 |
| }, |
| { |
| "base_loss": 0.3049684434235096, |
| "epoch": 2.026702880859375, |
| "grad_norm": 0.1288381665945053, |
| "learning_rate": 1.5047931671142578e-05, |
| "lookahead_loss": 3.7073655371665954, |
| "loss": 1.9991, |
| "step": 366500 |
| }, |
| { |
| "base_loss": 0.30682381707429884, |
| "epoch": 2.0276565551757812, |
| "grad_norm": 0.10210111737251282, |
| "learning_rate": 1.5000247955322267e-05, |
| "lookahead_loss": 3.7111876306533813, |
| "loss": 2.0056, |
| "step": 367000 |
| }, |
| { |
| "base_loss": 0.33313145861029625, |
| "epoch": 2.0286102294921875, |
| "grad_norm": 0.11859393119812012, |
| "learning_rate": 1.4952564239501954e-05, |
| "lookahead_loss": 3.7556820430755615, |
| "loss": 2.0382, |
| "step": 367500 |
| }, |
| { |
| "base_loss": 0.3025540582239628, |
| "epoch": 2.0295639038085938, |
| "grad_norm": 0.11141599714756012, |
| "learning_rate": 1.4904880523681642e-05, |
| "lookahead_loss": 3.679368188858032, |
| "loss": 1.9926, |
| "step": 368000 |
| }, |
| { |
| "base_loss": 0.3034520089030266, |
| "epoch": 2.030517578125, |
| "grad_norm": 0.15309970080852509, |
| "learning_rate": 1.4857196807861329e-05, |
| "lookahead_loss": 3.7101809630393983, |
| "loss": 2.0078, |
| "step": 368500 |
| }, |
| { |
| "base_loss": 0.30229984161257745, |
| "epoch": 2.0314712524414062, |
| "grad_norm": 0.12841260433197021, |
| "learning_rate": 1.4809513092041016e-05, |
| "lookahead_loss": 3.716352026462555, |
| "loss": 2.0084, |
| "step": 369000 |
| }, |
| { |
| "base_loss": 0.3168328501284122, |
| "epoch": 2.0324249267578125, |
| "grad_norm": 0.14004720747470856, |
| "learning_rate": 1.4761829376220704e-05, |
| "lookahead_loss": 3.734860891342163, |
| "loss": 2.0319, |
| "step": 369500 |
| }, |
| { |
| "base_loss": 0.30422543051838874, |
| "epoch": 2.0333786010742188, |
| "grad_norm": 0.1576036512851715, |
| "learning_rate": 1.4714145660400391e-05, |
| "lookahead_loss": 3.6790615549087526, |
| "loss": 1.9909, |
| "step": 370000 |
| }, |
| { |
| "epoch": 2.0333786010742188, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1475962008150242, |
| "eval_lookahead_perplexity": 23.280036734746588, |
| "eval_loss": 1.637601375579834, |
| "eval_perplexity": 5.142819014776387, |
| "eval_runtime": 269.2705, |
| "eval_samples_per_second": 18.569, |
| "eval_steps_per_second": 0.583, |
| "step": 370000 |
| }, |
| { |
| "base_loss": 0.3037178117632866, |
| "epoch": 2.034332275390625, |
| "grad_norm": 0.132655069231987, |
| "learning_rate": 1.466646194458008e-05, |
| "lookahead_loss": 3.709216817855835, |
| "loss": 2.0074, |
| "step": 370500 |
| }, |
| { |
| "base_loss": 0.3084926683306694, |
| "epoch": 2.0352859497070312, |
| "grad_norm": 0.1192920133471489, |
| "learning_rate": 1.4618778228759766e-05, |
| "lookahead_loss": 3.703082191944122, |
| "loss": 2.0072, |
| "step": 371000 |
| }, |
| { |
| "base_loss": 0.32473134699463846, |
| "epoch": 2.0362396240234375, |
| "grad_norm": 0.1038188636302948, |
| "learning_rate": 1.4571094512939453e-05, |
| "lookahead_loss": 3.736210472106934, |
| "loss": 2.0279, |
| "step": 371500 |
| }, |
| { |
| "base_loss": 0.3050074822306633, |
| "epoch": 2.0371932983398438, |
| "grad_norm": 0.11769280582666397, |
| "learning_rate": 1.4523410797119142e-05, |
| "lookahead_loss": 3.6904703121185305, |
| "loss": 1.9993, |
| "step": 372000 |
| }, |
| { |
| "base_loss": 0.3013649364411831, |
| "epoch": 2.03814697265625, |
| "grad_norm": 0.13569827377796173, |
| "learning_rate": 1.4475727081298829e-05, |
| "lookahead_loss": 3.687872163295746, |
| "loss": 1.9978, |
| "step": 372500 |
| }, |
| { |
| "base_loss": 0.3096700141429901, |
| "epoch": 2.0391006469726562, |
| "grad_norm": 0.11427426338195801, |
| "learning_rate": 1.4428043365478517e-05, |
| "lookahead_loss": 3.708254415988922, |
| "loss": 2.0132, |
| "step": 373000 |
| }, |
| { |
| "base_loss": 0.31990464240312577, |
| "epoch": 2.0400543212890625, |
| "grad_norm": 0.15470543503761292, |
| "learning_rate": 1.4380359649658204e-05, |
| "lookahead_loss": 3.7311536393165587, |
| "loss": 2.0233, |
| "step": 373500 |
| }, |
| { |
| "base_loss": 0.3096840573251247, |
| "epoch": 2.0410079956054688, |
| "grad_norm": 0.10737185180187225, |
| "learning_rate": 1.433267593383789e-05, |
| "lookahead_loss": 3.690148108959198, |
| "loss": 1.9951, |
| "step": 374000 |
| }, |
| { |
| "base_loss": 0.29508850196003916, |
| "epoch": 2.041961669921875, |
| "grad_norm": 0.14022259414196014, |
| "learning_rate": 1.428499221801758e-05, |
| "lookahead_loss": 3.702949764251709, |
| "loss": 2.0013, |
| "step": 374500 |
| }, |
| { |
| "base_loss": 0.3078301128745079, |
| "epoch": 2.0429153442382812, |
| "grad_norm": 0.28813207149505615, |
| "learning_rate": 1.4237308502197266e-05, |
| "lookahead_loss": 3.698307290554047, |
| "loss": 2.0061, |
| "step": 375000 |
| }, |
| { |
| "epoch": 2.0429153442382812, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.145147456909521, |
| "eval_lookahead_perplexity": 23.223099627321105, |
| "eval_loss": 1.636375069618225, |
| "eval_perplexity": 5.136516210532161, |
| "eval_runtime": 261.4483, |
| "eval_samples_per_second": 19.124, |
| "eval_steps_per_second": 0.601, |
| "step": 375000 |
| }, |
| { |
| "base_loss": 0.3264890166521072, |
| "epoch": 2.0438690185546875, |
| "grad_norm": 0.13248620927333832, |
| "learning_rate": 1.4189624786376955e-05, |
| "lookahead_loss": 3.7284442324638367, |
| "loss": 2.0322, |
| "step": 375500 |
| }, |
| { |
| "base_loss": 0.29649411234259604, |
| "epoch": 2.0448226928710938, |
| "grad_norm": 0.1766345202922821, |
| "learning_rate": 1.4141941070556641e-05, |
| "lookahead_loss": 3.6673648881912233, |
| "loss": 1.9829, |
| "step": 376000 |
| }, |
| { |
| "base_loss": 0.30447618263959886, |
| "epoch": 2.0457763671875, |
| "grad_norm": 0.09906455129384995, |
| "learning_rate": 1.4094257354736328e-05, |
| "lookahead_loss": 3.7270817494392396, |
| "loss": 2.0136, |
| "step": 376500 |
| }, |
| { |
| "base_loss": 0.3287204530388117, |
| "epoch": 2.0467300415039062, |
| "grad_norm": 0.10636741667985916, |
| "learning_rate": 1.4046573638916017e-05, |
| "lookahead_loss": 3.7259369196891785, |
| "loss": 2.025, |
| "step": 377000 |
| }, |
| { |
| "base_loss": 0.32552153533697126, |
| "epoch": 2.0476837158203125, |
| "grad_norm": 0.19604343175888062, |
| "learning_rate": 1.3998889923095704e-05, |
| "lookahead_loss": 3.7353513431549072, |
| "loss": 2.034, |
| "step": 377500 |
| }, |
| { |
| "base_loss": 0.2965818170309067, |
| "epoch": 2.0486373901367188, |
| "grad_norm": 0.11304906010627747, |
| "learning_rate": 1.3951206207275392e-05, |
| "lookahead_loss": 3.6689750633239746, |
| "loss": 1.9855, |
| "step": 378000 |
| }, |
| { |
| "base_loss": 0.30438165706396103, |
| "epoch": 2.049591064453125, |
| "grad_norm": 0.10858786851167679, |
| "learning_rate": 1.3903522491455079e-05, |
| "lookahead_loss": 3.7019722032547, |
| "loss": 2.0043, |
| "step": 378500 |
| }, |
| { |
| "base_loss": 0.3185528250038624, |
| "epoch": 2.0505447387695312, |
| "grad_norm": 0.09491516649723053, |
| "learning_rate": 1.3855838775634766e-05, |
| "lookahead_loss": 3.7292102155685423, |
| "loss": 2.0251, |
| "step": 379000 |
| }, |
| { |
| "base_loss": 0.3064282323718071, |
| "epoch": 2.0514984130859375, |
| "grad_norm": 0.14370054006576538, |
| "learning_rate": 1.3808155059814454e-05, |
| "lookahead_loss": 3.675420670509338, |
| "loss": 1.9933, |
| "step": 379500 |
| }, |
| { |
| "base_loss": 0.30485667461156846, |
| "epoch": 2.0524520874023438, |
| "grad_norm": 0.1851443499326706, |
| "learning_rate": 1.3760471343994141e-05, |
| "lookahead_loss": 3.701467691421509, |
| "loss": 2.0043, |
| "step": 380000 |
| }, |
| { |
| "epoch": 2.0524520874023438, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1431306185432897, |
| "eval_lookahead_perplexity": 23.1763095888427, |
| "eval_loss": 1.6353859901428223, |
| "eval_perplexity": 5.131438299416049, |
| "eval_runtime": 274.3785, |
| "eval_samples_per_second": 18.223, |
| "eval_steps_per_second": 0.572, |
| "step": 380000 |
| }, |
| { |
| "base_loss": 0.3230416283607483, |
| "epoch": 2.05340576171875, |
| "grad_norm": 0.1157195121049881, |
| "learning_rate": 1.371278762817383e-05, |
| "lookahead_loss": 3.7277134714126587, |
| "loss": 2.0199, |
| "step": 380500 |
| }, |
| { |
| "base_loss": 0.3555378588140011, |
| "epoch": 2.0543594360351562, |
| "grad_norm": 0.10356119275093079, |
| "learning_rate": 1.3665103912353516e-05, |
| "lookahead_loss": 3.7598505668640136, |
| "loss": 2.0624, |
| "step": 381000 |
| }, |
| { |
| "base_loss": 0.29287488567829134, |
| "epoch": 2.0553131103515625, |
| "grad_norm": 0.13426493108272552, |
| "learning_rate": 1.3617420196533203e-05, |
| "lookahead_loss": 3.6564659061431883, |
| "loss": 1.9806, |
| "step": 381500 |
| }, |
| { |
| "base_loss": 0.3070660081803799, |
| "epoch": 2.0562667846679688, |
| "grad_norm": 0.10810237377882004, |
| "learning_rate": 1.3569736480712892e-05, |
| "lookahead_loss": 3.7246059970855714, |
| "loss": 2.0172, |
| "step": 382000 |
| }, |
| { |
| "base_loss": 0.31805591636896136, |
| "epoch": 2.057220458984375, |
| "grad_norm": 0.1190585345029831, |
| "learning_rate": 1.3522052764892579e-05, |
| "lookahead_loss": 3.7293725261688233, |
| "loss": 2.0252, |
| "step": 382500 |
| }, |
| { |
| "base_loss": 0.32177385982871054, |
| "epoch": 2.0581741333007812, |
| "grad_norm": 0.10589335113763809, |
| "learning_rate": 1.3474369049072265e-05, |
| "lookahead_loss": 3.7027944622039795, |
| "loss": 2.0089, |
| "step": 383000 |
| }, |
| { |
| "base_loss": 0.2912293503880501, |
| "epoch": 2.0591278076171875, |
| "grad_norm": 0.08998730033636093, |
| "learning_rate": 1.3426685333251954e-05, |
| "lookahead_loss": 3.6608450388908387, |
| "loss": 1.9813, |
| "step": 383500 |
| }, |
| { |
| "base_loss": 0.3013720656633377, |
| "epoch": 2.0600814819335938, |
| "grad_norm": 0.09680195152759552, |
| "learning_rate": 1.337900161743164e-05, |
| "lookahead_loss": 3.7127736706733705, |
| "loss": 2.008, |
| "step": 384000 |
| }, |
| { |
| "base_loss": 0.320624990940094, |
| "epoch": 2.06103515625, |
| "grad_norm": 0.09335515648126602, |
| "learning_rate": 1.333131790161133e-05, |
| "lookahead_loss": 3.7168069033622744, |
| "loss": 2.0159, |
| "step": 384500 |
| }, |
| { |
| "base_loss": 0.30687601006031034, |
| "epoch": 2.0619888305664062, |
| "grad_norm": 0.1287391632795334, |
| "learning_rate": 1.3283634185791016e-05, |
| "lookahead_loss": 3.6904591851234434, |
| "loss": 1.9892, |
| "step": 385000 |
| }, |
| { |
| "epoch": 2.0619888305664062, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1412412084329624, |
| "eval_lookahead_perplexity": 23.13256137735663, |
| "eval_loss": 1.6344444751739502, |
| "eval_perplexity": 5.126609247114544, |
| "eval_runtime": 262.0521, |
| "eval_samples_per_second": 19.08, |
| "eval_steps_per_second": 0.599, |
| "step": 385000 |
| }, |
| { |
| "base_loss": 0.3092333972156048, |
| "epoch": 2.0629425048828125, |
| "grad_norm": 0.11992325633764267, |
| "learning_rate": 1.3235950469970703e-05, |
| "lookahead_loss": 3.731930965423584, |
| "loss": 2.0138, |
| "step": 385500 |
| }, |
| { |
| "base_loss": 0.3187819467484951, |
| "epoch": 2.0638961791992188, |
| "grad_norm": 0.10708379745483398, |
| "learning_rate": 1.3188266754150391e-05, |
| "lookahead_loss": 3.728700873374939, |
| "loss": 2.0192, |
| "step": 386000 |
| }, |
| { |
| "base_loss": 0.30155983543396, |
| "epoch": 2.064849853515625, |
| "grad_norm": 0.1287672370672226, |
| "learning_rate": 1.3140583038330078e-05, |
| "lookahead_loss": 3.677223875999451, |
| "loss": 1.9938, |
| "step": 386500 |
| }, |
| { |
| "base_loss": 0.30980769458413127, |
| "epoch": 2.0658035278320312, |
| "grad_norm": 0.14807488024234772, |
| "learning_rate": 1.3092899322509767e-05, |
| "lookahead_loss": 3.6996435074806215, |
| "loss": 2.003, |
| "step": 387000 |
| }, |
| { |
| "base_loss": 0.309319562882185, |
| "epoch": 2.0667572021484375, |
| "grad_norm": 0.10004570335149765, |
| "learning_rate": 1.3045215606689454e-05, |
| "lookahead_loss": 3.7083262605667113, |
| "loss": 2.007, |
| "step": 387500 |
| }, |
| { |
| "base_loss": 0.33088878998160365, |
| "epoch": 2.0677108764648438, |
| "grad_norm": 0.1570482850074768, |
| "learning_rate": 1.299753189086914e-05, |
| "lookahead_loss": 3.744370493888855, |
| "loss": 2.0344, |
| "step": 388000 |
| }, |
| { |
| "base_loss": 0.30170208609104154, |
| "epoch": 2.06866455078125, |
| "grad_norm": 0.13802482187747955, |
| "learning_rate": 1.2949848175048829e-05, |
| "lookahead_loss": 3.660481810569763, |
| "loss": 1.9787, |
| "step": 388500 |
| }, |
| { |
| "base_loss": 0.3034212864339352, |
| "epoch": 2.0696182250976562, |
| "grad_norm": 0.13130271434783936, |
| "learning_rate": 1.2902164459228516e-05, |
| "lookahead_loss": 3.7303440852165224, |
| "loss": 2.015, |
| "step": 389000 |
| }, |
| { |
| "base_loss": 0.3412878410220146, |
| "epoch": 2.0705718994140625, |
| "grad_norm": 0.10371687263250351, |
| "learning_rate": 1.2854480743408204e-05, |
| "lookahead_loss": 3.7537382555007937, |
| "loss": 2.0526, |
| "step": 389500 |
| }, |
| { |
| "base_loss": 0.31413120782375337, |
| "epoch": 2.0715255737304688, |
| "grad_norm": 0.14051884412765503, |
| "learning_rate": 1.2806797027587891e-05, |
| "lookahead_loss": 3.687371994972229, |
| "loss": 1.9978, |
| "step": 390000 |
| }, |
| { |
| "epoch": 2.0715255737304688, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1392902360556607, |
| "eval_lookahead_perplexity": 23.087474385132566, |
| "eval_loss": 1.6334805488586426, |
| "eval_perplexity": 5.121669954492467, |
| "eval_runtime": 296.484, |
| "eval_samples_per_second": 16.864, |
| "eval_steps_per_second": 0.53, |
| "step": 390000 |
| }, |
| { |
| "base_loss": 0.30943771398067477, |
| "epoch": 2.072479248046875, |
| "grad_norm": 0.1194412112236023, |
| "learning_rate": 1.2759113311767578e-05, |
| "lookahead_loss": 3.6957876262664793, |
| "loss": 1.9982, |
| "step": 390500 |
| }, |
| { |
| "base_loss": 0.3021232470273972, |
| "epoch": 2.0734329223632812, |
| "grad_norm": 0.09396813064813614, |
| "learning_rate": 1.2711429595947266e-05, |
| "lookahead_loss": 3.70153946685791, |
| "loss": 2.0048, |
| "step": 391000 |
| }, |
| { |
| "base_loss": 0.3273992139399052, |
| "epoch": 2.0743865966796875, |
| "grad_norm": 0.104710154235363, |
| "learning_rate": 1.2663745880126953e-05, |
| "lookahead_loss": 3.7224628949165344, |
| "loss": 2.0227, |
| "step": 391500 |
| }, |
| { |
| "base_loss": 0.3019404113292694, |
| "epoch": 2.0753402709960938, |
| "grad_norm": 0.1233215481042862, |
| "learning_rate": 1.2616062164306642e-05, |
| "lookahead_loss": 3.6753981237411497, |
| "loss": 1.9883, |
| "step": 392000 |
| }, |
| { |
| "base_loss": 0.30488721799850466, |
| "epoch": 2.0762939453125, |
| "grad_norm": 0.1560201346874237, |
| "learning_rate": 1.2568378448486329e-05, |
| "lookahead_loss": 3.72650838804245, |
| "loss": 2.0156, |
| "step": 392500 |
| }, |
| { |
| "base_loss": 0.32930157482624056, |
| "epoch": 2.0772476196289062, |
| "grad_norm": 0.14494048058986664, |
| "learning_rate": 1.2520694732666015e-05, |
| "lookahead_loss": 3.735769229888916, |
| "loss": 2.0379, |
| "step": 393000 |
| }, |
| { |
| "base_loss": 0.3017339085638523, |
| "epoch": 2.0782012939453125, |
| "grad_norm": 0.11965566128492355, |
| "learning_rate": 1.2473011016845704e-05, |
| "lookahead_loss": 3.667924000263214, |
| "loss": 1.991, |
| "step": 393500 |
| }, |
| { |
| "base_loss": 0.299552004635334, |
| "epoch": 2.0791549682617188, |
| "grad_norm": 0.1270737498998642, |
| "learning_rate": 1.242532730102539e-05, |
| "lookahead_loss": 3.699381884098053, |
| "loss": 2.0014, |
| "step": 394000 |
| }, |
| { |
| "base_loss": 0.31343785190582274, |
| "epoch": 2.080108642578125, |
| "grad_norm": 0.1207314133644104, |
| "learning_rate": 1.237764358520508e-05, |
| "lookahead_loss": 3.725409944534302, |
| "loss": 2.0234, |
| "step": 394500 |
| }, |
| { |
| "base_loss": 0.3193435942828655, |
| "epoch": 2.0810623168945312, |
| "grad_norm": 0.1668887585401535, |
| "learning_rate": 1.2329959869384766e-05, |
| "lookahead_loss": 3.705724129199982, |
| "loss": 2.018, |
| "step": 395000 |
| }, |
| { |
| "epoch": 2.0810623168945312, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1376450054180887, |
| "eval_lookahead_perplexity": 23.049521394202596, |
| "eval_loss": 1.6326426267623901, |
| "eval_perplexity": 5.117380191562451, |
| "eval_runtime": 275.2681, |
| "eval_samples_per_second": 18.164, |
| "eval_steps_per_second": 0.57, |
| "step": 395000 |
| }, |
| { |
| "base_loss": 0.29771795102953913, |
| "epoch": 2.0820159912109375, |
| "grad_norm": 0.13261182606220245, |
| "learning_rate": 1.2282276153564453e-05, |
| "lookahead_loss": 3.674468102455139, |
| "loss": 1.9931, |
| "step": 395500 |
| }, |
| { |
| "base_loss": 0.30634405037760737, |
| "epoch": 2.0829696655273438, |
| "grad_norm": 0.09685485810041428, |
| "learning_rate": 1.2234592437744141e-05, |
| "lookahead_loss": 3.7131715035438537, |
| "loss": 2.0079, |
| "step": 396000 |
| }, |
| { |
| "base_loss": 0.3325966064631939, |
| "epoch": 2.08392333984375, |
| "grad_norm": 0.11063214391469955, |
| "learning_rate": 1.2186908721923828e-05, |
| "lookahead_loss": 3.7380114979743957, |
| "loss": 2.032, |
| "step": 396500 |
| }, |
| { |
| "base_loss": 0.3046146906912327, |
| "epoch": 2.0848770141601562, |
| "grad_norm": 0.14153137803077698, |
| "learning_rate": 1.2139225006103517e-05, |
| "lookahead_loss": 3.6873513193130494, |
| "loss": 1.9979, |
| "step": 397000 |
| }, |
| { |
| "base_loss": 0.2984280304312706, |
| "epoch": 2.0858306884765625, |
| "grad_norm": 0.1063678041100502, |
| "learning_rate": 1.2091541290283204e-05, |
| "lookahead_loss": 3.6935010514259337, |
| "loss": 1.9924, |
| "step": 397500 |
| }, |
| { |
| "base_loss": 0.3047517819106579, |
| "epoch": 2.0867843627929688, |
| "grad_norm": 0.4544115960597992, |
| "learning_rate": 1.204385757446289e-05, |
| "lookahead_loss": 3.697446392059326, |
| "loss": 1.9966, |
| "step": 398000 |
| }, |
| { |
| "base_loss": 0.33619433450698855, |
| "epoch": 2.087738037109375, |
| "grad_norm": 0.10550739616155624, |
| "learning_rate": 1.1996173858642579e-05, |
| "lookahead_loss": 3.7438949031829836, |
| "loss": 2.0327, |
| "step": 398500 |
| }, |
| { |
| "base_loss": 0.30201966351270676, |
| "epoch": 2.0886917114257812, |
| "grad_norm": 0.12323067337274551, |
| "learning_rate": 1.1948490142822266e-05, |
| "lookahead_loss": 3.6697822360992434, |
| "loss": 1.9862, |
| "step": 399000 |
| }, |
| { |
| "base_loss": 0.308837145447731, |
| "epoch": 2.0896453857421875, |
| "grad_norm": 0.11336886882781982, |
| "learning_rate": 1.1900806427001954e-05, |
| "lookahead_loss": 3.667280921936035, |
| "loss": 1.9871, |
| "step": 399500 |
| }, |
| { |
| "base_loss": 0.2996679684817791, |
| "epoch": 2.0905990600585938, |
| "grad_norm": 0.10357397049665451, |
| "learning_rate": 1.1853122711181641e-05, |
| "lookahead_loss": 3.6947066493034364, |
| "loss": 1.9992, |
| "step": 400000 |
| }, |
| { |
| "epoch": 2.0905990600585938, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.135611410719899, |
| "eval_lookahead_perplexity": 23.00269563814152, |
| "eval_loss": 1.6316173076629639, |
| "eval_perplexity": 5.112135932891737, |
| "eval_runtime": 263.2362, |
| "eval_samples_per_second": 18.994, |
| "eval_steps_per_second": 0.596, |
| "step": 400000 |
| }, |
| { |
| "base_loss": 0.3001748549044132, |
| "epoch": 2.091552734375, |
| "grad_norm": 0.10686542093753815, |
| "learning_rate": 1.1805438995361328e-05, |
| "lookahead_loss": 3.696131613254547, |
| "loss": 1.9946, |
| "step": 400500 |
| }, |
| { |
| "base_loss": 0.32454153364896776, |
| "epoch": 2.0925064086914062, |
| "grad_norm": 0.16283544898033142, |
| "learning_rate": 1.1757755279541016e-05, |
| "lookahead_loss": 3.719967551231384, |
| "loss": 2.0177, |
| "step": 401000 |
| }, |
| { |
| "base_loss": 0.3073597291409969, |
| "epoch": 2.0934600830078125, |
| "grad_norm": 0.13384558260440826, |
| "learning_rate": 1.1710071563720703e-05, |
| "lookahead_loss": 3.679731466293335, |
| "loss": 1.9974, |
| "step": 401500 |
| }, |
| { |
| "base_loss": 0.2906791627705097, |
| "epoch": 2.0944137573242188, |
| "grad_norm": 0.16118714213371277, |
| "learning_rate": 1.1662387847900392e-05, |
| "lookahead_loss": 3.6603454394340513, |
| "loss": 1.9723, |
| "step": 402000 |
| }, |
| { |
| "base_loss": 0.2977730810046196, |
| "epoch": 2.095367431640625, |
| "grad_norm": 0.1117246001958847, |
| "learning_rate": 1.1614704132080079e-05, |
| "lookahead_loss": 3.671906901359558, |
| "loss": 1.9866, |
| "step": 402500 |
| }, |
| { |
| "base_loss": 0.3014255873262882, |
| "epoch": 2.0963211059570312, |
| "grad_norm": 0.22929483652114868, |
| "learning_rate": 1.1567020416259765e-05, |
| "lookahead_loss": 3.6827497115135195, |
| "loss": 1.9928, |
| "step": 403000 |
| }, |
| { |
| "base_loss": 0.3342979139983654, |
| "epoch": 2.0972747802734375, |
| "grad_norm": 0.10726416856050491, |
| "learning_rate": 1.1519336700439454e-05, |
| "lookahead_loss": 3.723692095756531, |
| "loss": 2.0222, |
| "step": 403500 |
| }, |
| { |
| "base_loss": 0.29461920487880705, |
| "epoch": 2.0982284545898438, |
| "grad_norm": 0.11667662858963013, |
| "learning_rate": 1.147165298461914e-05, |
| "lookahead_loss": 3.6522948231697083, |
| "loss": 1.9764, |
| "step": 404000 |
| }, |
| { |
| "base_loss": 0.29449185797572136, |
| "epoch": 2.09918212890625, |
| "grad_norm": 0.09684642404317856, |
| "learning_rate": 1.142396926879883e-05, |
| "lookahead_loss": 3.6839715528488157, |
| "loss": 1.9921, |
| "step": 404500 |
| }, |
| { |
| "base_loss": 0.3004076217412949, |
| "epoch": 2.1001358032226562, |
| "grad_norm": 0.1516093909740448, |
| "learning_rate": 1.1376285552978516e-05, |
| "lookahead_loss": 3.6948669986724854, |
| "loss": 2.0012, |
| "step": 405000 |
| }, |
| { |
| "epoch": 2.1001358032226562, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.133693779619357, |
| "eval_lookahead_perplexity": 22.95862722057799, |
| "eval_loss": 1.6306617259979248, |
| "eval_perplexity": 5.107253202820293, |
| "eval_runtime": 275.9898, |
| "eval_samples_per_second": 18.117, |
| "eval_steps_per_second": 0.569, |
| "step": 405000 |
| }, |
| { |
| "base_loss": 0.3221150857806206, |
| "epoch": 2.1010894775390625, |
| "grad_norm": 0.09898355603218079, |
| "learning_rate": 1.1328601837158203e-05, |
| "lookahead_loss": 3.7262251405715943, |
| "loss": 2.0191, |
| "step": 405500 |
| }, |
| { |
| "base_loss": 0.3021469285786152, |
| "epoch": 2.1020431518554688, |
| "grad_norm": 0.14063310623168945, |
| "learning_rate": 1.1280918121337891e-05, |
| "lookahead_loss": 3.6706616439819335, |
| "loss": 1.9854, |
| "step": 406000 |
| }, |
| { |
| "base_loss": 0.29727980035543444, |
| "epoch": 2.102996826171875, |
| "grad_norm": 0.12236055731773376, |
| "learning_rate": 1.1233234405517578e-05, |
| "lookahead_loss": 3.6699202404022215, |
| "loss": 1.9916, |
| "step": 406500 |
| }, |
| { |
| "base_loss": 0.2964036027789116, |
| "epoch": 2.1039505004882812, |
| "grad_norm": 0.10163906216621399, |
| "learning_rate": 1.1185550689697267e-05, |
| "lookahead_loss": 3.6987908611297606, |
| "loss": 2.0006, |
| "step": 407000 |
| }, |
| { |
| "base_loss": 0.3138076714575291, |
| "epoch": 2.1049041748046875, |
| "grad_norm": 0.10581351071596146, |
| "learning_rate": 1.1137866973876954e-05, |
| "lookahead_loss": 3.700042960166931, |
| "loss": 2.0104, |
| "step": 407500 |
| }, |
| { |
| "base_loss": 0.31011119556427, |
| "epoch": 2.1058578491210938, |
| "grad_norm": 0.14030158519744873, |
| "learning_rate": 1.109018325805664e-05, |
| "lookahead_loss": 3.677447699546814, |
| "loss": 1.9897, |
| "step": 408000 |
| }, |
| { |
| "base_loss": 0.2961321137845516, |
| "epoch": 2.1068115234375, |
| "grad_norm": 0.10945964604616165, |
| "learning_rate": 1.1042499542236329e-05, |
| "lookahead_loss": 3.6609533157348633, |
| "loss": 1.9796, |
| "step": 408500 |
| }, |
| { |
| "base_loss": 0.2940867764055729, |
| "epoch": 2.1077651977539062, |
| "grad_norm": 0.14712247252464294, |
| "learning_rate": 1.0994815826416016e-05, |
| "lookahead_loss": 3.684739191532135, |
| "loss": 1.9891, |
| "step": 409000 |
| }, |
| { |
| "base_loss": 0.32057751885056496, |
| "epoch": 2.1087188720703125, |
| "grad_norm": 0.12016324698925018, |
| "learning_rate": 1.0947132110595704e-05, |
| "lookahead_loss": 3.7167062678337097, |
| "loss": 2.0188, |
| "step": 409500 |
| }, |
| { |
| "base_loss": 0.3168235483467579, |
| "epoch": 2.1096725463867188, |
| "grad_norm": 0.11053480952978134, |
| "learning_rate": 1.0899448394775391e-05, |
| "lookahead_loss": 3.685606789112091, |
| "loss": 2.0024, |
| "step": 410000 |
| }, |
| { |
| "epoch": 2.1096725463867188, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1321658196921547, |
| "eval_lookahead_perplexity": 22.923574144868628, |
| "eval_loss": 1.6299090385437012, |
| "eval_perplexity": 5.103410483773616, |
| "eval_runtime": 277.1361, |
| "eval_samples_per_second": 18.042, |
| "eval_steps_per_second": 0.567, |
| "step": 410000 |
| }, |
| { |
| "base_loss": 0.30013936150074005, |
| "epoch": 2.110626220703125, |
| "grad_norm": 0.10985679924488068, |
| "learning_rate": 1.0851764678955078e-05, |
| "lookahead_loss": 3.661238037109375, |
| "loss": 1.9809, |
| "step": 410500 |
| }, |
| { |
| "base_loss": 0.29884218820929526, |
| "epoch": 2.1115798950195312, |
| "grad_norm": 0.2866235077381134, |
| "learning_rate": 1.0804080963134766e-05, |
| "lookahead_loss": 3.697595094203949, |
| "loss": 1.9953, |
| "step": 411000 |
| }, |
| { |
| "base_loss": 0.30719696512818334, |
| "epoch": 2.1125335693359375, |
| "grad_norm": 0.12627729773521423, |
| "learning_rate": 1.0756397247314453e-05, |
| "lookahead_loss": 3.7029927005767824, |
| "loss": 2.0056, |
| "step": 411500 |
| }, |
| { |
| "base_loss": 0.3448580102622509, |
| "epoch": 2.1134872436523438, |
| "grad_norm": 0.11490114778280258, |
| "learning_rate": 1.0708713531494142e-05, |
| "lookahead_loss": 3.7366786670684813, |
| "loss": 2.0333, |
| "step": 412000 |
| }, |
| { |
| "base_loss": 0.29588871854543686, |
| "epoch": 2.11444091796875, |
| "grad_norm": 0.11319919675588608, |
| "learning_rate": 1.0661029815673829e-05, |
| "lookahead_loss": 3.652657012939453, |
| "loss": 1.9766, |
| "step": 412500 |
| }, |
| { |
| "base_loss": 0.3001244888305664, |
| "epoch": 2.1153945922851562, |
| "grad_norm": 0.383401095867157, |
| "learning_rate": 1.0613346099853515e-05, |
| "lookahead_loss": 3.699717406749725, |
| "loss": 2.0031, |
| "step": 413000 |
| }, |
| { |
| "base_loss": 0.3110040880739689, |
| "epoch": 2.1163482666015625, |
| "grad_norm": 0.16206490993499756, |
| "learning_rate": 1.0565662384033204e-05, |
| "lookahead_loss": 3.6906900959014894, |
| "loss": 2.0034, |
| "step": 413500 |
| }, |
| { |
| "base_loss": 0.32364195665717127, |
| "epoch": 2.1173019409179688, |
| "grad_norm": 0.10381924360990524, |
| "learning_rate": 1.051797866821289e-05, |
| "lookahead_loss": 3.726861256599426, |
| "loss": 2.0293, |
| "step": 414000 |
| }, |
| { |
| "base_loss": 0.30665904381871223, |
| "epoch": 2.118255615234375, |
| "grad_norm": 0.10144428163766861, |
| "learning_rate": 1.047029495239258e-05, |
| "lookahead_loss": 3.686223955631256, |
| "loss": 1.9927, |
| "step": 414500 |
| }, |
| { |
| "base_loss": 0.30039039224386216, |
| "epoch": 2.1192092895507812, |
| "grad_norm": 0.1295783519744873, |
| "learning_rate": 1.0422611236572266e-05, |
| "lookahead_loss": 3.6910682001113893, |
| "loss": 1.9947, |
| "step": 415000 |
| }, |
| { |
| "epoch": 2.1192092895507812, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.130569809161055, |
| "eval_lookahead_perplexity": 22.88701705962949, |
| "eval_loss": 1.629108190536499, |
| "eval_perplexity": 5.099325063776334, |
| "eval_runtime": 260.4647, |
| "eval_samples_per_second": 19.196, |
| "eval_steps_per_second": 0.603, |
| "step": 415000 |
| }, |
| { |
| "base_loss": 0.30430060923099517, |
| "epoch": 3.0009536743164062, |
| "grad_norm": 0.11666166037321091, |
| "learning_rate": 1.0374927520751953e-05, |
| "lookahead_loss": 3.7038714718818664, |
| "loss": 1.9999, |
| "step": 415500 |
| }, |
| { |
| "base_loss": 0.3014007512629032, |
| "epoch": 3.0019073486328125, |
| "grad_norm": 0.17477287352085114, |
| "learning_rate": 1.0327243804931641e-05, |
| "lookahead_loss": 3.6869843678474425, |
| "loss": 1.9959, |
| "step": 416000 |
| }, |
| { |
| "base_loss": 0.30987949097156525, |
| "epoch": 3.0028610229492188, |
| "grad_norm": 0.1018444299697876, |
| "learning_rate": 1.0279560089111328e-05, |
| "lookahead_loss": 3.7006162996292113, |
| "loss": 1.9985, |
| "step": 416500 |
| }, |
| { |
| "base_loss": 0.31901577454805374, |
| "epoch": 3.003814697265625, |
| "grad_norm": 0.11105263233184814, |
| "learning_rate": 1.0231876373291017e-05, |
| "lookahead_loss": 3.7003021211624145, |
| "loss": 2.0117, |
| "step": 417000 |
| }, |
| { |
| "base_loss": 0.30133016020059583, |
| "epoch": 3.0047683715820312, |
| "grad_norm": 0.09324323385953903, |
| "learning_rate": 1.0184192657470704e-05, |
| "lookahead_loss": 3.6617124271392822, |
| "loss": 1.9858, |
| "step": 417500 |
| }, |
| { |
| "base_loss": 0.29890421107411386, |
| "epoch": 3.0057220458984375, |
| "grad_norm": 0.10885365307331085, |
| "learning_rate": 1.013650894165039e-05, |
| "lookahead_loss": 3.6865826263427732, |
| "loss": 1.9927, |
| "step": 418000 |
| }, |
| { |
| "base_loss": 0.2965673512518406, |
| "epoch": 3.0066757202148438, |
| "grad_norm": 0.10601469874382019, |
| "learning_rate": 1.0088825225830079e-05, |
| "lookahead_loss": 3.6957881078720094, |
| "loss": 2.0004, |
| "step": 418500 |
| }, |
| { |
| "base_loss": 0.309929815903306, |
| "epoch": 3.00762939453125, |
| "grad_norm": 0.11855798214673996, |
| "learning_rate": 1.0041141510009766e-05, |
| "lookahead_loss": 3.705542845726013, |
| "loss": 2.0052, |
| "step": 419000 |
| }, |
| { |
| "base_loss": 0.3150366614460945, |
| "epoch": 3.0085830688476562, |
| "grad_norm": 0.11749642342329025, |
| "learning_rate": 9.993457794189454e-06, |
| "lookahead_loss": 3.6966953639984133, |
| "loss": 1.9993, |
| "step": 419500 |
| }, |
| { |
| "base_loss": 0.2985053049325943, |
| "epoch": 3.0095367431640625, |
| "grad_norm": 0.11532973498106003, |
| "learning_rate": 9.945774078369141e-06, |
| "lookahead_loss": 3.6538707246780397, |
| "loss": 1.9834, |
| "step": 420000 |
| }, |
| { |
| "epoch": 3.0095367431640625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.129125172337785, |
| "eval_lookahead_perplexity": 22.85397750283659, |
| "eval_loss": 1.6283732652664185, |
| "eval_perplexity": 5.095578817700681, |
| "eval_runtime": 275.3938, |
| "eval_samples_per_second": 18.156, |
| "eval_steps_per_second": 0.57, |
| "step": 420000 |
| }, |
| { |
| "base_loss": 0.299905475795269, |
| "epoch": 3.0104904174804688, |
| "grad_norm": 0.10150377452373505, |
| "learning_rate": 9.898090362548828e-06, |
| "lookahead_loss": 3.694345196247101, |
| "loss": 1.9941, |
| "step": 420500 |
| }, |
| { |
| "base_loss": 0.3031460309624672, |
| "epoch": 3.011444091796875, |
| "grad_norm": 0.1087944358587265, |
| "learning_rate": 9.850406646728516e-06, |
| "lookahead_loss": 3.6788615469932555, |
| "loss": 1.9953, |
| "step": 421000 |
| }, |
| { |
| "base_loss": 0.32608956068754197, |
| "epoch": 3.0123977661132812, |
| "grad_norm": 0.09304623305797577, |
| "learning_rate": 9.802722930908203e-06, |
| "lookahead_loss": 3.7108820514678955, |
| "loss": 2.0181, |
| "step": 421500 |
| }, |
| { |
| "base_loss": 0.3085035228431225, |
| "epoch": 3.0133514404296875, |
| "grad_norm": 0.11764844506978989, |
| "learning_rate": 9.755039215087892e-06, |
| "lookahead_loss": 3.6660648827552795, |
| "loss": 1.99, |
| "step": 422000 |
| }, |
| { |
| "base_loss": 0.2973440226018429, |
| "epoch": 3.0143051147460938, |
| "grad_norm": 0.11751745641231537, |
| "learning_rate": 9.707355499267579e-06, |
| "lookahead_loss": 3.6628987169265748, |
| "loss": 1.981, |
| "step": 422500 |
| }, |
| { |
| "base_loss": 0.2965872138440609, |
| "epoch": 3.0152587890625, |
| "grad_norm": 0.11600304394960403, |
| "learning_rate": 9.659671783447265e-06, |
| "lookahead_loss": 3.6893741731643677, |
| "loss": 1.9923, |
| "step": 423000 |
| }, |
| { |
| "base_loss": 0.31338943153619764, |
| "epoch": 3.0162124633789062, |
| "grad_norm": 0.1297282874584198, |
| "learning_rate": 9.611988067626954e-06, |
| "lookahead_loss": 3.703455045223236, |
| "loss": 2.0093, |
| "step": 423500 |
| }, |
| { |
| "base_loss": 0.3127202790379524, |
| "epoch": 3.0171661376953125, |
| "grad_norm": 0.1299683153629303, |
| "learning_rate": 9.56430435180664e-06, |
| "lookahead_loss": 3.697984820842743, |
| "loss": 2.0007, |
| "step": 424000 |
| }, |
| { |
| "base_loss": 0.30206070256233214, |
| "epoch": 3.0181198120117188, |
| "grad_norm": 0.1379324048757553, |
| "learning_rate": 9.51662063598633e-06, |
| "lookahead_loss": 3.667839276313782, |
| "loss": 1.9805, |
| "step": 424500 |
| }, |
| { |
| "base_loss": 0.2996631888449192, |
| "epoch": 3.019073486328125, |
| "grad_norm": 0.10621386021375656, |
| "learning_rate": 9.468936920166016e-06, |
| "lookahead_loss": 3.700213254451752, |
| "loss": 2.0011, |
| "step": 425000 |
| }, |
| { |
| "epoch": 3.019073486328125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1276195155926785, |
| "eval_lookahead_perplexity": 22.819593149469497, |
| "eval_loss": 1.6276158094406128, |
| "eval_perplexity": 5.0917206032373405, |
| "eval_runtime": 264.8437, |
| "eval_samples_per_second": 18.879, |
| "eval_steps_per_second": 0.593, |
| "step": 425000 |
| }, |
| { |
| "base_loss": 0.3018102090358734, |
| "epoch": 3.0200271606445312, |
| "grad_norm": 0.121736079454422, |
| "learning_rate": 9.421253204345703e-06, |
| "lookahead_loss": 3.6784856877326964, |
| "loss": 1.9918, |
| "step": 425500 |
| }, |
| { |
| "base_loss": 0.32929644933342933, |
| "epoch": 3.0209808349609375, |
| "grad_norm": 0.14993664622306824, |
| "learning_rate": 9.373569488525391e-06, |
| "lookahead_loss": 3.7336047492027284, |
| "loss": 2.0255, |
| "step": 426000 |
| }, |
| { |
| "base_loss": 0.3050749698281288, |
| "epoch": 3.0219345092773438, |
| "grad_norm": 0.10891906917095184, |
| "learning_rate": 9.325885772705078e-06, |
| "lookahead_loss": 3.6635623302459717, |
| "loss": 1.98, |
| "step": 426500 |
| }, |
| { |
| "base_loss": 0.29926853865385056, |
| "epoch": 3.02288818359375, |
| "grad_norm": 0.1561029702425003, |
| "learning_rate": 9.278202056884767e-06, |
| "lookahead_loss": 3.6687038259506224, |
| "loss": 1.9894, |
| "step": 427000 |
| }, |
| { |
| "base_loss": 0.30300188970565795, |
| "epoch": 3.0238418579101562, |
| "grad_norm": 0.11594616621732712, |
| "learning_rate": 9.230518341064454e-06, |
| "lookahead_loss": 3.677744508266449, |
| "loss": 1.9921, |
| "step": 427500 |
| }, |
| { |
| "base_loss": 0.3265539970099926, |
| "epoch": 3.0247955322265625, |
| "grad_norm": 0.10191618651151657, |
| "learning_rate": 9.18283462524414e-06, |
| "lookahead_loss": 3.7241694231033327, |
| "loss": 2.0228, |
| "step": 428000 |
| }, |
| { |
| "base_loss": 0.3085910253226757, |
| "epoch": 3.0257492065429688, |
| "grad_norm": 0.12174220383167267, |
| "learning_rate": 9.135150909423829e-06, |
| "lookahead_loss": 3.667691098690033, |
| "loss": 1.9951, |
| "step": 428500 |
| }, |
| { |
| "base_loss": 0.3028672685772181, |
| "epoch": 3.026702880859375, |
| "grad_norm": 0.12968415021896362, |
| "learning_rate": 9.087467193603516e-06, |
| "lookahead_loss": 3.688646504402161, |
| "loss": 1.9897, |
| "step": 429000 |
| }, |
| { |
| "base_loss": 0.3085927827656269, |
| "epoch": 3.0276565551757812, |
| "grad_norm": 0.11275982856750488, |
| "learning_rate": 9.039783477783204e-06, |
| "lookahead_loss": 3.697152105331421, |
| "loss": 1.9988, |
| "step": 429500 |
| }, |
| { |
| "base_loss": 0.3330037909448147, |
| "epoch": 3.0286102294921875, |
| "grad_norm": 0.13144977390766144, |
| "learning_rate": 8.992099761962891e-06, |
| "lookahead_loss": 3.739767518520355, |
| "loss": 2.0306, |
| "step": 430000 |
| }, |
| { |
| "epoch": 3.0286102294921875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.126402835876416, |
| "eval_lookahead_perplexity": 22.79184589653397, |
| "eval_loss": 1.6270209550857544, |
| "eval_perplexity": 5.088692671741188, |
| "eval_runtime": 277.6859, |
| "eval_samples_per_second": 18.006, |
| "eval_steps_per_second": 0.565, |
| "step": 430000 |
| }, |
| { |
| "base_loss": 0.3037443687617779, |
| "epoch": 3.0295639038085938, |
| "grad_norm": 0.11082852631807327, |
| "learning_rate": 8.944416046142578e-06, |
| "lookahead_loss": 3.663953601360321, |
| "loss": 1.9849, |
| "step": 430500 |
| }, |
| { |
| "base_loss": 0.3000984548330307, |
| "epoch": 3.030517578125, |
| "grad_norm": 0.15227945148944855, |
| "learning_rate": 8.896732330322266e-06, |
| "lookahead_loss": 3.69063617515564, |
| "loss": 1.9985, |
| "step": 431000 |
| }, |
| { |
| "base_loss": 0.30023023423552514, |
| "epoch": 3.0314712524414062, |
| "grad_norm": 0.11759933084249496, |
| "learning_rate": 8.849048614501953e-06, |
| "lookahead_loss": 3.698117901802063, |
| "loss": 1.9996, |
| "step": 431500 |
| }, |
| { |
| "base_loss": 0.3138752512037754, |
| "epoch": 3.0324249267578125, |
| "grad_norm": 0.13768881559371948, |
| "learning_rate": 8.801364898681642e-06, |
| "lookahead_loss": 3.718590494632721, |
| "loss": 2.0225, |
| "step": 432000 |
| }, |
| { |
| "base_loss": 0.3038401392996311, |
| "epoch": 3.0333786010742188, |
| "grad_norm": 0.16651467978954315, |
| "learning_rate": 8.753681182861329e-06, |
| "lookahead_loss": 3.6639317264556883, |
| "loss": 1.9836, |
| "step": 432500 |
| }, |
| { |
| "base_loss": 0.302562724173069, |
| "epoch": 3.034332275390625, |
| "grad_norm": 0.1357959806919098, |
| "learning_rate": 8.705997467041015e-06, |
| "lookahead_loss": 3.694962691307068, |
| "loss": 2.0013, |
| "step": 433000 |
| }, |
| { |
| "base_loss": 0.30816466361284256, |
| "epoch": 3.0352859497070312, |
| "grad_norm": 0.11627933382987976, |
| "learning_rate": 8.658313751220704e-06, |
| "lookahead_loss": 3.68630348110199, |
| "loss": 2.0, |
| "step": 433500 |
| }, |
| { |
| "base_loss": 0.32414969062805177, |
| "epoch": 3.0362396240234375, |
| "grad_norm": 0.10137467831373215, |
| "learning_rate": 8.61063003540039e-06, |
| "lookahead_loss": 3.7200622777938843, |
| "loss": 2.0191, |
| "step": 434000 |
| }, |
| { |
| "base_loss": 0.3057764404714107, |
| "epoch": 3.0371932983398438, |
| "grad_norm": 0.11666380614042282, |
| "learning_rate": 8.56294631958008e-06, |
| "lookahead_loss": 3.675651388168335, |
| "loss": 1.992, |
| "step": 434500 |
| }, |
| { |
| "base_loss": 0.2984813822805881, |
| "epoch": 3.03814697265625, |
| "grad_norm": 0.13813932240009308, |
| "learning_rate": 8.515262603759766e-06, |
| "lookahead_loss": 3.667681806087494, |
| "loss": 1.9876, |
| "step": 435000 |
| }, |
| { |
| "epoch": 3.03814697265625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.125287823783704, |
| "eval_lookahead_perplexity": 22.76644687548336, |
| "eval_loss": 1.6264574527740479, |
| "eval_perplexity": 5.08582598942401, |
| "eval_runtime": 276.6213, |
| "eval_samples_per_second": 18.075, |
| "eval_steps_per_second": 0.568, |
| "step": 435000 |
| }, |
| { |
| "base_loss": 0.308715908408165, |
| "epoch": 3.0391006469726562, |
| "grad_norm": 0.12543562054634094, |
| "learning_rate": 8.467578887939453e-06, |
| "lookahead_loss": 3.693666660785675, |
| "loss": 2.006, |
| "step": 435500 |
| }, |
| { |
| "base_loss": 0.3176246392726898, |
| "epoch": 3.0400543212890625, |
| "grad_norm": 0.15535596013069153, |
| "learning_rate": 8.419895172119141e-06, |
| "lookahead_loss": 3.715300820350647, |
| "loss": 2.0146, |
| "step": 436000 |
| }, |
| { |
| "base_loss": 0.30790746420621873, |
| "epoch": 3.0410079956054688, |
| "grad_norm": 0.11093998700380325, |
| "learning_rate": 8.372211456298828e-06, |
| "lookahead_loss": 3.674683918952942, |
| "loss": 1.987, |
| "step": 436500 |
| }, |
| { |
| "base_loss": 0.29577805346250535, |
| "epoch": 3.041961669921875, |
| "grad_norm": 0.14224250614643097, |
| "learning_rate": 8.324527740478517e-06, |
| "lookahead_loss": 3.6888953552246093, |
| "loss": 1.9934, |
| "step": 437000 |
| }, |
| { |
| "base_loss": 0.30930229860544206, |
| "epoch": 3.0429153442382812, |
| "grad_norm": 0.29056692123413086, |
| "learning_rate": 8.276844024658204e-06, |
| "lookahead_loss": 3.685419809818268, |
| "loss": 2.0001, |
| "step": 437500 |
| }, |
| { |
| "base_loss": 0.3300181960165501, |
| "epoch": 3.0438690185546875, |
| "grad_norm": 0.13487912714481354, |
| "learning_rate": 8.22916030883789e-06, |
| "lookahead_loss": 3.7153477659225462, |
| "loss": 2.0264, |
| "step": 438000 |
| }, |
| { |
| "base_loss": 0.2941302236020565, |
| "epoch": 3.0448226928710938, |
| "grad_norm": 0.18283872306346893, |
| "learning_rate": 8.181476593017579e-06, |
| "lookahead_loss": 3.651518307685852, |
| "loss": 1.9746, |
| "step": 438500 |
| }, |
| { |
| "base_loss": 0.3060891110301018, |
| "epoch": 3.0457763671875, |
| "grad_norm": 0.10058881342411041, |
| "learning_rate": 8.133792877197266e-06, |
| "lookahead_loss": 3.7120121273994444, |
| "loss": 2.0059, |
| "step": 439000 |
| }, |
| { |
| "base_loss": 0.3289530730843544, |
| "epoch": 3.0467300415039062, |
| "grad_norm": 0.11518778651952744, |
| "learning_rate": 8.086109161376954e-06, |
| "lookahead_loss": 3.713506714820862, |
| "loss": 2.019, |
| "step": 439500 |
| }, |
| { |
| "base_loss": 0.32627532437443735, |
| "epoch": 3.0476837158203125, |
| "grad_norm": 0.18812230229377747, |
| "learning_rate": 8.038425445556641e-06, |
| "lookahead_loss": 3.7203544387817384, |
| "loss": 2.0281, |
| "step": 440000 |
| }, |
| { |
| "epoch": 3.0476837158203125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1241645150291273, |
| "eval_lookahead_perplexity": 22.740887484628896, |
| "eval_loss": 1.625902533531189, |
| "eval_perplexity": 5.0830045496246665, |
| "eval_runtime": 268.2319, |
| "eval_samples_per_second": 18.641, |
| "eval_steps_per_second": 0.585, |
| "step": 440000 |
| }, |
| { |
| "base_loss": 0.29505294340848925, |
| "epoch": 3.0486373901367188, |
| "grad_norm": 0.10775741934776306, |
| "learning_rate": 7.990741729736328e-06, |
| "lookahead_loss": 3.655467821598053, |
| "loss": 1.9785, |
| "step": 440500 |
| }, |
| { |
| "base_loss": 0.3058525973558426, |
| "epoch": 3.049591064453125, |
| "grad_norm": 0.100101538002491, |
| "learning_rate": 7.943058013916016e-06, |
| "lookahead_loss": 3.6875068039894106, |
| "loss": 1.9976, |
| "step": 441000 |
| }, |
| { |
| "base_loss": 0.3209580435454845, |
| "epoch": 3.0505447387695312, |
| "grad_norm": 0.0992191731929779, |
| "learning_rate": 7.895374298095703e-06, |
| "lookahead_loss": 3.716779721736908, |
| "loss": 2.0189, |
| "step": 441500 |
| }, |
| { |
| "base_loss": 0.30481894659996034, |
| "epoch": 3.0514984130859375, |
| "grad_norm": 0.14181774854660034, |
| "learning_rate": 7.847690582275392e-06, |
| "lookahead_loss": 3.6599699621200563, |
| "loss": 1.9858, |
| "step": 442000 |
| }, |
| { |
| "base_loss": 0.30778305551409724, |
| "epoch": 3.0524520874023438, |
| "grad_norm": 0.1846870332956314, |
| "learning_rate": 7.800006866455079e-06, |
| "lookahead_loss": 3.68911500453949, |
| "loss": 1.9979, |
| "step": 442500 |
| }, |
| { |
| "base_loss": 0.3220736192762852, |
| "epoch": 3.05340576171875, |
| "grad_norm": 0.11497963219881058, |
| "learning_rate": 7.752323150634765e-06, |
| "lookahead_loss": 3.7116645102500914, |
| "loss": 2.0122, |
| "step": 443000 |
| }, |
| { |
| "base_loss": 0.3551108354330063, |
| "epoch": 3.0543594360351562, |
| "grad_norm": 0.10149979591369629, |
| "learning_rate": 7.704639434814454e-06, |
| "lookahead_loss": 3.746900239944458, |
| "loss": 2.0558, |
| "step": 443500 |
| }, |
| { |
| "base_loss": 0.2914521896839142, |
| "epoch": 3.0553131103515625, |
| "grad_norm": 0.13717815279960632, |
| "learning_rate": 7.65695571899414e-06, |
| "lookahead_loss": 3.639768280506134, |
| "loss": 1.9715, |
| "step": 444000 |
| }, |
| { |
| "base_loss": 0.3070342823863029, |
| "epoch": 3.0562667846679688, |
| "grad_norm": 0.10753431171178818, |
| "learning_rate": 7.6092720031738284e-06, |
| "lookahead_loss": 3.7122844524383547, |
| "loss": 2.0111, |
| "step": 444500 |
| }, |
| { |
| "base_loss": 0.31701629158854483, |
| "epoch": 3.057220458984375, |
| "grad_norm": 0.12341847270727158, |
| "learning_rate": 7.561588287353516e-06, |
| "lookahead_loss": 3.713340766429901, |
| "loss": 2.0165, |
| "step": 445000 |
| }, |
| { |
| "epoch": 3.057220458984375, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1230207685464486, |
| "eval_lookahead_perplexity": 22.71489254320056, |
| "eval_loss": 1.6253362894058228, |
| "eval_perplexity": 5.080127142893444, |
| "eval_runtime": 274.3775, |
| "eval_samples_per_second": 18.223, |
| "eval_steps_per_second": 0.572, |
| "step": 445000 |
| }, |
| { |
| "base_loss": 0.3187244310975075, |
| "epoch": 3.0581741333007812, |
| "grad_norm": 0.10979755967855453, |
| "learning_rate": 7.513904571533204e-06, |
| "lookahead_loss": 3.6891490058898926, |
| "loss": 2.0024, |
| "step": 445500 |
| }, |
| { |
| "base_loss": 0.2901874388754368, |
| "epoch": 3.0591278076171875, |
| "grad_norm": 0.09405338019132614, |
| "learning_rate": 7.466220855712891e-06, |
| "lookahead_loss": 3.646950806617737, |
| "loss": 1.9735, |
| "step": 446000 |
| }, |
| { |
| "base_loss": 0.30347892227768897, |
| "epoch": 3.0600814819335938, |
| "grad_norm": 0.09631290286779404, |
| "learning_rate": 7.418537139892578e-06, |
| "lookahead_loss": 3.7013241720199583, |
| "loss": 2.0025, |
| "step": 446500 |
| }, |
| { |
| "base_loss": 0.32231138944625853, |
| "epoch": 3.06103515625, |
| "grad_norm": 0.09189051389694214, |
| "learning_rate": 7.370853424072266e-06, |
| "lookahead_loss": 3.7054029207229613, |
| "loss": 2.0104, |
| "step": 447000 |
| }, |
| { |
| "base_loss": 0.3056237104833126, |
| "epoch": 3.0619888305664062, |
| "grad_norm": 0.1302718222141266, |
| "learning_rate": 7.323169708251954e-06, |
| "lookahead_loss": 3.675487512588501, |
| "loss": 1.9827, |
| "step": 447500 |
| }, |
| { |
| "base_loss": 0.3069985309243202, |
| "epoch": 3.0629425048828125, |
| "grad_norm": 0.11389657109975815, |
| "learning_rate": 7.275485992431641e-06, |
| "lookahead_loss": 3.7175057182312012, |
| "loss": 2.007, |
| "step": 448000 |
| }, |
| { |
| "base_loss": 0.31453078415989877, |
| "epoch": 3.0638961791992188, |
| "grad_norm": 0.1031729131937027, |
| "learning_rate": 7.227802276611328e-06, |
| "lookahead_loss": 3.7119864888191225, |
| "loss": 2.0122, |
| "step": 448500 |
| }, |
| { |
| "base_loss": 0.3025907655358315, |
| "epoch": 3.064849853515625, |
| "grad_norm": 0.1239413321018219, |
| "learning_rate": 7.180118560791016e-06, |
| "lookahead_loss": 3.665122102737427, |
| "loss": 1.9872, |
| "step": 449000 |
| }, |
| { |
| "base_loss": 0.3104621644318104, |
| "epoch": 3.0658035278320312, |
| "grad_norm": 0.13091668486595154, |
| "learning_rate": 7.1324348449707034e-06, |
| "lookahead_loss": 3.685753818035126, |
| "loss": 1.9961, |
| "step": 449500 |
| }, |
| { |
| "base_loss": 0.3051215724647045, |
| "epoch": 3.0667572021484375, |
| "grad_norm": 0.1033690795302391, |
| "learning_rate": 7.084751129150391e-06, |
| "lookahead_loss": 3.6920931324958803, |
| "loss": 1.9991, |
| "step": 450000 |
| }, |
| { |
| "epoch": 3.0667572021484375, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1221639020755267, |
| "eval_lookahead_perplexity": 22.695437249874335, |
| "eval_loss": 1.624902367591858, |
| "eval_perplexity": 5.077923243103105, |
| "eval_runtime": 274.3111, |
| "eval_samples_per_second": 18.227, |
| "eval_steps_per_second": 0.572, |
| "step": 450000 |
| }, |
| { |
| "base_loss": 0.3333324840962887, |
| "epoch": 3.0677108764648438, |
| "grad_norm": 0.16154102981090546, |
| "learning_rate": 7.037067413330079e-06, |
| "lookahead_loss": 3.734944314479828, |
| "loss": 2.0298, |
| "step": 450500 |
| }, |
| { |
| "base_loss": 0.3019326252639294, |
| "epoch": 3.06866455078125, |
| "grad_norm": 0.13059785962104797, |
| "learning_rate": 6.989383697509766e-06, |
| "lookahead_loss": 3.6473833932876585, |
| "loss": 1.9719, |
| "step": 451000 |
| }, |
| { |
| "base_loss": 0.29965000972151756, |
| "epoch": 3.0696182250976562, |
| "grad_norm": 0.1245245486497879, |
| "learning_rate": 6.941699981689453e-06, |
| "lookahead_loss": 3.7130830340385437, |
| "loss": 2.0066, |
| "step": 451500 |
| }, |
| { |
| "base_loss": 0.34273114350438116, |
| "epoch": 3.0705718994140625, |
| "grad_norm": 0.1069691926240921, |
| "learning_rate": 6.894016265869141e-06, |
| "lookahead_loss": 3.7431788458824156, |
| "loss": 2.0475, |
| "step": 452000 |
| }, |
| { |
| "base_loss": 0.3139573369324207, |
| "epoch": 3.0715255737304688, |
| "grad_norm": 0.14187411963939667, |
| "learning_rate": 6.846332550048829e-06, |
| "lookahead_loss": 3.6735409541130064, |
| "loss": 1.9913, |
| "step": 452500 |
| }, |
| { |
| "base_loss": 0.306064426034689, |
| "epoch": 3.072479248046875, |
| "grad_norm": 0.12259159982204437, |
| "learning_rate": 6.798648834228516e-06, |
| "lookahead_loss": 3.681407932758331, |
| "loss": 1.9905, |
| "step": 453000 |
| }, |
| { |
| "base_loss": 0.30416936001181605, |
| "epoch": 3.0734329223632812, |
| "grad_norm": 0.09869462251663208, |
| "learning_rate": 6.750965118408203e-06, |
| "lookahead_loss": 3.6899491953849792, |
| "loss": 1.9993, |
| "step": 453500 |
| }, |
| { |
| "base_loss": 0.32993814861774445, |
| "epoch": 3.0743865966796875, |
| "grad_norm": 0.11042125523090363, |
| "learning_rate": 6.703281402587891e-06, |
| "lookahead_loss": 3.7116984300613405, |
| "loss": 2.0173, |
| "step": 454000 |
| }, |
| { |
| "base_loss": 0.3029070964753628, |
| "epoch": 3.0753402709960938, |
| "grad_norm": 0.12257838994264603, |
| "learning_rate": 6.6555976867675784e-06, |
| "lookahead_loss": 3.6630940194129944, |
| "loss": 1.9823, |
| "step": 454500 |
| }, |
| { |
| "base_loss": 0.3045917192697525, |
| "epoch": 3.0762939453125, |
| "grad_norm": 0.1443857103586197, |
| "learning_rate": 6.607913970947266e-06, |
| "lookahead_loss": 3.714505085945129, |
| "loss": 2.01, |
| "step": 455000 |
| }, |
| { |
| "epoch": 3.0762939453125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.121192988496238, |
| "eval_lookahead_perplexity": 22.673412635389344, |
| "eval_loss": 1.6244314908981323, |
| "eval_perplexity": 5.0755327302579625, |
| "eval_runtime": 260.1225, |
| "eval_samples_per_second": 19.222, |
| "eval_steps_per_second": 0.604, |
| "step": 455000 |
| }, |
| { |
| "base_loss": 0.3286729282438755, |
| "epoch": 3.0772476196289062, |
| "grad_norm": 0.1463230848312378, |
| "learning_rate": 6.560230255126954e-06, |
| "lookahead_loss": 3.7237854881286623, |
| "loss": 2.0319, |
| "step": 455500 |
| }, |
| { |
| "base_loss": 0.30402689361572266, |
| "epoch": 3.0782012939453125, |
| "grad_norm": 0.11617710441350937, |
| "learning_rate": 6.512546539306641e-06, |
| "lookahead_loss": 3.656920817375183, |
| "loss": 1.985, |
| "step": 456000 |
| }, |
| { |
| "base_loss": 0.2991543865799904, |
| "epoch": 3.0791549682617188, |
| "grad_norm": 0.11465635150671005, |
| "learning_rate": 6.464862823486328e-06, |
| "lookahead_loss": 3.686256776332855, |
| "loss": 1.9948, |
| "step": 456500 |
| }, |
| { |
| "base_loss": 0.3117095545232296, |
| "epoch": 3.080108642578125, |
| "grad_norm": 0.12070228904485703, |
| "learning_rate": 6.417179107666016e-06, |
| "lookahead_loss": 3.7134584021568298, |
| "loss": 2.0171, |
| "step": 457000 |
| }, |
| { |
| "base_loss": 0.32279202672839163, |
| "epoch": 3.0810623168945312, |
| "grad_norm": 0.16106776893138885, |
| "learning_rate": 6.369495391845704e-06, |
| "lookahead_loss": 3.6951747126579284, |
| "loss": 2.0141, |
| "step": 457500 |
| }, |
| { |
| "base_loss": 0.2984404028356075, |
| "epoch": 3.0820159912109375, |
| "grad_norm": 0.12972743809223175, |
| "learning_rate": 6.321811676025391e-06, |
| "lookahead_loss": 3.663430832386017, |
| "loss": 1.9873, |
| "step": 458000 |
| }, |
| { |
| "base_loss": 0.30507346931099893, |
| "epoch": 3.0829696655273438, |
| "grad_norm": 0.09754678606987, |
| "learning_rate": 6.274127960205078e-06, |
| "lookahead_loss": 3.6983825812339783, |
| "loss": 2.0024, |
| "step": 458500 |
| }, |
| { |
| "base_loss": 0.3304409826993942, |
| "epoch": 3.08392333984375, |
| "grad_norm": 0.10381273180246353, |
| "learning_rate": 6.226444244384766e-06, |
| "lookahead_loss": 3.7253785543441773, |
| "loss": 2.0254, |
| "step": 459000 |
| }, |
| { |
| "base_loss": 0.3062775060236454, |
| "epoch": 3.0848770141601562, |
| "grad_norm": 0.14119604229927063, |
| "learning_rate": 6.1787605285644534e-06, |
| "lookahead_loss": 3.675768536090851, |
| "loss": 1.9917, |
| "step": 459500 |
| }, |
| { |
| "base_loss": 0.2975120039880276, |
| "epoch": 3.0858306884765625, |
| "grad_norm": 0.09802526980638504, |
| "learning_rate": 6.131076812744141e-06, |
| "lookahead_loss": 3.6808967127799987, |
| "loss": 1.9873, |
| "step": 460000 |
| }, |
| { |
| "epoch": 3.0858306884765625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1203504332338277, |
| "eval_lookahead_perplexity": 22.654317077917476, |
| "eval_loss": 1.624000906944275, |
| "eval_perplexity": 5.073347757747845, |
| "eval_runtime": 271.0959, |
| "eval_samples_per_second": 18.444, |
| "eval_steps_per_second": 0.579, |
| "step": 460000 |
| }, |
| { |
| "base_loss": 0.3025768627524376, |
| "epoch": 3.0867843627929688, |
| "grad_norm": 0.4642274081707001, |
| "learning_rate": 6.083393096923829e-06, |
| "lookahead_loss": 3.6851150598526, |
| "loss": 1.9892, |
| "step": 460500 |
| }, |
| { |
| "base_loss": 0.3363469123840332, |
| "epoch": 3.087738037109375, |
| "grad_norm": 0.11057446897029877, |
| "learning_rate": 6.035709381103516e-06, |
| "lookahead_loss": 3.736498592376709, |
| "loss": 2.029, |
| "step": 461000 |
| }, |
| { |
| "base_loss": 0.3032494475245476, |
| "epoch": 3.0886917114257812, |
| "grad_norm": 0.12081364542245865, |
| "learning_rate": 5.988025665283203e-06, |
| "lookahead_loss": 3.658247405529022, |
| "loss": 1.9801, |
| "step": 461500 |
| }, |
| { |
| "base_loss": 0.3093935915529728, |
| "epoch": 3.0896453857421875, |
| "grad_norm": 0.11053162068128586, |
| "learning_rate": 5.940341949462891e-06, |
| "lookahead_loss": 3.657750358581543, |
| "loss": 1.9829, |
| "step": 462000 |
| }, |
| { |
| "base_loss": 0.29919813787937166, |
| "epoch": 3.0905990600585938, |
| "grad_norm": 0.1109168529510498, |
| "learning_rate": 5.892658233642579e-06, |
| "lookahead_loss": 3.680602566242218, |
| "loss": 1.9922, |
| "step": 462500 |
| }, |
| { |
| "base_loss": 0.2975557982325554, |
| "epoch": 3.091552734375, |
| "grad_norm": 0.10788305848836899, |
| "learning_rate": 5.844974517822266e-06, |
| "lookahead_loss": 3.683750663757324, |
| "loss": 1.9885, |
| "step": 463000 |
| }, |
| { |
| "base_loss": 0.3243062843978405, |
| "epoch": 3.0925064086914062, |
| "grad_norm": 0.15955285727977753, |
| "learning_rate": 5.797290802001953e-06, |
| "lookahead_loss": 3.709198553085327, |
| "loss": 2.0119, |
| "step": 463500 |
| }, |
| { |
| "base_loss": 0.308385471701622, |
| "epoch": 3.0934600830078125, |
| "grad_norm": 0.1460312008857727, |
| "learning_rate": 5.749607086181641e-06, |
| "lookahead_loss": 3.66904278755188, |
| "loss": 1.9921, |
| "step": 464000 |
| }, |
| { |
| "base_loss": 0.29028289583325384, |
| "epoch": 3.0944137573242188, |
| "grad_norm": 0.16574956476688385, |
| "learning_rate": 5.7019233703613284e-06, |
| "lookahead_loss": 3.648255637168884, |
| "loss": 1.9668, |
| "step": 464500 |
| }, |
| { |
| "base_loss": 0.29396757900714876, |
| "epoch": 3.095367431640625, |
| "grad_norm": 0.10332682728767395, |
| "learning_rate": 5.654239654541016e-06, |
| "lookahead_loss": 3.657000524997711, |
| "loss": 1.9795, |
| "step": 465000 |
| }, |
| { |
| "epoch": 3.095367431640625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1194351359297294, |
| "eval_lookahead_perplexity": 22.63359112921932, |
| "eval_loss": 1.6235333681106567, |
| "eval_perplexity": 5.070976325066281, |
| "eval_runtime": 259.9944, |
| "eval_samples_per_second": 19.231, |
| "eval_steps_per_second": 0.604, |
| "step": 465000 |
| }, |
| { |
| "base_loss": 0.3007206397950649, |
| "epoch": 3.0963211059570312, |
| "grad_norm": 0.22960178554058075, |
| "learning_rate": 5.606555938720704e-06, |
| "lookahead_loss": 3.670574773788452, |
| "loss": 1.9856, |
| "step": 465500 |
| }, |
| { |
| "base_loss": 0.33368014737963675, |
| "epoch": 3.0972747802734375, |
| "grad_norm": 0.10277973115444183, |
| "learning_rate": 5.558872222900391e-06, |
| "lookahead_loss": 3.712209891319275, |
| "loss": 2.0179, |
| "step": 466000 |
| }, |
| { |
| "base_loss": 0.2953821074962616, |
| "epoch": 3.0982284545898438, |
| "grad_norm": 0.12367592006921768, |
| "learning_rate": 5.511188507080078e-06, |
| "lookahead_loss": 3.64039670085907, |
| "loss": 1.9701, |
| "step": 466500 |
| }, |
| { |
| "base_loss": 0.29632621896266936, |
| "epoch": 3.09918212890625, |
| "grad_norm": 0.10013365000486374, |
| "learning_rate": 5.463504791259766e-06, |
| "lookahead_loss": 3.676180508136749, |
| "loss": 1.9884, |
| "step": 467000 |
| }, |
| { |
| "base_loss": 0.3015917186141014, |
| "epoch": 3.1001358032226562, |
| "grad_norm": 0.1582956314086914, |
| "learning_rate": 5.415821075439454e-06, |
| "lookahead_loss": 3.6843478126525877, |
| "loss": 1.9969, |
| "step": 467500 |
| }, |
| { |
| "base_loss": 0.3215226559937, |
| "epoch": 3.1010894775390625, |
| "grad_norm": 0.10262365639209747, |
| "learning_rate": 5.368137359619141e-06, |
| "lookahead_loss": 3.7145949635505677, |
| "loss": 2.0143, |
| "step": 468000 |
| }, |
| { |
| "base_loss": 0.3008797511458397, |
| "epoch": 3.1020431518554688, |
| "grad_norm": 0.1437883973121643, |
| "learning_rate": 5.320453643798828e-06, |
| "lookahead_loss": 3.6612454042434694, |
| "loss": 1.9818, |
| "step": 468500 |
| }, |
| { |
| "base_loss": 0.29713244566321373, |
| "epoch": 3.102996826171875, |
| "grad_norm": 0.11875070631504059, |
| "learning_rate": 5.272769927978516e-06, |
| "lookahead_loss": 3.658306348800659, |
| "loss": 1.9854, |
| "step": 469000 |
| }, |
| { |
| "base_loss": 0.29707186728715895, |
| "epoch": 3.1039505004882812, |
| "grad_norm": 0.10225356370210648, |
| "learning_rate": 5.2250862121582034e-06, |
| "lookahead_loss": 3.6872463150024415, |
| "loss": 1.9941, |
| "step": 469500 |
| }, |
| { |
| "base_loss": 0.31371429899334907, |
| "epoch": 3.1049041748046875, |
| "grad_norm": 0.11156973242759705, |
| "learning_rate": 5.177402496337891e-06, |
| "lookahead_loss": 3.6897249317169187, |
| "loss": 2.005, |
| "step": 470000 |
| }, |
| { |
| "epoch": 3.1049041748046875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1185580457742224, |
| "eval_lookahead_perplexity": 22.613748132576696, |
| "eval_loss": 1.6231098175048828, |
| "eval_perplexity": 5.068828964761916, |
| "eval_runtime": 273.7848, |
| "eval_samples_per_second": 18.263, |
| "eval_steps_per_second": 0.573, |
| "step": 470000 |
| }, |
| { |
| "base_loss": 0.30964688101410864, |
| "epoch": 3.1058578491210938, |
| "grad_norm": 0.14706304669380188, |
| "learning_rate": 5.129718780517579e-06, |
| "lookahead_loss": 3.6665027503967287, |
| "loss": 1.9853, |
| "step": 470500 |
| }, |
| { |
| "base_loss": 0.2950109542310238, |
| "epoch": 3.1068115234375, |
| "grad_norm": 0.10974892228841782, |
| "learning_rate": 5.082035064697266e-06, |
| "lookahead_loss": 3.649388165473938, |
| "loss": 1.9749, |
| "step": 471000 |
| }, |
| { |
| "base_loss": 0.29701292705535887, |
| "epoch": 3.1077651977539062, |
| "grad_norm": 0.14115644991397858, |
| "learning_rate": 5.034351348876953e-06, |
| "lookahead_loss": 3.6766736326217653, |
| "loss": 1.9849, |
| "step": 471500 |
| }, |
| { |
| "base_loss": 0.3228313593864441, |
| "epoch": 3.1087188720703125, |
| "grad_norm": 0.12014192342758179, |
| "learning_rate": 4.986667633056641e-06, |
| "lookahead_loss": 3.7084100289344786, |
| "loss": 2.0137, |
| "step": 472000 |
| }, |
| { |
| "base_loss": 0.31703535151481627, |
| "epoch": 3.1096725463867188, |
| "grad_norm": 0.11289548873901367, |
| "learning_rate": 4.938983917236329e-06, |
| "lookahead_loss": 3.674653216838837, |
| "loss": 1.9978, |
| "step": 472500 |
| }, |
| { |
| "base_loss": 0.2994038117825985, |
| "epoch": 3.110626220703125, |
| "grad_norm": 0.10673966258764267, |
| "learning_rate": 4.891300201416016e-06, |
| "lookahead_loss": 3.6496841259002686, |
| "loss": 1.9763, |
| "step": 473000 |
| }, |
| { |
| "base_loss": 0.2989161580502987, |
| "epoch": 3.1115798950195312, |
| "grad_norm": 0.28921955823898315, |
| "learning_rate": 4.843616485595703e-06, |
| "lookahead_loss": 3.6857114777565, |
| "loss": 1.9901, |
| "step": 473500 |
| }, |
| { |
| "base_loss": 0.30562417407333853, |
| "epoch": 3.1125335693359375, |
| "grad_norm": 0.1318550556898117, |
| "learning_rate": 4.795932769775391e-06, |
| "lookahead_loss": 3.693369821548462, |
| "loss": 2.0018, |
| "step": 474000 |
| }, |
| { |
| "base_loss": 0.3457943990826607, |
| "epoch": 3.1134872436523438, |
| "grad_norm": 0.11760886013507843, |
| "learning_rate": 4.7482490539550784e-06, |
| "lookahead_loss": 3.7278725819587706, |
| "loss": 2.0282, |
| "step": 474500 |
| }, |
| { |
| "base_loss": 0.2958811685740948, |
| "epoch": 3.11444091796875, |
| "grad_norm": 0.11645074933767319, |
| "learning_rate": 4.700565338134766e-06, |
| "lookahead_loss": 3.6444467964172365, |
| "loss": 1.9711, |
| "step": 475000 |
| }, |
| { |
| "epoch": 3.11444091796875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.11814274498449, |
| "eval_lookahead_perplexity": 22.604358574998717, |
| "eval_loss": 1.6229058504104614, |
| "eval_perplexity": 5.0677951958768555, |
| "eval_runtime": 273.0672, |
| "eval_samples_per_second": 18.311, |
| "eval_steps_per_second": 0.575, |
| "step": 475000 |
| }, |
| { |
| "base_loss": 0.2985372878909111, |
| "epoch": 3.1153945922851562, |
| "grad_norm": 0.3836623430252075, |
| "learning_rate": 4.652881622314453e-06, |
| "lookahead_loss": 3.688412217617035, |
| "loss": 1.9976, |
| "step": 475500 |
| }, |
| { |
| "base_loss": 0.31154814088344573, |
| "epoch": 3.1163482666015625, |
| "grad_norm": 0.161661297082901, |
| "learning_rate": 4.605197906494141e-06, |
| "lookahead_loss": 3.6807560696601866, |
| "loss": 1.9983, |
| "step": 476000 |
| }, |
| { |
| "base_loss": 0.3238145258128643, |
| "epoch": 3.1173019409179688, |
| "grad_norm": 0.10365596413612366, |
| "learning_rate": 4.557514190673828e-06, |
| "lookahead_loss": 3.7186170196533204, |
| "loss": 2.0244, |
| "step": 476500 |
| }, |
| { |
| "base_loss": 0.30326704213023187, |
| "epoch": 3.118255615234375, |
| "grad_norm": 0.10426798462867737, |
| "learning_rate": 4.509830474853516e-06, |
| "lookahead_loss": 3.6723845586776735, |
| "loss": 1.9861, |
| "step": 477000 |
| }, |
| { |
| "base_loss": 0.30094251811504363, |
| "epoch": 3.1192092895507812, |
| "grad_norm": 0.1405145525932312, |
| "learning_rate": 4.462146759033204e-06, |
| "lookahead_loss": 3.6822138290405273, |
| "loss": 1.9895, |
| "step": 477500 |
| }, |
| { |
| "base_loss": 0.3028905067443848, |
| "epoch": 4.000953674316406, |
| "grad_norm": 0.11994955688714981, |
| "learning_rate": 4.4144630432128904e-06, |
| "lookahead_loss": 3.694705171585083, |
| "loss": 1.9938, |
| "step": 478000 |
| }, |
| { |
| "base_loss": 0.30170239555835726, |
| "epoch": 4.0019073486328125, |
| "grad_norm": 0.17390646040439606, |
| "learning_rate": 4.366779327392578e-06, |
| "lookahead_loss": 3.677241044521332, |
| "loss": 1.9921, |
| "step": 478500 |
| }, |
| { |
| "base_loss": 0.3116890364587307, |
| "epoch": 4.002861022949219, |
| "grad_norm": 0.10386445373296738, |
| "learning_rate": 4.319095611572266e-06, |
| "lookahead_loss": 3.693949136734009, |
| "loss": 1.994, |
| "step": 479000 |
| }, |
| { |
| "base_loss": 0.32097554665803907, |
| "epoch": 4.003814697265625, |
| "grad_norm": 0.11139928549528122, |
| "learning_rate": 4.2714118957519534e-06, |
| "lookahead_loss": 3.6914780583381654, |
| "loss": 2.0077, |
| "step": 479500 |
| }, |
| { |
| "base_loss": 0.3001700294613838, |
| "epoch": 4.004768371582031, |
| "grad_norm": 0.09557037055492401, |
| "learning_rate": 4.223728179931641e-06, |
| "lookahead_loss": 3.650711070537567, |
| "loss": 1.9799, |
| "step": 480000 |
| }, |
| { |
| "epoch": 4.004768371582031, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.117379737738222, |
| "eval_lookahead_perplexity": 22.587117863838966, |
| "eval_loss": 1.622514009475708, |
| "eval_perplexity": 5.065809815272263, |
| "eval_runtime": 262.804, |
| "eval_samples_per_second": 19.026, |
| "eval_steps_per_second": 0.597, |
| "step": 480000 |
| }, |
| { |
| "base_loss": 0.29891238936781883, |
| "epoch": 4.0057220458984375, |
| "grad_norm": 0.12393570691347122, |
| "learning_rate": 4.176044464111328e-06, |
| "lookahead_loss": 3.6757360472679137, |
| "loss": 1.9858, |
| "step": 480500 |
| }, |
| { |
| "base_loss": 0.297944345831871, |
| "epoch": 4.006675720214844, |
| "grad_norm": 0.10606178641319275, |
| "learning_rate": 4.128360748291016e-06, |
| "lookahead_loss": 3.688351684093475, |
| "loss": 1.9954, |
| "step": 481000 |
| }, |
| { |
| "base_loss": 0.3113737390637398, |
| "epoch": 4.00762939453125, |
| "grad_norm": 0.12270639836788177, |
| "learning_rate": 4.080677032470703e-06, |
| "lookahead_loss": 3.696225019454956, |
| "loss": 2.0012, |
| "step": 481500 |
| }, |
| { |
| "base_loss": 0.3164871991574764, |
| "epoch": 4.008583068847656, |
| "grad_norm": 0.11441905051469803, |
| "learning_rate": 4.032993316650391e-06, |
| "lookahead_loss": 3.6902763628959656, |
| "loss": 1.9955, |
| "step": 482000 |
| }, |
| { |
| "base_loss": 0.30087858831882475, |
| "epoch": 4.0095367431640625, |
| "grad_norm": 0.113109290599823, |
| "learning_rate": 3.985309600830079e-06, |
| "lookahead_loss": 3.645549575805664, |
| "loss": 1.9779, |
| "step": 482500 |
| }, |
| { |
| "base_loss": 0.29908930853009225, |
| "epoch": 4.010490417480469, |
| "grad_norm": 0.10333634167909622, |
| "learning_rate": 3.9376258850097654e-06, |
| "lookahead_loss": 3.685707633972168, |
| "loss": 1.991, |
| "step": 483000 |
| }, |
| { |
| "base_loss": 0.30296807369589807, |
| "epoch": 4.011444091796875, |
| "grad_norm": 0.09995999187231064, |
| "learning_rate": 3.889942169189453e-06, |
| "lookahead_loss": 3.6709425745010376, |
| "loss": 1.9912, |
| "step": 483500 |
| }, |
| { |
| "base_loss": 0.3258657184243202, |
| "epoch": 4.012397766113281, |
| "grad_norm": 0.0958399698138237, |
| "learning_rate": 3.842258453369141e-06, |
| "lookahead_loss": 3.7037739310264586, |
| "loss": 2.0136, |
| "step": 484000 |
| }, |
| { |
| "base_loss": 0.30683475187420844, |
| "epoch": 4.0133514404296875, |
| "grad_norm": 0.12681221961975098, |
| "learning_rate": 3.7945747375488284e-06, |
| "lookahead_loss": 3.656396279811859, |
| "loss": 1.9858, |
| "step": 484500 |
| }, |
| { |
| "base_loss": 0.2975224345624447, |
| "epoch": 4.014305114746094, |
| "grad_norm": 0.12238068878650665, |
| "learning_rate": 3.7468910217285157e-06, |
| "lookahead_loss": 3.6545630931854247, |
| "loss": 1.9783, |
| "step": 485000 |
| }, |
| { |
| "epoch": 4.014305114746094, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.11680754533591, |
| "eval_lookahead_perplexity": 22.574197383460017, |
| "eval_loss": 1.622229814529419, |
| "eval_perplexity": 5.064370342279065, |
| "eval_runtime": 274.0996, |
| "eval_samples_per_second": 18.242, |
| "eval_steps_per_second": 0.573, |
| "step": 485000 |
| }, |
| { |
| "base_loss": 0.30197526678442954, |
| "epoch": 1.0009536743164062, |
| "grad_norm": 0.11313185840845108, |
| "learning_rate": 3.6992073059082034e-06, |
| "lookahead_loss": 3.693221253395081, |
| "loss": 1.9935, |
| "step": 485500 |
| }, |
| { |
| "base_loss": 0.303896483540535, |
| "epoch": 1.0019073486328125, |
| "grad_norm": 0.167490616440773, |
| "learning_rate": 3.6515235900878906e-06, |
| "lookahead_loss": 3.677141076564789, |
| "loss": 1.9921, |
| "step": 486000 |
| }, |
| { |
| "base_loss": 0.3094813532233238, |
| "epoch": 1.0028610229492188, |
| "grad_norm": 0.10422538220882416, |
| "learning_rate": 3.6038398742675783e-06, |
| "lookahead_loss": 3.6902101335525512, |
| "loss": 1.9925, |
| "step": 486500 |
| }, |
| { |
| "base_loss": 0.3199170651733875, |
| "epoch": 1.003814697265625, |
| "grad_norm": 0.11597315967082977, |
| "learning_rate": 3.556156158447266e-06, |
| "lookahead_loss": 3.691033944129944, |
| "loss": 2.007, |
| "step": 487000 |
| }, |
| { |
| "base_loss": 0.30184895062446593, |
| "epoch": 1.0047683715820312, |
| "grad_norm": 0.09484552592039108, |
| "learning_rate": 3.508472442626953e-06, |
| "lookahead_loss": 3.651390314102173, |
| "loss": 1.9807, |
| "step": 487500 |
| }, |
| { |
| "base_loss": 0.2977984355092049, |
| "epoch": 1.0057220458984375, |
| "grad_norm": 0.11500236392021179, |
| "learning_rate": 3.460788726806641e-06, |
| "lookahead_loss": 3.674714815616608, |
| "loss": 1.9851, |
| "step": 488000 |
| }, |
| { |
| "base_loss": 0.2989386010617018, |
| "epoch": 1.0066757202148438, |
| "grad_norm": 0.10374879837036133, |
| "learning_rate": 3.413105010986328e-06, |
| "lookahead_loss": 3.6858183379173277, |
| "loss": 1.9948, |
| "step": 488500 |
| }, |
| { |
| "base_loss": 0.3137947543263435, |
| "epoch": 1.00762939453125, |
| "grad_norm": 0.12499138712882996, |
| "learning_rate": 3.3654212951660158e-06, |
| "lookahead_loss": 3.698023428440094, |
| "loss": 2.0016, |
| "step": 489000 |
| }, |
| { |
| "base_loss": 0.31258952274918556, |
| "epoch": 1.0085830688476562, |
| "grad_norm": 0.11980710178613663, |
| "learning_rate": 3.3177375793457034e-06, |
| "lookahead_loss": 3.6856638407707214, |
| "loss": 1.993, |
| "step": 489500 |
| }, |
| { |
| "base_loss": 0.3022870315015316, |
| "epoch": 1.0095367431640625, |
| "grad_norm": 0.11947501450777054, |
| "learning_rate": 3.2700538635253907e-06, |
| "lookahead_loss": 3.644882921695709, |
| "loss": 1.9775, |
| "step": 490000 |
| }, |
| { |
| "epoch": 1.0095367431640625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1163531157155386, |
| "eval_lookahead_perplexity": 22.563941330016597, |
| "eval_loss": 1.6219943761825562, |
| "eval_perplexity": 5.063178135648863, |
| "eval_runtime": 270.228, |
| "eval_samples_per_second": 18.503, |
| "eval_steps_per_second": 0.581, |
| "step": 490000 |
| }, |
| { |
| "base_loss": 0.29763062146306035, |
| "epoch": 1.0104904174804688, |
| "grad_norm": 0.1063813641667366, |
| "learning_rate": 3.2223701477050784e-06, |
| "lookahead_loss": 3.681282151699066, |
| "loss": 1.9884, |
| "step": 490500 |
| }, |
| { |
| "base_loss": 0.3011737278997898, |
| "epoch": 1.011444091796875, |
| "grad_norm": 0.10796815901994705, |
| "learning_rate": 3.1746864318847656e-06, |
| "lookahead_loss": 3.6680510277748106, |
| "loss": 1.9893, |
| "step": 491000 |
| }, |
| { |
| "base_loss": 0.32275081843137743, |
| "epoch": 1.0123977661132812, |
| "grad_norm": 0.08862055093050003, |
| "learning_rate": 3.1270027160644533e-06, |
| "lookahead_loss": 3.700777189731598, |
| "loss": 2.0121, |
| "step": 491500 |
| }, |
| { |
| "base_loss": 0.30656733042001727, |
| "epoch": 1.0133514404296875, |
| "grad_norm": 0.11730780452489853, |
| "learning_rate": 3.079319000244141e-06, |
| "lookahead_loss": 3.6528478350639344, |
| "loss": 1.984, |
| "step": 492000 |
| }, |
| { |
| "base_loss": 0.29944423550367355, |
| "epoch": 1.0143051147460938, |
| "grad_norm": 0.11960422992706299, |
| "learning_rate": 3.031635284423828e-06, |
| "lookahead_loss": 3.654515419960022, |
| "loss": 1.9772, |
| "step": 492500 |
| }, |
| { |
| "base_loss": 0.29441548812389373, |
| "epoch": 1.0152587890625, |
| "grad_norm": 0.12259198725223541, |
| "learning_rate": 2.983951568603516e-06, |
| "lookahead_loss": 3.6766817421913145, |
| "loss": 1.9865, |
| "step": 493000 |
| }, |
| { |
| "base_loss": 0.31012057706713675, |
| "epoch": 1.0162124633789062, |
| "grad_norm": 0.12037604302167892, |
| "learning_rate": 2.936267852783203e-06, |
| "lookahead_loss": 3.6906212878227236, |
| "loss": 2.0039, |
| "step": 493500 |
| }, |
| { |
| "base_loss": 0.3121089872717857, |
| "epoch": 1.0171661376953125, |
| "grad_norm": 0.13175299763679504, |
| "learning_rate": 2.8885841369628908e-06, |
| "lookahead_loss": 3.687456472873688, |
| "loss": 1.9945, |
| "step": 494000 |
| }, |
| { |
| "base_loss": 0.30401164934039115, |
| "epoch": 1.0181198120117188, |
| "grad_norm": 0.13393071293830872, |
| "learning_rate": 2.8409004211425784e-06, |
| "lookahead_loss": 3.6606993680000306, |
| "loss": 1.9758, |
| "step": 494500 |
| }, |
| { |
| "base_loss": 0.29751659095287325, |
| "epoch": 1.019073486328125, |
| "grad_norm": 0.1026306077837944, |
| "learning_rate": 2.7932167053222657e-06, |
| "lookahead_loss": 3.6879693541526795, |
| "loss": 1.994, |
| "step": 495000 |
| }, |
| { |
| "epoch": 1.019073486328125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1158480590905624, |
| "eval_lookahead_perplexity": 22.552548139307703, |
| "eval_loss": 1.6217412948608398, |
| "eval_perplexity": 5.061896901969204, |
| "eval_runtime": 275.022, |
| "eval_samples_per_second": 18.18, |
| "eval_steps_per_second": 0.571, |
| "step": 495000 |
| }, |
| { |
| "base_loss": 0.30135778871178626, |
| "epoch": 1.0200271606445312, |
| "grad_norm": 0.12621034681797028, |
| "learning_rate": 2.7455329895019534e-06, |
| "lookahead_loss": 3.668645941734314, |
| "loss": 1.9875, |
| "step": 495500 |
| }, |
| { |
| "base_loss": 0.33046225929260253, |
| "epoch": 1.0209808349609375, |
| "grad_norm": 0.14475296437740326, |
| "learning_rate": 2.6978492736816406e-06, |
| "lookahead_loss": 3.725844889640808, |
| "loss": 2.0219, |
| "step": 496000 |
| }, |
| { |
| "base_loss": 0.30414657789468763, |
| "epoch": 1.0219345092773438, |
| "grad_norm": 0.10405510663986206, |
| "learning_rate": 2.6501655578613283e-06, |
| "lookahead_loss": 3.65455238866806, |
| "loss": 1.9765, |
| "step": 496500 |
| }, |
| { |
| "base_loss": 0.30107878148555756, |
| "epoch": 1.02288818359375, |
| "grad_norm": 0.15244410932064056, |
| "learning_rate": 2.602481842041016e-06, |
| "lookahead_loss": 3.661600576877594, |
| "loss": 1.9842, |
| "step": 497000 |
| }, |
| { |
| "base_loss": 0.3019954281449318, |
| "epoch": 1.0238418579101562, |
| "grad_norm": 0.10435742884874344, |
| "learning_rate": 2.554798126220703e-06, |
| "lookahead_loss": 3.6684547848701476, |
| "loss": 1.9873, |
| "step": 497500 |
| }, |
| { |
| "base_loss": 0.3265440165698528, |
| "epoch": 1.0247955322265625, |
| "grad_norm": 0.11240936815738678, |
| "learning_rate": 2.507114410400391e-06, |
| "lookahead_loss": 3.7140092349052427, |
| "loss": 2.0185, |
| "step": 498000 |
| }, |
| { |
| "base_loss": 0.3089427370727062, |
| "epoch": 1.0257492065429688, |
| "grad_norm": 0.12863673269748688, |
| "learning_rate": 2.459430694580078e-06, |
| "lookahead_loss": 3.660076278209686, |
| "loss": 1.9887, |
| "step": 498500 |
| }, |
| { |
| "base_loss": 0.306296229749918, |
| "epoch": 1.026702880859375, |
| "grad_norm": 0.13621826469898224, |
| "learning_rate": 2.4117469787597658e-06, |
| "lookahead_loss": 3.6829268450737, |
| "loss": 1.9866, |
| "step": 499000 |
| }, |
| { |
| "base_loss": 0.30920383241772653, |
| "epoch": 1.0276565551757812, |
| "grad_norm": 0.1093730702996254, |
| "learning_rate": 2.3640632629394534e-06, |
| "lookahead_loss": 3.6884715824127197, |
| "loss": 1.995, |
| "step": 499500 |
| }, |
| { |
| "base_loss": 0.33220478031039236, |
| "epoch": 1.0286102294921875, |
| "grad_norm": 0.12156689912080765, |
| "learning_rate": 2.3163795471191407e-06, |
| "lookahead_loss": 3.7322032294273377, |
| "loss": 2.0268, |
| "step": 500000 |
| }, |
| { |
| "epoch": 1.0286102294921875, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1155673802470245, |
| "eval_lookahead_perplexity": 22.54621900444581, |
| "eval_loss": 1.621601939201355, |
| "eval_perplexity": 5.061191547136921, |
| "eval_runtime": 263.6593, |
| "eval_samples_per_second": 18.964, |
| "eval_steps_per_second": 0.595, |
| "step": 500000 |
| }, |
| { |
| "base_loss": 0.30326411041617396, |
| "epoch": 1.0295639038085938, |
| "grad_norm": 0.11229516565799713, |
| "learning_rate": 2.2686958312988284e-06, |
| "lookahead_loss": 3.656010775089264, |
| "loss": 1.9792, |
| "step": 500500 |
| }, |
| { |
| "base_loss": 0.3031138954460621, |
| "epoch": 1.030517578125, |
| "grad_norm": 0.14705514907836914, |
| "learning_rate": 2.2210121154785156e-06, |
| "lookahead_loss": 3.6846701459884645, |
| "loss": 1.9967, |
| "step": 501000 |
| }, |
| { |
| "base_loss": 0.30234866255521775, |
| "epoch": 1.0314712524414062, |
| "grad_norm": 0.12098229676485062, |
| "learning_rate": 2.1733283996582033e-06, |
| "lookahead_loss": 3.690236645698547, |
| "loss": 1.9955, |
| "step": 501500 |
| }, |
| { |
| "base_loss": 0.3155796425938606, |
| "epoch": 1.0324249267578125, |
| "grad_norm": 0.13642369210720062, |
| "learning_rate": 2.125644683837891e-06, |
| "lookahead_loss": 3.709054000377655, |
| "loss": 2.0181, |
| "step": 502000 |
| }, |
| { |
| "base_loss": 0.3022744803726673, |
| "epoch": 1.0333786010742188, |
| "grad_norm": 0.15626195073127747, |
| "learning_rate": 2.077960968017578e-06, |
| "lookahead_loss": 3.653508902549744, |
| "loss": 1.9791, |
| "step": 502500 |
| }, |
| { |
| "base_loss": 0.30410280799865724, |
| "epoch": 1.034332275390625, |
| "grad_norm": 0.12734845280647278, |
| "learning_rate": 2.030277252197266e-06, |
| "lookahead_loss": 3.688762324333191, |
| "loss": 1.9975, |
| "step": 503000 |
| }, |
| { |
| "base_loss": 0.3077150760293007, |
| "epoch": 1.0352859497070312, |
| "grad_norm": 0.12563078105449677, |
| "learning_rate": 1.982593536376953e-06, |
| "lookahead_loss": 3.680134729385376, |
| "loss": 1.9968, |
| "step": 503500 |
| }, |
| { |
| "base_loss": 0.3269314341843128, |
| "epoch": 1.0362396240234375, |
| "grad_norm": 0.10034479200839996, |
| "learning_rate": 1.9349098205566408e-06, |
| "lookahead_loss": 3.7161739377975462, |
| "loss": 2.0173, |
| "step": 504000 |
| }, |
| { |
| "base_loss": 0.30525318866968154, |
| "epoch": 1.0371932983398438, |
| "grad_norm": 0.10954893380403519, |
| "learning_rate": 1.8872261047363282e-06, |
| "lookahead_loss": 3.667464601516724, |
| "loss": 1.9871, |
| "step": 504500 |
| }, |
| { |
| "base_loss": 0.3003401378691196, |
| "epoch": 1.03814697265625, |
| "grad_norm": 0.14078158140182495, |
| "learning_rate": 1.8395423889160157e-06, |
| "lookahead_loss": 3.664040919303894, |
| "loss": 1.9849, |
| "step": 505000 |
| }, |
| { |
| "epoch": 1.03814697265625, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1153540405602502, |
| "eval_lookahead_perplexity": 22.54140951419127, |
| "eval_loss": 1.6214919090270996, |
| "eval_perplexity": 5.0606346939849365, |
| "eval_runtime": 273.2183, |
| "eval_samples_per_second": 18.3, |
| "eval_steps_per_second": 0.575, |
| "step": 505000 |
| }, |
| { |
| "base_loss": 0.3116057882905006, |
| "epoch": 1.0391006469726562, |
| "grad_norm": 0.12243175506591797, |
| "learning_rate": 1.7918586730957031e-06, |
| "lookahead_loss": 3.6875705614089966, |
| "loss": 2.0017, |
| "step": 505500 |
| }, |
| { |
| "base_loss": 0.31716569018363955, |
| "epoch": 1.0400543212890625, |
| "grad_norm": 0.1560487598180771, |
| "learning_rate": 1.7441749572753908e-06, |
| "lookahead_loss": 3.7047328453063963, |
| "loss": 2.0112, |
| "step": 506000 |
| }, |
| { |
| "base_loss": 0.31002197673916815, |
| "epoch": 1.0410079956054688, |
| "grad_norm": 0.11027877777814865, |
| "learning_rate": 1.6964912414550783e-06, |
| "lookahead_loss": 3.6693738231658934, |
| "loss": 1.9847, |
| "step": 506500 |
| }, |
| { |
| "base_loss": 0.29521755149960516, |
| "epoch": 1.041961669921875, |
| "grad_norm": 0.1359536051750183, |
| "learning_rate": 1.6488075256347657e-06, |
| "lookahead_loss": 3.6803672742843627, |
| "loss": 1.9893, |
| "step": 507000 |
| }, |
| { |
| "base_loss": 0.30736519694328307, |
| "epoch": 1.0429153442382812, |
| "grad_norm": 0.290884405374527, |
| "learning_rate": 1.6011238098144532e-06, |
| "lookahead_loss": 3.676666582584381, |
| "loss": 1.9959, |
| "step": 507500 |
| }, |
| { |
| "base_loss": 0.3271687869429588, |
| "epoch": 1.0438690185546875, |
| "grad_norm": 0.13232041895389557, |
| "learning_rate": 1.5534400939941406e-06, |
| "lookahead_loss": 3.7064253492355346, |
| "loss": 2.0222, |
| "step": 508000 |
| }, |
| { |
| "base_loss": 0.2943850245475769, |
| "epoch": 1.0448226928710938, |
| "grad_norm": 0.17723123729228973, |
| "learning_rate": 1.505756378173828e-06, |
| "lookahead_loss": 3.644582795619965, |
| "loss": 1.9702, |
| "step": 508500 |
| }, |
| { |
| "base_loss": 0.30418619123101237, |
| "epoch": 1.0457763671875, |
| "grad_norm": 0.10071691125631332, |
| "learning_rate": 1.4580726623535158e-06, |
| "lookahead_loss": 3.705183662891388, |
| "loss": 2.0024, |
| "step": 509000 |
| }, |
| { |
| "base_loss": 0.32734892451763153, |
| "epoch": 1.0467300415039062, |
| "grad_norm": 0.10738521814346313, |
| "learning_rate": 1.4103889465332032e-06, |
| "lookahead_loss": 3.7058504252433777, |
| "loss": 2.0141, |
| "step": 509500 |
| }, |
| { |
| "base_loss": 0.32642096510529517, |
| "epoch": 1.0476837158203125, |
| "grad_norm": 0.18155638873577118, |
| "learning_rate": 1.3627052307128907e-06, |
| "lookahead_loss": 3.713751731872559, |
| "loss": 2.0234, |
| "step": 510000 |
| }, |
| { |
| "epoch": 1.0476837158203125, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.115112134823784, |
| "eval_lookahead_perplexity": 22.535957277412017, |
| "eval_loss": 1.6213737726211548, |
| "eval_perplexity": 5.06003688410264, |
| "eval_runtime": 270.6265, |
| "eval_samples_per_second": 18.476, |
| "eval_steps_per_second": 0.58, |
| "step": 510000 |
| }, |
| { |
| "base_loss": 0.29649946457147597, |
| "epoch": 1.0486373901367188, |
| "grad_norm": 0.10251113772392273, |
| "learning_rate": 1.3150215148925781e-06, |
| "lookahead_loss": 3.6475554642677306, |
| "loss": 1.9757, |
| "step": 510500 |
| }, |
| { |
| "base_loss": 0.3057677939236164, |
| "epoch": 1.049591064453125, |
| "grad_norm": 0.10838726907968521, |
| "learning_rate": 1.2673377990722656e-06, |
| "lookahead_loss": 3.681268889427185, |
| "loss": 1.993, |
| "step": 511000 |
| }, |
| { |
| "base_loss": 0.3218669015169144, |
| "epoch": 1.0505447387695312, |
| "grad_norm": 0.09824109077453613, |
| "learning_rate": 1.2196540832519533e-06, |
| "lookahead_loss": 3.7119429187774657, |
| "loss": 2.0162, |
| "step": 511500 |
| }, |
| { |
| "base_loss": 0.308034790366888, |
| "epoch": 1.0514984130859375, |
| "grad_norm": 0.14391624927520752, |
| "learning_rate": 1.1719703674316407e-06, |
| "lookahead_loss": 3.6557554478645327, |
| "loss": 1.9841, |
| "step": 512000 |
| }, |
| { |
| "base_loss": 0.30695659655332563, |
| "epoch": 1.0524520874023438, |
| "grad_norm": 0.18154321610927582, |
| "learning_rate": 1.1242866516113282e-06, |
| "lookahead_loss": 3.6813112597465514, |
| "loss": 1.9938, |
| "step": 512500 |
| }, |
| { |
| "base_loss": 0.3215196977555752, |
| "epoch": 1.05340576171875, |
| "grad_norm": 0.12306945025920868, |
| "learning_rate": 1.0766029357910156e-06, |
| "lookahead_loss": 3.7058457975387573, |
| "loss": 2.0082, |
| "step": 513000 |
| }, |
| { |
| "base_loss": 0.35528673872351646, |
| "epoch": 1.0543594360351562, |
| "grad_norm": 0.10241065919399261, |
| "learning_rate": 1.028919219970703e-06, |
| "lookahead_loss": 3.7413923802375795, |
| "loss": 2.0523, |
| "step": 513500 |
| }, |
| { |
| "base_loss": 0.2939756731390953, |
| "epoch": 1.0553131103515625, |
| "grad_norm": 0.13681042194366455, |
| "learning_rate": 9.812355041503908e-07, |
| "lookahead_loss": 3.63578445148468, |
| "loss": 1.97, |
| "step": 514000 |
| }, |
| { |
| "base_loss": 0.30533788445591925, |
| "epoch": 1.0562667846679688, |
| "grad_norm": 0.1053692176938057, |
| "learning_rate": 9.335517883300781e-07, |
| "lookahead_loss": 3.703752159118652, |
| "loss": 2.0064, |
| "step": 514500 |
| }, |
| { |
| "base_loss": 0.3139654756486416, |
| "epoch": 1.057220458984375, |
| "grad_norm": 0.11466662585735321, |
| "learning_rate": 8.858680725097657e-07, |
| "lookahead_loss": 3.703360953807831, |
| "loss": 2.0123, |
| "step": 515000 |
| }, |
| { |
| "epoch": 1.057220458984375, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1149530372680565, |
| "eval_lookahead_perplexity": 22.532372146893515, |
| "eval_loss": 1.6212981939315796, |
| "eval_perplexity": 5.059654467597189, |
| "eval_runtime": 262.3851, |
| "eval_samples_per_second": 19.056, |
| "eval_steps_per_second": 0.598, |
| "step": 515000 |
| }, |
| { |
| "base_loss": 0.32300865882635116, |
| "epoch": 1.0581741333007812, |
| "grad_norm": 0.10684940963983536, |
| "learning_rate": 8.381843566894531e-07, |
| "lookahead_loss": 3.684891324043274, |
| "loss": 2.0007, |
| "step": 515500 |
| }, |
| { |
| "base_loss": 0.28907309558987615, |
| "epoch": 1.0591278076171875, |
| "grad_norm": 0.09958865493535995, |
| "learning_rate": 7.905006408691407e-07, |
| "lookahead_loss": 3.6399435262680053, |
| "loss": 1.9711, |
| "step": 516000 |
| }, |
| { |
| "base_loss": 0.30057010012865065, |
| "epoch": 1.0600814819335938, |
| "grad_norm": 0.097833551466465, |
| "learning_rate": 7.428169250488282e-07, |
| "lookahead_loss": 3.6915335173606874, |
| "loss": 1.9978, |
| "step": 516500 |
| }, |
| { |
| "base_loss": 0.32112616834044455, |
| "epoch": 1.06103515625, |
| "grad_norm": 0.09406998753547668, |
| "learning_rate": 6.951332092285156e-07, |
| "lookahead_loss": 3.697935612201691, |
| "loss": 2.0075, |
| "step": 517000 |
| }, |
| { |
| "base_loss": 0.3060283879637718, |
| "epoch": 1.0619888305664062, |
| "grad_norm": 0.13066363334655762, |
| "learning_rate": 6.474494934082032e-07, |
| "lookahead_loss": 3.6692494864463807, |
| "loss": 1.9784, |
| "step": 517500 |
| }, |
| { |
| "base_loss": 0.31152518782019617, |
| "epoch": 1.0629425048828125, |
| "grad_norm": 0.1175675168633461, |
| "learning_rate": 5.997657775878906e-07, |
| "lookahead_loss": 3.712639572620392, |
| "loss": 2.0057, |
| "step": 518000 |
| }, |
| { |
| "base_loss": 0.3149063532948494, |
| "epoch": 1.0638961791992188, |
| "grad_norm": 0.1051829382777214, |
| "learning_rate": 5.520820617675782e-07, |
| "lookahead_loss": 3.7067093458175657, |
| "loss": 2.0088, |
| "step": 518500 |
| }, |
| { |
| "base_loss": 0.30411062452197074, |
| "epoch": 1.064849853515625, |
| "grad_norm": 0.12518513202667236, |
| "learning_rate": 5.043983459472657e-07, |
| "lookahead_loss": 3.6613830890655517, |
| "loss": 1.9863, |
| "step": 519000 |
| }, |
| { |
| "base_loss": 0.30933507332205773, |
| "epoch": 1.0658035278320312, |
| "grad_norm": 0.13780134916305542, |
| "learning_rate": 4.5671463012695317e-07, |
| "lookahead_loss": 3.6811904344558717, |
| "loss": 1.994, |
| "step": 519500 |
| }, |
| { |
| "base_loss": 0.30638799047470094, |
| "epoch": 1.0667572021484375, |
| "grad_norm": 0.10666853189468384, |
| "learning_rate": 4.0903091430664063e-07, |
| "lookahead_loss": 3.6868834929466248, |
| "loss": 1.9957, |
| "step": 520000 |
| }, |
| { |
| "epoch": 1.0667572021484375, |
| "eval_accuracy": 0.00254853228962818, |
| "eval_base_loss": 0.13120111280355973, |
| "eval_base_perplexity": 1.1401970664837768, |
| "eval_lookahead_loss": 3.1148510923781716, |
| "eval_lookahead_perplexity": 22.53007520377895, |
| "eval_loss": 1.6212482452392578, |
| "eval_perplexity": 5.059401750784422, |
| "eval_runtime": 271.4662, |
| "eval_samples_per_second": 18.418, |
| "eval_steps_per_second": 0.578, |
| "step": 520000 |
| }, |
| { |
| "base_loss": 0.32938760298490527, |
| "epoch": 1.0677108764648438, |
| "grad_norm": 0.1531253606081009, |
| "learning_rate": 3.6134719848632814e-07, |
| "lookahead_loss": 3.7249799466133116, |
| "loss": 2.0249, |
| "step": 520500 |
| }, |
| { |
| "base_loss": 0.29950347980856895, |
| "epoch": 1.06866455078125, |
| "grad_norm": 0.1346207857131958, |
| "learning_rate": 3.1366348266601565e-07, |
| "lookahead_loss": 3.639112250804901, |
| "loss": 1.9671, |
| "step": 521000 |
| }, |
| { |
| "base_loss": 0.30374919882416723, |
| "epoch": 1.0696182250976562, |
| "grad_norm": 0.13088198006153107, |
| "learning_rate": 2.6597976684570316e-07, |
| "lookahead_loss": 3.7110060076713562, |
| "loss": 2.0063, |
| "step": 521500 |
| }, |
| { |
| "base_loss": 0.34455711591243743, |
| "epoch": 1.0705718994140625, |
| "grad_norm": 0.10850070416927338, |
| "learning_rate": 2.1829605102539064e-07, |
| "lookahead_loss": 3.7400474700927733, |
| "loss": 2.0458, |
| "step": 522000 |
| }, |
| { |
| "base_loss": 0.31508783569931986, |
| "epoch": 1.0715255737304688, |
| "grad_norm": 0.14581139385700226, |
| "learning_rate": 1.7061233520507813e-07, |
| "lookahead_loss": 3.6696447038650515, |
| "loss": 1.9897, |
| "step": 522500 |
| }, |
| { |
| "base_loss": 0.3064769520163536, |
| "epoch": 1.072479248046875, |
| "grad_norm": 0.12252921611070633, |
| "learning_rate": 1.2292861938476564e-07, |
| "lookahead_loss": 3.6762770075798032, |
| "loss": 1.9898, |
| "step": 523000 |
| }, |
| { |
| "base_loss": 0.3032188524603844, |
| "epoch": 1.0734329223632812, |
| "grad_norm": 0.09782757610082626, |
| "learning_rate": 7.524490356445312e-08, |
| "lookahead_loss": 3.6822463884353636, |
| "loss": 1.9965, |
| "step": 523500 |
| }, |
| { |
| "base_loss": 0.3287756524384022, |
| "epoch": 1.0743865966796875, |
| "grad_norm": 0.11079169809818268, |
| "learning_rate": 2.7561187744140627e-08, |
| "lookahead_loss": 3.706464078426361, |
| "loss": 2.0165, |
| "step": 524000 |
| }, |
| { |
| "epoch": 1.0749359130859375, |
| "step": 524288, |
| "total_flos": 3.285699411601208e+19, |
| "train_loss": 0.14963702380191535, |
| "train_runtime": 35362.9264, |
| "train_samples_per_second": 474.43, |
| "train_steps_per_second": 14.826 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 524288, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.285699411601208e+19, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|