| { |
| "best_global_step": null, |
| "best_metric": 0.20709213614463806, |
| "best_model_checkpoint": null, |
| "epoch": 8.04632568359375, |
| "eval_steps": 5000, |
| "global_step": 524288, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "base_loss": 0.5978933601975441, |
| "epoch": 0.00095367431640625, |
| "grad_norm": 0.0011252601398155093, |
| "learning_rate": 4.995241165161133e-05, |
| "lookahead_loss": 10.307127059936523, |
| "loss": 0.6104, |
| "step": 500 |
| }, |
| { |
| "base_loss": 0.5862715001702309, |
| "epoch": 0.0019073486328125, |
| "grad_norm": 0.0011970596387982368, |
| "learning_rate": 4.990472793579102e-05, |
| "lookahead_loss": 10.149111717224121, |
| "loss": 0.6054, |
| "step": 1000 |
| }, |
| { |
| "base_loss": 0.6046989848017692, |
| "epoch": 0.00286102294921875, |
| "grad_norm": 0.0011270787799730897, |
| "learning_rate": 4.98570442199707e-05, |
| "lookahead_loss": 10.002682935714722, |
| "loss": 0.6216, |
| "step": 1500 |
| }, |
| { |
| "base_loss": 0.6158502401709557, |
| "epoch": 0.003814697265625, |
| "grad_norm": 0.001161540043540299, |
| "learning_rate": 4.9809360504150393e-05, |
| "lookahead_loss": 9.859745756149293, |
| "loss": 0.6304, |
| "step": 2000 |
| }, |
| { |
| "base_loss": 0.6010496825575828, |
| "epoch": 0.00476837158203125, |
| "grad_norm": 0.0011645682388916612, |
| "learning_rate": 4.9761676788330084e-05, |
| "lookahead_loss": 9.7204390335083, |
| "loss": 0.6158, |
| "step": 2500 |
| }, |
| { |
| "base_loss": 0.5909238924980164, |
| "epoch": 0.0057220458984375, |
| "grad_norm": 0.0010902719805017114, |
| "learning_rate": 4.971399307250977e-05, |
| "lookahead_loss": 9.615325008392334, |
| "loss": 0.6064, |
| "step": 3000 |
| }, |
| { |
| "base_loss": 0.5803323667645455, |
| "epoch": 0.00667572021484375, |
| "grad_norm": 0.0011976484674960375, |
| "learning_rate": 4.966630935668946e-05, |
| "lookahead_loss": 9.45008639717102, |
| "loss": 0.6025, |
| "step": 3500 |
| }, |
| { |
| "base_loss": 0.6060425414443016, |
| "epoch": 0.00762939453125, |
| "grad_norm": 0.0011318833567202091, |
| "learning_rate": 4.961862564086914e-05, |
| "lookahead_loss": 9.358827793121337, |
| "loss": 0.6227, |
| "step": 4000 |
| }, |
| { |
| "base_loss": 0.6035141298770904, |
| "epoch": 0.00858306884765625, |
| "grad_norm": 0.0010984783293679357, |
| "learning_rate": 4.957094192504883e-05, |
| "lookahead_loss": 9.244875957489013, |
| "loss": 0.6141, |
| "step": 4500 |
| }, |
| { |
| "base_loss": 0.5932371410131454, |
| "epoch": 0.0095367431640625, |
| "grad_norm": 0.0012236966285854578, |
| "learning_rate": 4.952325820922852e-05, |
| "lookahead_loss": 9.141341512680054, |
| "loss": 0.6126, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.0095367431640625, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 8.99573203321463, |
| "eval_lookahead_perplexity": 8068.573930658563, |
| "eval_loss": 0.2124410718679428, |
| "eval_perplexity": 1.236693235395729, |
| "eval_runtime": 92.5309, |
| "eval_samples_per_second": 54.036, |
| "eval_steps_per_second": 1.697, |
| "step": 5000 |
| }, |
| { |
| "base_loss": 0.5934136973619462, |
| "epoch": 0.01049041748046875, |
| "grad_norm": 0.0011562301078811288, |
| "learning_rate": 4.9475574493408205e-05, |
| "lookahead_loss": 9.01434019088745, |
| "loss": 0.6097, |
| "step": 5500 |
| }, |
| { |
| "base_loss": 0.5931573793888092, |
| "epoch": 0.011444091796875, |
| "grad_norm": 0.0011571954237297177, |
| "learning_rate": 4.9427890777587895e-05, |
| "lookahead_loss": 8.929434476852418, |
| "loss": 0.6112, |
| "step": 6000 |
| }, |
| { |
| "base_loss": 0.6124616218209267, |
| "epoch": 0.01239776611328125, |
| "grad_norm": 0.0011109471088275313, |
| "learning_rate": 4.938020706176758e-05, |
| "lookahead_loss": 8.831825304031373, |
| "loss": 0.6266, |
| "step": 6500 |
| }, |
| { |
| "base_loss": 0.5971367362737655, |
| "epoch": 0.0133514404296875, |
| "grad_norm": 0.0011375549947842956, |
| "learning_rate": 4.933252334594727e-05, |
| "lookahead_loss": 8.758240091323852, |
| "loss": 0.6147, |
| "step": 7000 |
| }, |
| { |
| "base_loss": 0.5954981714487075, |
| "epoch": 0.01430511474609375, |
| "grad_norm": 0.0011499100364744663, |
| "learning_rate": 4.928483963012696e-05, |
| "lookahead_loss": 8.65474760055542, |
| "loss": 0.6113, |
| "step": 7500 |
| }, |
| { |
| "base_loss": 0.5817196745276451, |
| "epoch": 0.0152587890625, |
| "grad_norm": 0.0011594763491302729, |
| "learning_rate": 4.923715591430664e-05, |
| "lookahead_loss": 8.523625542640685, |
| "loss": 0.5931, |
| "step": 8000 |
| }, |
| { |
| "base_loss": 0.612317619562149, |
| "epoch": 0.01621246337890625, |
| "grad_norm": 0.001187981222756207, |
| "learning_rate": 4.918947219848633e-05, |
| "lookahead_loss": 8.48364700603485, |
| "loss": 0.6226, |
| "step": 8500 |
| }, |
| { |
| "base_loss": 0.6063876725435257, |
| "epoch": 0.0171661376953125, |
| "grad_norm": 0.0011250991374254227, |
| "learning_rate": 4.9141788482666016e-05, |
| "lookahead_loss": 8.421111564636231, |
| "loss": 0.6171, |
| "step": 9000 |
| }, |
| { |
| "base_loss": 0.5936728613972664, |
| "epoch": 0.01811981201171875, |
| "grad_norm": 0.0010821197647601366, |
| "learning_rate": 4.9094104766845706e-05, |
| "lookahead_loss": 8.355785271644592, |
| "loss": 0.606, |
| "step": 9500 |
| }, |
| { |
| "base_loss": 0.5859414834976197, |
| "epoch": 0.019073486328125, |
| "grad_norm": 0.0011178837157785892, |
| "learning_rate": 4.9046421051025396e-05, |
| "lookahead_loss": 8.281417448997498, |
| "loss": 0.6026, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.019073486328125, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 8.09148007688431, |
| "eval_lookahead_perplexity": 3266.518692937833, |
| "eval_loss": 0.21068428456783295, |
| "eval_perplexity": 1.2345225357127578, |
| "eval_runtime": 95.6991, |
| "eval_samples_per_second": 52.247, |
| "eval_steps_per_second": 1.641, |
| "step": 10000 |
| }, |
| { |
| "base_loss": 0.5858585347533226, |
| "epoch": 0.02002716064453125, |
| "grad_norm": 0.0011825824622064829, |
| "learning_rate": 4.899873733520508e-05, |
| "lookahead_loss": 8.141557210922242, |
| "loss": 0.6052, |
| "step": 10500 |
| }, |
| { |
| "base_loss": 0.6189192984104156, |
| "epoch": 0.0209808349609375, |
| "grad_norm": 0.001107234158553183, |
| "learning_rate": 4.895105361938477e-05, |
| "lookahead_loss": 8.11882752418518, |
| "loss": 0.6289, |
| "step": 11000 |
| }, |
| { |
| "base_loss": 0.5979826554656029, |
| "epoch": 0.02193450927734375, |
| "grad_norm": 0.001138643710874021, |
| "learning_rate": 4.890336990356445e-05, |
| "lookahead_loss": 8.047190843582154, |
| "loss": 0.6135, |
| "step": 11500 |
| }, |
| { |
| "base_loss": 0.5944889052510262, |
| "epoch": 0.02288818359375, |
| "grad_norm": 0.0011185838375240564, |
| "learning_rate": 4.8855686187744143e-05, |
| "lookahead_loss": 7.9819194717407225, |
| "loss": 0.6082, |
| "step": 12000 |
| }, |
| { |
| "base_loss": 0.5957045419812202, |
| "epoch": 0.02384185791015625, |
| "grad_norm": 0.0011360319331288338, |
| "learning_rate": 4.8808002471923834e-05, |
| "lookahead_loss": 7.931338422775268, |
| "loss": 0.607, |
| "step": 12500 |
| }, |
| { |
| "base_loss": 0.61453830909729, |
| "epoch": 0.0247955322265625, |
| "grad_norm": 0.0010940487263724208, |
| "learning_rate": 4.876031875610352e-05, |
| "lookahead_loss": 7.886316944122314, |
| "loss": 0.6274, |
| "step": 13000 |
| }, |
| { |
| "base_loss": 0.6059305937290191, |
| "epoch": 0.02574920654296875, |
| "grad_norm": 0.0010977721540257335, |
| "learning_rate": 4.871263504028321e-05, |
| "lookahead_loss": 7.826973139762878, |
| "loss": 0.6186, |
| "step": 13500 |
| }, |
| { |
| "base_loss": 0.5937826926708222, |
| "epoch": 0.026702880859375, |
| "grad_norm": 0.001152095035649836, |
| "learning_rate": 4.866495132446289e-05, |
| "lookahead_loss": 7.781199982643128, |
| "loss": 0.6065, |
| "step": 14000 |
| }, |
| { |
| "base_loss": 0.5939369524717331, |
| "epoch": 0.02765655517578125, |
| "grad_norm": 0.001108541153371334, |
| "learning_rate": 4.861726760864258e-05, |
| "lookahead_loss": 7.837098469734192, |
| "loss": 0.6077, |
| "step": 14500 |
| }, |
| { |
| "base_loss": 0.616533571600914, |
| "epoch": 0.0286102294921875, |
| "grad_norm": 0.0011754513252526522, |
| "learning_rate": 4.856958389282227e-05, |
| "lookahead_loss": 7.8037817134857175, |
| "loss": 0.6282, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.0286102294921875, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 7.644905143652481, |
| "eval_lookahead_perplexity": 2089.9703194804256, |
| "eval_loss": 0.20981459319591522, |
| "eval_perplexity": 1.2334493488533298, |
| "eval_runtime": 112.883, |
| "eval_samples_per_second": 44.294, |
| "eval_steps_per_second": 1.391, |
| "step": 15000 |
| }, |
| { |
| "base_loss": 0.6012749794125557, |
| "epoch": 0.02956390380859375, |
| "grad_norm": 0.0011089193867519498, |
| "learning_rate": 4.8521900177001955e-05, |
| "lookahead_loss": 7.749636699676514, |
| "loss": 0.6145, |
| "step": 15500 |
| }, |
| { |
| "base_loss": 0.5948225556612015, |
| "epoch": 0.030517578125, |
| "grad_norm": 0.0011371374130249023, |
| "learning_rate": 4.8474216461181645e-05, |
| "lookahead_loss": 7.701480973243713, |
| "loss": 0.61, |
| "step": 16000 |
| }, |
| { |
| "base_loss": 0.5902180044651032, |
| "epoch": 0.03147125244140625, |
| "grad_norm": 0.0011123986914753914, |
| "learning_rate": 4.842653274536133e-05, |
| "lookahead_loss": 7.686638072967529, |
| "loss": 0.606, |
| "step": 16500 |
| }, |
| { |
| "base_loss": 0.6148928787708282, |
| "epoch": 0.0324249267578125, |
| "grad_norm": 0.0011524204164743423, |
| "learning_rate": 4.837884902954102e-05, |
| "lookahead_loss": 7.60680880355835, |
| "loss": 0.6307, |
| "step": 17000 |
| }, |
| { |
| "base_loss": 0.5949572869539261, |
| "epoch": 0.03337860107421875, |
| "grad_norm": 0.0011757535394281149, |
| "learning_rate": 4.833116531372071e-05, |
| "lookahead_loss": 7.631597835540772, |
| "loss": 0.6092, |
| "step": 17500 |
| }, |
| { |
| "base_loss": 0.59463729596138, |
| "epoch": 0.034332275390625, |
| "grad_norm": 0.001099089509807527, |
| "learning_rate": 4.828348159790039e-05, |
| "lookahead_loss": 7.6473195314407345, |
| "loss": 0.6108, |
| "step": 18000 |
| }, |
| { |
| "base_loss": 0.5924516545534134, |
| "epoch": 0.03528594970703125, |
| "grad_norm": 0.0011326675303280354, |
| "learning_rate": 4.823579788208008e-05, |
| "lookahead_loss": 7.507555624961853, |
| "loss": 0.6057, |
| "step": 18500 |
| }, |
| { |
| "base_loss": 0.6097258816361427, |
| "epoch": 0.0362396240234375, |
| "grad_norm": 0.0011656777933239937, |
| "learning_rate": 4.8188114166259766e-05, |
| "lookahead_loss": 7.5816989393234255, |
| "loss": 0.6232, |
| "step": 19000 |
| }, |
| { |
| "base_loss": 0.6038113740086556, |
| "epoch": 0.03719329833984375, |
| "grad_norm": 0.0011302254861220717, |
| "learning_rate": 4.8140430450439456e-05, |
| "lookahead_loss": 7.5106663646698, |
| "loss": 0.6146, |
| "step": 19500 |
| }, |
| { |
| "base_loss": 0.5906151984333992, |
| "epoch": 0.03814697265625, |
| "grad_norm": 0.0010981757659465075, |
| "learning_rate": 4.8092746734619146e-05, |
| "lookahead_loss": 7.503454574584961, |
| "loss": 0.6053, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.03814697265625, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 7.413040787267228, |
| "eval_lookahead_perplexity": 1657.4586710962171, |
| "eval_loss": 0.2093675285577774, |
| "eval_perplexity": 1.232898040510938, |
| "eval_runtime": 116.6122, |
| "eval_samples_per_second": 42.877, |
| "eval_steps_per_second": 1.346, |
| "step": 20000 |
| }, |
| { |
| "base_loss": 0.5973337704539299, |
| "epoch": 0.03910064697265625, |
| "grad_norm": 0.0011260570026934147, |
| "learning_rate": 4.804506301879883e-05, |
| "lookahead_loss": 7.45892139339447, |
| "loss": 0.6126, |
| "step": 20500 |
| }, |
| { |
| "base_loss": 0.6136817481517792, |
| "epoch": 0.0400543212890625, |
| "grad_norm": 0.0011610394576564431, |
| "learning_rate": 4.799737930297852e-05, |
| "lookahead_loss": 7.463105855941772, |
| "loss": 0.6255, |
| "step": 21000 |
| }, |
| { |
| "base_loss": 0.6021154451966285, |
| "epoch": 0.04100799560546875, |
| "grad_norm": 0.0011342237703502178, |
| "learning_rate": 4.79496955871582e-05, |
| "lookahead_loss": 7.411617256164551, |
| "loss": 0.6135, |
| "step": 21500 |
| }, |
| { |
| "base_loss": 0.5793227363824844, |
| "epoch": 0.041961669921875, |
| "grad_norm": 0.001150211552157998, |
| "learning_rate": 4.7902011871337893e-05, |
| "lookahead_loss": 7.446121297836304, |
| "loss": 0.5974, |
| "step": 22000 |
| }, |
| { |
| "base_loss": 0.6037838690280914, |
| "epoch": 0.04291534423828125, |
| "grad_norm": 0.0010809092782437801, |
| "learning_rate": 4.7854328155517584e-05, |
| "lookahead_loss": 7.466872262954712, |
| "loss": 0.6172, |
| "step": 22500 |
| }, |
| { |
| "base_loss": 0.6098511652350426, |
| "epoch": 0.0438690185546875, |
| "grad_norm": 0.0011468523880466819, |
| "learning_rate": 4.780664443969727e-05, |
| "lookahead_loss": 7.43432590007782, |
| "loss": 0.6256, |
| "step": 23000 |
| }, |
| { |
| "base_loss": 0.5932865824103355, |
| "epoch": 0.04482269287109375, |
| "grad_norm": 0.0011411454761400819, |
| "learning_rate": 4.775896072387696e-05, |
| "lookahead_loss": 7.379173007011413, |
| "loss": 0.6064, |
| "step": 23500 |
| }, |
| { |
| "base_loss": 0.5919549334645271, |
| "epoch": 0.0457763671875, |
| "grad_norm": 0.0010719113051891327, |
| "learning_rate": 4.771127700805664e-05, |
| "lookahead_loss": 7.352979991912842, |
| "loss": 0.6043, |
| "step": 24000 |
| }, |
| { |
| "base_loss": 0.6157130757570267, |
| "epoch": 0.04673004150390625, |
| "grad_norm": 0.0011310658883303404, |
| "learning_rate": 4.766359329223633e-05, |
| "lookahead_loss": 7.356749028205871, |
| "loss": 0.6284, |
| "step": 24500 |
| }, |
| { |
| "base_loss": 0.6059511578679084, |
| "epoch": 0.0476837158203125, |
| "grad_norm": 0.0011380530195310712, |
| "learning_rate": 4.761590957641602e-05, |
| "lookahead_loss": 7.340578727722168, |
| "loss": 0.6244, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.0476837158203125, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 7.268213134985, |
| "eval_lookahead_perplexity": 1433.9858232957502, |
| "eval_loss": 0.2090858817100525, |
| "eval_perplexity": 1.232550847559457, |
| "eval_runtime": 96.5112, |
| "eval_samples_per_second": 51.807, |
| "eval_steps_per_second": 1.627, |
| "step": 25000 |
| }, |
| { |
| "base_loss": 0.5887659189105033, |
| "epoch": 0.04863739013671875, |
| "grad_norm": 0.001135756610892713, |
| "learning_rate": 4.7568225860595705e-05, |
| "lookahead_loss": 7.312827653884888, |
| "loss": 0.6021, |
| "step": 25500 |
| }, |
| { |
| "base_loss": 0.5974695605635643, |
| "epoch": 0.049591064453125, |
| "grad_norm": 0.0011581169674172997, |
| "learning_rate": 4.7520542144775395e-05, |
| "lookahead_loss": 7.2902094869613645, |
| "loss": 0.6095, |
| "step": 26000 |
| }, |
| { |
| "base_loss": 0.6176432440280915, |
| "epoch": 0.05054473876953125, |
| "grad_norm": 0.0010899268090724945, |
| "learning_rate": 4.747285842895508e-05, |
| "lookahead_loss": 7.362010593414307, |
| "loss": 0.6288, |
| "step": 26500 |
| }, |
| { |
| "base_loss": 0.596397516131401, |
| "epoch": 0.0514984130859375, |
| "grad_norm": 0.0011435514315962791, |
| "learning_rate": 4.742517471313477e-05, |
| "lookahead_loss": 7.30141323184967, |
| "loss": 0.6118, |
| "step": 27000 |
| }, |
| { |
| "base_loss": 0.5952906757593155, |
| "epoch": 0.05245208740234375, |
| "grad_norm": 0.0011486115399748087, |
| "learning_rate": 4.737749099731446e-05, |
| "lookahead_loss": 7.251459365844727, |
| "loss": 0.6066, |
| "step": 27500 |
| }, |
| { |
| "base_loss": 0.6116316332221031, |
| "epoch": 0.05340576171875, |
| "grad_norm": 0.001126940012909472, |
| "learning_rate": 4.732980728149414e-05, |
| "lookahead_loss": 7.226618359565735, |
| "loss": 0.6223, |
| "step": 28000 |
| }, |
| { |
| "base_loss": 0.6090298828482628, |
| "epoch": 0.05435943603515625, |
| "grad_norm": 0.0011391318403184414, |
| "learning_rate": 4.728212356567383e-05, |
| "lookahead_loss": 7.265408018112183, |
| "loss": 0.626, |
| "step": 28500 |
| }, |
| { |
| "base_loss": 0.5974834812283516, |
| "epoch": 0.0553131103515625, |
| "grad_norm": 0.0011334357550367713, |
| "learning_rate": 4.7234439849853516e-05, |
| "lookahead_loss": 7.277325885772705, |
| "loss": 0.6081, |
| "step": 29000 |
| }, |
| { |
| "base_loss": 0.5880563573837281, |
| "epoch": 0.05626678466796875, |
| "grad_norm": 0.0010896348394453526, |
| "learning_rate": 4.7186756134033206e-05, |
| "lookahead_loss": 7.28397903251648, |
| "loss": 0.6031, |
| "step": 29500 |
| }, |
| { |
| "base_loss": 0.6152015085816384, |
| "epoch": 0.057220458984375, |
| "grad_norm": 0.0011281173210591078, |
| "learning_rate": 4.7139072418212896e-05, |
| "lookahead_loss": 7.264281406402588, |
| "loss": 0.6301, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.057220458984375, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 7.167584419250488, |
| "eval_lookahead_perplexity": 1296.7085124687899, |
| "eval_loss": 0.2088911086320877, |
| "eval_perplexity": 1.2323108032149765, |
| "eval_runtime": 93.7201, |
| "eval_samples_per_second": 53.35, |
| "eval_steps_per_second": 1.675, |
| "step": 30000 |
| }, |
| { |
| "base_loss": 0.6071268789768219, |
| "epoch": 0.05817413330078125, |
| "grad_norm": 0.0011435674969106913, |
| "learning_rate": 4.709138870239258e-05, |
| "lookahead_loss": 7.241272003173828, |
| "loss": 0.6175, |
| "step": 30500 |
| }, |
| { |
| "base_loss": 0.5850496413707733, |
| "epoch": 0.0591278076171875, |
| "grad_norm": 0.001114132348448038, |
| "learning_rate": 4.704370498657227e-05, |
| "lookahead_loss": 7.161423627853393, |
| "loss": 0.602, |
| "step": 31000 |
| }, |
| { |
| "base_loss": 0.5950538128614425, |
| "epoch": 0.06008148193359375, |
| "grad_norm": 0.0011068363673985004, |
| "learning_rate": 4.699602127075195e-05, |
| "lookahead_loss": 7.204737923622131, |
| "loss": 0.6088, |
| "step": 31500 |
| }, |
| { |
| "base_loss": 0.6059240178465843, |
| "epoch": 0.06103515625, |
| "grad_norm": 0.0011420327937230468, |
| "learning_rate": 4.6948337554931643e-05, |
| "lookahead_loss": 7.163901489257812, |
| "loss": 0.6186, |
| "step": 32000 |
| }, |
| { |
| "base_loss": 0.6009617374539376, |
| "epoch": 0.06198883056640625, |
| "grad_norm": 0.0011484776623547077, |
| "learning_rate": 4.6900653839111334e-05, |
| "lookahead_loss": 7.169231108665466, |
| "loss": 0.6148, |
| "step": 32500 |
| }, |
| { |
| "base_loss": 0.5932911797165871, |
| "epoch": 0.0629425048828125, |
| "grad_norm": 0.001157185179181397, |
| "learning_rate": 4.685297012329102e-05, |
| "lookahead_loss": 7.14691205406189, |
| "loss": 0.6086, |
| "step": 33000 |
| }, |
| { |
| "base_loss": 0.6118422101140022, |
| "epoch": 0.06389617919921875, |
| "grad_norm": 0.0010650681797415018, |
| "learning_rate": 4.680528640747071e-05, |
| "lookahead_loss": 7.168767412185669, |
| "loss": 0.625, |
| "step": 33500 |
| }, |
| { |
| "base_loss": 0.6075323719978333, |
| "epoch": 0.064849853515625, |
| "grad_norm": 0.001123252441175282, |
| "learning_rate": 4.675760269165039e-05, |
| "lookahead_loss": 7.191308692932129, |
| "loss": 0.618, |
| "step": 34000 |
| }, |
| { |
| "base_loss": 0.6046282976865769, |
| "epoch": 0.06580352783203125, |
| "grad_norm": 0.0011436466593295336, |
| "learning_rate": 4.670991897583008e-05, |
| "lookahead_loss": 7.114853853225708, |
| "loss": 0.6155, |
| "step": 34500 |
| }, |
| { |
| "base_loss": 0.6035915340185165, |
| "epoch": 0.0667572021484375, |
| "grad_norm": 0.0011282000923529267, |
| "learning_rate": 4.666223526000977e-05, |
| "lookahead_loss": 7.079581315994263, |
| "loss": 0.6135, |
| "step": 35000 |
| }, |
| { |
| "epoch": 0.0667572021484375, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 7.093881194203044, |
| "eval_lookahead_perplexity": 1204.573925020854, |
| "eval_loss": 0.20874807238578796, |
| "eval_perplexity": 1.232134550708959, |
| "eval_runtime": 95.3048, |
| "eval_samples_per_second": 52.463, |
| "eval_steps_per_second": 1.647, |
| "step": 35000 |
| }, |
| { |
| "base_loss": 0.6189325547814369, |
| "epoch": 0.06771087646484375, |
| "grad_norm": 0.0011684788623824716, |
| "learning_rate": 4.6614551544189455e-05, |
| "lookahead_loss": 7.151174359321594, |
| "loss": 0.6291, |
| "step": 35500 |
| }, |
| { |
| "base_loss": 0.5999714830517768, |
| "epoch": 0.06866455078125, |
| "grad_norm": 0.0010892520658671856, |
| "learning_rate": 4.6566867828369145e-05, |
| "lookahead_loss": 7.138783624649048, |
| "loss": 0.6115, |
| "step": 36000 |
| }, |
| { |
| "base_loss": 0.5872128927707672, |
| "epoch": 0.06961822509765625, |
| "grad_norm": 0.0011145739117637277, |
| "learning_rate": 4.651918411254883e-05, |
| "lookahead_loss": 7.1531580286026, |
| "loss": 0.6008, |
| "step": 36500 |
| }, |
| { |
| "base_loss": 0.6123319318294526, |
| "epoch": 0.0705718994140625, |
| "grad_norm": 0.001145465881563723, |
| "learning_rate": 4.647150039672852e-05, |
| "lookahead_loss": 7.051203452110291, |
| "loss": 0.6281, |
| "step": 37000 |
| }, |
| { |
| "base_loss": 0.5984540300965309, |
| "epoch": 0.07152557373046875, |
| "grad_norm": 0.0010991750750690699, |
| "learning_rate": 4.642381668090821e-05, |
| "lookahead_loss": 7.079177887916565, |
| "loss": 0.6127, |
| "step": 37500 |
| }, |
| { |
| "base_loss": 0.5946964643001557, |
| "epoch": 0.072479248046875, |
| "grad_norm": 0.0011612207163125277, |
| "learning_rate": 4.637613296508789e-05, |
| "lookahead_loss": 7.13145064163208, |
| "loss": 0.6076, |
| "step": 38000 |
| }, |
| { |
| "base_loss": 0.5916517315506935, |
| "epoch": 0.07343292236328125, |
| "grad_norm": 0.0010734308743849397, |
| "learning_rate": 4.632844924926758e-05, |
| "lookahead_loss": 7.0843782205581665, |
| "loss": 0.6049, |
| "step": 38500 |
| }, |
| { |
| "base_loss": 0.6112363495230675, |
| "epoch": 0.0743865966796875, |
| "grad_norm": 0.0011188907083123922, |
| "learning_rate": 4.6280765533447266e-05, |
| "lookahead_loss": 7.086141440391541, |
| "loss": 0.6247, |
| "step": 39000 |
| }, |
| { |
| "base_loss": 0.5962583271861076, |
| "epoch": 0.07534027099609375, |
| "grad_norm": 0.0011356917675584555, |
| "learning_rate": 4.6233081817626956e-05, |
| "lookahead_loss": 7.140744082450866, |
| "loss": 0.6075, |
| "step": 39500 |
| }, |
| { |
| "base_loss": 0.5866445366144181, |
| "epoch": 0.0762939453125, |
| "grad_norm": 0.0011127168545499444, |
| "learning_rate": 4.6185398101806646e-05, |
| "lookahead_loss": 7.094807801246643, |
| "loss": 0.5998, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.0762939453125, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 7.033647046683314, |
| "eval_lookahead_perplexity": 1134.1594089598198, |
| "eval_loss": 0.2086315155029297, |
| "eval_perplexity": 1.2319909453157274, |
| "eval_runtime": 108.2669, |
| "eval_samples_per_second": 46.182, |
| "eval_steps_per_second": 1.45, |
| "step": 40000 |
| }, |
| { |
| "base_loss": 0.6231044068932533, |
| "epoch": 0.07724761962890625, |
| "grad_norm": 0.0011179151479154825, |
| "learning_rate": 4.613771438598633e-05, |
| "lookahead_loss": 7.0746554174423215, |
| "loss": 0.6366, |
| "step": 40500 |
| }, |
| { |
| "base_loss": 0.5951727138757705, |
| "epoch": 0.0782012939453125, |
| "grad_norm": 0.0011324447114020586, |
| "learning_rate": 4.609003067016602e-05, |
| "lookahead_loss": 7.070094844818115, |
| "loss": 0.6099, |
| "step": 41000 |
| }, |
| { |
| "base_loss": 0.6008335100412369, |
| "epoch": 0.07915496826171875, |
| "grad_norm": 0.0010868014069274068, |
| "learning_rate": 4.60423469543457e-05, |
| "lookahead_loss": 7.104050822257996, |
| "loss": 0.6097, |
| "step": 41500 |
| }, |
| { |
| "base_loss": 0.6084940298199654, |
| "epoch": 0.080108642578125, |
| "grad_norm": 0.0010936354519799352, |
| "learning_rate": 4.5994663238525393e-05, |
| "lookahead_loss": 7.052688834190369, |
| "loss": 0.6203, |
| "step": 42000 |
| }, |
| { |
| "base_loss": 0.6062530100941658, |
| "epoch": 0.08106231689453125, |
| "grad_norm": 0.0011594812385737896, |
| "learning_rate": 4.5946979522705084e-05, |
| "lookahead_loss": 7.072220482826233, |
| "loss": 0.6196, |
| "step": 42500 |
| }, |
| { |
| "base_loss": 0.592376666367054, |
| "epoch": 0.0820159912109375, |
| "grad_norm": 0.0010804173070937395, |
| "learning_rate": 4.589929580688477e-05, |
| "lookahead_loss": 7.0764106168746945, |
| "loss": 0.6073, |
| "step": 43000 |
| }, |
| { |
| "base_loss": 0.5900094144940377, |
| "epoch": 0.08296966552734375, |
| "grad_norm": 0.001111154560931027, |
| "learning_rate": 4.585161209106446e-05, |
| "lookahead_loss": 7.086921313285828, |
| "loss": 0.6041, |
| "step": 43500 |
| }, |
| { |
| "base_loss": 0.6183186983466148, |
| "epoch": 0.08392333984375, |
| "grad_norm": 0.0010992800816893578, |
| "learning_rate": 4.580392837524414e-05, |
| "lookahead_loss": 7.0898256769180295, |
| "loss": 0.6294, |
| "step": 44000 |
| }, |
| { |
| "base_loss": 0.5945970554947853, |
| "epoch": 0.08487701416015625, |
| "grad_norm": 0.0011169550707563758, |
| "learning_rate": 4.575624465942383e-05, |
| "lookahead_loss": 7.033306765556335, |
| "loss": 0.6053, |
| "step": 44500 |
| }, |
| { |
| "base_loss": 0.5856418209671974, |
| "epoch": 0.0858306884765625, |
| "grad_norm": 0.001114897895604372, |
| "learning_rate": 4.570856094360352e-05, |
| "lookahead_loss": 7.065869425773621, |
| "loss": 0.5972, |
| "step": 45000 |
| }, |
| { |
| "epoch": 0.0858306884765625, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.983915470659542, |
| "eval_lookahead_perplexity": 1079.1354288264654, |
| "eval_loss": 0.20853494107723236, |
| "eval_perplexity": 1.2318719722426898, |
| "eval_runtime": 99.1115, |
| "eval_samples_per_second": 50.448, |
| "eval_steps_per_second": 1.584, |
| "step": 45000 |
| }, |
| { |
| "base_loss": 0.591981584072113, |
| "epoch": 0.08678436279296875, |
| "grad_norm": 0.0012656449107453227, |
| "learning_rate": 4.5660877227783205e-05, |
| "lookahead_loss": 7.022496276855469, |
| "loss": 0.6054, |
| "step": 45500 |
| }, |
| { |
| "base_loss": 0.624447933793068, |
| "epoch": 0.087738037109375, |
| "grad_norm": 0.0010355014819651842, |
| "learning_rate": 4.5613193511962895e-05, |
| "lookahead_loss": 7.026841708183289, |
| "loss": 0.6307, |
| "step": 46000 |
| }, |
| { |
| "base_loss": 0.5977871975898743, |
| "epoch": 0.08869171142578125, |
| "grad_norm": 0.0011489527532830834, |
| "learning_rate": 4.556550979614258e-05, |
| "lookahead_loss": 7.015450751304626, |
| "loss": 0.607, |
| "step": 46500 |
| }, |
| { |
| "base_loss": 0.6007027108073234, |
| "epoch": 0.0896453857421875, |
| "grad_norm": 0.0011345903621986508, |
| "learning_rate": 4.551782608032227e-05, |
| "lookahead_loss": 7.049301884651184, |
| "loss": 0.6102, |
| "step": 47000 |
| }, |
| { |
| "base_loss": 0.5879440263509751, |
| "epoch": 0.09059906005859375, |
| "grad_norm": 0.0010685587767511606, |
| "learning_rate": 4.547014236450196e-05, |
| "lookahead_loss": 7.006776951789856, |
| "loss": 0.5999, |
| "step": 47500 |
| }, |
| { |
| "base_loss": 0.5816959359049797, |
| "epoch": 0.091552734375, |
| "grad_norm": 0.0010566096752882004, |
| "learning_rate": 4.542245864868164e-05, |
| "lookahead_loss": 6.994293849945068, |
| "loss": 0.5928, |
| "step": 48000 |
| }, |
| { |
| "base_loss": 0.615579255104065, |
| "epoch": 0.09250640869140625, |
| "grad_norm": 0.0011476678773760796, |
| "learning_rate": 4.537477493286133e-05, |
| "lookahead_loss": 7.024335027694702, |
| "loss": 0.6273, |
| "step": 48500 |
| }, |
| { |
| "base_loss": 0.5977260445952416, |
| "epoch": 0.0934600830078125, |
| "grad_norm": 0.0010787018109112978, |
| "learning_rate": 4.5327091217041016e-05, |
| "lookahead_loss": 7.00899642086029, |
| "loss": 0.6124, |
| "step": 49000 |
| }, |
| { |
| "base_loss": 0.5828603687882423, |
| "epoch": 0.09441375732421875, |
| "grad_norm": 0.0011202479945495725, |
| "learning_rate": 4.5279407501220706e-05, |
| "lookahead_loss": 7.005703037261963, |
| "loss": 0.5966, |
| "step": 49500 |
| }, |
| { |
| "base_loss": 0.5780045939087868, |
| "epoch": 0.095367431640625, |
| "grad_norm": 0.001087460434064269, |
| "learning_rate": 4.523172378540039e-05, |
| "lookahead_loss": 6.952547832489014, |
| "loss": 0.5947, |
| "step": 50000 |
| }, |
| { |
| "epoch": 0.095367431640625, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.945180719272017, |
| "eval_lookahead_perplexity": 1038.1345911819958, |
| "eval_loss": 0.2084558606147766, |
| "eval_perplexity": 1.231774559089228, |
| "eval_runtime": 115.6293, |
| "eval_samples_per_second": 43.242, |
| "eval_steps_per_second": 1.358, |
| "step": 50000 |
| }, |
| { |
| "base_loss": 0.5960951861143112, |
| "epoch": 0.09632110595703125, |
| "grad_norm": 0.0011174663668498397, |
| "learning_rate": 4.518404006958008e-05, |
| "lookahead_loss": 6.991843806266784, |
| "loss": 0.6115, |
| "step": 50500 |
| }, |
| { |
| "base_loss": 0.6136610082983971, |
| "epoch": 0.0972747802734375, |
| "grad_norm": 0.0011398132191970944, |
| "learning_rate": 4.513635635375977e-05, |
| "lookahead_loss": 6.998520258903503, |
| "loss": 0.626, |
| "step": 51000 |
| }, |
| { |
| "base_loss": 0.5944582785964012, |
| "epoch": 0.09822845458984375, |
| "grad_norm": 0.0011265052016824484, |
| "learning_rate": 4.508867263793945e-05, |
| "lookahead_loss": 6.961953915596008, |
| "loss": 0.6069, |
| "step": 51500 |
| }, |
| { |
| "base_loss": 0.5841029364466667, |
| "epoch": 0.09918212890625, |
| "grad_norm": 0.0011030936148017645, |
| "learning_rate": 4.5040988922119143e-05, |
| "lookahead_loss": 7.00307084941864, |
| "loss": 0.5984, |
| "step": 52000 |
| }, |
| { |
| "base_loss": 0.5904176152348518, |
| "epoch": 0.10013580322265625, |
| "grad_norm": 0.001120659988373518, |
| "learning_rate": 4.499330520629883e-05, |
| "lookahead_loss": 6.987798627853394, |
| "loss": 0.6045, |
| "step": 52500 |
| }, |
| { |
| "base_loss": 0.6092280206680298, |
| "epoch": 0.1010894775390625, |
| "grad_norm": 0.0011076764203608036, |
| "learning_rate": 4.494562149047852e-05, |
| "lookahead_loss": 7.027708046913147, |
| "loss": 0.6204, |
| "step": 53000 |
| }, |
| { |
| "base_loss": 0.6015866943001748, |
| "epoch": 0.10204315185546875, |
| "grad_norm": 0.0011410149745643139, |
| "learning_rate": 4.489793777465821e-05, |
| "lookahead_loss": 7.007394369125366, |
| "loss": 0.6094, |
| "step": 53500 |
| }, |
| { |
| "base_loss": 0.5893580458164215, |
| "epoch": 0.102996826171875, |
| "grad_norm": 0.0011117961257696152, |
| "learning_rate": 4.485025405883789e-05, |
| "lookahead_loss": 6.979210342407226, |
| "loss": 0.6037, |
| "step": 54000 |
| }, |
| { |
| "base_loss": 0.5914280138015747, |
| "epoch": 0.10395050048828125, |
| "grad_norm": 0.0011249147355556488, |
| "learning_rate": 4.480257034301758e-05, |
| "lookahead_loss": 6.984224304199219, |
| "loss": 0.6027, |
| "step": 54500 |
| }, |
| { |
| "base_loss": 0.6135287986993789, |
| "epoch": 0.1049041748046875, |
| "grad_norm": 0.0010942122898995876, |
| "learning_rate": 4.4754886627197264e-05, |
| "lookahead_loss": 6.940293532371521, |
| "loss": 0.6257, |
| "step": 55000 |
| }, |
| { |
| "epoch": 0.1049041748046875, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.90965398660483, |
| "eval_lookahead_perplexity": 1001.9005113093888, |
| "eval_loss": 0.20838512480258942, |
| "eval_perplexity": 1.2316874315969126, |
| "eval_runtime": 105.743, |
| "eval_samples_per_second": 47.284, |
| "eval_steps_per_second": 1.485, |
| "step": 55000 |
| }, |
| { |
| "base_loss": 0.6006079289913178, |
| "epoch": 0.10585784912109375, |
| "grad_norm": 0.0010583704570308328, |
| "learning_rate": 4.4707202911376955e-05, |
| "lookahead_loss": 6.928493264198303, |
| "loss": 0.6133, |
| "step": 55500 |
| }, |
| { |
| "base_loss": 0.5945557134747506, |
| "epoch": 0.1068115234375, |
| "grad_norm": 0.0010779986623674631, |
| "learning_rate": 4.4659519195556645e-05, |
| "lookahead_loss": 6.998174697875976, |
| "loss": 0.6067, |
| "step": 56000 |
| }, |
| { |
| "base_loss": 0.5838606398105621, |
| "epoch": 0.10776519775390625, |
| "grad_norm": 0.001134494668804109, |
| "learning_rate": 4.461183547973633e-05, |
| "lookahead_loss": 6.914501080513, |
| "loss": 0.5962, |
| "step": 56500 |
| }, |
| { |
| "base_loss": 0.6029577027559281, |
| "epoch": 0.1087188720703125, |
| "grad_norm": 0.0011605332838371396, |
| "learning_rate": 4.456415176391602e-05, |
| "lookahead_loss": 6.893777732849121, |
| "loss": 0.6179, |
| "step": 57000 |
| }, |
| { |
| "base_loss": 0.6101588426232338, |
| "epoch": 0.10967254638671875, |
| "grad_norm": 0.0011116194073110819, |
| "learning_rate": 4.45164680480957e-05, |
| "lookahead_loss": 6.8865425481796265, |
| "loss": 0.6193, |
| "step": 57500 |
| }, |
| { |
| "base_loss": 0.5955164663791657, |
| "epoch": 0.110626220703125, |
| "grad_norm": 0.0011056356597691774, |
| "learning_rate": 4.446878433227539e-05, |
| "lookahead_loss": 6.867807936668396, |
| "loss": 0.6074, |
| "step": 58000 |
| }, |
| { |
| "base_loss": 0.5822924041152, |
| "epoch": 0.11157989501953125, |
| "grad_norm": 0.0010786657221615314, |
| "learning_rate": 4.442110061645508e-05, |
| "lookahead_loss": 6.94662325668335, |
| "loss": 0.5984, |
| "step": 58500 |
| }, |
| { |
| "base_loss": 0.6039745928645134, |
| "epoch": 0.1125335693359375, |
| "grad_norm": 0.0010965235996991396, |
| "learning_rate": 4.4373416900634766e-05, |
| "lookahead_loss": 6.950021827697754, |
| "loss": 0.6148, |
| "step": 59000 |
| }, |
| { |
| "base_loss": 0.6145888038873673, |
| "epoch": 0.11348724365234375, |
| "grad_norm": 0.001114803715609014, |
| "learning_rate": 4.4325733184814456e-05, |
| "lookahead_loss": 6.944812598228455, |
| "loss": 0.6277, |
| "step": 59500 |
| }, |
| { |
| "base_loss": 0.5980466955900192, |
| "epoch": 0.11444091796875, |
| "grad_norm": 0.001079982495866716, |
| "learning_rate": 4.427804946899414e-05, |
| "lookahead_loss": 6.852143743515015, |
| "loss": 0.6075, |
| "step": 60000 |
| }, |
| { |
| "epoch": 0.11444091796875, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.878637344311602, |
| "eval_lookahead_perplexity": 971.3019075411648, |
| "eval_loss": 0.20832312107086182, |
| "eval_perplexity": 1.2316110647473708, |
| "eval_runtime": 91.1615, |
| "eval_samples_per_second": 54.848, |
| "eval_steps_per_second": 1.722, |
| "step": 60000 |
| }, |
| { |
| "base_loss": 0.583279720902443, |
| "epoch": 0.11539459228515625, |
| "grad_norm": 0.001118990359827876, |
| "learning_rate": 4.423036575317383e-05, |
| "lookahead_loss": 6.890160356521607, |
| "loss": 0.5958, |
| "step": 60500 |
| }, |
| { |
| "base_loss": 0.6041163793802261, |
| "epoch": 0.1163482666015625, |
| "grad_norm": 0.0011760067427530885, |
| "learning_rate": 4.418268203735352e-05, |
| "lookahead_loss": 6.909441259384155, |
| "loss": 0.6188, |
| "step": 61000 |
| }, |
| { |
| "base_loss": 0.6094158036708832, |
| "epoch": 0.11730194091796875, |
| "grad_norm": 0.0011371213477104902, |
| "learning_rate": 4.41349983215332e-05, |
| "lookahead_loss": 6.888408424377442, |
| "loss": 0.6244, |
| "step": 61500 |
| }, |
| { |
| "base_loss": 0.6007498300671578, |
| "epoch": 0.118255615234375, |
| "grad_norm": 0.001127147930674255, |
| "learning_rate": 4.4087314605712893e-05, |
| "lookahead_loss": 6.916694809913635, |
| "loss": 0.6107, |
| "step": 62000 |
| }, |
| { |
| "base_loss": 0.5840388324260711, |
| "epoch": 0.11920928955078125, |
| "grad_norm": 0.0011241508182138205, |
| "learning_rate": 4.403963088989258e-05, |
| "lookahead_loss": 6.910943949699402, |
| "loss": 0.5963, |
| "step": 62500 |
| }, |
| { |
| "base_loss": 0.5946898341774941, |
| "epoch": 1.0009536743164062, |
| "grad_norm": 0.0011088403407484293, |
| "learning_rate": 4.399194717407227e-05, |
| "lookahead_loss": 6.968314840316772, |
| "loss": 0.6022, |
| "step": 63000 |
| }, |
| { |
| "base_loss": 0.5878614686727524, |
| "epoch": 1.0019073486328125, |
| "grad_norm": 0.0011633536778390408, |
| "learning_rate": 4.394426345825196e-05, |
| "lookahead_loss": 6.820799809455871, |
| "loss": 0.5996, |
| "step": 63500 |
| }, |
| { |
| "base_loss": 0.6057377905845642, |
| "epoch": 1.0028610229492188, |
| "grad_norm": 0.001116783358156681, |
| "learning_rate": 4.389657974243164e-05, |
| "lookahead_loss": 6.83216045665741, |
| "loss": 0.6149, |
| "step": 64000 |
| }, |
| { |
| "base_loss": 0.6151153823137283, |
| "epoch": 1.003814697265625, |
| "grad_norm": 0.0011050739558413625, |
| "learning_rate": 4.384889602661133e-05, |
| "lookahead_loss": 6.836852411270142, |
| "loss": 0.6237, |
| "step": 64500 |
| }, |
| { |
| "base_loss": 0.5990258244276047, |
| "epoch": 1.0047683715820312, |
| "grad_norm": 0.0011015802156180143, |
| "learning_rate": 4.3801212310791014e-05, |
| "lookahead_loss": 6.826987672805786, |
| "loss": 0.6094, |
| "step": 65000 |
| }, |
| { |
| "epoch": 1.0047683715820312, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.850588269888783, |
| "eval_lookahead_perplexity": 944.4363267400045, |
| "eval_loss": 0.2082662284374237, |
| "eval_perplexity": 1.2315409971437101, |
| "eval_runtime": 100.2414, |
| "eval_samples_per_second": 49.88, |
| "eval_steps_per_second": 1.566, |
| "step": 65000 |
| }, |
| { |
| "base_loss": 0.5899734389781952, |
| "epoch": 1.0057220458984375, |
| "grad_norm": 0.0010623879497870803, |
| "learning_rate": 4.3753528594970705e-05, |
| "lookahead_loss": 6.945034725189209, |
| "loss": 0.6, |
| "step": 65500 |
| }, |
| { |
| "base_loss": 0.5821795570850372, |
| "epoch": 1.0066757202148438, |
| "grad_norm": 0.001107605523429811, |
| "learning_rate": 4.3705844879150395e-05, |
| "lookahead_loss": 6.779200302124023, |
| "loss": 0.5982, |
| "step": 66000 |
| }, |
| { |
| "base_loss": 0.6049286904931068, |
| "epoch": 1.00762939453125, |
| "grad_norm": 0.0010999958030879498, |
| "learning_rate": 4.365816116333008e-05, |
| "lookahead_loss": 6.845924237251282, |
| "loss": 0.6171, |
| "step": 66500 |
| }, |
| { |
| "base_loss": 0.6021122798919678, |
| "epoch": 1.0085830688476562, |
| "grad_norm": 0.0010436498560011387, |
| "learning_rate": 4.361047744750977e-05, |
| "lookahead_loss": 6.854257493972779, |
| "loss": 0.6084, |
| "step": 67000 |
| }, |
| { |
| "base_loss": 0.5890477049946785, |
| "epoch": 1.0095367431640625, |
| "grad_norm": 0.001144828856922686, |
| "learning_rate": 4.356279373168945e-05, |
| "lookahead_loss": 6.862697887420654, |
| "loss": 0.6064, |
| "step": 67500 |
| }, |
| { |
| "base_loss": 0.5928993408083916, |
| "epoch": 1.0104904174804688, |
| "grad_norm": 0.0011384448735043406, |
| "learning_rate": 4.351511001586914e-05, |
| "lookahead_loss": 6.802295356750489, |
| "loss": 0.6045, |
| "step": 68000 |
| }, |
| { |
| "base_loss": 0.5893295911550522, |
| "epoch": 1.011444091796875, |
| "grad_norm": 0.0011518702376633883, |
| "learning_rate": 4.346742630004883e-05, |
| "lookahead_loss": 6.8328519544601445, |
| "loss": 0.6061, |
| "step": 68500 |
| }, |
| { |
| "base_loss": 0.6088726551532745, |
| "epoch": 1.0123977661132812, |
| "grad_norm": 0.0010904420632869005, |
| "learning_rate": 4.3419742584228516e-05, |
| "lookahead_loss": 6.814763288497925, |
| "loss": 0.6216, |
| "step": 69000 |
| }, |
| { |
| "base_loss": 0.5975775923132897, |
| "epoch": 1.0133514404296875, |
| "grad_norm": 0.0011171442456543446, |
| "learning_rate": 4.3372058868408206e-05, |
| "lookahead_loss": 6.899648434638977, |
| "loss": 0.6108, |
| "step": 69500 |
| }, |
| { |
| "base_loss": 0.595933021903038, |
| "epoch": 1.0143051147460938, |
| "grad_norm": 0.0010974474716931581, |
| "learning_rate": 4.332437515258789e-05, |
| "lookahead_loss": 6.864955189704895, |
| "loss": 0.6082, |
| "step": 70000 |
| }, |
| { |
| "epoch": 1.0143051147460938, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.82464911914862, |
| "eval_lookahead_perplexity": 920.2534480815124, |
| "eval_loss": 0.20821362733840942, |
| "eval_perplexity": 1.2314762184375097, |
| "eval_runtime": 96.8797, |
| "eval_samples_per_second": 51.61, |
| "eval_steps_per_second": 1.621, |
| "step": 70000 |
| }, |
| { |
| "base_loss": 0.5827738286256791, |
| "epoch": 1.0152587890625, |
| "grad_norm": 0.0011384559329599142, |
| "learning_rate": 4.327669143676758e-05, |
| "lookahead_loss": 6.803851629257202, |
| "loss": 0.5907, |
| "step": 70500 |
| }, |
| { |
| "base_loss": 0.6107082022428513, |
| "epoch": 1.0162124633789062, |
| "grad_norm": 0.0011207167990505695, |
| "learning_rate": 4.322900772094727e-05, |
| "lookahead_loss": 6.853131731986999, |
| "loss": 0.6182, |
| "step": 71000 |
| }, |
| { |
| "base_loss": 0.6065192295908928, |
| "epoch": 1.0171661376953125, |
| "grad_norm": 0.0011044219136238098, |
| "learning_rate": 4.318132400512695e-05, |
| "lookahead_loss": 6.888585398674011, |
| "loss": 0.6136, |
| "step": 71500 |
| }, |
| { |
| "base_loss": 0.5948996670246124, |
| "epoch": 1.0181198120117188, |
| "grad_norm": 0.0010809814557433128, |
| "learning_rate": 4.3133640289306643e-05, |
| "lookahead_loss": 6.898890166282654, |
| "loss": 0.6055, |
| "step": 72000 |
| }, |
| { |
| "base_loss": 0.5865646304488182, |
| "epoch": 1.019073486328125, |
| "grad_norm": 0.0010813730768859386, |
| "learning_rate": 4.308595657348633e-05, |
| "lookahead_loss": 6.91842933177948, |
| "loss": 0.5989, |
| "step": 72500 |
| }, |
| { |
| "base_loss": 0.5887798971533775, |
| "epoch": 1.0200271606445312, |
| "grad_norm": 0.0011451997561380267, |
| "learning_rate": 4.303827285766602e-05, |
| "lookahead_loss": 6.75454776763916, |
| "loss": 0.6052, |
| "step": 73000 |
| }, |
| { |
| "base_loss": 0.6147837865948677, |
| "epoch": 1.0209808349609375, |
| "grad_norm": 0.0010907722171396017, |
| "learning_rate": 4.299058914184571e-05, |
| "lookahead_loss": 6.805963615417481, |
| "loss": 0.6247, |
| "step": 73500 |
| }, |
| { |
| "base_loss": 0.5963076213002205, |
| "epoch": 1.0219345092773438, |
| "grad_norm": 0.0011470620520412922, |
| "learning_rate": 4.294290542602539e-05, |
| "lookahead_loss": 6.7863368434906, |
| "loss": 0.6097, |
| "step": 74000 |
| }, |
| { |
| "base_loss": 0.5970929145216942, |
| "epoch": 1.02288818359375, |
| "grad_norm": 0.0011032413458451629, |
| "learning_rate": 4.289522171020508e-05, |
| "lookahead_loss": 6.821732865333557, |
| "loss": 0.6062, |
| "step": 74500 |
| }, |
| { |
| "base_loss": 0.5953870372772216, |
| "epoch": 1.0238418579101562, |
| "grad_norm": 0.0011107485042884946, |
| "learning_rate": 4.2847537994384764e-05, |
| "lookahead_loss": 6.788943789482117, |
| "loss": 0.6047, |
| "step": 75000 |
| }, |
| { |
| "epoch": 1.0238418579101562, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.798898217015373, |
| "eval_lookahead_perplexity": 896.8586035415678, |
| "eval_loss": 0.20816253125667572, |
| "eval_perplexity": 1.2314132964355469, |
| "eval_runtime": 91.6696, |
| "eval_samples_per_second": 54.544, |
| "eval_steps_per_second": 1.713, |
| "step": 75000 |
| }, |
| { |
| "base_loss": 0.6108319897651673, |
| "epoch": 1.0247955322265625, |
| "grad_norm": 0.0010732373921200633, |
| "learning_rate": 4.2799854278564455e-05, |
| "lookahead_loss": 6.784094378471375, |
| "loss": 0.6248, |
| "step": 75500 |
| }, |
| { |
| "base_loss": 0.6046044981479645, |
| "epoch": 1.0257492065429688, |
| "grad_norm": 0.001091954531148076, |
| "learning_rate": 4.2752170562744145e-05, |
| "lookahead_loss": 6.748379487991333, |
| "loss": 0.615, |
| "step": 76000 |
| }, |
| { |
| "base_loss": 0.5950362936258317, |
| "epoch": 1.026702880859375, |
| "grad_norm": 0.0011224595364183187, |
| "learning_rate": 4.270448684692383e-05, |
| "lookahead_loss": 6.760618274688721, |
| "loss": 0.6056, |
| "step": 76500 |
| }, |
| { |
| "base_loss": 0.5943114874362946, |
| "epoch": 1.0276565551757812, |
| "grad_norm": 0.0011067682644352317, |
| "learning_rate": 4.265680313110352e-05, |
| "lookahead_loss": 6.873536200523376, |
| "loss": 0.6063, |
| "step": 77000 |
| }, |
| { |
| "base_loss": 0.6171733926534653, |
| "epoch": 1.0286102294921875, |
| "grad_norm": 0.001133575802668929, |
| "learning_rate": 4.26091194152832e-05, |
| "lookahead_loss": 6.882242550849915, |
| "loss": 0.6266, |
| "step": 77500 |
| }, |
| { |
| "base_loss": 0.6017211389541626, |
| "epoch": 1.0295639038085938, |
| "grad_norm": 0.001115851104259491, |
| "learning_rate": 4.256143569946289e-05, |
| "lookahead_loss": 6.8330844841003415, |
| "loss": 0.6123, |
| "step": 78000 |
| }, |
| { |
| "base_loss": 0.5937362365722656, |
| "epoch": 1.030517578125, |
| "grad_norm": 0.0011238973820582032, |
| "learning_rate": 4.251375198364258e-05, |
| "lookahead_loss": 6.8106466889381405, |
| "loss": 0.6071, |
| "step": 78500 |
| }, |
| { |
| "base_loss": 0.5903627701997757, |
| "epoch": 1.0314712524414062, |
| "grad_norm": 0.0010951296426355839, |
| "learning_rate": 4.2466068267822266e-05, |
| "lookahead_loss": 6.845707628250122, |
| "loss": 0.6039, |
| "step": 79000 |
| }, |
| { |
| "base_loss": 0.6103688189387322, |
| "epoch": 1.0324249267578125, |
| "grad_norm": 0.001133197103627026, |
| "learning_rate": 4.2418384552001956e-05, |
| "lookahead_loss": 6.768160277366638, |
| "loss": 0.6277, |
| "step": 79500 |
| }, |
| { |
| "base_loss": 0.5958747680187225, |
| "epoch": 1.0333786010742188, |
| "grad_norm": 0.001138185732997954, |
| "learning_rate": 4.237070083618164e-05, |
| "lookahead_loss": 6.823498950004578, |
| "loss": 0.6081, |
| "step": 80000 |
| }, |
| { |
| "epoch": 1.0333786010742188, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.777561352275812, |
| "eval_lookahead_perplexity": 877.9251612762528, |
| "eval_loss": 0.20812006294727325, |
| "eval_perplexity": 1.231361001505118, |
| "eval_runtime": 106.9612, |
| "eval_samples_per_second": 46.746, |
| "eval_steps_per_second": 1.468, |
| "step": 80000 |
| }, |
| { |
| "base_loss": 0.5922364763021469, |
| "epoch": 1.034332275390625, |
| "grad_norm": 0.001084625837393105, |
| "learning_rate": 4.232301712036133e-05, |
| "lookahead_loss": 6.893566377639771, |
| "loss": 0.6076, |
| "step": 80500 |
| }, |
| { |
| "base_loss": 0.5894868444204331, |
| "epoch": 1.0352859497070312, |
| "grad_norm": 0.0011169870849698782, |
| "learning_rate": 4.227533340454102e-05, |
| "lookahead_loss": 6.730395976066589, |
| "loss": 0.6035, |
| "step": 81000 |
| }, |
| { |
| "base_loss": 0.6110191858410835, |
| "epoch": 1.0362396240234375, |
| "grad_norm": 0.0011154419044032693, |
| "learning_rate": 4.22276496887207e-05, |
| "lookahead_loss": 6.848829930305481, |
| "loss": 0.6219, |
| "step": 81500 |
| }, |
| { |
| "base_loss": 0.6014680997133255, |
| "epoch": 1.0371932983398438, |
| "grad_norm": 0.0011206173803657293, |
| "learning_rate": 4.2179965972900393e-05, |
| "lookahead_loss": 6.780662053108215, |
| "loss": 0.611, |
| "step": 82000 |
| }, |
| { |
| "base_loss": 0.5883948777914048, |
| "epoch": 1.03814697265625, |
| "grad_norm": 0.0010986519046127796, |
| "learning_rate": 4.213228225708008e-05, |
| "lookahead_loss": 6.795178040504456, |
| "loss": 0.6017, |
| "step": 82500 |
| }, |
| { |
| "base_loss": 0.6005192502140999, |
| "epoch": 1.0391006469726562, |
| "grad_norm": 0.0011019845260307193, |
| "learning_rate": 4.208459854125977e-05, |
| "lookahead_loss": 6.752805541038513, |
| "loss": 0.6141, |
| "step": 83000 |
| }, |
| { |
| "base_loss": 0.6166067426204681, |
| "epoch": 1.0400543212890625, |
| "grad_norm": 0.0011135574895888567, |
| "learning_rate": 4.203691482543946e-05, |
| "lookahead_loss": 6.774759250640869, |
| "loss": 0.6253, |
| "step": 83500 |
| }, |
| { |
| "base_loss": 0.6028286694288254, |
| "epoch": 1.0410079956054688, |
| "grad_norm": 0.0011277147568762302, |
| "learning_rate": 4.198923110961914e-05, |
| "lookahead_loss": 6.730431209564209, |
| "loss": 0.6126, |
| "step": 84000 |
| }, |
| { |
| "base_loss": 0.5807398597002029, |
| "epoch": 1.041961669921875, |
| "grad_norm": 0.0011186593910679221, |
| "learning_rate": 4.194154739379883e-05, |
| "lookahead_loss": 6.8060649909973145, |
| "loss": 0.5979, |
| "step": 84500 |
| }, |
| { |
| "base_loss": 0.605569171845913, |
| "epoch": 1.0429153442382812, |
| "grad_norm": 0.0010821224423125386, |
| "learning_rate": 4.1893863677978514e-05, |
| "lookahead_loss": 6.820508779525757, |
| "loss": 0.6175, |
| "step": 85000 |
| }, |
| { |
| "epoch": 1.0429153442382812, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.75723210538919, |
| "eval_lookahead_perplexity": 860.2577945124659, |
| "eval_loss": 0.20807963609695435, |
| "eval_perplexity": 1.2313112224644334, |
| "eval_runtime": 93.1866, |
| "eval_samples_per_second": 53.656, |
| "eval_steps_per_second": 1.685, |
| "step": 85000 |
| }, |
| { |
| "base_loss": 0.6117001739740372, |
| "epoch": 1.0438690185546875, |
| "grad_norm": 0.001131902332417667, |
| "learning_rate": 4.1846179962158205e-05, |
| "lookahead_loss": 6.828276200294495, |
| "loss": 0.6253, |
| "step": 85500 |
| }, |
| { |
| "base_loss": 0.5927367950081825, |
| "epoch": 1.0448226928710938, |
| "grad_norm": 0.0011309866094961762, |
| "learning_rate": 4.1798496246337895e-05, |
| "lookahead_loss": 6.777775590896606, |
| "loss": 0.6043, |
| "step": 86000 |
| }, |
| { |
| "base_loss": 0.5924578613042831, |
| "epoch": 1.0457763671875, |
| "grad_norm": 0.001088097458705306, |
| "learning_rate": 4.175081253051758e-05, |
| "lookahead_loss": 6.762310373783111, |
| "loss": 0.6038, |
| "step": 86500 |
| }, |
| { |
| "base_loss": 0.6208472669720649, |
| "epoch": 1.0467300415039062, |
| "grad_norm": 0.001117968698963523, |
| "learning_rate": 4.170312881469727e-05, |
| "lookahead_loss": 6.744260499000549, |
| "loss": 0.629, |
| "step": 87000 |
| }, |
| { |
| "base_loss": 0.6022886065840721, |
| "epoch": 1.0476837158203125, |
| "grad_norm": 0.001148878363892436, |
| "learning_rate": 4.165544509887695e-05, |
| "lookahead_loss": 6.774776460647583, |
| "loss": 0.6215, |
| "step": 87500 |
| }, |
| { |
| "base_loss": 0.588396491408348, |
| "epoch": 1.0486373901367188, |
| "grad_norm": 0.0010915439343079925, |
| "learning_rate": 4.160776138305664e-05, |
| "lookahead_loss": 6.75317128944397, |
| "loss": 0.6012, |
| "step": 88000 |
| }, |
| { |
| "base_loss": 0.5997280370593071, |
| "epoch": 1.049591064453125, |
| "grad_norm": 0.0011360279750078917, |
| "learning_rate": 4.156007766723633e-05, |
| "lookahead_loss": 6.72175373840332, |
| "loss": 0.6095, |
| "step": 88500 |
| }, |
| { |
| "base_loss": 0.6170587275028229, |
| "epoch": 1.0505447387695312, |
| "grad_norm": 0.0010587567230686545, |
| "learning_rate": 4.1512393951416016e-05, |
| "lookahead_loss": 6.816565957069397, |
| "loss": 0.6288, |
| "step": 89000 |
| }, |
| { |
| "base_loss": 0.5966709926724434, |
| "epoch": 1.0514984130859375, |
| "grad_norm": 0.0011491916375234723, |
| "learning_rate": 4.1464710235595706e-05, |
| "lookahead_loss": 6.767929425239563, |
| "loss": 0.6097, |
| "step": 89500 |
| }, |
| { |
| "base_loss": 0.592747309923172, |
| "epoch": 1.0524520874023438, |
| "grad_norm": 0.0011455725179985166, |
| "learning_rate": 4.141702651977539e-05, |
| "lookahead_loss": 6.712255210876465, |
| "loss": 0.606, |
| "step": 90000 |
| }, |
| { |
| "epoch": 1.0524520874023438, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.738918094208446, |
| "eval_lookahead_perplexity": 844.6464134883248, |
| "eval_loss": 0.2080424576997757, |
| "eval_perplexity": 1.2312654451377232, |
| "eval_runtime": 93.0744, |
| "eval_samples_per_second": 53.72, |
| "eval_steps_per_second": 1.687, |
| "step": 90000 |
| }, |
| { |
| "base_loss": 0.6108891260027886, |
| "epoch": 1.05340576171875, |
| "grad_norm": 0.0010910800192505121, |
| "learning_rate": 4.136934280395508e-05, |
| "lookahead_loss": 6.70730511379242, |
| "loss": 0.6205, |
| "step": 90500 |
| }, |
| { |
| "base_loss": 0.6110858068466186, |
| "epoch": 1.0543594360351562, |
| "grad_norm": 0.001111338846385479, |
| "learning_rate": 4.132165908813477e-05, |
| "lookahead_loss": 6.761465684890747, |
| "loss": 0.6261, |
| "step": 91000 |
| }, |
| { |
| "base_loss": 0.5958408567905426, |
| "epoch": 1.0553131103515625, |
| "grad_norm": 0.0011252695694565773, |
| "learning_rate": 4.127397537231445e-05, |
| "lookahead_loss": 6.779712849617004, |
| "loss": 0.6066, |
| "step": 91500 |
| }, |
| { |
| "base_loss": 0.5900192571878433, |
| "epoch": 1.0562667846679688, |
| "grad_norm": 0.0010718937264755368, |
| "learning_rate": 4.1226291656494143e-05, |
| "lookahead_loss": 6.779324295997619, |
| "loss": 0.6041, |
| "step": 92000 |
| }, |
| { |
| "base_loss": 0.6141881394386292, |
| "epoch": 1.057220458984375, |
| "grad_norm": 0.0011070650070905685, |
| "learning_rate": 4.117860794067383e-05, |
| "lookahead_loss": 6.778371848106384, |
| "loss": 0.6279, |
| "step": 92500 |
| }, |
| { |
| "base_loss": 0.6065550698041916, |
| "epoch": 1.0581741333007812, |
| "grad_norm": 0.0011338784825056791, |
| "learning_rate": 4.113092422485352e-05, |
| "lookahead_loss": 6.767937861442566, |
| "loss": 0.6173, |
| "step": 93000 |
| }, |
| { |
| "base_loss": 0.5854485256075859, |
| "epoch": 1.0591278076171875, |
| "grad_norm": 0.0011064207646995783, |
| "learning_rate": 4.108324050903321e-05, |
| "lookahead_loss": 6.672726812362671, |
| "loss": 0.6006, |
| "step": 93500 |
| }, |
| { |
| "base_loss": 0.5929028804898262, |
| "epoch": 1.0600814819335938, |
| "grad_norm": 0.0011152655351907015, |
| "learning_rate": 4.103555679321289e-05, |
| "lookahead_loss": 6.745905955314636, |
| "loss": 0.6068, |
| "step": 94000 |
| }, |
| { |
| "base_loss": 0.6067430639863014, |
| "epoch": 1.06103515625, |
| "grad_norm": 0.001127126393839717, |
| "learning_rate": 4.098787307739258e-05, |
| "lookahead_loss": 6.6982609925270085, |
| "loss": 0.617, |
| "step": 94500 |
| }, |
| { |
| "base_loss": 0.6029584980010987, |
| "epoch": 1.0619888305664062, |
| "grad_norm": 0.0011526525486260653, |
| "learning_rate": 4.0940189361572264e-05, |
| "lookahead_loss": 6.715296206474304, |
| "loss": 0.6149, |
| "step": 95000 |
| }, |
| { |
| "epoch": 1.0619888305664062, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.7193962986857745, |
| "eval_lookahead_perplexity": 828.3173042636047, |
| "eval_loss": 0.20800456404685974, |
| "eval_perplexity": 1.2312187888762913, |
| "eval_runtime": 90.4107, |
| "eval_samples_per_second": 55.303, |
| "eval_steps_per_second": 1.737, |
| "step": 95000 |
| }, |
| { |
| "base_loss": 0.5950580761432648, |
| "epoch": 1.0629425048828125, |
| "grad_norm": 0.0011395127512514591, |
| "learning_rate": 4.0892505645751955e-05, |
| "lookahead_loss": 6.710915733337402, |
| "loss": 0.6065, |
| "step": 95500 |
| }, |
| { |
| "base_loss": 0.6125950453877449, |
| "epoch": 1.0638961791992188, |
| "grad_norm": 0.0010708384215831757, |
| "learning_rate": 4.0844821929931645e-05, |
| "lookahead_loss": 6.735831215858459, |
| "loss": 0.6241, |
| "step": 96000 |
| }, |
| { |
| "base_loss": 0.605140404343605, |
| "epoch": 1.064849853515625, |
| "grad_norm": 0.0011066367151215672, |
| "learning_rate": 4.079713821411133e-05, |
| "lookahead_loss": 6.77455454158783, |
| "loss": 0.616, |
| "step": 96500 |
| }, |
| { |
| "base_loss": 0.6022319710254669, |
| "epoch": 1.0658035278320312, |
| "grad_norm": 0.0011385679244995117, |
| "learning_rate": 4.074945449829102e-05, |
| "lookahead_loss": 6.687186507225037, |
| "loss": 0.6135, |
| "step": 97000 |
| }, |
| { |
| "base_loss": 0.6027357627749443, |
| "epoch": 1.0667572021484375, |
| "grad_norm": 0.0011094497749581933, |
| "learning_rate": 4.07017707824707e-05, |
| "lookahead_loss": 6.668306805610657, |
| "loss": 0.6111, |
| "step": 97500 |
| }, |
| { |
| "base_loss": 0.6161690940260887, |
| "epoch": 1.0677108764648438, |
| "grad_norm": 0.0011532640783116221, |
| "learning_rate": 4.065408706665039e-05, |
| "lookahead_loss": 6.747381279945373, |
| "loss": 0.6264, |
| "step": 98000 |
| }, |
| { |
| "base_loss": 0.5982645556926728, |
| "epoch": 1.06866455078125, |
| "grad_norm": 0.0010863294592127204, |
| "learning_rate": 4.060640335083008e-05, |
| "lookahead_loss": 6.722812586784363, |
| "loss": 0.6099, |
| "step": 98500 |
| }, |
| { |
| "base_loss": 0.5865320681333542, |
| "epoch": 1.0696182250976562, |
| "grad_norm": 0.0011132799554616213, |
| "learning_rate": 4.0558719635009766e-05, |
| "lookahead_loss": 6.748574607849121, |
| "loss": 0.6003, |
| "step": 99000 |
| }, |
| { |
| "base_loss": 0.6179066747426987, |
| "epoch": 1.0705718994140625, |
| "grad_norm": 0.0011427431600168347, |
| "learning_rate": 4.0511035919189456e-05, |
| "lookahead_loss": 6.636237494945526, |
| "loss": 0.6297, |
| "step": 99500 |
| }, |
| { |
| "base_loss": 0.5993645028471947, |
| "epoch": 1.0715255737304688, |
| "grad_norm": 0.0011170258512720466, |
| "learning_rate": 4.046335220336914e-05, |
| "lookahead_loss": 6.691766156196595, |
| "loss": 0.6122, |
| "step": 100000 |
| }, |
| { |
| "epoch": 1.0715255737304688, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.701932117961847, |
| "eval_lookahead_perplexity": 813.9770064196388, |
| "eval_loss": 0.2079702615737915, |
| "eval_perplexity": 1.2311765557512993, |
| "eval_runtime": 93.3483, |
| "eval_samples_per_second": 53.563, |
| "eval_steps_per_second": 1.682, |
| "step": 100000 |
| }, |
| { |
| "base_loss": 0.5923813906908035, |
| "epoch": 1.072479248046875, |
| "grad_norm": 0.0011480902321636677, |
| "learning_rate": 4.041566848754883e-05, |
| "lookahead_loss": 6.744957288742065, |
| "loss": 0.6046, |
| "step": 100500 |
| }, |
| { |
| "base_loss": 0.5912299656271934, |
| "epoch": 1.0734329223632812, |
| "grad_norm": 0.0010534238535910845, |
| "learning_rate": 4.036798477172852e-05, |
| "lookahead_loss": 6.693481325149536, |
| "loss": 0.6037, |
| "step": 101000 |
| }, |
| { |
| "base_loss": 0.6128248473405838, |
| "epoch": 1.0743865966796875, |
| "grad_norm": 0.001108926022425294, |
| "learning_rate": 4.03203010559082e-05, |
| "lookahead_loss": 6.697817398071289, |
| "loss": 0.6247, |
| "step": 101500 |
| }, |
| { |
| "base_loss": 0.5946393350362777, |
| "epoch": 1.0753402709960938, |
| "grad_norm": 0.001114795682951808, |
| "learning_rate": 4.0272617340087893e-05, |
| "lookahead_loss": 6.764534550666809, |
| "loss": 0.6062, |
| "step": 102000 |
| }, |
| { |
| "base_loss": 0.5858605382442474, |
| "epoch": 1.0762939453125, |
| "grad_norm": 0.001073968131095171, |
| "learning_rate": 4.022493362426758e-05, |
| "lookahead_loss": 6.726330715179444, |
| "loss": 0.5967, |
| "step": 102500 |
| }, |
| { |
| "base_loss": 0.6261867806911469, |
| "epoch": 1.0772476196289062, |
| "grad_norm": 0.001089173019863665, |
| "learning_rate": 4.017724990844727e-05, |
| "lookahead_loss": 6.691923519134521, |
| "loss": 0.6366, |
| "step": 103000 |
| }, |
| { |
| "base_loss": 0.5926554707288743, |
| "epoch": 1.0782012939453125, |
| "grad_norm": 0.0011284619104117155, |
| "learning_rate": 4.012956619262696e-05, |
| "lookahead_loss": 6.703181129455566, |
| "loss": 0.6086, |
| "step": 103500 |
| }, |
| { |
| "base_loss": 0.5974663733839989, |
| "epoch": 1.0791549682617188, |
| "grad_norm": 0.0010861388873308897, |
| "learning_rate": 4.008188247680664e-05, |
| "lookahead_loss": 6.74266376209259, |
| "loss": 0.6072, |
| "step": 104000 |
| }, |
| { |
| "base_loss": 0.6090346719622612, |
| "epoch": 1.080108642578125, |
| "grad_norm": 0.0010843543568626046, |
| "learning_rate": 4.003419876098633e-05, |
| "lookahead_loss": 6.690739940643311, |
| "loss": 0.6216, |
| "step": 104500 |
| }, |
| { |
| "base_loss": 0.6080623995661736, |
| "epoch": 1.0810623168945312, |
| "grad_norm": 0.00115968135651201, |
| "learning_rate": 3.9986515045166014e-05, |
| "lookahead_loss": 6.721202290534973, |
| "loss": 0.6199, |
| "step": 105000 |
| }, |
| { |
| "epoch": 1.0810623168945312, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.683899407188733, |
| "eval_lookahead_perplexity": 799.4303465286481, |
| "eval_loss": 0.20793530344963074, |
| "eval_perplexity": 1.231133516880683, |
| "eval_runtime": 92.6887, |
| "eval_samples_per_second": 53.944, |
| "eval_steps_per_second": 1.694, |
| "step": 105000 |
| }, |
| { |
| "base_loss": 0.5944187866449356, |
| "epoch": 1.0820159912109375, |
| "grad_norm": 0.0010683005675673485, |
| "learning_rate": 3.9938831329345705e-05, |
| "lookahead_loss": 6.727095356464386, |
| "loss": 0.6054, |
| "step": 105500 |
| }, |
| { |
| "base_loss": 0.5926255500912666, |
| "epoch": 1.0829696655273438, |
| "grad_norm": 0.001137380488216877, |
| "learning_rate": 3.9891147613525395e-05, |
| "lookahead_loss": 6.750259411811829, |
| "loss": 0.6044, |
| "step": 106000 |
| }, |
| { |
| "base_loss": 0.6189518148899078, |
| "epoch": 1.08392333984375, |
| "grad_norm": 0.0010804138146340847, |
| "learning_rate": 3.984346389770508e-05, |
| "lookahead_loss": 6.7454378662109375, |
| "loss": 0.629, |
| "step": 106500 |
| }, |
| { |
| "base_loss": 0.5973528184294701, |
| "epoch": 1.0848770141601562, |
| "grad_norm": 0.0011135113891214132, |
| "learning_rate": 3.979578018188477e-05, |
| "lookahead_loss": 6.6877857160568235, |
| "loss": 0.6073, |
| "step": 107000 |
| }, |
| { |
| "base_loss": 0.5883546487689019, |
| "epoch": 1.0858306884765625, |
| "grad_norm": 0.0011390469735488296, |
| "learning_rate": 3.974809646606445e-05, |
| "lookahead_loss": 6.7175205068588255, |
| "loss": 0.5988, |
| "step": 107500 |
| }, |
| { |
| "base_loss": 0.5933809608817101, |
| "epoch": 1.0867843627929688, |
| "grad_norm": 0.0012121612671762705, |
| "learning_rate": 3.970041275024414e-05, |
| "lookahead_loss": 6.67557014465332, |
| "loss": 0.6058, |
| "step": 108000 |
| }, |
| { |
| "base_loss": 0.6234395582079887, |
| "epoch": 1.087738037109375, |
| "grad_norm": 0.0010294954990968108, |
| "learning_rate": 3.965272903442383e-05, |
| "lookahead_loss": 6.695505553245544, |
| "loss": 0.6294, |
| "step": 108500 |
| }, |
| { |
| "base_loss": 0.5963783563375473, |
| "epoch": 1.0886917114257812, |
| "grad_norm": 0.0011363314697518945, |
| "learning_rate": 3.9605045318603516e-05, |
| "lookahead_loss": 6.69392915725708, |
| "loss": 0.6053, |
| "step": 109000 |
| }, |
| { |
| "base_loss": 0.5991918464303017, |
| "epoch": 1.0896453857421875, |
| "grad_norm": 0.00112410937435925, |
| "learning_rate": 3.9557361602783206e-05, |
| "lookahead_loss": 6.717226490974427, |
| "loss": 0.6083, |
| "step": 109500 |
| }, |
| { |
| "base_loss": 0.5880093929767609, |
| "epoch": 1.0905990600585938, |
| "grad_norm": 0.0010848381789401174, |
| "learning_rate": 3.950967788696289e-05, |
| "lookahead_loss": 6.691950410842895, |
| "loss": 0.5994, |
| "step": 110000 |
| }, |
| { |
| "epoch": 1.0905990600585938, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.667511131055058, |
| "eval_lookahead_perplexity": 786.4358309477227, |
| "eval_loss": 0.20790287852287292, |
| "eval_perplexity": 1.231093598113754, |
| "eval_runtime": 90.3464, |
| "eval_samples_per_second": 55.343, |
| "eval_steps_per_second": 1.738, |
| "step": 110000 |
| }, |
| { |
| "base_loss": 0.5782296355366707, |
| "epoch": 1.091552734375, |
| "grad_norm": 0.0010654990328475833, |
| "learning_rate": 3.946199417114258e-05, |
| "lookahead_loss": 6.665598778724671, |
| "loss": 0.5917, |
| "step": 110500 |
| }, |
| { |
| "base_loss": 0.617558573782444, |
| "epoch": 1.0925064086914062, |
| "grad_norm": 0.001140857464633882, |
| "learning_rate": 3.941431045532227e-05, |
| "lookahead_loss": 6.707964798927307, |
| "loss": 0.6283, |
| "step": 111000 |
| }, |
| { |
| "base_loss": 0.5982673740983009, |
| "epoch": 1.0934600830078125, |
| "grad_norm": 0.001054911408573389, |
| "learning_rate": 3.936662673950195e-05, |
| "lookahead_loss": 6.694295763015747, |
| "loss": 0.6126, |
| "step": 111500 |
| }, |
| { |
| "base_loss": 0.5841596345305443, |
| "epoch": 1.0944137573242188, |
| "grad_norm": 0.0011086445301771164, |
| "learning_rate": 3.9318943023681643e-05, |
| "lookahead_loss": 6.687951999664307, |
| "loss": 0.597, |
| "step": 112000 |
| }, |
| { |
| "base_loss": 0.5796255503892899, |
| "epoch": 1.095367431640625, |
| "grad_norm": 0.0010867841774597764, |
| "learning_rate": 3.927125930786133e-05, |
| "lookahead_loss": 6.6346531310081485, |
| "loss": 0.5945, |
| "step": 112500 |
| }, |
| { |
| "base_loss": 0.5978616480827331, |
| "epoch": 1.0963211059570312, |
| "grad_norm": 0.001112610101699829, |
| "learning_rate": 3.922357559204102e-05, |
| "lookahead_loss": 6.685487722396851, |
| "loss": 0.6119, |
| "step": 113000 |
| }, |
| { |
| "base_loss": 0.6154543727636337, |
| "epoch": 1.0972747802734375, |
| "grad_norm": 0.0011152740335091949, |
| "learning_rate": 3.917589187622071e-05, |
| "lookahead_loss": 6.701974026679992, |
| "loss": 0.6251, |
| "step": 113500 |
| }, |
| { |
| "base_loss": 0.596605758190155, |
| "epoch": 1.0982284545898438, |
| "grad_norm": 0.0011151140788570046, |
| "learning_rate": 3.912820816040039e-05, |
| "lookahead_loss": 6.645841445922851, |
| "loss": 0.6064, |
| "step": 114000 |
| }, |
| { |
| "base_loss": 0.5833840205669403, |
| "epoch": 1.09918212890625, |
| "grad_norm": 0.0011117896065115929, |
| "learning_rate": 3.908052444458008e-05, |
| "lookahead_loss": 6.711025802612305, |
| "loss": 0.5967, |
| "step": 114500 |
| }, |
| { |
| "base_loss": 0.586369781255722, |
| "epoch": 1.1001358032226562, |
| "grad_norm": 0.0011113210348412395, |
| "learning_rate": 3.9032840728759764e-05, |
| "lookahead_loss": 6.68584174823761, |
| "loss": 0.6021, |
| "step": 115000 |
| }, |
| { |
| "epoch": 1.1001358032226562, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.653558318226482, |
| "eval_lookahead_perplexity": 775.5390362479169, |
| "eval_loss": 0.20787346363067627, |
| "eval_perplexity": 1.2310573861608696, |
| "eval_runtime": 93.7117, |
| "eval_samples_per_second": 53.355, |
| "eval_steps_per_second": 1.675, |
| "step": 115000 |
| }, |
| { |
| "base_loss": 0.6109204713702202, |
| "epoch": 1.1010894775390625, |
| "grad_norm": 0.0011247434886172414, |
| "learning_rate": 3.8985157012939455e-05, |
| "lookahead_loss": 6.729825895309448, |
| "loss": 0.6217, |
| "step": 115500 |
| }, |
| { |
| "base_loss": 0.6036947486400605, |
| "epoch": 1.1020431518554688, |
| "grad_norm": 0.001134915859438479, |
| "learning_rate": 3.8937473297119145e-05, |
| "lookahead_loss": 6.710568108081818, |
| "loss": 0.609, |
| "step": 116000 |
| }, |
| { |
| "base_loss": 0.5892721264362335, |
| "epoch": 1.102996826171875, |
| "grad_norm": 0.001085427007637918, |
| "learning_rate": 3.888978958129883e-05, |
| "lookahead_loss": 6.689410036087036, |
| "loss": 0.6033, |
| "step": 116500 |
| }, |
| { |
| "base_loss": 0.5901298764944076, |
| "epoch": 1.1039505004882812, |
| "grad_norm": 0.0011025239946320653, |
| "learning_rate": 3.884210586547852e-05, |
| "lookahead_loss": 6.702957942962646, |
| "loss": 0.6024, |
| "step": 117000 |
| }, |
| { |
| "base_loss": 0.6100293419957161, |
| "epoch": 1.1049041748046875, |
| "grad_norm": 0.0010662467684596777, |
| "learning_rate": 3.87944221496582e-05, |
| "lookahead_loss": 6.645748790740967, |
| "loss": 0.6218, |
| "step": 117500 |
| }, |
| { |
| "base_loss": 0.602484605550766, |
| "epoch": 1.1058578491210938, |
| "grad_norm": 0.001067397533915937, |
| "learning_rate": 3.874673843383789e-05, |
| "lookahead_loss": 6.646303733825683, |
| "loss": 0.6147, |
| "step": 118000 |
| }, |
| { |
| "base_loss": 0.5962947644591331, |
| "epoch": 1.1068115234375, |
| "grad_norm": 0.0010932920267805457, |
| "learning_rate": 3.869905471801758e-05, |
| "lookahead_loss": 6.713128123283386, |
| "loss": 0.6065, |
| "step": 118500 |
| }, |
| { |
| "base_loss": 0.5839503274559975, |
| "epoch": 1.1077651977539062, |
| "grad_norm": 0.0011347734834998846, |
| "learning_rate": 3.8651371002197266e-05, |
| "lookahead_loss": 6.62405997467041, |
| "loss": 0.5955, |
| "step": 119000 |
| }, |
| { |
| "base_loss": 0.6058702719211578, |
| "epoch": 1.1087188720703125, |
| "grad_norm": 0.0011517951497808099, |
| "learning_rate": 3.8603687286376956e-05, |
| "lookahead_loss": 6.613666387557983, |
| "loss": 0.6193, |
| "step": 119500 |
| }, |
| { |
| "base_loss": 0.6073802384138107, |
| "epoch": 1.1096725463867188, |
| "grad_norm": 0.0010947365080937743, |
| "learning_rate": 3.855600357055664e-05, |
| "lookahead_loss": 6.604139113426209, |
| "loss": 0.6186, |
| "step": 120000 |
| }, |
| { |
| "epoch": 1.1096725463867188, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.638637524443313, |
| "eval_lookahead_perplexity": 764.0532796091819, |
| "eval_loss": 0.20784424245357513, |
| "eval_perplexity": 1.2310214137405475, |
| "eval_runtime": 92.4064, |
| "eval_samples_per_second": 54.109, |
| "eval_steps_per_second": 1.699, |
| "step": 120000 |
| }, |
| { |
| "base_loss": 0.5944828372001648, |
| "epoch": 1.110626220703125, |
| "grad_norm": 0.0010918062180280685, |
| "learning_rate": 3.850831985473633e-05, |
| "lookahead_loss": 6.5845939102172855, |
| "loss": 0.6062, |
| "step": 120500 |
| }, |
| { |
| "base_loss": 0.5857415299415588, |
| "epoch": 1.1115798950195312, |
| "grad_norm": 0.0010996379423886538, |
| "learning_rate": 3.846063613891602e-05, |
| "lookahead_loss": 6.665441793441772, |
| "loss": 0.598, |
| "step": 121000 |
| }, |
| { |
| "base_loss": 0.6081982196569443, |
| "epoch": 1.1125335693359375, |
| "grad_norm": 0.001094916253350675, |
| "learning_rate": 3.84129524230957e-05, |
| "lookahead_loss": 6.685654604434967, |
| "loss": 0.6153, |
| "step": 121500 |
| }, |
| { |
| "base_loss": 0.6173637208938598, |
| "epoch": 1.1134872436523438, |
| "grad_norm": 0.0011174079263582826, |
| "learning_rate": 3.8365268707275393e-05, |
| "lookahead_loss": 6.6888926963806155, |
| "loss": 0.6297, |
| "step": 122000 |
| }, |
| { |
| "base_loss": 0.595946085691452, |
| "epoch": 1.11444091796875, |
| "grad_norm": 0.001059638219885528, |
| "learning_rate": 3.831758499145508e-05, |
| "lookahead_loss": 6.576348360061646, |
| "loss": 0.6061, |
| "step": 122500 |
| }, |
| { |
| "base_loss": 0.5870628617405892, |
| "epoch": 1.1153945922851562, |
| "grad_norm": 0.0011298077879473567, |
| "learning_rate": 3.826990127563477e-05, |
| "lookahead_loss": 6.627452656745911, |
| "loss": 0.5984, |
| "step": 123000 |
| }, |
| { |
| "base_loss": 0.6015327024459839, |
| "epoch": 1.1163482666015625, |
| "grad_norm": 0.001181896193884313, |
| "learning_rate": 3.822221755981446e-05, |
| "lookahead_loss": 6.645281805038453, |
| "loss": 0.6155, |
| "step": 123500 |
| }, |
| { |
| "base_loss": 0.6103048238754273, |
| "epoch": 1.1173019409179688, |
| "grad_norm": 0.0011344418162479997, |
| "learning_rate": 3.817453384399414e-05, |
| "lookahead_loss": 6.633471826553345, |
| "loss": 0.6218, |
| "step": 124000 |
| }, |
| { |
| "base_loss": 0.6046528750061989, |
| "epoch": 1.118255615234375, |
| "grad_norm": 0.0011290363036096096, |
| "learning_rate": 3.812685012817383e-05, |
| "lookahead_loss": 6.66287439250946, |
| "loss": 0.6122, |
| "step": 124500 |
| }, |
| { |
| "base_loss": 0.5855171493887902, |
| "epoch": 1.1192092895507812, |
| "grad_norm": 0.001125651178881526, |
| "learning_rate": 3.8079166412353514e-05, |
| "lookahead_loss": 6.652395925998688, |
| "loss": 0.596, |
| "step": 125000 |
| }, |
| { |
| "epoch": 1.1192092895507812, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.624730955678434, |
| "eval_lookahead_perplexity": 753.5014599919688, |
| "eval_loss": 0.20781628787517548, |
| "eval_perplexity": 1.230987001536917, |
| "eval_runtime": 90.2053, |
| "eval_samples_per_second": 55.429, |
| "eval_steps_per_second": 1.74, |
| "step": 125000 |
| }, |
| { |
| "base_loss": 0.5973180630803108, |
| "epoch": 2.0009536743164062, |
| "grad_norm": 0.0011061604600399733, |
| "learning_rate": 3.8031482696533205e-05, |
| "lookahead_loss": 6.729214018821716, |
| "loss": 0.6027, |
| "step": 125500 |
| }, |
| { |
| "base_loss": 0.5872496640682221, |
| "epoch": 2.0019073486328125, |
| "grad_norm": 0.0011476138606667519, |
| "learning_rate": 3.7983798980712895e-05, |
| "lookahead_loss": 6.56571629524231, |
| "loss": 0.5971, |
| "step": 126000 |
| }, |
| { |
| "base_loss": 0.6028999392390251, |
| "epoch": 2.0028610229492188, |
| "grad_norm": 0.0011293648276478052, |
| "learning_rate": 3.793611526489258e-05, |
| "lookahead_loss": 6.571031596183777, |
| "loss": 0.6128, |
| "step": 126500 |
| }, |
| { |
| "base_loss": 0.6122851598262787, |
| "epoch": 2.003814697265625, |
| "grad_norm": 0.0010998975485563278, |
| "learning_rate": 3.788843154907227e-05, |
| "lookahead_loss": 6.580973098754883, |
| "loss": 0.622, |
| "step": 127000 |
| }, |
| { |
| "base_loss": 0.5994558810591698, |
| "epoch": 2.0047683715820312, |
| "grad_norm": 0.001085072522982955, |
| "learning_rate": 3.784074783325195e-05, |
| "lookahead_loss": 6.575087475776672, |
| "loss": 0.608, |
| "step": 127500 |
| }, |
| { |
| "base_loss": 0.5888918130993843, |
| "epoch": 2.0057220458984375, |
| "grad_norm": 0.0010583444964140654, |
| "learning_rate": 3.779306411743164e-05, |
| "lookahead_loss": 6.699651515007019, |
| "loss": 0.6002, |
| "step": 128000 |
| }, |
| { |
| "base_loss": 0.5821106826066971, |
| "epoch": 2.0066757202148438, |
| "grad_norm": 0.0010807913495227695, |
| "learning_rate": 3.774538040161133e-05, |
| "lookahead_loss": 6.536865455627441, |
| "loss": 0.599, |
| "step": 128500 |
| }, |
| { |
| "base_loss": 0.6039754400849342, |
| "epoch": 2.00762939453125, |
| "grad_norm": 0.0010995008051395416, |
| "learning_rate": 3.7697696685791016e-05, |
| "lookahead_loss": 6.604396286010743, |
| "loss": 0.6165, |
| "step": 129000 |
| }, |
| { |
| "base_loss": 0.6021587365865707, |
| "epoch": 2.0085830688476562, |
| "grad_norm": 0.0010323027381673455, |
| "learning_rate": 3.7650012969970706e-05, |
| "lookahead_loss": 6.605464251518249, |
| "loss": 0.6082, |
| "step": 129500 |
| }, |
| { |
| "base_loss": 0.5906935078501702, |
| "epoch": 2.0095367431640625, |
| "grad_norm": 0.0011525024892762303, |
| "learning_rate": 3.760232925415039e-05, |
| "lookahead_loss": 6.627658121109008, |
| "loss": 0.6063, |
| "step": 130000 |
| }, |
| { |
| "epoch": 2.0095367431640625, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.611681720319266, |
| "eval_lookahead_perplexity": 743.7327180237362, |
| "eval_loss": 0.2077895551919937, |
| "eval_perplexity": 1.2309540943912542, |
| "eval_runtime": 91.3828, |
| "eval_samples_per_second": 54.715, |
| "eval_steps_per_second": 1.718, |
| "step": 130000 |
| }, |
| { |
| "base_loss": 0.597510245859623, |
| "epoch": 2.0104904174804688, |
| "grad_norm": 0.001104092923924327, |
| "learning_rate": 3.755464553833008e-05, |
| "lookahead_loss": 6.562650055885315, |
| "loss": 0.6071, |
| "step": 130500 |
| }, |
| { |
| "base_loss": 0.589246776163578, |
| "epoch": 2.011444091796875, |
| "grad_norm": 0.0011504783760756254, |
| "learning_rate": 3.750696182250977e-05, |
| "lookahead_loss": 6.588836015701294, |
| "loss": 0.6033, |
| "step": 131000 |
| }, |
| { |
| "base_loss": 0.610647314965725, |
| "epoch": 2.0123977661132812, |
| "grad_norm": 0.0011107546743005514, |
| "learning_rate": 3.745927810668945e-05, |
| "lookahead_loss": 6.588105909347534, |
| "loss": 0.6243, |
| "step": 131500 |
| }, |
| { |
| "base_loss": 0.5952036259770394, |
| "epoch": 2.0133514404296875, |
| "grad_norm": 0.0011159584391862154, |
| "learning_rate": 3.7411594390869143e-05, |
| "lookahead_loss": 6.66543052482605, |
| "loss": 0.6094, |
| "step": 132000 |
| }, |
| { |
| "base_loss": 0.5953016864061356, |
| "epoch": 2.0143051147460938, |
| "grad_norm": 0.0010857629822567105, |
| "learning_rate": 3.736391067504883e-05, |
| "lookahead_loss": 6.636122459411621, |
| "loss": 0.6068, |
| "step": 132500 |
| }, |
| { |
| "base_loss": 0.58618260627985, |
| "epoch": 2.0152587890625, |
| "grad_norm": 0.0011120929848402739, |
| "learning_rate": 3.731622695922852e-05, |
| "lookahead_loss": 6.585938324451447, |
| "loss": 0.5924, |
| "step": 133000 |
| }, |
| { |
| "base_loss": 0.6084697796702385, |
| "epoch": 2.0162124633789062, |
| "grad_norm": 0.0011215230915695429, |
| "learning_rate": 3.726854324340821e-05, |
| "lookahead_loss": 6.614871105194092, |
| "loss": 0.6169, |
| "step": 133500 |
| }, |
| { |
| "base_loss": 0.6040195283293724, |
| "epoch": 2.0171661376953125, |
| "grad_norm": 0.0011012755567207932, |
| "learning_rate": 3.722085952758789e-05, |
| "lookahead_loss": 6.664707413673401, |
| "loss": 0.612, |
| "step": 134000 |
| }, |
| { |
| "base_loss": 0.5937762448191642, |
| "epoch": 2.0181198120117188, |
| "grad_norm": 0.0010495241731405258, |
| "learning_rate": 3.717317581176758e-05, |
| "lookahead_loss": 6.665002863883972, |
| "loss": 0.6034, |
| "step": 134500 |
| }, |
| { |
| "base_loss": 0.5864896615743637, |
| "epoch": 2.019073486328125, |
| "grad_norm": 0.0010809170780703425, |
| "learning_rate": 3.7125492095947264e-05, |
| "lookahead_loss": 6.6852047996521, |
| "loss": 0.5996, |
| "step": 135000 |
| }, |
| { |
| "epoch": 2.019073486328125, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.59784925783785, |
| "eval_lookahead_perplexity": 733.5158879689737, |
| "eval_loss": 0.20776157081127167, |
| "eval_perplexity": 1.2309196473852166, |
| "eval_runtime": 92.8062, |
| "eval_samples_per_second": 53.876, |
| "eval_steps_per_second": 1.692, |
| "step": 135000 |
| }, |
| { |
| "base_loss": 0.5884653336405754, |
| "epoch": 2.0200271606445312, |
| "grad_norm": 0.001135366503149271, |
| "learning_rate": 3.7077808380126955e-05, |
| "lookahead_loss": 6.525245847702027, |
| "loss": 0.6027, |
| "step": 135500 |
| }, |
| { |
| "base_loss": 0.6167576683163642, |
| "epoch": 2.0209808349609375, |
| "grad_norm": 0.0010758079588413239, |
| "learning_rate": 3.7030124664306645e-05, |
| "lookahead_loss": 6.587370067596436, |
| "loss": 0.6253, |
| "step": 136000 |
| }, |
| { |
| "base_loss": 0.5946086082458496, |
| "epoch": 2.0219345092773438, |
| "grad_norm": 0.0011470479657873511, |
| "learning_rate": 3.698244094848633e-05, |
| "lookahead_loss": 6.577798476696015, |
| "loss": 0.6091, |
| "step": 136500 |
| }, |
| { |
| "base_loss": 0.5980083233714104, |
| "epoch": 2.02288818359375, |
| "grad_norm": 0.0011121248826384544, |
| "learning_rate": 3.693475723266602e-05, |
| "lookahead_loss": 6.599166748046875, |
| "loss": 0.6066, |
| "step": 137000 |
| }, |
| { |
| "base_loss": 0.5981668121218682, |
| "epoch": 2.0238418579101562, |
| "grad_norm": 0.001129783340729773, |
| "learning_rate": 3.68870735168457e-05, |
| "lookahead_loss": 6.572176639556885, |
| "loss": 0.6053, |
| "step": 137500 |
| }, |
| { |
| "base_loss": 0.6096467951536179, |
| "epoch": 2.0247955322265625, |
| "grad_norm": 0.0010902190115302801, |
| "learning_rate": 3.683938980102539e-05, |
| "lookahead_loss": 6.575278332710266, |
| "loss": 0.6221, |
| "step": 138000 |
| }, |
| { |
| "base_loss": 0.6067250183224678, |
| "epoch": 2.0257492065429688, |
| "grad_norm": 0.0010856341104954481, |
| "learning_rate": 3.679170608520508e-05, |
| "lookahead_loss": 6.537411547660827, |
| "loss": 0.6159, |
| "step": 138500 |
| }, |
| { |
| "base_loss": 0.5957215885519982, |
| "epoch": 2.026702880859375, |
| "grad_norm": 0.0011199660366401076, |
| "learning_rate": 3.6744022369384766e-05, |
| "lookahead_loss": 6.5439116020202635, |
| "loss": 0.605, |
| "step": 139000 |
| }, |
| { |
| "base_loss": 0.5930430209040641, |
| "epoch": 2.0276565551757812, |
| "grad_norm": 0.0011078852694481611, |
| "learning_rate": 3.6696338653564456e-05, |
| "lookahead_loss": 6.6607541685104374, |
| "loss": 0.6048, |
| "step": 139500 |
| }, |
| { |
| "base_loss": 0.6162356662750245, |
| "epoch": 2.0286102294921875, |
| "grad_norm": 0.001123247086070478, |
| "learning_rate": 3.664865493774414e-05, |
| "lookahead_loss": 6.67528129196167, |
| "loss": 0.6263, |
| "step": 140000 |
| }, |
| { |
| "epoch": 2.0286102294921875, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.585361625439823, |
| "eval_lookahead_perplexity": 724.412966463547, |
| "eval_loss": 0.20773683488368988, |
| "eval_perplexity": 1.230889199822536, |
| "eval_runtime": 92.7637, |
| "eval_samples_per_second": 53.9, |
| "eval_steps_per_second": 1.692, |
| "step": 140000 |
| }, |
| { |
| "base_loss": 0.6023641695976257, |
| "epoch": 2.0295639038085938, |
| "grad_norm": 0.0010968261631205678, |
| "learning_rate": 3.660097122192383e-05, |
| "lookahead_loss": 6.617411545753479, |
| "loss": 0.6128, |
| "step": 140500 |
| }, |
| { |
| "base_loss": 0.5918351314663887, |
| "epoch": 2.030517578125, |
| "grad_norm": 0.0011174526298418641, |
| "learning_rate": 3.655328750610352e-05, |
| "lookahead_loss": 6.603970588207245, |
| "loss": 0.6048, |
| "step": 141000 |
| }, |
| { |
| "base_loss": 0.5900247128009796, |
| "epoch": 2.0314712524414062, |
| "grad_norm": 0.001116051571443677, |
| "learning_rate": 3.65056037902832e-05, |
| "lookahead_loss": 6.635660397529602, |
| "loss": 0.6032, |
| "step": 141500 |
| }, |
| { |
| "base_loss": 0.6122562985420227, |
| "epoch": 2.0324249267578125, |
| "grad_norm": 0.0011088504688814282, |
| "learning_rate": 3.6457920074462893e-05, |
| "lookahead_loss": 6.562885791778564, |
| "loss": 0.6278, |
| "step": 142000 |
| }, |
| { |
| "base_loss": 0.5956820755600929, |
| "epoch": 2.0333786010742188, |
| "grad_norm": 0.0011337499599903822, |
| "learning_rate": 3.641023635864258e-05, |
| "lookahead_loss": 6.617730149269104, |
| "loss": 0.6081, |
| "step": 142500 |
| }, |
| { |
| "base_loss": 0.5925231646895409, |
| "epoch": 2.034332275390625, |
| "grad_norm": 0.0010689555201679468, |
| "learning_rate": 3.636255264282227e-05, |
| "lookahead_loss": 6.687095086097718, |
| "loss": 0.6073, |
| "step": 143000 |
| }, |
| { |
| "base_loss": 0.5926423314213752, |
| "epoch": 2.0352859497070312, |
| "grad_norm": 0.0011333973379805684, |
| "learning_rate": 3.631486892700196e-05, |
| "lookahead_loss": 6.5130055770874025, |
| "loss": 0.6038, |
| "step": 143500 |
| }, |
| { |
| "base_loss": 0.6081820316910743, |
| "epoch": 2.0362396240234375, |
| "grad_norm": 0.0011208722135052085, |
| "learning_rate": 3.626718521118164e-05, |
| "lookahead_loss": 6.640297612190246, |
| "loss": 0.62, |
| "step": 144000 |
| }, |
| { |
| "base_loss": 0.6008555814623833, |
| "epoch": 2.0371932983398438, |
| "grad_norm": 0.0011375031899660826, |
| "learning_rate": 3.621950149536133e-05, |
| "lookahead_loss": 6.57792680644989, |
| "loss": 0.6119, |
| "step": 144500 |
| }, |
| { |
| "base_loss": 0.5884444781541824, |
| "epoch": 2.03814697265625, |
| "grad_norm": 0.0010999179212376475, |
| "learning_rate": 3.6171817779541014e-05, |
| "lookahead_loss": 6.599160625457763, |
| "loss": 0.6015, |
| "step": 145000 |
| }, |
| { |
| "epoch": 2.03814697265625, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.573444949933134, |
| "eval_lookahead_perplexity": 715.8316044138329, |
| "eval_loss": 0.2077145278453827, |
| "eval_perplexity": 1.230861742636249, |
| "eval_runtime": 92.2965, |
| "eval_samples_per_second": 54.173, |
| "eval_steps_per_second": 1.701, |
| "step": 145000 |
| }, |
| { |
| "base_loss": 0.5984013820290566, |
| "epoch": 2.0391006469726562, |
| "grad_norm": 0.0011051023611798882, |
| "learning_rate": 3.6124134063720705e-05, |
| "lookahead_loss": 6.544455199241638, |
| "loss": 0.6123, |
| "step": 145500 |
| }, |
| { |
| "base_loss": 0.6142821377515792, |
| "epoch": 2.0400543212890625, |
| "grad_norm": 0.0011354960734024644, |
| "learning_rate": 3.6076450347900395e-05, |
| "lookahead_loss": 6.579610840797424, |
| "loss": 0.624, |
| "step": 146000 |
| }, |
| { |
| "base_loss": 0.5992258986830712, |
| "epoch": 2.0410079956054688, |
| "grad_norm": 0.0011294566793367267, |
| "learning_rate": 3.602876663208008e-05, |
| "lookahead_loss": 6.541078899383545, |
| "loss": 0.6105, |
| "step": 146500 |
| }, |
| { |
| "base_loss": 0.578558257818222, |
| "epoch": 2.041961669921875, |
| "grad_norm": 0.0011305843945592642, |
| "learning_rate": 3.598108291625977e-05, |
| "lookahead_loss": 6.605445454597473, |
| "loss": 0.5955, |
| "step": 147000 |
| }, |
| { |
| "base_loss": 0.6040933942198753, |
| "epoch": 2.0429153442382812, |
| "grad_norm": 0.0010733373928815126, |
| "learning_rate": 3.593339920043945e-05, |
| "lookahead_loss": 6.627297685623169, |
| "loss": 0.6147, |
| "step": 147500 |
| }, |
| { |
| "base_loss": 0.6104523810148239, |
| "epoch": 2.0438690185546875, |
| "grad_norm": 0.001119652995839715, |
| "learning_rate": 3.588571548461914e-05, |
| "lookahead_loss": 6.631721858024597, |
| "loss": 0.6237, |
| "step": 148000 |
| }, |
| { |
| "base_loss": 0.5943758766055107, |
| "epoch": 2.0448226928710938, |
| "grad_norm": 0.00112288782838732, |
| "learning_rate": 3.583803176879883e-05, |
| "lookahead_loss": 6.579286907196045, |
| "loss": 0.6059, |
| "step": 148500 |
| }, |
| { |
| "base_loss": 0.5905072175264359, |
| "epoch": 2.0457763671875, |
| "grad_norm": 0.001094466308131814, |
| "learning_rate": 3.5790348052978516e-05, |
| "lookahead_loss": 6.574234865665436, |
| "loss": 0.6021, |
| "step": 149000 |
| }, |
| { |
| "base_loss": 0.6159886345267296, |
| "epoch": 2.0467300415039062, |
| "grad_norm": 0.0011022677645087242, |
| "learning_rate": 3.5742664337158206e-05, |
| "lookahead_loss": 6.549607624053955, |
| "loss": 0.6273, |
| "step": 149500 |
| }, |
| { |
| "base_loss": 0.6079316187500954, |
| "epoch": 2.0476837158203125, |
| "grad_norm": 0.0011532205389812589, |
| "learning_rate": 3.569498062133789e-05, |
| "lookahead_loss": 6.573279874801636, |
| "loss": 0.6236, |
| "step": 150000 |
| }, |
| { |
| "epoch": 2.0476837158203125, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.563056849823973, |
| "eval_lookahead_perplexity": 708.4339642837788, |
| "eval_loss": 0.20769280195236206, |
| "eval_perplexity": 1.2308350013561955, |
| "eval_runtime": 91.5391, |
| "eval_samples_per_second": 54.621, |
| "eval_steps_per_second": 1.715, |
| "step": 150000 |
| }, |
| { |
| "base_loss": 0.5885444439649582, |
| "epoch": 2.0486373901367188, |
| "grad_norm": 0.0010961332591250539, |
| "learning_rate": 3.564729690551758e-05, |
| "lookahead_loss": 6.5612752790451045, |
| "loss": 0.6013, |
| "step": 150500 |
| }, |
| { |
| "base_loss": 0.6004898179769516, |
| "epoch": 2.049591064453125, |
| "grad_norm": 0.0011418386129662395, |
| "learning_rate": 3.559961318969727e-05, |
| "lookahead_loss": 6.530380144119262, |
| "loss": 0.6083, |
| "step": 151000 |
| }, |
| { |
| "base_loss": 0.6162783756256104, |
| "epoch": 2.0505447387695312, |
| "grad_norm": 0.0010346118360757828, |
| "learning_rate": 3.555192947387695e-05, |
| "lookahead_loss": 6.624601441383362, |
| "loss": 0.6277, |
| "step": 151500 |
| }, |
| { |
| "base_loss": 0.5984932317137718, |
| "epoch": 2.0514984130859375, |
| "grad_norm": 0.001150093856267631, |
| "learning_rate": 3.5504245758056643e-05, |
| "lookahead_loss": 6.58684754562378, |
| "loss": 0.6103, |
| "step": 152000 |
| }, |
| { |
| "base_loss": 0.5949168145656586, |
| "epoch": 2.0524520874023438, |
| "grad_norm": 0.00113403657451272, |
| "learning_rate": 3.545656204223633e-05, |
| "lookahead_loss": 6.523898173332214, |
| "loss": 0.6055, |
| "step": 152500 |
| }, |
| { |
| "base_loss": 0.6100201278328895, |
| "epoch": 2.05340576171875, |
| "grad_norm": 0.001108511001802981, |
| "learning_rate": 3.540887832641602e-05, |
| "lookahead_loss": 6.522478567123413, |
| "loss": 0.6204, |
| "step": 153000 |
| }, |
| { |
| "base_loss": 0.6109586038589477, |
| "epoch": 2.0543594360351562, |
| "grad_norm": 0.0011183718452230096, |
| "learning_rate": 3.536119461059571e-05, |
| "lookahead_loss": 6.566904292106629, |
| "loss": 0.6263, |
| "step": 153500 |
| }, |
| { |
| "base_loss": 0.5970847414731979, |
| "epoch": 2.0553131103515625, |
| "grad_norm": 0.001120952656492591, |
| "learning_rate": 3.531351089477539e-05, |
| "lookahead_loss": 6.588804016113281, |
| "loss": 0.6066, |
| "step": 154000 |
| }, |
| { |
| "base_loss": 0.5898715674877166, |
| "epoch": 2.0562667846679688, |
| "grad_norm": 0.0010837721638381481, |
| "learning_rate": 3.526582717895508e-05, |
| "lookahead_loss": 6.603852970123291, |
| "loss": 0.6029, |
| "step": 154500 |
| }, |
| { |
| "base_loss": 0.6163467369079589, |
| "epoch": 2.057220458984375, |
| "grad_norm": 0.0010966199915856123, |
| "learning_rate": 3.5218143463134764e-05, |
| "lookahead_loss": 6.5893798031806945, |
| "loss": 0.6288, |
| "step": 155000 |
| }, |
| { |
| "epoch": 2.057220458984375, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.552286794010443, |
| "eval_lookahead_perplexity": 700.8450309219951, |
| "eval_loss": 0.20767158269882202, |
| "eval_perplexity": 1.2308088842333296, |
| "eval_runtime": 91.2351, |
| "eval_samples_per_second": 54.803, |
| "eval_steps_per_second": 1.721, |
| "step": 155000 |
| }, |
| { |
| "base_loss": 0.6090138986110687, |
| "epoch": 2.0581741333007812, |
| "grad_norm": 0.0011216332204639912, |
| "learning_rate": 3.5170459747314455e-05, |
| "lookahead_loss": 6.590124855995178, |
| "loss": 0.618, |
| "step": 155500 |
| }, |
| { |
| "base_loss": 0.583700324177742, |
| "epoch": 2.0591278076171875, |
| "grad_norm": 0.0010965469991788268, |
| "learning_rate": 3.5122776031494145e-05, |
| "lookahead_loss": 6.496915921211243, |
| "loss": 0.5999, |
| "step": 156000 |
| }, |
| { |
| "base_loss": 0.5938667116761207, |
| "epoch": 2.0600814819335938, |
| "grad_norm": 0.0010983194224536419, |
| "learning_rate": 3.507509231567383e-05, |
| "lookahead_loss": 6.5626952772140505, |
| "loss": 0.6085, |
| "step": 156500 |
| }, |
| { |
| "base_loss": 0.6066018126606941, |
| "epoch": 2.06103515625, |
| "grad_norm": 0.0011154355015605688, |
| "learning_rate": 3.502740859985352e-05, |
| "lookahead_loss": 6.521708921432495, |
| "loss": 0.617, |
| "step": 157000 |
| }, |
| { |
| "base_loss": 0.6058481879830361, |
| "epoch": 2.0619888305664062, |
| "grad_norm": 0.001157720573246479, |
| "learning_rate": 3.49797248840332e-05, |
| "lookahead_loss": 6.530971702575684, |
| "loss": 0.6158, |
| "step": 157500 |
| }, |
| { |
| "base_loss": 0.5946802944540978, |
| "epoch": 2.0629425048828125, |
| "grad_norm": 0.0011384790996089578, |
| "learning_rate": 3.493204116821289e-05, |
| "lookahead_loss": 6.535911062240601, |
| "loss": 0.6069, |
| "step": 158000 |
| }, |
| { |
| "base_loss": 0.6126915777921677, |
| "epoch": 2.0638961791992188, |
| "grad_norm": 0.0010698529658839107, |
| "learning_rate": 3.488435745239258e-05, |
| "lookahead_loss": 6.552743993759155, |
| "loss": 0.6234, |
| "step": 158500 |
| }, |
| { |
| "base_loss": 0.6043052950501442, |
| "epoch": 2.064849853515625, |
| "grad_norm": 0.0011142947478219867, |
| "learning_rate": 3.4836673736572266e-05, |
| "lookahead_loss": 6.595561448097229, |
| "loss": 0.6137, |
| "step": 159000 |
| }, |
| { |
| "base_loss": 0.6039554010033608, |
| "epoch": 2.0658035278320312, |
| "grad_norm": 0.0011351387947797775, |
| "learning_rate": 3.4788990020751956e-05, |
| "lookahead_loss": 6.505132764816284, |
| "loss": 0.6139, |
| "step": 159500 |
| }, |
| { |
| "base_loss": 0.605331601202488, |
| "epoch": 2.0667572021484375, |
| "grad_norm": 0.001111305202357471, |
| "learning_rate": 3.474130630493164e-05, |
| "lookahead_loss": 6.489367088317871, |
| "loss": 0.6123, |
| "step": 160000 |
| }, |
| { |
| "epoch": 2.0667572021484375, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.541540684410558, |
| "eval_lookahead_perplexity": 693.3539952402889, |
| "eval_loss": 0.20765088498592377, |
| "eval_perplexity": 1.2307834095680457, |
| "eval_runtime": 93.5455, |
| "eval_samples_per_second": 53.45, |
| "eval_steps_per_second": 1.678, |
| "step": 160000 |
| }, |
| { |
| "base_loss": 0.6198571705818177, |
| "epoch": 2.0677108764648438, |
| "grad_norm": 0.0011394508183002472, |
| "learning_rate": 3.469362258911133e-05, |
| "lookahead_loss": 6.569809526443481, |
| "loss": 0.6283, |
| "step": 160500 |
| }, |
| { |
| "base_loss": 0.5996336659789085, |
| "epoch": 2.06866455078125, |
| "grad_norm": 0.001077422290109098, |
| "learning_rate": 3.464593887329102e-05, |
| "lookahead_loss": 6.538529127120972, |
| "loss": 0.6077, |
| "step": 161000 |
| }, |
| { |
| "base_loss": 0.5853740153312683, |
| "epoch": 2.0696182250976562, |
| "grad_norm": 0.0011083297431468964, |
| "learning_rate": 3.45982551574707e-05, |
| "lookahead_loss": 6.577185619831085, |
| "loss": 0.6, |
| "step": 161500 |
| }, |
| { |
| "base_loss": 0.6150279142260552, |
| "epoch": 2.0705718994140625, |
| "grad_norm": 0.0011394877219572663, |
| "learning_rate": 3.4550571441650393e-05, |
| "lookahead_loss": 6.460757801055908, |
| "loss": 0.6275, |
| "step": 162000 |
| }, |
| { |
| "base_loss": 0.5995807001590728, |
| "epoch": 2.0715255737304688, |
| "grad_norm": 0.0011034323833882809, |
| "learning_rate": 3.450288772583008e-05, |
| "lookahead_loss": 6.503600845336914, |
| "loss": 0.6119, |
| "step": 162500 |
| }, |
| { |
| "base_loss": 0.5951229523420334, |
| "epoch": 2.072479248046875, |
| "grad_norm": 0.0011338687036186457, |
| "learning_rate": 3.445520401000977e-05, |
| "lookahead_loss": 6.56779256439209, |
| "loss": 0.6063, |
| "step": 163000 |
| }, |
| { |
| "base_loss": 0.5946593886613846, |
| "epoch": 2.0734329223632812, |
| "grad_norm": 0.0010535767069086432, |
| "learning_rate": 3.440752029418946e-05, |
| "lookahead_loss": 6.522209219932556, |
| "loss": 0.6063, |
| "step": 163500 |
| }, |
| { |
| "base_loss": 0.6102511178851128, |
| "epoch": 2.0743865966796875, |
| "grad_norm": 0.0011173501843586564, |
| "learning_rate": 3.435983657836914e-05, |
| "lookahead_loss": 6.522058952331543, |
| "loss": 0.6216, |
| "step": 164000 |
| }, |
| { |
| "base_loss": 0.5937603359222412, |
| "epoch": 2.0753402709960938, |
| "grad_norm": 0.0011274093994870782, |
| "learning_rate": 3.431215286254883e-05, |
| "lookahead_loss": 6.592309030056, |
| "loss": 0.6057, |
| "step": 164500 |
| }, |
| { |
| "base_loss": 0.5869635686874389, |
| "epoch": 2.0762939453125, |
| "grad_norm": 0.001087229116819799, |
| "learning_rate": 3.4264469146728514e-05, |
| "lookahead_loss": 6.557568222045899, |
| "loss": 0.5964, |
| "step": 165000 |
| }, |
| { |
| "epoch": 2.0762939453125, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.530356111617896, |
| "eval_lookahead_perplexity": 685.6423332225185, |
| "eval_loss": 0.20762944221496582, |
| "eval_perplexity": 1.2307570184442458, |
| "eval_runtime": 90.9123, |
| "eval_samples_per_second": 54.998, |
| "eval_steps_per_second": 1.727, |
| "step": 165000 |
| }, |
| { |
| "base_loss": 0.6242118158340454, |
| "epoch": 2.0772476196289062, |
| "grad_norm": 0.001096642459742725, |
| "learning_rate": 3.4216785430908205e-05, |
| "lookahead_loss": 6.5268142805099485, |
| "loss": 0.6344, |
| "step": 165500 |
| }, |
| { |
| "base_loss": 0.5928893273472786, |
| "epoch": 2.0782012939453125, |
| "grad_norm": 0.001135199679993093, |
| "learning_rate": 3.4169101715087895e-05, |
| "lookahead_loss": 6.549333306312561, |
| "loss": 0.6079, |
| "step": 166000 |
| }, |
| { |
| "base_loss": 0.5996605790853501, |
| "epoch": 2.0791549682617188, |
| "grad_norm": 0.0010736893163993955, |
| "learning_rate": 3.412141799926758e-05, |
| "lookahead_loss": 6.570660936355591, |
| "loss": 0.6084, |
| "step": 166500 |
| }, |
| { |
| "base_loss": 0.6092261442542076, |
| "epoch": 2.080108642578125, |
| "grad_norm": 0.0010752358939498663, |
| "learning_rate": 3.407373428344727e-05, |
| "lookahead_loss": 6.512598112106323, |
| "loss": 0.6199, |
| "step": 167000 |
| }, |
| { |
| "base_loss": 0.6069996964335441, |
| "epoch": 2.0810623168945312, |
| "grad_norm": 0.001151898643001914, |
| "learning_rate": 3.402605056762695e-05, |
| "lookahead_loss": 6.553294209480286, |
| "loss": 0.6194, |
| "step": 167500 |
| }, |
| { |
| "base_loss": 0.5921739342212677, |
| "epoch": 2.0820159912109375, |
| "grad_norm": 0.0010723688174039125, |
| "learning_rate": 3.397836685180664e-05, |
| "lookahead_loss": 6.563275011539459, |
| "loss": 0.6057, |
| "step": 168000 |
| }, |
| { |
| "base_loss": 0.5929293268918991, |
| "epoch": 2.0829696655273438, |
| "grad_norm": 0.0011118296533823013, |
| "learning_rate": 3.393068313598633e-05, |
| "lookahead_loss": 6.586033729553223, |
| "loss": 0.6052, |
| "step": 168500 |
| }, |
| { |
| "base_loss": 0.6163858331441879, |
| "epoch": 2.08392333984375, |
| "grad_norm": 0.0010826945072039962, |
| "learning_rate": 3.3882999420166016e-05, |
| "lookahead_loss": 6.580185745239258, |
| "loss": 0.6286, |
| "step": 169000 |
| }, |
| { |
| "base_loss": 0.5959904823899269, |
| "epoch": 2.0848770141601562, |
| "grad_norm": 0.0011054837377741933, |
| "learning_rate": 3.3835315704345706e-05, |
| "lookahead_loss": 6.532082775115967, |
| "loss": 0.6064, |
| "step": 169500 |
| }, |
| { |
| "base_loss": 0.5873328469395638, |
| "epoch": 2.0858306884765625, |
| "grad_norm": 0.0011319448240101337, |
| "learning_rate": 3.378763198852539e-05, |
| "lookahead_loss": 6.541397624015808, |
| "loss": 0.5974, |
| "step": 170000 |
| }, |
| { |
| "epoch": 2.0858306884765625, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.519407711089991, |
| "eval_lookahead_perplexity": 678.1765898884748, |
| "eval_loss": 0.20760849118232727, |
| "eval_perplexity": 1.2307312330838982, |
| "eval_runtime": 92.2316, |
| "eval_samples_per_second": 54.211, |
| "eval_steps_per_second": 1.702, |
| "step": 170000 |
| }, |
| { |
| "base_loss": 0.5929982444047928, |
| "epoch": 2.0867843627929688, |
| "grad_norm": 0.001194111187942326, |
| "learning_rate": 3.373994827270508e-05, |
| "lookahead_loss": 6.505416949272155, |
| "loss": 0.6043, |
| "step": 170500 |
| }, |
| { |
| "base_loss": 0.6234143348932266, |
| "epoch": 2.087738037109375, |
| "grad_norm": 0.0010146180866286159, |
| "learning_rate": 3.369226455688477e-05, |
| "lookahead_loss": 6.542620226383209, |
| "loss": 0.6284, |
| "step": 171000 |
| }, |
| { |
| "base_loss": 0.5942349677085876, |
| "epoch": 2.0886917114257812, |
| "grad_norm": 0.0011270438553765416, |
| "learning_rate": 3.364458084106445e-05, |
| "lookahead_loss": 6.5318672647476195, |
| "loss": 0.6037, |
| "step": 171500 |
| }, |
| { |
| "base_loss": 0.5997879543304443, |
| "epoch": 2.0896453857421875, |
| "grad_norm": 0.0011306487722322345, |
| "learning_rate": 3.3596897125244143e-05, |
| "lookahead_loss": 6.558032165527344, |
| "loss": 0.6084, |
| "step": 172000 |
| }, |
| { |
| "base_loss": 0.5867617139816285, |
| "epoch": 2.0905990600585938, |
| "grad_norm": 0.0010641829576343298, |
| "learning_rate": 3.354921340942383e-05, |
| "lookahead_loss": 6.522708724975586, |
| "loss": 0.5974, |
| "step": 172500 |
| }, |
| { |
| "base_loss": 0.581401211798191, |
| "epoch": 2.091552734375, |
| "grad_norm": 0.0010614799102768302, |
| "learning_rate": 3.350152969360352e-05, |
| "lookahead_loss": 6.5099988975524905, |
| "loss": 0.5912, |
| "step": 173000 |
| }, |
| { |
| "base_loss": 0.6186662130355834, |
| "epoch": 2.0925064086914062, |
| "grad_norm": 0.0011495859362185001, |
| "learning_rate": 3.345384597778321e-05, |
| "lookahead_loss": 6.54921883392334, |
| "loss": 0.6288, |
| "step": 173500 |
| }, |
| { |
| "base_loss": 0.600584501862526, |
| "epoch": 2.0934600830078125, |
| "grad_norm": 0.0010833586566150188, |
| "learning_rate": 3.340616226196289e-05, |
| "lookahead_loss": 6.5347207136154175, |
| "loss": 0.6132, |
| "step": 174000 |
| }, |
| { |
| "base_loss": 0.5855657352209092, |
| "epoch": 2.0944137573242188, |
| "grad_norm": 0.0010808638762682676, |
| "learning_rate": 3.335847854614258e-05, |
| "lookahead_loss": 6.533743205070496, |
| "loss": 0.5979, |
| "step": 174500 |
| }, |
| { |
| "base_loss": 0.5778659620285034, |
| "epoch": 2.095367431640625, |
| "grad_norm": 0.0010777119314298034, |
| "learning_rate": 3.3310794830322264e-05, |
| "lookahead_loss": 6.469995648384094, |
| "loss": 0.5933, |
| "step": 175000 |
| }, |
| { |
| "epoch": 2.095367431640625, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.510934283558172, |
| "eval_lookahead_perplexity": 672.4543872721904, |
| "eval_loss": 0.20759032666683197, |
| "eval_perplexity": 1.2307088776503827, |
| "eval_runtime": 93.5217, |
| "eval_samples_per_second": 53.464, |
| "eval_steps_per_second": 1.679, |
| "step": 175000 |
| }, |
| { |
| "base_loss": 0.59952286028862, |
| "epoch": 2.0963211059570312, |
| "grad_norm": 0.0011107597965747118, |
| "learning_rate": 3.3263111114501955e-05, |
| "lookahead_loss": 6.528745898723602, |
| "loss": 0.6123, |
| "step": 175500 |
| }, |
| { |
| "base_loss": 0.6133572874069214, |
| "epoch": 2.0972747802734375, |
| "grad_norm": 0.0011266444344073534, |
| "learning_rate": 3.3215427398681645e-05, |
| "lookahead_loss": 6.552535983085632, |
| "loss": 0.6237, |
| "step": 176000 |
| }, |
| { |
| "base_loss": 0.5973192919492721, |
| "epoch": 2.0982284545898438, |
| "grad_norm": 0.001105320523492992, |
| "learning_rate": 3.316774368286133e-05, |
| "lookahead_loss": 6.48703881072998, |
| "loss": 0.6069, |
| "step": 176500 |
| }, |
| { |
| "base_loss": 0.5845331786870956, |
| "epoch": 2.09918212890625, |
| "grad_norm": 0.001108050113543868, |
| "learning_rate": 3.312005996704102e-05, |
| "lookahead_loss": 6.557789768218994, |
| "loss": 0.5956, |
| "step": 177000 |
| }, |
| { |
| "base_loss": 0.5865774551033973, |
| "epoch": 2.1001358032226562, |
| "grad_norm": 0.001104854280129075, |
| "learning_rate": 3.30723762512207e-05, |
| "lookahead_loss": 6.52143590259552, |
| "loss": 0.6035, |
| "step": 177500 |
| }, |
| { |
| "base_loss": 0.6098086424469947, |
| "epoch": 2.1010894775390625, |
| "grad_norm": 0.0011074242647737265, |
| "learning_rate": 3.302469253540039e-05, |
| "lookahead_loss": 6.581818056106568, |
| "loss": 0.6195, |
| "step": 178000 |
| }, |
| { |
| "base_loss": 0.6012407766580582, |
| "epoch": 2.1020431518554688, |
| "grad_norm": 0.0011334229493513703, |
| "learning_rate": 3.297700881958008e-05, |
| "lookahead_loss": 6.556834000587464, |
| "loss": 0.6076, |
| "step": 178500 |
| }, |
| { |
| "base_loss": 0.5883222328424453, |
| "epoch": 2.102996826171875, |
| "grad_norm": 0.0010879429755732417, |
| "learning_rate": 3.2929325103759766e-05, |
| "lookahead_loss": 6.528950669288635, |
| "loss": 0.6017, |
| "step": 179000 |
| }, |
| { |
| "base_loss": 0.5900123327970505, |
| "epoch": 2.1039505004882812, |
| "grad_norm": 0.0010885036317631602, |
| "learning_rate": 3.2881641387939456e-05, |
| "lookahead_loss": 6.548188806533814, |
| "loss": 0.6016, |
| "step": 179500 |
| }, |
| { |
| "base_loss": 0.6137035485506058, |
| "epoch": 2.1049041748046875, |
| "grad_norm": 0.0010836453875526786, |
| "learning_rate": 3.283395767211914e-05, |
| "lookahead_loss": 6.490313670158386, |
| "loss": 0.6239, |
| "step": 180000 |
| }, |
| { |
| "epoch": 2.1049041748046875, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.501633135274576, |
| "eval_lookahead_perplexity": 666.2287868008185, |
| "eval_loss": 0.2075720876455307, |
| "eval_perplexity": 1.2306864309296515, |
| "eval_runtime": 91.2824, |
| "eval_samples_per_second": 54.775, |
| "eval_steps_per_second": 1.72, |
| "step": 180000 |
| }, |
| { |
| "base_loss": 0.6017958376407623, |
| "epoch": 2.1058578491210938, |
| "grad_norm": 0.0010430403053760529, |
| "learning_rate": 3.278627395629883e-05, |
| "lookahead_loss": 6.493426759719848, |
| "loss": 0.6127, |
| "step": 180500 |
| }, |
| { |
| "base_loss": 0.5970892771482468, |
| "epoch": 2.1068115234375, |
| "grad_norm": 0.001087990473024547, |
| "learning_rate": 3.273859024047852e-05, |
| "lookahead_loss": 6.560095170974732, |
| "loss": 0.6064, |
| "step": 181000 |
| }, |
| { |
| "base_loss": 0.5833802008032799, |
| "epoch": 2.1077651977539062, |
| "grad_norm": 0.0011280244216322899, |
| "learning_rate": 3.26909065246582e-05, |
| "lookahead_loss": 6.470833657264709, |
| "loss": 0.5948, |
| "step": 181500 |
| }, |
| { |
| "base_loss": 0.604837516605854, |
| "epoch": 2.1087188720703125, |
| "grad_norm": 0.0011518291430547833, |
| "learning_rate": 3.2643222808837893e-05, |
| "lookahead_loss": 6.458127084732055, |
| "loss": 0.6189, |
| "step": 182000 |
| }, |
| { |
| "base_loss": 0.6057112255096435, |
| "epoch": 2.1096725463867188, |
| "grad_norm": 0.001071825623512268, |
| "learning_rate": 3.259553909301758e-05, |
| "lookahead_loss": 6.447037249565125, |
| "loss": 0.6163, |
| "step": 182500 |
| }, |
| { |
| "base_loss": 0.5945826203823089, |
| "epoch": 2.110626220703125, |
| "grad_norm": 0.0010940604843199253, |
| "learning_rate": 3.254785537719727e-05, |
| "lookahead_loss": 6.43380268573761, |
| "loss": 0.6068, |
| "step": 183000 |
| }, |
| { |
| "base_loss": 0.5839763838648796, |
| "epoch": 2.1115798950195312, |
| "grad_norm": 0.0010885735973715782, |
| "learning_rate": 3.250017166137696e-05, |
| "lookahead_loss": 6.519184366226196, |
| "loss": 0.5969, |
| "step": 183500 |
| }, |
| { |
| "base_loss": 0.6021909977793694, |
| "epoch": 2.1125335693359375, |
| "grad_norm": 0.0010922467336058617, |
| "learning_rate": 3.245248794555664e-05, |
| "lookahead_loss": 6.546927687168122, |
| "loss": 0.612, |
| "step": 184000 |
| }, |
| { |
| "base_loss": 0.6174382773041726, |
| "epoch": 2.1134872436523438, |
| "grad_norm": 0.0011140938149765134, |
| "learning_rate": 3.240480422973633e-05, |
| "lookahead_loss": 6.5441567344665525, |
| "loss": 0.6269, |
| "step": 184500 |
| }, |
| { |
| "base_loss": 0.5974359802007675, |
| "epoch": 2.11444091796875, |
| "grad_norm": 0.0010466972598806024, |
| "learning_rate": 3.2357120513916014e-05, |
| "lookahead_loss": 6.435290018081665, |
| "loss": 0.6071, |
| "step": 185000 |
| }, |
| { |
| "epoch": 2.11444091796875, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.492953685906748, |
| "eval_lookahead_perplexity": 660.4713097873832, |
| "eval_loss": 0.20755501091480255, |
| "eval_perplexity": 1.2306654150083016, |
| "eval_runtime": 93.1261, |
| "eval_samples_per_second": 53.691, |
| "eval_steps_per_second": 1.686, |
| "step": 185000 |
| }, |
| { |
| "base_loss": 0.5844211729764939, |
| "epoch": 2.1153945922851562, |
| "grad_norm": 0.0011425914708524942, |
| "learning_rate": 3.2309436798095705e-05, |
| "lookahead_loss": 6.488190547943115, |
| "loss": 0.5962, |
| "step": 185500 |
| }, |
| { |
| "base_loss": 0.6047764738798141, |
| "epoch": 2.1163482666015625, |
| "grad_norm": 0.0011844782857224345, |
| "learning_rate": 3.2261753082275395e-05, |
| "lookahead_loss": 6.4878438234329225, |
| "loss": 0.6158, |
| "step": 186000 |
| }, |
| { |
| "base_loss": 0.6098092859387397, |
| "epoch": 2.1173019409179688, |
| "grad_norm": 0.0011336279567331076, |
| "learning_rate": 3.221406936645508e-05, |
| "lookahead_loss": 6.487655387878418, |
| "loss": 0.6218, |
| "step": 186500 |
| }, |
| { |
| "base_loss": 0.6043260169625282, |
| "epoch": 2.118255615234375, |
| "grad_norm": 0.0011267218505963683, |
| "learning_rate": 3.216638565063477e-05, |
| "lookahead_loss": 6.516085889816284, |
| "loss": 0.6128, |
| "step": 187000 |
| }, |
| { |
| "base_loss": 0.5858061800599098, |
| "epoch": 2.1192092895507812, |
| "grad_norm": 0.001102335867471993, |
| "learning_rate": 3.211870193481445e-05, |
| "lookahead_loss": 6.516676008224487, |
| "loss": 0.5958, |
| "step": 187500 |
| }, |
| { |
| "base_loss": 0.5981094686985016, |
| "epoch": 3.0009536743164062, |
| "grad_norm": 0.0011026667198166251, |
| "learning_rate": 3.207101821899414e-05, |
| "lookahead_loss": 6.580178588867187, |
| "loss": 0.6029, |
| "step": 188000 |
| }, |
| { |
| "base_loss": 0.587501579284668, |
| "epoch": 3.0019073486328125, |
| "grad_norm": 0.0011541040148586035, |
| "learning_rate": 3.202333450317383e-05, |
| "lookahead_loss": 6.419054620742798, |
| "loss": 0.5974, |
| "step": 188500 |
| }, |
| { |
| "base_loss": 0.6046080349087715, |
| "epoch": 3.0028610229492188, |
| "grad_norm": 0.001120623666793108, |
| "learning_rate": 3.1975650787353516e-05, |
| "lookahead_loss": 6.434816195487976, |
| "loss": 0.613, |
| "step": 189000 |
| }, |
| { |
| "base_loss": 0.6173882039189339, |
| "epoch": 3.003814697265625, |
| "grad_norm": 0.0010917120380327106, |
| "learning_rate": 3.1927967071533206e-05, |
| "lookahead_loss": 6.458130680561066, |
| "loss": 0.6242, |
| "step": 189500 |
| }, |
| { |
| "base_loss": 0.6012161781191826, |
| "epoch": 3.0047683715820312, |
| "grad_norm": 0.001108814962208271, |
| "learning_rate": 3.188028335571289e-05, |
| "lookahead_loss": 6.437314165115357, |
| "loss": 0.6087, |
| "step": 190000 |
| }, |
| { |
| "epoch": 3.0047683715820312, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.484307500120169, |
| "eval_lookahead_perplexity": 654.7853684107388, |
| "eval_loss": 0.20753738284111023, |
| "eval_perplexity": 1.2306437209388883, |
| "eval_runtime": 91.1617, |
| "eval_samples_per_second": 54.848, |
| "eval_steps_per_second": 1.722, |
| "step": 190000 |
| }, |
| { |
| "base_loss": 0.5926664335727692, |
| "epoch": 3.0057220458984375, |
| "grad_norm": 0.0010479306802153587, |
| "learning_rate": 3.183259963989258e-05, |
| "lookahead_loss": 6.560607043266296, |
| "loss": 0.6018, |
| "step": 190500 |
| }, |
| { |
| "base_loss": 0.5822248963713645, |
| "epoch": 3.0066757202148438, |
| "grad_norm": 0.0010960784275084734, |
| "learning_rate": 3.178491592407227e-05, |
| "lookahead_loss": 6.403440669059753, |
| "loss": 0.5981, |
| "step": 191000 |
| }, |
| { |
| "base_loss": 0.6042868258953095, |
| "epoch": 3.00762939453125, |
| "grad_norm": 0.001109161414206028, |
| "learning_rate": 3.173723220825195e-05, |
| "lookahead_loss": 6.468971241950989, |
| "loss": 0.6169, |
| "step": 191500 |
| }, |
| { |
| "base_loss": 0.6021608446240425, |
| "epoch": 3.0085830688476562, |
| "grad_norm": 0.0010364059126004577, |
| "learning_rate": 3.1689548492431643e-05, |
| "lookahead_loss": 6.479389870643616, |
| "loss": 0.608, |
| "step": 192000 |
| }, |
| { |
| "base_loss": 0.5907191566824913, |
| "epoch": 3.0095367431640625, |
| "grad_norm": 0.0011385597754269838, |
| "learning_rate": 3.164186477661133e-05, |
| "lookahead_loss": 6.483307428359986, |
| "loss": 0.6059, |
| "step": 192500 |
| }, |
| { |
| "base_loss": 0.5937975888252258, |
| "epoch": 3.0104904174804688, |
| "grad_norm": 0.0011329938424751163, |
| "learning_rate": 3.159418106079102e-05, |
| "lookahead_loss": 6.426459211349488, |
| "loss": 0.6052, |
| "step": 193000 |
| }, |
| { |
| "base_loss": 0.5900298383831978, |
| "epoch": 3.011444091796875, |
| "grad_norm": 0.0011370591819286346, |
| "learning_rate": 3.154649734497071e-05, |
| "lookahead_loss": 6.45755704498291, |
| "loss": 0.6034, |
| "step": 193500 |
| }, |
| { |
| "base_loss": 0.6114438434243202, |
| "epoch": 3.0123977661132812, |
| "grad_norm": 0.0010784439509734511, |
| "learning_rate": 3.149881362915039e-05, |
| "lookahead_loss": 6.444551125526428, |
| "loss": 0.6234, |
| "step": 194000 |
| }, |
| { |
| "base_loss": 0.5947433623075485, |
| "epoch": 3.0133514404296875, |
| "grad_norm": 0.0011095026275143027, |
| "learning_rate": 3.145112991333008e-05, |
| "lookahead_loss": 6.529066830635071, |
| "loss": 0.6095, |
| "step": 194500 |
| }, |
| { |
| "base_loss": 0.5968700026273728, |
| "epoch": 3.0143051147460938, |
| "grad_norm": 0.0011093729408457875, |
| "learning_rate": 3.1403446197509764e-05, |
| "lookahead_loss": 6.499406108856201, |
| "loss": 0.6074, |
| "step": 195000 |
| }, |
| { |
| "epoch": 3.0143051147460938, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.476042339976984, |
| "eval_lookahead_perplexity": 649.3957661259718, |
| "eval_loss": 0.20752041041851044, |
| "eval_perplexity": 1.2306228341108372, |
| "eval_runtime": 93.185, |
| "eval_samples_per_second": 53.657, |
| "eval_steps_per_second": 1.685, |
| "step": 195000 |
| }, |
| { |
| "base_loss": 0.5854599202871322, |
| "epoch": 3.0152587890625, |
| "grad_norm": 0.001107083517126739, |
| "learning_rate": 3.1355762481689455e-05, |
| "lookahead_loss": 6.448059041976928, |
| "loss": 0.5918, |
| "step": 195500 |
| }, |
| { |
| "base_loss": 0.6102398954629898, |
| "epoch": 3.0162124633789062, |
| "grad_norm": 0.0011182829039171338, |
| "learning_rate": 3.1308078765869145e-05, |
| "lookahead_loss": 6.49233545923233, |
| "loss": 0.6183, |
| "step": 196000 |
| }, |
| { |
| "base_loss": 0.604021582365036, |
| "epoch": 3.0171661376953125, |
| "grad_norm": 0.0011117098620161414, |
| "learning_rate": 3.126039505004883e-05, |
| "lookahead_loss": 6.529406607627869, |
| "loss": 0.6108, |
| "step": 196500 |
| }, |
| { |
| "base_loss": 0.5954824941754341, |
| "epoch": 3.0181198120117188, |
| "grad_norm": 0.0010434985160827637, |
| "learning_rate": 3.121271133422852e-05, |
| "lookahead_loss": 6.530284749984741, |
| "loss": 0.6029, |
| "step": 197000 |
| }, |
| { |
| "base_loss": 0.58826500248909, |
| "epoch": 3.019073486328125, |
| "grad_norm": 0.0010525453835725784, |
| "learning_rate": 3.11650276184082e-05, |
| "lookahead_loss": 6.558384260654449, |
| "loss": 0.6004, |
| "step": 197500 |
| }, |
| { |
| "base_loss": 0.5917664663791656, |
| "epoch": 3.0200271606445312, |
| "grad_norm": 0.0011327258544042706, |
| "learning_rate": 3.111734390258789e-05, |
| "lookahead_loss": 6.401400192260742, |
| "loss": 0.6047, |
| "step": 198000 |
| }, |
| { |
| "base_loss": 0.6186916393637657, |
| "epoch": 3.0209808349609375, |
| "grad_norm": 0.0010724315652623773, |
| "learning_rate": 3.106966018676758e-05, |
| "lookahead_loss": 6.458036962509155, |
| "loss": 0.6258, |
| "step": 198500 |
| }, |
| { |
| "base_loss": 0.5938585975766182, |
| "epoch": 3.0219345092773438, |
| "grad_norm": 0.0011486399453133345, |
| "learning_rate": 3.1021976470947266e-05, |
| "lookahead_loss": 6.43785121679306, |
| "loss": 0.6075, |
| "step": 199000 |
| }, |
| { |
| "base_loss": 0.595318921983242, |
| "epoch": 3.02288818359375, |
| "grad_norm": 0.001104371971450746, |
| "learning_rate": 3.0974292755126956e-05, |
| "lookahead_loss": 6.464245168209076, |
| "loss": 0.6043, |
| "step": 199500 |
| }, |
| { |
| "base_loss": 0.5955991841554642, |
| "epoch": 3.0238418579101562, |
| "grad_norm": 0.0011078201932832599, |
| "learning_rate": 3.092660903930664e-05, |
| "lookahead_loss": 6.4285355896949765, |
| "loss": 0.6046, |
| "step": 200000 |
| }, |
| { |
| "epoch": 3.0238418579101562, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.4665484976844665, |
| "eval_lookahead_perplexity": 643.2596787484863, |
| "eval_loss": 0.20750202238559723, |
| "eval_perplexity": 1.2306002055857075, |
| "eval_runtime": 95.5351, |
| "eval_samples_per_second": 52.337, |
| "eval_steps_per_second": 1.643, |
| "step": 200000 |
| }, |
| { |
| "base_loss": 0.6106253827810287, |
| "epoch": 3.0247955322265625, |
| "grad_norm": 0.0010915478924289346, |
| "learning_rate": 3.087892532348633e-05, |
| "lookahead_loss": 6.434819388389587, |
| "loss": 0.6219, |
| "step": 200500 |
| }, |
| { |
| "base_loss": 0.6082036694884301, |
| "epoch": 3.0257492065429688, |
| "grad_norm": 0.0010945522226393223, |
| "learning_rate": 3.083124160766602e-05, |
| "lookahead_loss": 6.398662006378173, |
| "loss": 0.6163, |
| "step": 201000 |
| }, |
| { |
| "base_loss": 0.5935834443569183, |
| "epoch": 3.026702880859375, |
| "grad_norm": 0.0011064212303608656, |
| "learning_rate": 3.07835578918457e-05, |
| "lookahead_loss": 6.422159289360047, |
| "loss": 0.6039, |
| "step": 201500 |
| }, |
| { |
| "base_loss": 0.5928121148943901, |
| "epoch": 3.0276565551757812, |
| "grad_norm": 0.0010989385191351175, |
| "learning_rate": 3.0735874176025393e-05, |
| "lookahead_loss": 6.5297203512191775, |
| "loss": 0.6048, |
| "step": 202000 |
| }, |
| { |
| "base_loss": 0.6184476745128632, |
| "epoch": 3.0286102294921875, |
| "grad_norm": 0.0010950877331197262, |
| "learning_rate": 3.068819046020508e-05, |
| "lookahead_loss": 6.549453610897064, |
| "loss": 0.6282, |
| "step": 202500 |
| }, |
| { |
| "base_loss": 0.6005746681690216, |
| "epoch": 3.0295639038085938, |
| "grad_norm": 0.0010932876029983163, |
| "learning_rate": 3.064050674438477e-05, |
| "lookahead_loss": 6.481932869434357, |
| "loss": 0.611, |
| "step": 203000 |
| }, |
| { |
| "base_loss": 0.5931842148900032, |
| "epoch": 3.030517578125, |
| "grad_norm": 0.0011122091673314571, |
| "learning_rate": 3.059282302856446e-05, |
| "lookahead_loss": 6.4869947714805605, |
| "loss": 0.6067, |
| "step": 203500 |
| }, |
| { |
| "base_loss": 0.5912481832504273, |
| "epoch": 3.0314712524414062, |
| "grad_norm": 0.0010941592045128345, |
| "learning_rate": 3.054513931274414e-05, |
| "lookahead_loss": 6.504393637657166, |
| "loss": 0.6046, |
| "step": 204000 |
| }, |
| { |
| "base_loss": 0.6138262154459954, |
| "epoch": 3.0324249267578125, |
| "grad_norm": 0.0011110466439276934, |
| "learning_rate": 3.049745559692383e-05, |
| "lookahead_loss": 6.4517954845428465, |
| "loss": 0.628, |
| "step": 204500 |
| }, |
| { |
| "base_loss": 0.5952054759263993, |
| "epoch": 3.0333786010742188, |
| "grad_norm": 0.0011273113777861, |
| "learning_rate": 3.0449771881103518e-05, |
| "lookahead_loss": 6.492183149337769, |
| "loss": 0.6084, |
| "step": 205000 |
| }, |
| { |
| "epoch": 3.0333786010742188, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.459207607915226, |
| "eval_lookahead_perplexity": 638.5548702221528, |
| "eval_loss": 0.20748774707317352, |
| "eval_perplexity": 1.2305826385086922, |
| "eval_runtime": 91.6822, |
| "eval_samples_per_second": 54.536, |
| "eval_steps_per_second": 1.712, |
| "step": 205000 |
| }, |
| { |
| "base_loss": 0.5937945809960365, |
| "epoch": 3.034332275390625, |
| "grad_norm": 0.0010771463857963681, |
| "learning_rate": 3.0402088165283205e-05, |
| "lookahead_loss": 6.558969589233398, |
| "loss": 0.6073, |
| "step": 205500 |
| }, |
| { |
| "base_loss": 0.5899240897297859, |
| "epoch": 3.0352859497070312, |
| "grad_norm": 0.0011380125069990754, |
| "learning_rate": 3.035440444946289e-05, |
| "lookahead_loss": 6.393504500389099, |
| "loss": 0.6019, |
| "step": 206000 |
| }, |
| { |
| "base_loss": 0.6100166696310043, |
| "epoch": 3.0362396240234375, |
| "grad_norm": 0.0011085773585364223, |
| "learning_rate": 3.0306720733642578e-05, |
| "lookahead_loss": 6.515884090900421, |
| "loss": 0.622, |
| "step": 206500 |
| }, |
| { |
| "base_loss": 0.600666867017746, |
| "epoch": 3.0371932983398438, |
| "grad_norm": 0.0011293049901723862, |
| "learning_rate": 3.025903701782227e-05, |
| "lookahead_loss": 6.463486310005188, |
| "loss": 0.6106, |
| "step": 207000 |
| }, |
| { |
| "base_loss": 0.5914758368730545, |
| "epoch": 3.03814697265625, |
| "grad_norm": 0.0010940312640741467, |
| "learning_rate": 3.0211353302001955e-05, |
| "lookahead_loss": 6.46989311504364, |
| "loss": 0.6023, |
| "step": 207500 |
| }, |
| { |
| "base_loss": 0.600289347231388, |
| "epoch": 3.0391006469726562, |
| "grad_norm": 0.0010963748209178448, |
| "learning_rate": 3.0163669586181642e-05, |
| "lookahead_loss": 6.431695939540863, |
| "loss": 0.6123, |
| "step": 208000 |
| }, |
| { |
| "base_loss": 0.6154078626036644, |
| "epoch": 3.0400543212890625, |
| "grad_norm": 0.001127979252487421, |
| "learning_rate": 3.011598587036133e-05, |
| "lookahead_loss": 6.457052739143371, |
| "loss": 0.6236, |
| "step": 208500 |
| }, |
| { |
| "base_loss": 0.6015743594169617, |
| "epoch": 3.0410079956054688, |
| "grad_norm": 0.0011190706863999367, |
| "learning_rate": 3.0068302154541016e-05, |
| "lookahead_loss": 6.41373338508606, |
| "loss": 0.6116, |
| "step": 209000 |
| }, |
| { |
| "base_loss": 0.5813222458958626, |
| "epoch": 3.041961669921875, |
| "grad_norm": 0.0011437357170507312, |
| "learning_rate": 3.0020618438720706e-05, |
| "lookahead_loss": 6.483449687004089, |
| "loss": 0.5969, |
| "step": 209500 |
| }, |
| { |
| "base_loss": 0.6031124092936516, |
| "epoch": 3.0429153442382812, |
| "grad_norm": 0.0010715459939092398, |
| "learning_rate": 2.9972934722900393e-05, |
| "lookahead_loss": 6.503455612182617, |
| "loss": 0.614, |
| "step": 210000 |
| }, |
| { |
| "epoch": 3.0429153442382812, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.451748331514791, |
| "eval_lookahead_perplexity": 633.8094337129085, |
| "eval_loss": 0.20747321844100952, |
| "eval_perplexity": 1.2305647599560656, |
| "eval_runtime": 94.4941, |
| "eval_samples_per_second": 52.913, |
| "eval_steps_per_second": 1.661, |
| "step": 210000 |
| }, |
| { |
| "base_loss": 0.6137282114624977, |
| "epoch": 3.0438690185546875, |
| "grad_norm": 0.0011294978903606534, |
| "learning_rate": 2.992525100708008e-05, |
| "lookahead_loss": 6.515301226615906, |
| "loss": 0.6256, |
| "step": 210500 |
| }, |
| { |
| "base_loss": 0.5906867882609368, |
| "epoch": 3.0448226928710938, |
| "grad_norm": 0.0011206147028133273, |
| "learning_rate": 2.9877567291259766e-05, |
| "lookahead_loss": 6.449306659698486, |
| "loss": 0.6033, |
| "step": 211000 |
| }, |
| { |
| "base_loss": 0.5901689050197602, |
| "epoch": 3.0457763671875, |
| "grad_norm": 0.0010848396923393011, |
| "learning_rate": 2.9829883575439453e-05, |
| "lookahead_loss": 6.44843610572815, |
| "loss": 0.6011, |
| "step": 211500 |
| }, |
| { |
| "base_loss": 0.6179713225364685, |
| "epoch": 3.0467300415039062, |
| "grad_norm": 0.0011150614591315389, |
| "learning_rate": 2.9782199859619143e-05, |
| "lookahead_loss": 6.423956147193909, |
| "loss": 0.6278, |
| "step": 212000 |
| }, |
| { |
| "base_loss": 0.6056273721456528, |
| "epoch": 3.0476837158203125, |
| "grad_norm": 0.0011628296924754977, |
| "learning_rate": 2.973451614379883e-05, |
| "lookahead_loss": 6.465731230258942, |
| "loss": 0.6222, |
| "step": 212500 |
| }, |
| { |
| "base_loss": 0.5891297512054443, |
| "epoch": 3.0486373901367188, |
| "grad_norm": 0.0010860287584364414, |
| "learning_rate": 2.9686832427978517e-05, |
| "lookahead_loss": 6.444560400009156, |
| "loss": 0.6002, |
| "step": 213000 |
| }, |
| { |
| "base_loss": 0.5967240616083145, |
| "epoch": 3.049591064453125, |
| "grad_norm": 0.0011478269007056952, |
| "learning_rate": 2.9639148712158204e-05, |
| "lookahead_loss": 6.408600190162659, |
| "loss": 0.607, |
| "step": 213500 |
| }, |
| { |
| "base_loss": 0.6166991795897484, |
| "epoch": 3.0505447387695312, |
| "grad_norm": 0.001043649623170495, |
| "learning_rate": 2.959146499633789e-05, |
| "lookahead_loss": 6.510616254806519, |
| "loss": 0.6283, |
| "step": 214000 |
| }, |
| { |
| "base_loss": 0.5971619437336921, |
| "epoch": 3.0514984130859375, |
| "grad_norm": 0.0011554835364222527, |
| "learning_rate": 2.954378128051758e-05, |
| "lookahead_loss": 6.4740786409378055, |
| "loss": 0.6093, |
| "step": 214500 |
| }, |
| { |
| "base_loss": 0.594504154086113, |
| "epoch": 3.0524520874023438, |
| "grad_norm": 0.0011469792807474732, |
| "learning_rate": 2.9496097564697268e-05, |
| "lookahead_loss": 6.407137874603271, |
| "loss": 0.605, |
| "step": 215000 |
| }, |
| { |
| "epoch": 3.0524520874023438, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.445492273702408, |
| "eval_lookahead_perplexity": 629.8566625281327, |
| "eval_loss": 0.2074602097272873, |
| "eval_perplexity": 1.2305487519955085, |
| "eval_runtime": 90.8147, |
| "eval_samples_per_second": 55.057, |
| "eval_steps_per_second": 1.729, |
| "step": 215000 |
| }, |
| { |
| "base_loss": 0.6092002688646316, |
| "epoch": 3.05340576171875, |
| "grad_norm": 0.0011114060180261731, |
| "learning_rate": 2.9448413848876955e-05, |
| "lookahead_loss": 6.412362268447876, |
| "loss": 0.6198, |
| "step": 215500 |
| }, |
| { |
| "base_loss": 0.6074704146981239, |
| "epoch": 3.0543594360351562, |
| "grad_norm": 0.0011088059982284904, |
| "learning_rate": 2.940073013305664e-05, |
| "lookahead_loss": 6.450668965816498, |
| "loss": 0.6219, |
| "step": 216000 |
| }, |
| { |
| "base_loss": 0.5954701615571976, |
| "epoch": 3.0553131103515625, |
| "grad_norm": 0.0011205608025193214, |
| "learning_rate": 2.9353046417236328e-05, |
| "lookahead_loss": 6.46648416519165, |
| "loss": 0.6048, |
| "step": 216500 |
| }, |
| { |
| "base_loss": 0.5905682035684585, |
| "epoch": 3.0562667846679688, |
| "grad_norm": 0.0010696501703932881, |
| "learning_rate": 2.930536270141602e-05, |
| "lookahead_loss": 6.470299499511719, |
| "loss": 0.6042, |
| "step": 217000 |
| }, |
| { |
| "base_loss": 0.6139637017846108, |
| "epoch": 3.057220458984375, |
| "grad_norm": 0.0011050283210352063, |
| "learning_rate": 2.9257678985595705e-05, |
| "lookahead_loss": 6.474473210811615, |
| "loss": 0.6279, |
| "step": 217500 |
| }, |
| { |
| "base_loss": 0.606031008541584, |
| "epoch": 3.0581741333007812, |
| "grad_norm": 0.0011311868438497186, |
| "learning_rate": 2.9209995269775392e-05, |
| "lookahead_loss": 6.475669991016388, |
| "loss": 0.6157, |
| "step": 218000 |
| }, |
| { |
| "base_loss": 0.5839636498689651, |
| "epoch": 3.0591278076171875, |
| "grad_norm": 0.0010969273280352354, |
| "learning_rate": 2.916231155395508e-05, |
| "lookahead_loss": 6.375632545471191, |
| "loss": 0.5996, |
| "step": 218500 |
| }, |
| { |
| "base_loss": 0.5957706315517426, |
| "epoch": 3.0600814819335938, |
| "grad_norm": 0.0011069747852161527, |
| "learning_rate": 2.9114627838134766e-05, |
| "lookahead_loss": 6.45486762714386, |
| "loss": 0.6077, |
| "step": 219000 |
| }, |
| { |
| "base_loss": 0.6068459544181823, |
| "epoch": 3.06103515625, |
| "grad_norm": 0.0011218679137527943, |
| "learning_rate": 2.9066944122314456e-05, |
| "lookahead_loss": 6.403849498271942, |
| "loss": 0.6192, |
| "step": 219500 |
| }, |
| { |
| "base_loss": 0.6023038199543953, |
| "epoch": 3.0619888305664062, |
| "grad_norm": 0.0011592921800911427, |
| "learning_rate": 2.9019260406494143e-05, |
| "lookahead_loss": 6.420722769737243, |
| "loss": 0.6158, |
| "step": 220000 |
| }, |
| { |
| "epoch": 3.0619888305664062, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.437712517790139, |
| "eval_lookahead_perplexity": 624.9755430121139, |
| "eval_loss": 0.2074456363916397, |
| "eval_perplexity": 1.2305308189261874, |
| "eval_runtime": 93.2872, |
| "eval_samples_per_second": 53.598, |
| "eval_steps_per_second": 1.683, |
| "step": 220000 |
| }, |
| { |
| "base_loss": 0.5944037571549415, |
| "epoch": 3.0629425048828125, |
| "grad_norm": 0.0011297245509922504, |
| "learning_rate": 2.897157669067383e-05, |
| "lookahead_loss": 6.439490074634552, |
| "loss": 0.6075, |
| "step": 220500 |
| }, |
| { |
| "base_loss": 0.6128740097880363, |
| "epoch": 3.0638961791992188, |
| "grad_norm": 0.0010691159404814243, |
| "learning_rate": 2.8923892974853516e-05, |
| "lookahead_loss": 6.441310132026673, |
| "loss": 0.6236, |
| "step": 221000 |
| }, |
| { |
| "base_loss": 0.6068454984426498, |
| "epoch": 3.064849853515625, |
| "grad_norm": 0.0011188320349901915, |
| "learning_rate": 2.8876209259033203e-05, |
| "lookahead_loss": 6.489636187553406, |
| "loss": 0.6166, |
| "step": 221500 |
| }, |
| { |
| "base_loss": 0.6046090689897538, |
| "epoch": 3.0658035278320312, |
| "grad_norm": 0.0011356692994013429, |
| "learning_rate": 2.8828525543212893e-05, |
| "lookahead_loss": 6.39787455034256, |
| "loss": 0.6135, |
| "step": 222000 |
| }, |
| { |
| "base_loss": 0.6052177213430404, |
| "epoch": 3.0667572021484375, |
| "grad_norm": 0.001095956307835877, |
| "learning_rate": 2.878084182739258e-05, |
| "lookahead_loss": 6.380840573787689, |
| "loss": 0.6114, |
| "step": 222500 |
| }, |
| { |
| "base_loss": 0.6186637369394302, |
| "epoch": 3.0677108764648438, |
| "grad_norm": 0.001142095890827477, |
| "learning_rate": 2.8733158111572267e-05, |
| "lookahead_loss": 6.458719520568848, |
| "loss": 0.6293, |
| "step": 223000 |
| }, |
| { |
| "base_loss": 0.5988062580823899, |
| "epoch": 3.06866455078125, |
| "grad_norm": 0.0010754456743597984, |
| "learning_rate": 2.8685474395751954e-05, |
| "lookahead_loss": 6.417570797920227, |
| "loss": 0.6071, |
| "step": 223500 |
| }, |
| { |
| "base_loss": 0.5868127301335335, |
| "epoch": 3.0696182250976562, |
| "grad_norm": 0.0011147081386297941, |
| "learning_rate": 2.863779067993164e-05, |
| "lookahead_loss": 6.465461602210999, |
| "loss": 0.6002, |
| "step": 224000 |
| }, |
| { |
| "base_loss": 0.6132235081791878, |
| "epoch": 3.0705718994140625, |
| "grad_norm": 0.0011301016202196479, |
| "learning_rate": 2.859010696411133e-05, |
| "lookahead_loss": 6.349413995742798, |
| "loss": 0.628, |
| "step": 224500 |
| }, |
| { |
| "base_loss": 0.5981150686740875, |
| "epoch": 3.0715255737304688, |
| "grad_norm": 0.001094558509066701, |
| "learning_rate": 2.8542423248291018e-05, |
| "lookahead_loss": 6.409218377113342, |
| "loss": 0.6103, |
| "step": 225000 |
| }, |
| { |
| "epoch": 3.0715255737304688, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.430860436381623, |
| "eval_lookahead_perplexity": 620.7077978795056, |
| "eval_loss": 0.20743218064308167, |
| "eval_perplexity": 1.2305142613242928, |
| "eval_runtime": 91.988, |
| "eval_samples_per_second": 54.355, |
| "eval_steps_per_second": 1.707, |
| "step": 225000 |
| }, |
| { |
| "base_loss": 0.5917516273856163, |
| "epoch": 3.072479248046875, |
| "grad_norm": 0.001141304150223732, |
| "learning_rate": 2.8494739532470705e-05, |
| "lookahead_loss": 6.462259509086609, |
| "loss": 0.6047, |
| "step": 225500 |
| }, |
| { |
| "base_loss": 0.5921737739443779, |
| "epoch": 3.0734329223632812, |
| "grad_norm": 0.0010474661830812693, |
| "learning_rate": 2.844705581665039e-05, |
| "lookahead_loss": 6.417533561706543, |
| "loss": 0.6032, |
| "step": 226000 |
| }, |
| { |
| "base_loss": 0.6123625862002373, |
| "epoch": 3.0743865966796875, |
| "grad_norm": 0.0010933991288766265, |
| "learning_rate": 2.8399372100830078e-05, |
| "lookahead_loss": 6.422773428916932, |
| "loss": 0.6223, |
| "step": 226500 |
| }, |
| { |
| "base_loss": 0.593733324766159, |
| "epoch": 3.0753402709960938, |
| "grad_norm": 0.0010934488382190466, |
| "learning_rate": 2.835168838500977e-05, |
| "lookahead_loss": 6.487575828552246, |
| "loss": 0.6048, |
| "step": 227000 |
| }, |
| { |
| "base_loss": 0.5864555166959763, |
| "epoch": 3.0762939453125, |
| "grad_norm": 0.0010649971663951874, |
| "learning_rate": 2.8304004669189455e-05, |
| "lookahead_loss": 6.451822665691376, |
| "loss": 0.5975, |
| "step": 227500 |
| }, |
| { |
| "base_loss": 0.6271779141426086, |
| "epoch": 3.0772476196289062, |
| "grad_norm": 0.0011156428372487426, |
| "learning_rate": 2.8256320953369142e-05, |
| "lookahead_loss": 6.417567262649536, |
| "loss": 0.638, |
| "step": 228000 |
| }, |
| { |
| "base_loss": 0.5956557096838951, |
| "epoch": 3.0782012939453125, |
| "grad_norm": 0.001118983025662601, |
| "learning_rate": 2.820863723754883e-05, |
| "lookahead_loss": 6.439277349472046, |
| "loss": 0.6088, |
| "step": 228500 |
| }, |
| { |
| "base_loss": 0.599572938144207, |
| "epoch": 3.0791549682617188, |
| "grad_norm": 0.001086446107365191, |
| "learning_rate": 2.8160953521728516e-05, |
| "lookahead_loss": 6.460713619232178, |
| "loss": 0.6093, |
| "step": 229000 |
| }, |
| { |
| "base_loss": 0.6085604978203774, |
| "epoch": 3.080108642578125, |
| "grad_norm": 0.0010718839475885034, |
| "learning_rate": 2.8113269805908206e-05, |
| "lookahead_loss": 6.416140838623047, |
| "loss": 0.6184, |
| "step": 229500 |
| }, |
| { |
| "base_loss": 0.6050790804624557, |
| "epoch": 3.0810623168945312, |
| "grad_norm": 0.0011485074646770954, |
| "learning_rate": 2.8065586090087893e-05, |
| "lookahead_loss": 6.4402008790969845, |
| "loss": 0.6174, |
| "step": 230000 |
| }, |
| { |
| "epoch": 3.0810623168945312, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.423224188649235, |
| "eval_lookahead_perplexity": 615.9859708310377, |
| "eval_loss": 0.20741787552833557, |
| "eval_perplexity": 1.2304966588024913, |
| "eval_runtime": 90.357, |
| "eval_samples_per_second": 55.336, |
| "eval_steps_per_second": 1.738, |
| "step": 230000 |
| }, |
| { |
| "base_loss": 0.5915502496957779, |
| "epoch": 3.0820159912109375, |
| "grad_norm": 0.0010833791457116604, |
| "learning_rate": 2.801790237426758e-05, |
| "lookahead_loss": 6.446974056243897, |
| "loss": 0.6047, |
| "step": 230500 |
| }, |
| { |
| "base_loss": 0.5885776147842408, |
| "epoch": 3.0829696655273438, |
| "grad_norm": 0.0011150679783895612, |
| "learning_rate": 2.7970218658447266e-05, |
| "lookahead_loss": 6.480014348506928, |
| "loss": 0.6035, |
| "step": 231000 |
| }, |
| { |
| "base_loss": 0.6157435640096665, |
| "epoch": 3.08392333984375, |
| "grad_norm": 0.0010689526097849011, |
| "learning_rate": 2.7922534942626953e-05, |
| "lookahead_loss": 6.488008930206298, |
| "loss": 0.6266, |
| "step": 231500 |
| }, |
| { |
| "base_loss": 0.5945523136258125, |
| "epoch": 3.0848770141601562, |
| "grad_norm": 0.001094442093744874, |
| "learning_rate": 2.7874851226806643e-05, |
| "lookahead_loss": 6.41443329334259, |
| "loss": 0.6037, |
| "step": 232000 |
| }, |
| { |
| "base_loss": 0.5877983981966972, |
| "epoch": 3.0858306884765625, |
| "grad_norm": 0.0011258223094046116, |
| "learning_rate": 2.782716751098633e-05, |
| "lookahead_loss": 6.43271403503418, |
| "loss": 0.5988, |
| "step": 232500 |
| }, |
| { |
| "base_loss": 0.5929736877083779, |
| "epoch": 3.0867843627929688, |
| "grad_norm": 0.0012092749821022153, |
| "learning_rate": 2.7779483795166017e-05, |
| "lookahead_loss": 6.397968234062195, |
| "loss": 0.6049, |
| "step": 233000 |
| }, |
| { |
| "base_loss": 0.6253425707817077, |
| "epoch": 3.087738037109375, |
| "grad_norm": 0.001021925127133727, |
| "learning_rate": 2.7731800079345704e-05, |
| "lookahead_loss": 6.433693765163421, |
| "loss": 0.6294, |
| "step": 233500 |
| }, |
| { |
| "base_loss": 0.5948817446827889, |
| "epoch": 3.0886917114257812, |
| "grad_norm": 0.0011388851562514901, |
| "learning_rate": 2.768411636352539e-05, |
| "lookahead_loss": 6.4267685527801515, |
| "loss": 0.6046, |
| "step": 234000 |
| }, |
| { |
| "base_loss": 0.6024612309336662, |
| "epoch": 3.0896453857421875, |
| "grad_norm": 0.0011255706194788218, |
| "learning_rate": 2.763643264770508e-05, |
| "lookahead_loss": 6.45055183506012, |
| "loss": 0.61, |
| "step": 234500 |
| }, |
| { |
| "base_loss": 0.5884939526319504, |
| "epoch": 3.0905990600585938, |
| "grad_norm": 0.0010525273391976953, |
| "learning_rate": 2.7588748931884768e-05, |
| "lookahead_loss": 6.4216416721344, |
| "loss": 0.5991, |
| "step": 235000 |
| }, |
| { |
| "epoch": 3.0905990600585938, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.416253169885459, |
| "eval_lookahead_perplexity": 611.7068533031069, |
| "eval_loss": 0.20740444958209991, |
| "eval_perplexity": 1.2304801383314088, |
| "eval_runtime": 91.5784, |
| "eval_samples_per_second": 54.598, |
| "eval_steps_per_second": 1.714, |
| "step": 235000 |
| }, |
| { |
| "base_loss": 0.5811684273481369, |
| "epoch": 3.091552734375, |
| "grad_norm": 0.0010410203831270337, |
| "learning_rate": 2.7541065216064455e-05, |
| "lookahead_loss": 6.4036853876113895, |
| "loss": 0.5934, |
| "step": 235500 |
| }, |
| { |
| "base_loss": 0.6168777622580528, |
| "epoch": 3.0925064086914062, |
| "grad_norm": 0.001115046557970345, |
| "learning_rate": 2.749338150024414e-05, |
| "lookahead_loss": 6.453854230880737, |
| "loss": 0.6274, |
| "step": 236000 |
| }, |
| { |
| "base_loss": 0.59999961155653, |
| "epoch": 3.0934600830078125, |
| "grad_norm": 0.0010715369135141373, |
| "learning_rate": 2.7445697784423828e-05, |
| "lookahead_loss": 6.4379607105255126, |
| "loss": 0.6127, |
| "step": 236500 |
| }, |
| { |
| "base_loss": 0.5828448947072029, |
| "epoch": 3.0944137573242188, |
| "grad_norm": 0.0011117984540760517, |
| "learning_rate": 2.739801406860352e-05, |
| "lookahead_loss": 6.436477872371674, |
| "loss": 0.5952, |
| "step": 237000 |
| }, |
| { |
| "base_loss": 0.5784061435461044, |
| "epoch": 3.095367431640625, |
| "grad_norm": 0.001090219127945602, |
| "learning_rate": 2.7350330352783205e-05, |
| "lookahead_loss": 6.364374763965607, |
| "loss": 0.5937, |
| "step": 237500 |
| }, |
| { |
| "base_loss": 0.5969022975564003, |
| "epoch": 3.0963211059570312, |
| "grad_norm": 0.0011031440226361156, |
| "learning_rate": 2.7302646636962892e-05, |
| "lookahead_loss": 6.427553595066071, |
| "loss": 0.6116, |
| "step": 238000 |
| }, |
| { |
| "base_loss": 0.6168330173492431, |
| "epoch": 3.0972747802734375, |
| "grad_norm": 0.0011244708439335227, |
| "learning_rate": 2.725496292114258e-05, |
| "lookahead_loss": 6.443069730758667, |
| "loss": 0.6256, |
| "step": 238500 |
| }, |
| { |
| "base_loss": 0.5952534638047219, |
| "epoch": 3.0982284545898438, |
| "grad_norm": 0.0011027234140783548, |
| "learning_rate": 2.7207279205322266e-05, |
| "lookahead_loss": 6.3885435886383055, |
| "loss": 0.6053, |
| "step": 239000 |
| }, |
| { |
| "base_loss": 0.5822330508232116, |
| "epoch": 3.09918212890625, |
| "grad_norm": 0.0011086640879511833, |
| "learning_rate": 2.7159595489501956e-05, |
| "lookahead_loss": 6.448413589000702, |
| "loss": 0.5947, |
| "step": 239500 |
| }, |
| { |
| "base_loss": 0.5872656118273735, |
| "epoch": 3.1001358032226562, |
| "grad_norm": 0.001106367213651538, |
| "learning_rate": 2.7111911773681643e-05, |
| "lookahead_loss": 6.428266541957855, |
| "loss": 0.6033, |
| "step": 240000 |
| }, |
| { |
| "epoch": 3.1001358032226562, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.410780125151808, |
| "eval_lookahead_perplexity": 608.3680992404113, |
| "eval_loss": 0.20739257335662842, |
| "eval_perplexity": 1.2304655249586238, |
| "eval_runtime": 93.0639, |
| "eval_samples_per_second": 53.727, |
| "eval_steps_per_second": 1.687, |
| "step": 240000 |
| }, |
| { |
| "base_loss": 0.6096063278317452, |
| "epoch": 3.1010894775390625, |
| "grad_norm": 0.0010960623621940613, |
| "learning_rate": 2.706422805786133e-05, |
| "lookahead_loss": 6.487349942684173, |
| "loss": 0.6199, |
| "step": 240500 |
| }, |
| { |
| "base_loss": 0.6008942748308181, |
| "epoch": 3.1020431518554688, |
| "grad_norm": 0.001130930962972343, |
| "learning_rate": 2.7016544342041016e-05, |
| "lookahead_loss": 6.453820777893067, |
| "loss": 0.6088, |
| "step": 241000 |
| }, |
| { |
| "base_loss": 0.5871310735344887, |
| "epoch": 3.102996826171875, |
| "grad_norm": 0.0010819945018738508, |
| "learning_rate": 2.6968860626220703e-05, |
| "lookahead_loss": 6.439614234924316, |
| "loss": 0.6019, |
| "step": 241500 |
| }, |
| { |
| "base_loss": 0.5920609677433968, |
| "epoch": 3.1039505004882812, |
| "grad_norm": 0.0011099249823018909, |
| "learning_rate": 2.6921176910400393e-05, |
| "lookahead_loss": 6.449179169654847, |
| "loss": 0.6023, |
| "step": 242000 |
| }, |
| { |
| "base_loss": 0.6111314262747765, |
| "epoch": 3.1049041748046875, |
| "grad_norm": 0.0010791884269565344, |
| "learning_rate": 2.687349319458008e-05, |
| "lookahead_loss": 6.400581046104431, |
| "loss": 0.6206, |
| "step": 242500 |
| }, |
| { |
| "base_loss": 0.6005220351815224, |
| "epoch": 3.1058578491210938, |
| "grad_norm": 0.0010320625733584166, |
| "learning_rate": 2.6825809478759767e-05, |
| "lookahead_loss": 6.395295563697815, |
| "loss": 0.612, |
| "step": 243000 |
| }, |
| { |
| "base_loss": 0.5973421422243118, |
| "epoch": 3.1068115234375, |
| "grad_norm": 0.0010832108091562986, |
| "learning_rate": 2.6778125762939454e-05, |
| "lookahead_loss": 6.463225367546081, |
| "loss": 0.6071, |
| "step": 243500 |
| }, |
| { |
| "base_loss": 0.5840073474049569, |
| "epoch": 3.1077651977539062, |
| "grad_norm": 0.0011341134086251259, |
| "learning_rate": 2.673044204711914e-05, |
| "lookahead_loss": 6.36874011850357, |
| "loss": 0.5946, |
| "step": 244000 |
| }, |
| { |
| "base_loss": 0.6044590792655945, |
| "epoch": 3.1087188720703125, |
| "grad_norm": 0.0011408327845856547, |
| "learning_rate": 2.668275833129883e-05, |
| "lookahead_loss": 6.373018598556518, |
| "loss": 0.6181, |
| "step": 244500 |
| }, |
| { |
| "base_loss": 0.6069816564917564, |
| "epoch": 3.1096725463867188, |
| "grad_norm": 0.00108093093149364, |
| "learning_rate": 2.6635074615478518e-05, |
| "lookahead_loss": 6.359239377021789, |
| "loss": 0.6166, |
| "step": 245000 |
| }, |
| { |
| "epoch": 3.1096725463867188, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.4042751575811225, |
| "eval_lookahead_perplexity": 604.4235280470633, |
| "eval_loss": 0.20738010108470917, |
| "eval_perplexity": 1.230450178353713, |
| "eval_runtime": 89.9445, |
| "eval_samples_per_second": 55.59, |
| "eval_steps_per_second": 1.746, |
| "step": 245000 |
| }, |
| { |
| "base_loss": 0.5957676445245743, |
| "epoch": 3.110626220703125, |
| "grad_norm": 0.0010888735996559262, |
| "learning_rate": 2.6587390899658205e-05, |
| "lookahead_loss": 6.333310267925262, |
| "loss": 0.607, |
| "step": 245500 |
| }, |
| { |
| "base_loss": 0.5857621270418167, |
| "epoch": 3.1115798950195312, |
| "grad_norm": 0.0010927121620625257, |
| "learning_rate": 2.653970718383789e-05, |
| "lookahead_loss": 6.424707530021667, |
| "loss": 0.5985, |
| "step": 246000 |
| }, |
| { |
| "base_loss": 0.603391693353653, |
| "epoch": 3.1125335693359375, |
| "grad_norm": 0.001093365834094584, |
| "learning_rate": 2.6492023468017578e-05, |
| "lookahead_loss": 6.4441242928504945, |
| "loss": 0.6132, |
| "step": 246500 |
| }, |
| { |
| "base_loss": 0.6127855234742164, |
| "epoch": 3.1134872436523438, |
| "grad_norm": 0.0011091139167547226, |
| "learning_rate": 2.644433975219727e-05, |
| "lookahead_loss": 6.444578236103058, |
| "loss": 0.6263, |
| "step": 247000 |
| }, |
| { |
| "base_loss": 0.5977847113609314, |
| "epoch": 3.11444091796875, |
| "grad_norm": 0.0010543358512222767, |
| "learning_rate": 2.6396656036376955e-05, |
| "lookahead_loss": 6.342389549255371, |
| "loss": 0.6063, |
| "step": 247500 |
| }, |
| { |
| "base_loss": 0.585680897474289, |
| "epoch": 3.1153945922851562, |
| "grad_norm": 0.001126592163927853, |
| "learning_rate": 2.6348972320556642e-05, |
| "lookahead_loss": 6.385599094390869, |
| "loss": 0.5957, |
| "step": 248000 |
| }, |
| { |
| "base_loss": 0.6022236620783806, |
| "epoch": 3.1163482666015625, |
| "grad_norm": 0.0011800749925896525, |
| "learning_rate": 2.630128860473633e-05, |
| "lookahead_loss": 6.397586730957031, |
| "loss": 0.6145, |
| "step": 248500 |
| }, |
| { |
| "base_loss": 0.6104110144376754, |
| "epoch": 3.1173019409179688, |
| "grad_norm": 0.0011340758064761758, |
| "learning_rate": 2.6253604888916016e-05, |
| "lookahead_loss": 6.392207997322083, |
| "loss": 0.6223, |
| "step": 249000 |
| }, |
| { |
| "base_loss": 0.603924877524376, |
| "epoch": 3.118255615234375, |
| "grad_norm": 0.0011272222036495805, |
| "learning_rate": 2.6205921173095706e-05, |
| "lookahead_loss": 6.430540772438049, |
| "loss": 0.6105, |
| "step": 249500 |
| }, |
| { |
| "base_loss": 0.5842441658377647, |
| "epoch": 3.1192092895507812, |
| "grad_norm": 0.001132176723331213, |
| "learning_rate": 2.6158237457275393e-05, |
| "lookahead_loss": 6.420798850536347, |
| "loss": 0.5941, |
| "step": 250000 |
| }, |
| { |
| "epoch": 3.1192092895507812, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.398296785811647, |
| "eval_lookahead_perplexity": 600.8208393267304, |
| "eval_loss": 0.20736823976039886, |
| "eval_perplexity": 1.230435583671656, |
| "eval_runtime": 93.3617, |
| "eval_samples_per_second": 53.555, |
| "eval_steps_per_second": 1.682, |
| "step": 250000 |
| }, |
| { |
| "base_loss": 0.5957836083769799, |
| "epoch": 4.000953674316406, |
| "grad_norm": 0.00111961062066257, |
| "learning_rate": 2.611055374145508e-05, |
| "lookahead_loss": 6.485253520011902, |
| "loss": 0.6023, |
| "step": 250500 |
| }, |
| { |
| "base_loss": 0.586733922958374, |
| "epoch": 4.0019073486328125, |
| "grad_norm": 0.0011267756344750524, |
| "learning_rate": 2.6062870025634766e-05, |
| "lookahead_loss": 6.335224370956421, |
| "loss": 0.5985, |
| "step": 251000 |
| }, |
| { |
| "base_loss": 0.6027773340344429, |
| "epoch": 4.002861022949219, |
| "grad_norm": 0.0011099674738943577, |
| "learning_rate": 2.6015186309814453e-05, |
| "lookahead_loss": 6.3454978213310245, |
| "loss": 0.6125, |
| "step": 251500 |
| }, |
| { |
| "base_loss": 0.6127874755859375, |
| "epoch": 4.003814697265625, |
| "grad_norm": 0.001098731765523553, |
| "learning_rate": 2.5967502593994143e-05, |
| "lookahead_loss": 6.355840661048889, |
| "loss": 0.6212, |
| "step": 252000 |
| }, |
| { |
| "base_loss": 0.5991843653917313, |
| "epoch": 4.004768371582031, |
| "grad_norm": 0.0010964460670948029, |
| "learning_rate": 2.591981887817383e-05, |
| "lookahead_loss": 6.346891592025757, |
| "loss": 0.6078, |
| "step": 252500 |
| }, |
| { |
| "base_loss": 0.5906335787773133, |
| "epoch": 4.0057220458984375, |
| "grad_norm": 0.0010437110904604197, |
| "learning_rate": 2.5872135162353517e-05, |
| "lookahead_loss": 6.4677832736969, |
| "loss": 0.6006, |
| "step": 253000 |
| }, |
| { |
| "base_loss": 0.5800002152323723, |
| "epoch": 4.006675720214844, |
| "grad_norm": 0.0010704013984650373, |
| "learning_rate": 2.5824451446533204e-05, |
| "lookahead_loss": 6.314324316501618, |
| "loss": 0.5963, |
| "step": 253500 |
| }, |
| { |
| "base_loss": 0.6064526105523109, |
| "epoch": 4.00762939453125, |
| "grad_norm": 0.0011120210401713848, |
| "learning_rate": 2.577676773071289e-05, |
| "lookahead_loss": 6.374777404785156, |
| "loss": 0.6184, |
| "step": 254000 |
| }, |
| { |
| "base_loss": 0.6036154347658157, |
| "epoch": 4.008583068847656, |
| "grad_norm": 0.0010486901737749577, |
| "learning_rate": 2.572908401489258e-05, |
| "lookahead_loss": 6.386540162086487, |
| "loss": 0.6094, |
| "step": 254500 |
| }, |
| { |
| "base_loss": 0.5886935539245606, |
| "epoch": 4.0095367431640625, |
| "grad_norm": 0.0011316712480038404, |
| "learning_rate": 2.5681400299072268e-05, |
| "lookahead_loss": 6.388794689178467, |
| "loss": 0.6054, |
| "step": 255000 |
| }, |
| { |
| "epoch": 4.0095367431640625, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.392628372667697, |
| "eval_lookahead_perplexity": 597.4247728311323, |
| "eval_loss": 0.20735666155815125, |
| "eval_perplexity": 1.2304213375220883, |
| "eval_runtime": 90.6862, |
| "eval_samples_per_second": 55.135, |
| "eval_steps_per_second": 1.731, |
| "step": 255000 |
| }, |
| { |
| "base_loss": 0.5923344283699989, |
| "epoch": 4.010490417480469, |
| "grad_norm": 0.0011050583561882377, |
| "learning_rate": 2.5633716583251955e-05, |
| "lookahead_loss": 6.32834493637085, |
| "loss": 0.6037, |
| "step": 255500 |
| }, |
| { |
| "base_loss": 0.5904504474997521, |
| "epoch": 4.011444091796875, |
| "grad_norm": 0.001151700154878199, |
| "learning_rate": 2.558603286743164e-05, |
| "lookahead_loss": 6.364507519721985, |
| "loss": 0.605, |
| "step": 256000 |
| }, |
| { |
| "base_loss": 0.6088732249736786, |
| "epoch": 4.012397766113281, |
| "grad_norm": 0.0010878178291022778, |
| "learning_rate": 2.5538349151611328e-05, |
| "lookahead_loss": 6.36634335231781, |
| "loss": 0.6219, |
| "step": 256500 |
| }, |
| { |
| "base_loss": 0.59388463139534, |
| "epoch": 4.0133514404296875, |
| "grad_norm": 0.0011076608207076788, |
| "learning_rate": 2.549066543579102e-05, |
| "lookahead_loss": 6.438094673156738, |
| "loss": 0.607, |
| "step": 257000 |
| }, |
| { |
| "base_loss": 0.5942253875732422, |
| "epoch": 4.014305114746094, |
| "grad_norm": 0.0010897432221099734, |
| "learning_rate": 2.5442981719970705e-05, |
| "lookahead_loss": 6.4205363702774045, |
| "loss": 0.6056, |
| "step": 257500 |
| }, |
| { |
| "base_loss": 0.5844743223190307, |
| "epoch": 4.0152587890625, |
| "grad_norm": 0.001093736500479281, |
| "learning_rate": 2.5395298004150392e-05, |
| "lookahead_loss": 6.345088118553162, |
| "loss": 0.592, |
| "step": 258000 |
| }, |
| { |
| "base_loss": 0.6102200556993485, |
| "epoch": 4.016212463378906, |
| "grad_norm": 0.001128020347096026, |
| "learning_rate": 2.534761428833008e-05, |
| "lookahead_loss": 6.393371778488159, |
| "loss": 0.6185, |
| "step": 258500 |
| }, |
| { |
| "base_loss": 0.6051568930149078, |
| "epoch": 4.0171661376953125, |
| "grad_norm": 0.001108690514229238, |
| "learning_rate": 2.5299930572509766e-05, |
| "lookahead_loss": 6.449675846099853, |
| "loss": 0.6106, |
| "step": 259000 |
| }, |
| { |
| "base_loss": 0.5939539469480515, |
| "epoch": 4.018119812011719, |
| "grad_norm": 0.001039958675391972, |
| "learning_rate": 2.5252246856689456e-05, |
| "lookahead_loss": 6.4385554246902466, |
| "loss": 0.6013, |
| "step": 259500 |
| }, |
| { |
| "base_loss": 0.5881457543373108, |
| "epoch": 4.019073486328125, |
| "grad_norm": 0.001058859284967184, |
| "learning_rate": 2.5204563140869143e-05, |
| "lookahead_loss": 6.462026128768921, |
| "loss": 0.5991, |
| "step": 260000 |
| }, |
| { |
| "epoch": 4.019073486328125, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.386161040955077, |
| "eval_lookahead_perplexity": 593.5734958186138, |
| "eval_loss": 0.2073436826467514, |
| "eval_perplexity": 1.230405368096197, |
| "eval_runtime": 92.3148, |
| "eval_samples_per_second": 54.162, |
| "eval_steps_per_second": 1.701, |
| "step": 260000 |
| }, |
| { |
| "base_loss": 0.5868560363054276, |
| "epoch": 4.020027160644531, |
| "grad_norm": 0.0011423506075516343, |
| "learning_rate": 2.515687942504883e-05, |
| "lookahead_loss": 6.313244658946991, |
| "loss": 0.6021, |
| "step": 260500 |
| }, |
| { |
| "base_loss": 0.6168393405079842, |
| "epoch": 4.0209808349609375, |
| "grad_norm": 0.001083866460248828, |
| "learning_rate": 2.5109195709228516e-05, |
| "lookahead_loss": 6.362197784423828, |
| "loss": 0.6256, |
| "step": 261000 |
| }, |
| { |
| "base_loss": 0.5967223855853081, |
| "epoch": 4.021934509277344, |
| "grad_norm": 0.0011323863873258233, |
| "learning_rate": 2.5061511993408203e-05, |
| "lookahead_loss": 6.346835678100586, |
| "loss": 0.6105, |
| "step": 261500 |
| }, |
| { |
| "base_loss": 0.5978920136094094, |
| "epoch": 4.02288818359375, |
| "grad_norm": 0.00111202837433666, |
| "learning_rate": 2.5013828277587893e-05, |
| "lookahead_loss": 6.382146213531494, |
| "loss": 0.6063, |
| "step": 262000 |
| }, |
| { |
| "base_loss": 0.5982745458483696, |
| "epoch": 4.023841857910156, |
| "grad_norm": 0.001102231559343636, |
| "learning_rate": 2.496614456176758e-05, |
| "lookahead_loss": 6.34771177482605, |
| "loss": 0.6052, |
| "step": 262500 |
| }, |
| { |
| "base_loss": 0.6113470050692559, |
| "epoch": 4.0247955322265625, |
| "grad_norm": 0.0010907762916758657, |
| "learning_rate": 2.4918460845947267e-05, |
| "lookahead_loss": 6.360005939006806, |
| "loss": 0.621, |
| "step": 263000 |
| }, |
| { |
| "base_loss": 0.6052272637486458, |
| "epoch": 4.025749206542969, |
| "grad_norm": 0.001096719759516418, |
| "learning_rate": 2.4870777130126954e-05, |
| "lookahead_loss": 6.313052820205688, |
| "loss": 0.6153, |
| "step": 263500 |
| }, |
| { |
| "base_loss": 0.5957095698714256, |
| "epoch": 4.026702880859375, |
| "grad_norm": 0.0011290950933471322, |
| "learning_rate": 2.482309341430664e-05, |
| "lookahead_loss": 6.339996559143066, |
| "loss": 0.6055, |
| "step": 264000 |
| }, |
| { |
| "base_loss": 0.5928845180273056, |
| "epoch": 4.027656555175781, |
| "grad_norm": 0.0011050624307245016, |
| "learning_rate": 2.477540969848633e-05, |
| "lookahead_loss": 6.442285621643067, |
| "loss": 0.6044, |
| "step": 264500 |
| }, |
| { |
| "base_loss": 0.6156682388782502, |
| "epoch": 4.0286102294921875, |
| "grad_norm": 0.0010948260314762592, |
| "learning_rate": 2.4727725982666018e-05, |
| "lookahead_loss": 6.466365109920502, |
| "loss": 0.6243, |
| "step": 265000 |
| }, |
| { |
| "epoch": 4.0286102294921875, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.380477937265707, |
| "eval_lookahead_perplexity": 590.2097234815991, |
| "eval_loss": 0.2073325216770172, |
| "eval_perplexity": 1.230391635655757, |
| "eval_runtime": 91.8069, |
| "eval_samples_per_second": 54.462, |
| "eval_steps_per_second": 1.71, |
| "step": 265000 |
| }, |
| { |
| "base_loss": 0.6024927053451538, |
| "epoch": 4.029563903808594, |
| "grad_norm": 0.0011052724439650774, |
| "learning_rate": 2.4680042266845705e-05, |
| "lookahead_loss": 6.400326736450196, |
| "loss": 0.6118, |
| "step": 265500 |
| }, |
| { |
| "base_loss": 0.5958824927210807, |
| "epoch": 4.030517578125, |
| "grad_norm": 0.0010953382588922977, |
| "learning_rate": 2.463235855102539e-05, |
| "lookahead_loss": 6.396139843940735, |
| "loss": 0.6077, |
| "step": 266000 |
| }, |
| { |
| "base_loss": 0.5906458727121353, |
| "epoch": 4.031471252441406, |
| "grad_norm": 0.0011001034872606397, |
| "learning_rate": 2.4584674835205078e-05, |
| "lookahead_loss": 6.419850166797638, |
| "loss": 0.6039, |
| "step": 266500 |
| }, |
| { |
| "base_loss": 0.6140482442975044, |
| "epoch": 4.0324249267578125, |
| "grad_norm": 0.001121096545830369, |
| "learning_rate": 2.453699111938477e-05, |
| "lookahead_loss": 6.364969326019287, |
| "loss": 0.6282, |
| "step": 267000 |
| }, |
| { |
| "base_loss": 0.5971780525445938, |
| "epoch": 4.033378601074219, |
| "grad_norm": 0.0011231973767280579, |
| "learning_rate": 2.4489307403564455e-05, |
| "lookahead_loss": 6.409055516242981, |
| "loss": 0.6072, |
| "step": 267500 |
| }, |
| { |
| "base_loss": 0.5922137571573257, |
| "epoch": 4.034332275390625, |
| "grad_norm": 0.0010805290658026934, |
| "learning_rate": 2.4441623687744142e-05, |
| "lookahead_loss": 6.48427177810669, |
| "loss": 0.6062, |
| "step": 268000 |
| }, |
| { |
| "base_loss": 0.591649469256401, |
| "epoch": 4.035285949707031, |
| "grad_norm": 0.0011319770710542798, |
| "learning_rate": 2.439393997192383e-05, |
| "lookahead_loss": 6.313907026767731, |
| "loss": 0.6035, |
| "step": 268500 |
| }, |
| { |
| "base_loss": 0.609982794225216, |
| "epoch": 4.0362396240234375, |
| "grad_norm": 0.0011232325341552496, |
| "learning_rate": 2.4346256256103516e-05, |
| "lookahead_loss": 6.435509991645813, |
| "loss": 0.6217, |
| "step": 269000 |
| }, |
| { |
| "base_loss": 0.5994021319746972, |
| "epoch": 4.037193298339844, |
| "grad_norm": 0.0011218226281926036, |
| "learning_rate": 2.4298572540283206e-05, |
| "lookahead_loss": 6.378507499694824, |
| "loss": 0.6093, |
| "step": 269500 |
| }, |
| { |
| "base_loss": 0.5912262842059135, |
| "epoch": 4.03814697265625, |
| "grad_norm": 0.0010908119147643447, |
| "learning_rate": 2.4250888824462893e-05, |
| "lookahead_loss": 6.3788512840271, |
| "loss": 0.6023, |
| "step": 270000 |
| }, |
| { |
| "epoch": 4.03814697265625, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.375024708696067, |
| "eval_lookahead_perplexity": 586.9999347655023, |
| "eval_loss": 0.207322895526886, |
| "eval_perplexity": 1.2303797917781574, |
| "eval_runtime": 89.9406, |
| "eval_samples_per_second": 55.592, |
| "eval_steps_per_second": 1.746, |
| "step": 270000 |
| }, |
| { |
| "base_loss": 0.5976571834087372, |
| "epoch": 4.039100646972656, |
| "grad_norm": 0.001099589979276061, |
| "learning_rate": 2.420320510864258e-05, |
| "lookahead_loss": 6.3452381052970885, |
| "loss": 0.6117, |
| "step": 270500 |
| }, |
| { |
| "base_loss": 0.6155648761987687, |
| "epoch": 4.0400543212890625, |
| "grad_norm": 0.0011181931477040052, |
| "learning_rate": 2.4155521392822266e-05, |
| "lookahead_loss": 6.385155973434448, |
| "loss": 0.6245, |
| "step": 271000 |
| }, |
| { |
| "base_loss": 0.603646912753582, |
| "epoch": 4.041007995605469, |
| "grad_norm": 0.0011384629178792238, |
| "learning_rate": 2.4107837677001953e-05, |
| "lookahead_loss": 6.339157883644104, |
| "loss": 0.6137, |
| "step": 271500 |
| }, |
| { |
| "base_loss": 0.5785835943818093, |
| "epoch": 4.041961669921875, |
| "grad_norm": 0.0011360092321410775, |
| "learning_rate": 2.406015396118164e-05, |
| "lookahead_loss": 6.399327411651611, |
| "loss": 0.5954, |
| "step": 272000 |
| }, |
| { |
| "base_loss": 0.6030765172839164, |
| "epoch": 4.042915344238281, |
| "grad_norm": 0.0010638670064508915, |
| "learning_rate": 2.401247024536133e-05, |
| "lookahead_loss": 6.423156121253967, |
| "loss": 0.6143, |
| "step": 272500 |
| }, |
| { |
| "base_loss": 0.6119026395678521, |
| "epoch": 4.0438690185546875, |
| "grad_norm": 0.0011214343830943108, |
| "learning_rate": 2.3964786529541017e-05, |
| "lookahead_loss": 6.441630228996277, |
| "loss": 0.6252, |
| "step": 273000 |
| }, |
| { |
| "base_loss": 0.5916597181558609, |
| "epoch": 4.044822692871094, |
| "grad_norm": 0.0011162528535351157, |
| "learning_rate": 2.3917102813720704e-05, |
| "lookahead_loss": 6.372374580383301, |
| "loss": 0.6053, |
| "step": 273500 |
| }, |
| { |
| "base_loss": 0.5914614844322205, |
| "epoch": 4.0457763671875, |
| "grad_norm": 0.0010877457680180669, |
| "learning_rate": 2.386941909790039e-05, |
| "lookahead_loss": 6.3761955223083495, |
| "loss": 0.6021, |
| "step": 274000 |
| }, |
| { |
| "base_loss": 0.621587080359459, |
| "epoch": 4.046730041503906, |
| "grad_norm": 0.001101334230042994, |
| "learning_rate": 2.3821735382080078e-05, |
| "lookahead_loss": 6.33512171459198, |
| "loss": 0.6318, |
| "step": 274500 |
| }, |
| { |
| "base_loss": 0.6045556816458703, |
| "epoch": 4.0476837158203125, |
| "grad_norm": 0.001156001933850348, |
| "learning_rate": 2.3774051666259768e-05, |
| "lookahead_loss": 6.388499364376068, |
| "loss": 0.6215, |
| "step": 275000 |
| }, |
| { |
| "epoch": 4.0476837158203125, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.370460871309518, |
| "eval_lookahead_perplexity": 584.3270664248921, |
| "eval_loss": 0.20731313526630402, |
| "eval_perplexity": 1.2303677830093793, |
| "eval_runtime": 91.8401, |
| "eval_samples_per_second": 54.442, |
| "eval_steps_per_second": 1.709, |
| "step": 275000 |
| }, |
| { |
| "base_loss": 0.5886928399801254, |
| "epoch": 4.048637390136719, |
| "grad_norm": 0.0010876537999138236, |
| "learning_rate": 2.3726367950439455e-05, |
| "lookahead_loss": 6.358909488677979, |
| "loss": 0.6008, |
| "step": 275500 |
| }, |
| { |
| "base_loss": 0.5989522265791893, |
| "epoch": 4.049591064453125, |
| "grad_norm": 0.0011483179405331612, |
| "learning_rate": 2.367868423461914e-05, |
| "lookahead_loss": 6.335620626926422, |
| "loss": 0.6074, |
| "step": 276000 |
| }, |
| { |
| "base_loss": 0.615303504705429, |
| "epoch": 4.050544738769531, |
| "grad_norm": 0.0010323745664209127, |
| "learning_rate": 2.3631000518798828e-05, |
| "lookahead_loss": 6.436678630828857, |
| "loss": 0.6272, |
| "step": 276500 |
| }, |
| { |
| "base_loss": 0.5973832362890243, |
| "epoch": 4.0514984130859375, |
| "grad_norm": 0.0011526040034368634, |
| "learning_rate": 2.3583316802978515e-05, |
| "lookahead_loss": 6.392355844974518, |
| "loss": 0.6105, |
| "step": 277000 |
| }, |
| { |
| "base_loss": 0.5953535653948784, |
| "epoch": 4.052452087402344, |
| "grad_norm": 0.0011593152303248644, |
| "learning_rate": 2.3535633087158205e-05, |
| "lookahead_loss": 6.3196696195602415, |
| "loss": 0.6054, |
| "step": 277500 |
| }, |
| { |
| "base_loss": 0.609023510336876, |
| "epoch": 4.05340576171875, |
| "grad_norm": 0.0010948091512545943, |
| "learning_rate": 2.3487949371337892e-05, |
| "lookahead_loss": 6.335317673683167, |
| "loss": 0.6192, |
| "step": 278000 |
| }, |
| { |
| "base_loss": 0.6116033662557602, |
| "epoch": 4.054359436035156, |
| "grad_norm": 0.0011012164177373052, |
| "learning_rate": 2.344026565551758e-05, |
| "lookahead_loss": 6.380707992076874, |
| "loss": 0.6258, |
| "step": 278500 |
| }, |
| { |
| "base_loss": 0.5943940283060074, |
| "epoch": 4.0553131103515625, |
| "grad_norm": 0.0010989225702360272, |
| "learning_rate": 2.3392581939697266e-05, |
| "lookahead_loss": 6.384932087421417, |
| "loss": 0.6041, |
| "step": 279000 |
| }, |
| { |
| "base_loss": 0.5885190908908844, |
| "epoch": 4.056266784667969, |
| "grad_norm": 0.001057869172655046, |
| "learning_rate": 2.3344898223876953e-05, |
| "lookahead_loss": 6.398144548416138, |
| "loss": 0.6033, |
| "step": 279500 |
| }, |
| { |
| "base_loss": 0.6137237245440483, |
| "epoch": 4.057220458984375, |
| "grad_norm": 0.001099518733099103, |
| "learning_rate": 2.3297214508056643e-05, |
| "lookahead_loss": 6.392653202056885, |
| "loss": 0.6266, |
| "step": 280000 |
| }, |
| { |
| "epoch": 4.057220458984375, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.365785460121716, |
| "eval_lookahead_perplexity": 581.6014737195064, |
| "eval_loss": 0.20730404555797577, |
| "eval_perplexity": 1.2303565993759233, |
| "eval_runtime": 91.3171, |
| "eval_samples_per_second": 54.754, |
| "eval_steps_per_second": 1.719, |
| "step": 280000 |
| }, |
| { |
| "base_loss": 0.6062676934599877, |
| "epoch": 4.058174133300781, |
| "grad_norm": 0.0011220412561669946, |
| "learning_rate": 2.324953079223633e-05, |
| "lookahead_loss": 6.392493741989136, |
| "loss": 0.6151, |
| "step": 280500 |
| }, |
| { |
| "base_loss": 0.5848039170503616, |
| "epoch": 4.0591278076171875, |
| "grad_norm": 0.0011025768471881747, |
| "learning_rate": 2.3201847076416016e-05, |
| "lookahead_loss": 6.284692679405213, |
| "loss": 0.6003, |
| "step": 281000 |
| }, |
| { |
| "base_loss": 0.5932843062877655, |
| "epoch": 4.060081481933594, |
| "grad_norm": 0.0011020904639735818, |
| "learning_rate": 2.3154163360595703e-05, |
| "lookahead_loss": 6.377818461418152, |
| "loss": 0.6053, |
| "step": 281500 |
| }, |
| { |
| "base_loss": 0.6079036598205566, |
| "epoch": 4.06103515625, |
| "grad_norm": 0.0011223220499232411, |
| "learning_rate": 2.310647964477539e-05, |
| "lookahead_loss": 6.317885259151459, |
| "loss": 0.6188, |
| "step": 282000 |
| }, |
| { |
| "base_loss": 0.6019631532430649, |
| "epoch": 4.061988830566406, |
| "grad_norm": 0.0011525802547112107, |
| "learning_rate": 2.305879592895508e-05, |
| "lookahead_loss": 6.3444995069503785, |
| "loss": 0.6138, |
| "step": 282500 |
| }, |
| { |
| "base_loss": 0.5940396988987923, |
| "epoch": 4.0629425048828125, |
| "grad_norm": 0.001133267069235444, |
| "learning_rate": 2.3011112213134767e-05, |
| "lookahead_loss": 6.347807200431824, |
| "loss": 0.6066, |
| "step": 283000 |
| }, |
| { |
| "base_loss": 0.612035707950592, |
| "epoch": 4.063896179199219, |
| "grad_norm": 0.0010704045416787267, |
| "learning_rate": 2.2963428497314454e-05, |
| "lookahead_loss": 6.361331562995911, |
| "loss": 0.6231, |
| "step": 283500 |
| }, |
| { |
| "base_loss": 0.6061015563607216, |
| "epoch": 4.064849853515625, |
| "grad_norm": 0.0011012317845597863, |
| "learning_rate": 2.291574478149414e-05, |
| "lookahead_loss": 6.41039630651474, |
| "loss": 0.6162, |
| "step": 284000 |
| }, |
| { |
| "base_loss": 0.6026831232309341, |
| "epoch": 4.065803527832031, |
| "grad_norm": 0.0011320897610858083, |
| "learning_rate": 2.2868061065673828e-05, |
| "lookahead_loss": 6.329383935928345, |
| "loss": 0.6138, |
| "step": 284500 |
| }, |
| { |
| "base_loss": 0.6037889505624772, |
| "epoch": 4.0667572021484375, |
| "grad_norm": 0.0011039386736229062, |
| "learning_rate": 2.2820377349853518e-05, |
| "lookahead_loss": 6.318099370002747, |
| "loss": 0.6115, |
| "step": 285000 |
| }, |
| { |
| "epoch": 4.0667572021484375, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.360919896025246, |
| "eval_lookahead_perplexity": 578.7785276521714, |
| "eval_loss": 0.2072950154542923, |
| "eval_perplexity": 1.2303454891784265, |
| "eval_runtime": 89.1026, |
| "eval_samples_per_second": 56.115, |
| "eval_steps_per_second": 1.762, |
| "step": 285000 |
| }, |
| { |
| "base_loss": 0.6186125885248184, |
| "epoch": 4.067710876464844, |
| "grad_norm": 0.0011432436294853687, |
| "learning_rate": 2.2772693634033205e-05, |
| "lookahead_loss": 6.3883231830596925, |
| "loss": 0.6282, |
| "step": 285500 |
| }, |
| { |
| "base_loss": 0.5985263943076133, |
| "epoch": 4.06866455078125, |
| "grad_norm": 0.001086168922483921, |
| "learning_rate": 2.272500991821289e-05, |
| "lookahead_loss": 6.344208096981048, |
| "loss": 0.609, |
| "step": 286000 |
| }, |
| { |
| "base_loss": 0.5856241980195045, |
| "epoch": 4.069618225097656, |
| "grad_norm": 0.001086947857402265, |
| "learning_rate": 2.2677326202392578e-05, |
| "lookahead_loss": 6.384473464012146, |
| "loss": 0.5994, |
| "step": 286500 |
| }, |
| { |
| "base_loss": 0.615737675666809, |
| "epoch": 4.0705718994140625, |
| "grad_norm": 0.0011107822647318244, |
| "learning_rate": 2.2629642486572265e-05, |
| "lookahead_loss": 6.289638694763184, |
| "loss": 0.628, |
| "step": 287000 |
| }, |
| { |
| "base_loss": 0.5986395262479782, |
| "epoch": 4.071525573730469, |
| "grad_norm": 0.0010868743993341923, |
| "learning_rate": 2.2581958770751955e-05, |
| "lookahead_loss": 6.324786671638488, |
| "loss": 0.6105, |
| "step": 287500 |
| }, |
| { |
| "base_loss": 0.5934888973236084, |
| "epoch": 4.072479248046875, |
| "grad_norm": 0.0011527807218953967, |
| "learning_rate": 2.2534275054931642e-05, |
| "lookahead_loss": 6.377863648414611, |
| "loss": 0.6042, |
| "step": 288000 |
| }, |
| { |
| "base_loss": 0.5941547375321389, |
| "epoch": 4.073432922363281, |
| "grad_norm": 0.0010451872367411852, |
| "learning_rate": 2.248659133911133e-05, |
| "lookahead_loss": 6.336130671501159, |
| "loss": 0.6062, |
| "step": 288500 |
| }, |
| { |
| "base_loss": 0.6138048614859581, |
| "epoch": 4.0743865966796875, |
| "grad_norm": 0.00108580372761935, |
| "learning_rate": 2.2438907623291016e-05, |
| "lookahead_loss": 6.343031913757324, |
| "loss": 0.6243, |
| "step": 289000 |
| }, |
| { |
| "base_loss": 0.5931758234500885, |
| "epoch": 4.075340270996094, |
| "grad_norm": 0.001116293016821146, |
| "learning_rate": 2.2391223907470703e-05, |
| "lookahead_loss": 6.409209857940674, |
| "loss": 0.6072, |
| "step": 289500 |
| }, |
| { |
| "base_loss": 0.587039683163166, |
| "epoch": 4.0762939453125, |
| "grad_norm": 0.0010742460144683719, |
| "learning_rate": 2.2343540191650393e-05, |
| "lookahead_loss": 6.3832610340118405, |
| "loss": 0.5971, |
| "step": 290000 |
| }, |
| { |
| "epoch": 4.0762939453125, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.3556131493931, |
| "eval_lookahead_perplexity": 575.7152319059157, |
| "eval_loss": 0.20728513598442078, |
| "eval_perplexity": 1.2303333340772777, |
| "eval_runtime": 92.8001, |
| "eval_samples_per_second": 53.879, |
| "eval_steps_per_second": 1.692, |
| "step": 290000 |
| }, |
| { |
| "base_loss": 0.6259823521375656, |
| "epoch": 4.077247619628906, |
| "grad_norm": 0.001103928778320551, |
| "learning_rate": 2.229585647583008e-05, |
| "lookahead_loss": 6.341942510604858, |
| "loss": 0.6351, |
| "step": 290500 |
| }, |
| { |
| "base_loss": 0.5940629866719246, |
| "epoch": 4.0782012939453125, |
| "grad_norm": 0.001126352814026177, |
| "learning_rate": 2.2248172760009766e-05, |
| "lookahead_loss": 6.3599296531677245, |
| "loss": 0.6092, |
| "step": 291000 |
| }, |
| { |
| "base_loss": 0.5972123501300812, |
| "epoch": 4.079154968261719, |
| "grad_norm": 0.001072005252353847, |
| "learning_rate": 2.2200489044189453e-05, |
| "lookahead_loss": 6.396831311225891, |
| "loss": 0.6064, |
| "step": 291500 |
| }, |
| { |
| "base_loss": 0.6067690544128418, |
| "epoch": 4.080108642578125, |
| "grad_norm": 0.0010752794332802296, |
| "learning_rate": 2.215280532836914e-05, |
| "lookahead_loss": 6.333174806594848, |
| "loss": 0.6189, |
| "step": 292000 |
| }, |
| { |
| "base_loss": 0.6066553748250008, |
| "epoch": 4.081062316894531, |
| "grad_norm": 0.0011793439043685794, |
| "learning_rate": 2.210512161254883e-05, |
| "lookahead_loss": 6.3659382867813115, |
| "loss": 0.618, |
| "step": 292500 |
| }, |
| { |
| "base_loss": 0.5932499123811722, |
| "epoch": 4.0820159912109375, |
| "grad_norm": 0.001059235306456685, |
| "learning_rate": 2.2057437896728517e-05, |
| "lookahead_loss": 6.378923125267029, |
| "loss": 0.6045, |
| "step": 293000 |
| }, |
| { |
| "base_loss": 0.5890687267780303, |
| "epoch": 4.082969665527344, |
| "grad_norm": 0.0011305406223982573, |
| "learning_rate": 2.2009754180908204e-05, |
| "lookahead_loss": 6.4099526739120485, |
| "loss": 0.6029, |
| "step": 293500 |
| }, |
| { |
| "base_loss": 0.6172499601840973, |
| "epoch": 4.08392333984375, |
| "grad_norm": 0.0010661403648555279, |
| "learning_rate": 2.196207046508789e-05, |
| "lookahead_loss": 6.412414843082428, |
| "loss": 0.6288, |
| "step": 294000 |
| }, |
| { |
| "base_loss": 0.5928957391381263, |
| "epoch": 4.084877014160156, |
| "grad_norm": 0.0011031859321519732, |
| "learning_rate": 2.1914386749267578e-05, |
| "lookahead_loss": 6.342724691390991, |
| "loss": 0.6048, |
| "step": 294500 |
| }, |
| { |
| "base_loss": 0.5880893425941467, |
| "epoch": 4.0858306884765625, |
| "grad_norm": 0.0011241311440244317, |
| "learning_rate": 2.1866703033447268e-05, |
| "lookahead_loss": 6.35858272600174, |
| "loss": 0.5976, |
| "step": 295000 |
| }, |
| { |
| "epoch": 4.0858306884765625, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.350282313343816, |
| "eval_lookahead_perplexity": 572.6543541608802, |
| "eval_loss": 0.20727534592151642, |
| "eval_perplexity": 1.2303212890955044, |
| "eval_runtime": 88.9078, |
| "eval_samples_per_second": 56.238, |
| "eval_steps_per_second": 1.766, |
| "step": 295000 |
| }, |
| { |
| "base_loss": 0.5925973926782608, |
| "epoch": 4.086784362792969, |
| "grad_norm": 0.0011744624935090542, |
| "learning_rate": 2.1819019317626955e-05, |
| "lookahead_loss": 6.3310195207595825, |
| "loss": 0.6042, |
| "step": 295500 |
| }, |
| { |
| "base_loss": 0.6219116472601891, |
| "epoch": 4.087738037109375, |
| "grad_norm": 0.0010088298004120588, |
| "learning_rate": 2.177133560180664e-05, |
| "lookahead_loss": 6.351883282661438, |
| "loss": 0.6272, |
| "step": 296000 |
| }, |
| { |
| "base_loss": 0.5964856284856797, |
| "epoch": 4.088691711425781, |
| "grad_norm": 0.0011463849805295467, |
| "learning_rate": 2.1723651885986328e-05, |
| "lookahead_loss": 6.367171252727508, |
| "loss": 0.605, |
| "step": 296500 |
| }, |
| { |
| "base_loss": 0.599702876329422, |
| "epoch": 4.0896453857421875, |
| "grad_norm": 0.001124189468100667, |
| "learning_rate": 2.1675968170166015e-05, |
| "lookahead_loss": 6.378195665359497, |
| "loss": 0.6085, |
| "step": 297000 |
| }, |
| { |
| "base_loss": 0.5873163719773292, |
| "epoch": 4.090599060058594, |
| "grad_norm": 0.0010804467601701617, |
| "learning_rate": 2.1628284454345705e-05, |
| "lookahead_loss": 6.3505833082199095, |
| "loss": 0.5978, |
| "step": 297500 |
| }, |
| { |
| "base_loss": 0.5817358963489533, |
| "epoch": 4.091552734375, |
| "grad_norm": 0.0010557883651927114, |
| "learning_rate": 2.1580600738525392e-05, |
| "lookahead_loss": 6.3347275447845455, |
| "loss": 0.5921, |
| "step": 298000 |
| }, |
| { |
| "base_loss": 0.6144214420318603, |
| "epoch": 4.092506408691406, |
| "grad_norm": 0.0011353583540767431, |
| "learning_rate": 2.153291702270508e-05, |
| "lookahead_loss": 6.380691371917725, |
| "loss": 0.626, |
| "step": 298500 |
| }, |
| { |
| "base_loss": 0.5997614379525185, |
| "epoch": 4.0934600830078125, |
| "grad_norm": 0.001061895745806396, |
| "learning_rate": 2.1485233306884766e-05, |
| "lookahead_loss": 6.359518153190613, |
| "loss": 0.6125, |
| "step": 299000 |
| }, |
| { |
| "base_loss": 0.5837140116095543, |
| "epoch": 4.094413757324219, |
| "grad_norm": 0.0010897148167714477, |
| "learning_rate": 2.1437549591064453e-05, |
| "lookahead_loss": 6.365825653076172, |
| "loss": 0.5966, |
| "step": 299500 |
| }, |
| { |
| "base_loss": 0.5794652185440063, |
| "epoch": 4.095367431640625, |
| "grad_norm": 0.0010967223206534982, |
| "learning_rate": 2.1389865875244143e-05, |
| "lookahead_loss": 6.290959995746612, |
| "loss": 0.594, |
| "step": 300000 |
| }, |
| { |
| "epoch": 4.095367431640625, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.346539011397682, |
| "eval_lookahead_perplexity": 570.5147431060077, |
| "eval_loss": 0.20726701617240906, |
| "eval_perplexity": 1.2303110408705273, |
| "eval_runtime": 92.9044, |
| "eval_samples_per_second": 53.819, |
| "eval_steps_per_second": 1.69, |
| "step": 300000 |
| }, |
| { |
| "base_loss": 0.5975050512552261, |
| "epoch": 4.096321105957031, |
| "grad_norm": 0.0011109969345852733, |
| "learning_rate": 2.134218215942383e-05, |
| "lookahead_loss": 6.353515555381775, |
| "loss": 0.6116, |
| "step": 300500 |
| }, |
| { |
| "base_loss": 0.610376767218113, |
| "epoch": 4.0972747802734375, |
| "grad_norm": 0.0011318782344460487, |
| "learning_rate": 2.1294498443603516e-05, |
| "lookahead_loss": 6.376906949996949, |
| "loss": 0.6227, |
| "step": 301000 |
| }, |
| { |
| "base_loss": 0.5961799000501633, |
| "epoch": 4.098228454589844, |
| "grad_norm": 0.0010932480217888951, |
| "learning_rate": 2.1246814727783203e-05, |
| "lookahead_loss": 6.313533523082733, |
| "loss": 0.6066, |
| "step": 301500 |
| }, |
| { |
| "base_loss": 0.5839848874211311, |
| "epoch": 4.09918212890625, |
| "grad_norm": 0.0011126038152724504, |
| "learning_rate": 2.119913101196289e-05, |
| "lookahead_loss": 6.389643818378448, |
| "loss": 0.5955, |
| "step": 302000 |
| }, |
| { |
| "base_loss": 0.5879470457434655, |
| "epoch": 4.100135803222656, |
| "grad_norm": 0.0011223671026527882, |
| "learning_rate": 2.115144729614258e-05, |
| "lookahead_loss": 6.359994841575623, |
| "loss": 0.6028, |
| "step": 302500 |
| }, |
| { |
| "base_loss": 0.6098613065481185, |
| "epoch": 4.1010894775390625, |
| "grad_norm": 0.0010963305830955505, |
| "learning_rate": 2.1103763580322267e-05, |
| "lookahead_loss": 6.410213003635406, |
| "loss": 0.6181, |
| "step": 303000 |
| }, |
| { |
| "base_loss": 0.6013668415546417, |
| "epoch": 4.102043151855469, |
| "grad_norm": 0.0011391708394512534, |
| "learning_rate": 2.1056079864501954e-05, |
| "lookahead_loss": 6.382720433235169, |
| "loss": 0.6076, |
| "step": 303500 |
| }, |
| { |
| "base_loss": 0.5899085831642151, |
| "epoch": 4.102996826171875, |
| "grad_norm": 0.0010788282379508018, |
| "learning_rate": 2.100839614868164e-05, |
| "lookahead_loss": 6.362212418556213, |
| "loss": 0.6021, |
| "step": 304000 |
| }, |
| { |
| "base_loss": 0.5915732194781304, |
| "epoch": 4.103950500488281, |
| "grad_norm": 0.0011102594435214996, |
| "learning_rate": 2.0960712432861328e-05, |
| "lookahead_loss": 6.379088652610779, |
| "loss": 0.6022, |
| "step": 304500 |
| }, |
| { |
| "base_loss": 0.6102671644687653, |
| "epoch": 4.1049041748046875, |
| "grad_norm": 0.0010900754714384675, |
| "learning_rate": 2.0913028717041018e-05, |
| "lookahead_loss": 6.325960657596588, |
| "loss": 0.6216, |
| "step": 305000 |
| }, |
| { |
| "epoch": 4.1049041748046875, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.342223271775169, |
| "eval_lookahead_perplexity": 568.0578554810108, |
| "eval_loss": 0.20725864171981812, |
| "eval_perplexity": 1.2303007377321853, |
| "eval_runtime": 91.3026, |
| "eval_samples_per_second": 54.763, |
| "eval_steps_per_second": 1.72, |
| "step": 305000 |
| }, |
| { |
| "base_loss": 0.6025243458151818, |
| "epoch": 4.105857849121094, |
| "grad_norm": 0.001037590904161334, |
| "learning_rate": 2.0865345001220705e-05, |
| "lookahead_loss": 6.338426442146301, |
| "loss": 0.6134, |
| "step": 305500 |
| }, |
| { |
| "base_loss": 0.5969890383481979, |
| "epoch": 4.1068115234375, |
| "grad_norm": 0.0010905301896855235, |
| "learning_rate": 2.081766128540039e-05, |
| "lookahead_loss": 6.4113108348846435, |
| "loss": 0.6073, |
| "step": 306000 |
| }, |
| { |
| "base_loss": 0.5838305065631867, |
| "epoch": 4.107765197753906, |
| "grad_norm": 0.0011273369891569018, |
| "learning_rate": 2.0769977569580078e-05, |
| "lookahead_loss": 6.2966307439804075, |
| "loss": 0.5932, |
| "step": 306500 |
| }, |
| { |
| "base_loss": 0.6047073290944099, |
| "epoch": 4.1087188720703125, |
| "grad_norm": 0.0011418815702199936, |
| "learning_rate": 2.0722293853759765e-05, |
| "lookahead_loss": 6.3043163280487065, |
| "loss": 0.617, |
| "step": 307000 |
| }, |
| { |
| "base_loss": 0.6081058176159859, |
| "epoch": 4.109672546386719, |
| "grad_norm": 0.0010792854009196162, |
| "learning_rate": 2.0674610137939455e-05, |
| "lookahead_loss": 6.284334290027618, |
| "loss": 0.6181, |
| "step": 307500 |
| }, |
| { |
| "base_loss": 0.5982678539156914, |
| "epoch": 4.110626220703125, |
| "grad_norm": 0.00108222512062639, |
| "learning_rate": 2.0626926422119142e-05, |
| "lookahead_loss": 6.264766163825989, |
| "loss": 0.6081, |
| "step": 308000 |
| }, |
| { |
| "base_loss": 0.5849089304804802, |
| "epoch": 4.111579895019531, |
| "grad_norm": 0.001074333442375064, |
| "learning_rate": 2.057924270629883e-05, |
| "lookahead_loss": 6.356988987922668, |
| "loss": 0.5967, |
| "step": 308500 |
| }, |
| { |
| "base_loss": 0.60502344673872, |
| "epoch": 4.1125335693359375, |
| "grad_norm": 0.0010747781489044428, |
| "learning_rate": 2.0531558990478516e-05, |
| "lookahead_loss": 6.378328808307648, |
| "loss": 0.6128, |
| "step": 309000 |
| }, |
| { |
| "base_loss": 0.6168978958129883, |
| "epoch": 4.113487243652344, |
| "grad_norm": 0.001114184153266251, |
| "learning_rate": 2.0483875274658203e-05, |
| "lookahead_loss": 6.3930044984817505, |
| "loss": 0.6281, |
| "step": 309500 |
| }, |
| { |
| "base_loss": 0.5967838813066483, |
| "epoch": 4.11444091796875, |
| "grad_norm": 0.001040627365000546, |
| "learning_rate": 2.0436191558837893e-05, |
| "lookahead_loss": 6.270927430152893, |
| "loss": 0.6069, |
| "step": 310000 |
| }, |
| { |
| "epoch": 4.11444091796875, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.338280543732567, |
| "eval_lookahead_perplexity": 565.8225673067175, |
| "eval_loss": 0.2072509527206421, |
| "eval_perplexity": 1.2302912779871946, |
| "eval_runtime": 90.8255, |
| "eval_samples_per_second": 55.051, |
| "eval_steps_per_second": 1.729, |
| "step": 310000 |
| }, |
| { |
| "base_loss": 0.5838415359258652, |
| "epoch": 4.115394592285156, |
| "grad_norm": 0.0011250053066760302, |
| "learning_rate": 2.038850784301758e-05, |
| "lookahead_loss": 6.321314858436584, |
| "loss": 0.5963, |
| "step": 310500 |
| }, |
| { |
| "base_loss": 0.6032737566232681, |
| "epoch": 4.1163482666015625, |
| "grad_norm": 0.0011572489747777581, |
| "learning_rate": 2.0340824127197266e-05, |
| "lookahead_loss": 6.336915095329284, |
| "loss": 0.6151, |
| "step": 311000 |
| }, |
| { |
| "base_loss": 0.6101697637438774, |
| "epoch": 4.117301940917969, |
| "grad_norm": 0.0011303132632747293, |
| "learning_rate": 2.0293140411376953e-05, |
| "lookahead_loss": 6.329978828430176, |
| "loss": 0.6221, |
| "step": 311500 |
| }, |
| { |
| "base_loss": 0.5998096758127213, |
| "epoch": 4.118255615234375, |
| "grad_norm": 0.0011345641687512398, |
| "learning_rate": 2.024545669555664e-05, |
| "lookahead_loss": 6.358539440155029, |
| "loss": 0.6102, |
| "step": 312000 |
| }, |
| { |
| "base_loss": 0.5844805639982223, |
| "epoch": 4.119209289550781, |
| "grad_norm": 0.001088910736143589, |
| "learning_rate": 2.019777297973633e-05, |
| "lookahead_loss": 6.361526685237885, |
| "loss": 0.5943, |
| "step": 312500 |
| }, |
| { |
| "base_loss": 0.5970172066688537, |
| "epoch": 5.000953674316406, |
| "grad_norm": 0.0011056496296077967, |
| "learning_rate": 2.0150089263916017e-05, |
| "lookahead_loss": 6.426861086845398, |
| "loss": 0.602, |
| "step": 313000 |
| }, |
| { |
| "base_loss": 0.5878908542394639, |
| "epoch": 5.0019073486328125, |
| "grad_norm": 0.0011464518029242754, |
| "learning_rate": 2.0102405548095704e-05, |
| "lookahead_loss": 6.26604482460022, |
| "loss": 0.5986, |
| "step": 313500 |
| }, |
| { |
| "base_loss": 0.6054873216748238, |
| "epoch": 5.002861022949219, |
| "grad_norm": 0.0011213821126148105, |
| "learning_rate": 2.005472183227539e-05, |
| "lookahead_loss": 6.284090840816498, |
| "loss": 0.6149, |
| "step": 314000 |
| }, |
| { |
| "base_loss": 0.6155969781279564, |
| "epoch": 5.003814697265625, |
| "grad_norm": 0.001117490348406136, |
| "learning_rate": 2.0007038116455078e-05, |
| "lookahead_loss": 6.302778920173645, |
| "loss": 0.6229, |
| "step": 314500 |
| }, |
| { |
| "base_loss": 0.6011522135734558, |
| "epoch": 5.004768371582031, |
| "grad_norm": 0.0011072148336097598, |
| "learning_rate": 1.9959354400634768e-05, |
| "lookahead_loss": 6.271711923599243, |
| "loss": 0.6083, |
| "step": 315000 |
| }, |
| { |
| "epoch": 5.004768371582031, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.3342473286028484, |
| "eval_lookahead_perplexity": 563.5450790548485, |
| "eval_loss": 0.20724281668663025, |
| "eval_perplexity": 1.2302812683362319, |
| "eval_runtime": 91.4874, |
| "eval_samples_per_second": 54.652, |
| "eval_steps_per_second": 1.716, |
| "step": 315000 |
| }, |
| { |
| "base_loss": 0.5900973889827729, |
| "epoch": 5.0057220458984375, |
| "grad_norm": 0.0010573536856099963, |
| "learning_rate": 1.9911670684814455e-05, |
| "lookahead_loss": 6.4039622111320496, |
| "loss": 0.5992, |
| "step": 315500 |
| }, |
| { |
| "base_loss": 0.5815674068927765, |
| "epoch": 5.006675720214844, |
| "grad_norm": 0.0011015620548278093, |
| "learning_rate": 1.986398696899414e-05, |
| "lookahead_loss": 6.252389284610748, |
| "loss": 0.5969, |
| "step": 316000 |
| }, |
| { |
| "base_loss": 0.6040794110894203, |
| "epoch": 5.00762939453125, |
| "grad_norm": 0.0011115833185613155, |
| "learning_rate": 1.9816303253173828e-05, |
| "lookahead_loss": 6.311683702468872, |
| "loss": 0.6157, |
| "step": 316500 |
| }, |
| { |
| "base_loss": 0.6009674743413925, |
| "epoch": 5.008583068847656, |
| "grad_norm": 0.0010299838613718748, |
| "learning_rate": 1.9768619537353515e-05, |
| "lookahead_loss": 6.318372031211853, |
| "loss": 0.6064, |
| "step": 317000 |
| }, |
| { |
| "base_loss": 0.5907457684278488, |
| "epoch": 5.0095367431640625, |
| "grad_norm": 0.0011498293606564403, |
| "learning_rate": 1.9720935821533205e-05, |
| "lookahead_loss": 6.334591036319733, |
| "loss": 0.6071, |
| "step": 317500 |
| }, |
| { |
| "base_loss": 0.5929673384428025, |
| "epoch": 5.010490417480469, |
| "grad_norm": 0.0011196956038475037, |
| "learning_rate": 1.9673252105712892e-05, |
| "lookahead_loss": 6.270245428085327, |
| "loss": 0.6046, |
| "step": 318000 |
| }, |
| { |
| "base_loss": 0.5895965065360069, |
| "epoch": 5.011444091796875, |
| "grad_norm": 0.001152197364717722, |
| "learning_rate": 1.962556838989258e-05, |
| "lookahead_loss": 6.298059554100036, |
| "loss": 0.6054, |
| "step": 318500 |
| }, |
| { |
| "base_loss": 0.6096455634236336, |
| "epoch": 5.012397766113281, |
| "grad_norm": 0.0010954708559438586, |
| "learning_rate": 1.9577884674072266e-05, |
| "lookahead_loss": 6.312456937789917, |
| "loss": 0.6216, |
| "step": 319000 |
| }, |
| { |
| "base_loss": 0.5980722559094429, |
| "epoch": 5.0133514404296875, |
| "grad_norm": 0.0011161410948261619, |
| "learning_rate": 1.9530200958251953e-05, |
| "lookahead_loss": 6.37580472278595, |
| "loss": 0.6097, |
| "step": 319500 |
| }, |
| { |
| "base_loss": 0.5950591211915016, |
| "epoch": 5.014305114746094, |
| "grad_norm": 0.0011130195343866944, |
| "learning_rate": 1.9482517242431643e-05, |
| "lookahead_loss": 6.353333437919617, |
| "loss": 0.6077, |
| "step": 320000 |
| }, |
| { |
| "epoch": 5.014305114746094, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.33025686657086, |
| "eval_lookahead_perplexity": 561.3007547372324, |
| "eval_loss": 0.20723474025726318, |
| "eval_perplexity": 1.2302713320965912, |
| "eval_runtime": 92.6732, |
| "eval_samples_per_second": 53.953, |
| "eval_steps_per_second": 1.694, |
| "step": 320000 |
| }, |
| { |
| "base_loss": 0.5842609284520149, |
| "epoch": 5.0152587890625, |
| "grad_norm": 0.0011031727772206068, |
| "learning_rate": 1.943483352661133e-05, |
| "lookahead_loss": 6.298804305553436, |
| "loss": 0.5893, |
| "step": 320500 |
| }, |
| { |
| "base_loss": 0.6108002681136131, |
| "epoch": 5.016212463378906, |
| "grad_norm": 0.0011222651228308678, |
| "learning_rate": 1.9387149810791016e-05, |
| "lookahead_loss": 6.338112614154816, |
| "loss": 0.6178, |
| "step": 321000 |
| }, |
| { |
| "base_loss": 0.6050938671827316, |
| "epoch": 5.0171661376953125, |
| "grad_norm": 0.0011011798633262515, |
| "learning_rate": 1.9339466094970703e-05, |
| "lookahead_loss": 6.3767892370223995, |
| "loss": 0.6101, |
| "step": 321500 |
| }, |
| { |
| "base_loss": 0.59214752471447, |
| "epoch": 5.018119812011719, |
| "grad_norm": 0.0010596738429740071, |
| "learning_rate": 1.929178237915039e-05, |
| "lookahead_loss": 6.373836220741272, |
| "loss": 0.6019, |
| "step": 322000 |
| }, |
| { |
| "base_loss": 0.5841910520792007, |
| "epoch": 5.019073486328125, |
| "grad_norm": 0.0010723528685048223, |
| "learning_rate": 1.924409866333008e-05, |
| "lookahead_loss": 6.397507895469666, |
| "loss": 0.5966, |
| "step": 322500 |
| }, |
| { |
| "base_loss": 0.5900825787782669, |
| "epoch": 5.020027160644531, |
| "grad_norm": 0.0011345201637595892, |
| "learning_rate": 1.9196414947509767e-05, |
| "lookahead_loss": 6.252168926239014, |
| "loss": 0.6035, |
| "step": 323000 |
| }, |
| { |
| "base_loss": 0.6143665207028389, |
| "epoch": 5.0209808349609375, |
| "grad_norm": 0.001077457214705646, |
| "learning_rate": 1.9148731231689454e-05, |
| "lookahead_loss": 6.304201687812805, |
| "loss": 0.6244, |
| "step": 323500 |
| }, |
| { |
| "base_loss": 0.5979527611136436, |
| "epoch": 5.021934509277344, |
| "grad_norm": 0.0011455104686319828, |
| "learning_rate": 1.910104751586914e-05, |
| "lookahead_loss": 6.293455883979798, |
| "loss": 0.609, |
| "step": 324000 |
| }, |
| { |
| "base_loss": 0.5961930236816406, |
| "epoch": 5.02288818359375, |
| "grad_norm": 0.001122178859077394, |
| "learning_rate": 1.9053363800048828e-05, |
| "lookahead_loss": 6.3198678956031795, |
| "loss": 0.6044, |
| "step": 324500 |
| }, |
| { |
| "base_loss": 0.5978932146430016, |
| "epoch": 5.023841857910156, |
| "grad_norm": 0.0011242764303460717, |
| "learning_rate": 1.9005680084228518e-05, |
| "lookahead_loss": 6.288296319961548, |
| "loss": 0.6048, |
| "step": 325000 |
| }, |
| { |
| "epoch": 5.023841857910156, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.3256560042262455, |
| "eval_lookahead_perplexity": 558.7242189190645, |
| "eval_loss": 0.20722579956054688, |
| "eval_perplexity": 1.2302603326629036, |
| "eval_runtime": 89.5384, |
| "eval_samples_per_second": 55.842, |
| "eval_steps_per_second": 1.753, |
| "step": 325000 |
| }, |
| { |
| "base_loss": 0.6122328286170959, |
| "epoch": 5.0247955322265625, |
| "grad_norm": 0.0010839574970304966, |
| "learning_rate": 1.8957996368408205e-05, |
| "lookahead_loss": 6.295799336433411, |
| "loss": 0.6222, |
| "step": 325500 |
| }, |
| { |
| "base_loss": 0.6058134199380875, |
| "epoch": 5.025749206542969, |
| "grad_norm": 0.0010937975021079183, |
| "learning_rate": 1.891031265258789e-05, |
| "lookahead_loss": 6.256947278022766, |
| "loss": 0.6154, |
| "step": 326000 |
| }, |
| { |
| "base_loss": 0.5969057772159576, |
| "epoch": 5.026702880859375, |
| "grad_norm": 0.0011171259684488177, |
| "learning_rate": 1.8862628936767578e-05, |
| "lookahead_loss": 6.2739819231033325, |
| "loss": 0.6044, |
| "step": 326500 |
| }, |
| { |
| "base_loss": 0.5948947068452836, |
| "epoch": 5.027656555175781, |
| "grad_norm": 0.0010930420830845833, |
| "learning_rate": 1.8814945220947265e-05, |
| "lookahead_loss": 6.3927841548919675, |
| "loss": 0.6051, |
| "step": 327000 |
| }, |
| { |
| "base_loss": 0.6182447550296784, |
| "epoch": 5.0286102294921875, |
| "grad_norm": 0.001109077362343669, |
| "learning_rate": 1.8767261505126955e-05, |
| "lookahead_loss": 6.409919787406921, |
| "loss": 0.6284, |
| "step": 327500 |
| }, |
| { |
| "base_loss": 0.5978566119670868, |
| "epoch": 5.029563903808594, |
| "grad_norm": 0.0010880293557420373, |
| "learning_rate": 1.8719577789306642e-05, |
| "lookahead_loss": 6.346259602069855, |
| "loss": 0.6094, |
| "step": 328000 |
| }, |
| { |
| "base_loss": 0.5951992619633675, |
| "epoch": 5.030517578125, |
| "grad_norm": 0.0010963050881400704, |
| "learning_rate": 1.867189407348633e-05, |
| "lookahead_loss": 6.3438457818031315, |
| "loss": 0.6075, |
| "step": 328500 |
| }, |
| { |
| "base_loss": 0.5904952138662338, |
| "epoch": 5.031471252441406, |
| "grad_norm": 0.0010976595804095268, |
| "learning_rate": 1.8624210357666016e-05, |
| "lookahead_loss": 6.359939827442169, |
| "loss": 0.6039, |
| "step": 329000 |
| }, |
| { |
| "base_loss": 0.6122912662625313, |
| "epoch": 5.0324249267578125, |
| "grad_norm": 0.0011258937884122133, |
| "learning_rate": 1.8576526641845703e-05, |
| "lookahead_loss": 6.307325168609619, |
| "loss": 0.6268, |
| "step": 329500 |
| }, |
| { |
| "base_loss": 0.5966172614097596, |
| "epoch": 5.033378601074219, |
| "grad_norm": 0.0011488485615700483, |
| "learning_rate": 1.8528842926025393e-05, |
| "lookahead_loss": 6.344880403518677, |
| "loss": 0.6075, |
| "step": 330000 |
| }, |
| { |
| "epoch": 5.033378601074219, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.322288970216014, |
| "eval_lookahead_perplexity": 556.8461390253646, |
| "eval_loss": 0.20721931755542755, |
| "eval_perplexity": 1.2302523581349747, |
| "eval_runtime": 91.3128, |
| "eval_samples_per_second": 54.757, |
| "eval_steps_per_second": 1.719, |
| "step": 330000 |
| }, |
| { |
| "base_loss": 0.5913965013027191, |
| "epoch": 5.034332275390625, |
| "grad_norm": 0.0010760932927951217, |
| "learning_rate": 1.848115921020508e-05, |
| "lookahead_loss": 6.418337035179138, |
| "loss": 0.6054, |
| "step": 330500 |
| }, |
| { |
| "base_loss": 0.5880823624134064, |
| "epoch": 5.035285949707031, |
| "grad_norm": 0.0011206314666196704, |
| "learning_rate": 1.8433475494384766e-05, |
| "lookahead_loss": 6.257230742454529, |
| "loss": 0.6021, |
| "step": 331000 |
| }, |
| { |
| "base_loss": 0.6115914248228073, |
| "epoch": 5.0362396240234375, |
| "grad_norm": 0.0011233886471018195, |
| "learning_rate": 1.8385791778564453e-05, |
| "lookahead_loss": 6.371477304935455, |
| "loss": 0.6215, |
| "step": 331500 |
| }, |
| { |
| "base_loss": 0.6013447298407555, |
| "epoch": 5.037193298339844, |
| "grad_norm": 0.001142382388934493, |
| "learning_rate": 1.833810806274414e-05, |
| "lookahead_loss": 6.323221492767334, |
| "loss": 0.6115, |
| "step": 332000 |
| }, |
| { |
| "base_loss": 0.5888082583546639, |
| "epoch": 5.03814697265625, |
| "grad_norm": 0.0010905246017500758, |
| "learning_rate": 1.829042434692383e-05, |
| "lookahead_loss": 6.323706493854523, |
| "loss": 0.6011, |
| "step": 332500 |
| }, |
| { |
| "base_loss": 0.5994124051332473, |
| "epoch": 5.039100646972656, |
| "grad_norm": 0.0011082935379818082, |
| "learning_rate": 1.8242740631103517e-05, |
| "lookahead_loss": 6.297938917160034, |
| "loss": 0.6109, |
| "step": 333000 |
| }, |
| { |
| "base_loss": 0.6140200459361076, |
| "epoch": 5.0400543212890625, |
| "grad_norm": 0.0011177349369972944, |
| "learning_rate": 1.8195056915283204e-05, |
| "lookahead_loss": 6.32886435174942, |
| "loss": 0.6233, |
| "step": 333500 |
| }, |
| { |
| "base_loss": 0.6008699499964714, |
| "epoch": 5.041007995605469, |
| "grad_norm": 0.0011120132403448224, |
| "learning_rate": 1.814737319946289e-05, |
| "lookahead_loss": 6.274889605998993, |
| "loss": 0.6121, |
| "step": 334000 |
| }, |
| { |
| "base_loss": 0.576877062678337, |
| "epoch": 5.041961669921875, |
| "grad_norm": 0.001127906609326601, |
| "learning_rate": 1.8099689483642578e-05, |
| "lookahead_loss": 6.331388182640076, |
| "loss": 0.5947, |
| "step": 334500 |
| }, |
| { |
| "base_loss": 0.6040148069858551, |
| "epoch": 5.042915344238281, |
| "grad_norm": 0.0010664901928976178, |
| "learning_rate": 1.8052005767822268e-05, |
| "lookahead_loss": 6.35973362827301, |
| "loss": 0.6143, |
| "step": 335000 |
| }, |
| { |
| "epoch": 5.042915344238281, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.3188232392929615, |
| "eval_lookahead_perplexity": 554.9196005023798, |
| "eval_loss": 0.20721259713172913, |
| "eval_perplexity": 1.2302440903456535, |
| "eval_runtime": 91.4885, |
| "eval_samples_per_second": 54.652, |
| "eval_steps_per_second": 1.716, |
| "step": 335000 |
| }, |
| { |
| "base_loss": 0.6138452437520027, |
| "epoch": 5.0438690185546875, |
| "grad_norm": 0.0011214982951059937, |
| "learning_rate": 1.8004322052001955e-05, |
| "lookahead_loss": 6.382272615909576, |
| "loss": 0.6257, |
| "step": 335500 |
| }, |
| { |
| "base_loss": 0.5906766360402107, |
| "epoch": 5.044822692871094, |
| "grad_norm": 0.00112924596760422, |
| "learning_rate": 1.795663833618164e-05, |
| "lookahead_loss": 6.318922083854675, |
| "loss": 0.6028, |
| "step": 336000 |
| }, |
| { |
| "base_loss": 0.5900976023674012, |
| "epoch": 5.0457763671875, |
| "grad_norm": 0.0010961294174194336, |
| "learning_rate": 1.7908954620361328e-05, |
| "lookahead_loss": 6.324168976783753, |
| "loss": 0.6009, |
| "step": 336500 |
| }, |
| { |
| "base_loss": 0.6200715956091881, |
| "epoch": 5.046730041503906, |
| "grad_norm": 0.0010944355744868517, |
| "learning_rate": 1.7861270904541015e-05, |
| "lookahead_loss": 6.278039996147156, |
| "loss": 0.6312, |
| "step": 337000 |
| }, |
| { |
| "base_loss": 0.6041342921853066, |
| "epoch": 5.0476837158203125, |
| "grad_norm": 0.0011535694357007742, |
| "learning_rate": 1.7813587188720705e-05, |
| "lookahead_loss": 6.33254317522049, |
| "loss": 0.6221, |
| "step": 337500 |
| }, |
| { |
| "base_loss": 0.5885805570483208, |
| "epoch": 5.048637390136719, |
| "grad_norm": 0.0010927009861916304, |
| "learning_rate": 1.7765903472900392e-05, |
| "lookahead_loss": 6.311563837051391, |
| "loss": 0.6008, |
| "step": 338000 |
| }, |
| { |
| "base_loss": 0.5990964626669883, |
| "epoch": 5.049591064453125, |
| "grad_norm": 0.001139697851613164, |
| "learning_rate": 1.771821975708008e-05, |
| "lookahead_loss": 6.277810357570648, |
| "loss": 0.6072, |
| "step": 338500 |
| }, |
| { |
| "base_loss": 0.6159806163311005, |
| "epoch": 5.050544738769531, |
| "grad_norm": 0.0010485260281711817, |
| "learning_rate": 1.7670536041259766e-05, |
| "lookahead_loss": 6.383065185070038, |
| "loss": 0.6273, |
| "step": 339000 |
| }, |
| { |
| "base_loss": 0.6002469740509987, |
| "epoch": 5.0514984130859375, |
| "grad_norm": 0.0011513761710375547, |
| "learning_rate": 1.7622852325439453e-05, |
| "lookahead_loss": 6.340414535522461, |
| "loss": 0.6113, |
| "step": 339500 |
| }, |
| { |
| "base_loss": 0.5939902824163437, |
| "epoch": 5.052452087402344, |
| "grad_norm": 0.0011352116707712412, |
| "learning_rate": 1.7575168609619143e-05, |
| "lookahead_loss": 6.261112779617309, |
| "loss": 0.6055, |
| "step": 340000 |
| }, |
| { |
| "epoch": 5.052452087402344, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.316136239054866, |
| "eval_lookahead_perplexity": 553.4305328621764, |
| "eval_loss": 0.20720693469047546, |
| "eval_perplexity": 1.230237124180487, |
| "eval_runtime": 90.8802, |
| "eval_samples_per_second": 55.017, |
| "eval_steps_per_second": 1.728, |
| "step": 340000 |
| }, |
| { |
| "base_loss": 0.6115701840519905, |
| "epoch": 5.05340576171875, |
| "grad_norm": 0.0011188328498974442, |
| "learning_rate": 1.752748489379883e-05, |
| "lookahead_loss": 6.279942854881287, |
| "loss": 0.6192, |
| "step": 340500 |
| }, |
| { |
| "base_loss": 0.6067719753980637, |
| "epoch": 5.054359436035156, |
| "grad_norm": 0.0010936488397419453, |
| "learning_rate": 1.7479801177978516e-05, |
| "lookahead_loss": 6.320557628631592, |
| "loss": 0.6227, |
| "step": 341000 |
| }, |
| { |
| "base_loss": 0.594871651172638, |
| "epoch": 5.0553131103515625, |
| "grad_norm": 0.0011088603641837835, |
| "learning_rate": 1.7432117462158203e-05, |
| "lookahead_loss": 6.32833381986618, |
| "loss": 0.6045, |
| "step": 341500 |
| }, |
| { |
| "base_loss": 0.5912774945497513, |
| "epoch": 5.056266784667969, |
| "grad_norm": 0.0010413776617497206, |
| "learning_rate": 1.738443374633789e-05, |
| "lookahead_loss": 6.348275181770325, |
| "loss": 0.6037, |
| "step": 342000 |
| }, |
| { |
| "base_loss": 0.6153999509811401, |
| "epoch": 5.057220458984375, |
| "grad_norm": 0.0010915326420217752, |
| "learning_rate": 1.733675003051758e-05, |
| "lookahead_loss": 6.3401120119094845, |
| "loss": 0.6279, |
| "step": 342500 |
| }, |
| { |
| "base_loss": 0.6071836153268814, |
| "epoch": 5.058174133300781, |
| "grad_norm": 0.001087325974367559, |
| "learning_rate": 1.7289066314697267e-05, |
| "lookahead_loss": 6.34826420211792, |
| "loss": 0.6148, |
| "step": 343000 |
| }, |
| { |
| "base_loss": 0.5827829704880715, |
| "epoch": 5.0591278076171875, |
| "grad_norm": 0.001092458376660943, |
| "learning_rate": 1.7241382598876954e-05, |
| "lookahead_loss": 6.233663388252259, |
| "loss": 0.5998, |
| "step": 343500 |
| }, |
| { |
| "base_loss": 0.5933170615434646, |
| "epoch": 5.060081481933594, |
| "grad_norm": 0.0011025206185877323, |
| "learning_rate": 1.719369888305664e-05, |
| "lookahead_loss": 6.321156049728393, |
| "loss": 0.6051, |
| "step": 344000 |
| }, |
| { |
| "base_loss": 0.6054021108746529, |
| "epoch": 5.06103515625, |
| "grad_norm": 0.0011285829823464155, |
| "learning_rate": 1.7146015167236328e-05, |
| "lookahead_loss": 6.259889480590821, |
| "loss": 0.6174, |
| "step": 344500 |
| }, |
| { |
| "base_loss": 0.6035048805475235, |
| "epoch": 5.061988830566406, |
| "grad_norm": 0.0011320598423480988, |
| "learning_rate": 1.7098331451416018e-05, |
| "lookahead_loss": 6.293309184074402, |
| "loss": 0.6154, |
| "step": 345000 |
| }, |
| { |
| "epoch": 5.061988830566406, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.312484579726149, |
| "eval_lookahead_perplexity": 551.4132784985202, |
| "eval_loss": 0.20720021426677704, |
| "eval_perplexity": 1.2302288564935442, |
| "eval_runtime": 92.0757, |
| "eval_samples_per_second": 54.303, |
| "eval_steps_per_second": 1.705, |
| "step": 345000 |
| }, |
| { |
| "base_loss": 0.594410364151001, |
| "epoch": 5.0629425048828125, |
| "grad_norm": 0.0011401425581425428, |
| "learning_rate": 1.7050647735595705e-05, |
| "lookahead_loss": 6.305532325267792, |
| "loss": 0.6062, |
| "step": 345500 |
| }, |
| { |
| "base_loss": 0.6103508388400077, |
| "epoch": 5.063896179199219, |
| "grad_norm": 0.0010705353925004601, |
| "learning_rate": 1.700296401977539e-05, |
| "lookahead_loss": 6.314088912010193, |
| "loss": 0.6222, |
| "step": 346000 |
| }, |
| { |
| "base_loss": 0.606827544927597, |
| "epoch": 5.064849853515625, |
| "grad_norm": 0.0010943631641566753, |
| "learning_rate": 1.6955280303955078e-05, |
| "lookahead_loss": 6.355545690059662, |
| "loss": 0.6154, |
| "step": 346500 |
| }, |
| { |
| "base_loss": 0.6025100236535073, |
| "epoch": 5.065803527832031, |
| "grad_norm": 0.0011316005839034915, |
| "learning_rate": 1.6907596588134765e-05, |
| "lookahead_loss": 6.270945949554443, |
| "loss": 0.6118, |
| "step": 347000 |
| }, |
| { |
| "base_loss": 0.6017893969416618, |
| "epoch": 5.0667572021484375, |
| "grad_norm": 0.0011118698166683316, |
| "learning_rate": 1.6859912872314455e-05, |
| "lookahead_loss": 6.260629506111145, |
| "loss": 0.61, |
| "step": 347500 |
| }, |
| { |
| "base_loss": 0.6166568307876586, |
| "epoch": 5.067710876464844, |
| "grad_norm": 0.0011616123374551535, |
| "learning_rate": 1.6812229156494142e-05, |
| "lookahead_loss": 6.3263319902420045, |
| "loss": 0.6271, |
| "step": 348000 |
| }, |
| { |
| "base_loss": 0.5990218588709831, |
| "epoch": 5.06866455078125, |
| "grad_norm": 0.0010879425099119544, |
| "learning_rate": 1.676454544067383e-05, |
| "lookahead_loss": 6.2897031512260435, |
| "loss": 0.6077, |
| "step": 348500 |
| }, |
| { |
| "base_loss": 0.5863162516951561, |
| "epoch": 5.069618225097656, |
| "grad_norm": 0.0011173348175361753, |
| "learning_rate": 1.6716861724853516e-05, |
| "lookahead_loss": 6.335295492649078, |
| "loss": 0.6, |
| "step": 349000 |
| }, |
| { |
| "base_loss": 0.6140699430108071, |
| "epoch": 5.0705718994140625, |
| "grad_norm": 0.0011209336807951331, |
| "learning_rate": 1.6669178009033203e-05, |
| "lookahead_loss": 6.232721103191376, |
| "loss": 0.6271, |
| "step": 349500 |
| }, |
| { |
| "base_loss": 0.6004664080142975, |
| "epoch": 5.071525573730469, |
| "grad_norm": 0.0011067437008023262, |
| "learning_rate": 1.6621494293212893e-05, |
| "lookahead_loss": 6.271895239830017, |
| "loss": 0.6104, |
| "step": 350000 |
| }, |
| { |
| "epoch": 5.071525573730469, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.309436458368271, |
| "eval_lookahead_perplexity": 549.7350629090662, |
| "eval_loss": 0.20719435811042786, |
| "eval_perplexity": 1.2302216521021103, |
| "eval_runtime": 90.8003, |
| "eval_samples_per_second": 55.066, |
| "eval_steps_per_second": 1.729, |
| "step": 350000 |
| }, |
| { |
| "base_loss": 0.5908818225860596, |
| "epoch": 5.072479248046875, |
| "grad_norm": 0.0011330280685797334, |
| "learning_rate": 1.657381057739258e-05, |
| "lookahead_loss": 6.328438294410706, |
| "loss": 0.6029, |
| "step": 350500 |
| }, |
| { |
| "base_loss": 0.5954542902112007, |
| "epoch": 5.073432922363281, |
| "grad_norm": 0.0010515834437683225, |
| "learning_rate": 1.6526126861572266e-05, |
| "lookahead_loss": 6.284423396110535, |
| "loss": 0.6062, |
| "step": 351000 |
| }, |
| { |
| "base_loss": 0.6109695326685906, |
| "epoch": 5.0743865966796875, |
| "grad_norm": 0.0011074411449953914, |
| "learning_rate": 1.6478443145751953e-05, |
| "lookahead_loss": 6.297104268074036, |
| "loss": 0.6225, |
| "step": 351500 |
| }, |
| { |
| "base_loss": 0.5943767395019531, |
| "epoch": 5.075340270996094, |
| "grad_norm": 0.0011059824610128999, |
| "learning_rate": 1.643075942993164e-05, |
| "lookahead_loss": 6.358781456947327, |
| "loss": 0.6067, |
| "step": 352000 |
| }, |
| { |
| "base_loss": 0.5842889928817749, |
| "epoch": 5.0762939453125, |
| "grad_norm": 0.0010608715238049626, |
| "learning_rate": 1.638307571411133e-05, |
| "lookahead_loss": 6.328452944278717, |
| "loss": 0.5965, |
| "step": 352500 |
| }, |
| { |
| "base_loss": 0.6252594041824341, |
| "epoch": 5.077247619628906, |
| "grad_norm": 0.0010822336189448833, |
| "learning_rate": 1.6335391998291017e-05, |
| "lookahead_loss": 6.299285487651825, |
| "loss": 0.6364, |
| "step": 353000 |
| }, |
| { |
| "base_loss": 0.5977307210564613, |
| "epoch": 5.0782012939453125, |
| "grad_norm": 0.0011410461738705635, |
| "learning_rate": 1.6287708282470704e-05, |
| "lookahead_loss": 6.318628603935242, |
| "loss": 0.6125, |
| "step": 353500 |
| }, |
| { |
| "base_loss": 0.5993528968691826, |
| "epoch": 5.079154968261719, |
| "grad_norm": 0.0010777156567201018, |
| "learning_rate": 1.624002456665039e-05, |
| "lookahead_loss": 6.336204090118408, |
| "loss": 0.6074, |
| "step": 354000 |
| }, |
| { |
| "base_loss": 0.610366469681263, |
| "epoch": 5.080108642578125, |
| "grad_norm": 0.0010876890737563372, |
| "learning_rate": 1.6192340850830078e-05, |
| "lookahead_loss": 6.294617042541504, |
| "loss": 0.6205, |
| "step": 354500 |
| }, |
| { |
| "base_loss": 0.6119058151245117, |
| "epoch": 5.081062316894531, |
| "grad_norm": 0.0011599212884902954, |
| "learning_rate": 1.6144657135009768e-05, |
| "lookahead_loss": 6.320651604652404, |
| "loss": 0.6214, |
| "step": 355000 |
| }, |
| { |
| "epoch": 5.081062316894531, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.305697089947832, |
| "eval_lookahead_perplexity": 547.6832396276725, |
| "eval_loss": 0.20718762278556824, |
| "eval_perplexity": 1.2302133661875383, |
| "eval_runtime": 92.0429, |
| "eval_samples_per_second": 54.323, |
| "eval_steps_per_second": 1.706, |
| "step": 355000 |
| }, |
| { |
| "base_loss": 0.5920207235217094, |
| "epoch": 5.0820159912109375, |
| "grad_norm": 0.0010734680108726025, |
| "learning_rate": 1.6096973419189455e-05, |
| "lookahead_loss": 6.332855414390564, |
| "loss": 0.6048, |
| "step": 355500 |
| }, |
| { |
| "base_loss": 0.5911735916733741, |
| "epoch": 5.082969665527344, |
| "grad_norm": 0.0011140975402668118, |
| "learning_rate": 1.604928970336914e-05, |
| "lookahead_loss": 6.363209072589874, |
| "loss": 0.6043, |
| "step": 356000 |
| }, |
| { |
| "base_loss": 0.615800891816616, |
| "epoch": 5.08392333984375, |
| "grad_norm": 0.0010806769132614136, |
| "learning_rate": 1.6001605987548828e-05, |
| "lookahead_loss": 6.358828454494477, |
| "loss": 0.6262, |
| "step": 356500 |
| }, |
| { |
| "base_loss": 0.594821957230568, |
| "epoch": 5.084877014160156, |
| "grad_norm": 0.001102195936255157, |
| "learning_rate": 1.5953922271728515e-05, |
| "lookahead_loss": 6.303458571910858, |
| "loss": 0.6046, |
| "step": 357000 |
| }, |
| { |
| "base_loss": 0.5869647584557534, |
| "epoch": 5.0858306884765625, |
| "grad_norm": 0.0011355791939422488, |
| "learning_rate": 1.5906238555908205e-05, |
| "lookahead_loss": 6.312077233791351, |
| "loss": 0.5982, |
| "step": 357500 |
| }, |
| { |
| "base_loss": 0.5933388795256614, |
| "epoch": 5.086784362792969, |
| "grad_norm": 0.0011941350530833006, |
| "learning_rate": 1.5858554840087892e-05, |
| "lookahead_loss": 6.28103159236908, |
| "loss": 0.6038, |
| "step": 358000 |
| }, |
| { |
| "base_loss": 0.6257667993307113, |
| "epoch": 5.087738037109375, |
| "grad_norm": 0.0010375409619882703, |
| "learning_rate": 1.581087112426758e-05, |
| "lookahead_loss": 6.311352049827575, |
| "loss": 0.6301, |
| "step": 358500 |
| }, |
| { |
| "base_loss": 0.5958490890860557, |
| "epoch": 5.088691711425781, |
| "grad_norm": 0.0011225225171074271, |
| "learning_rate": 1.5763187408447266e-05, |
| "lookahead_loss": 6.311702779769897, |
| "loss": 0.6044, |
| "step": 359000 |
| }, |
| { |
| "base_loss": 0.5977150818109512, |
| "epoch": 5.0896453857421875, |
| "grad_norm": 0.0011090969201177359, |
| "learning_rate": 1.5715503692626953e-05, |
| "lookahead_loss": 6.333823210716248, |
| "loss": 0.6073, |
| "step": 359500 |
| }, |
| { |
| "base_loss": 0.5886843089461327, |
| "epoch": 5.090599060058594, |
| "grad_norm": 0.0010649607283994555, |
| "learning_rate": 1.5667819976806643e-05, |
| "lookahead_loss": 6.307887722015381, |
| "loss": 0.6011, |
| "step": 360000 |
| }, |
| { |
| "epoch": 5.090599060058594, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.30245965738266, |
| "eval_lookahead_perplexity": 545.9130191032252, |
| "eval_loss": 0.2071814388036728, |
| "eval_perplexity": 1.2302057585938768, |
| "eval_runtime": 91.4549, |
| "eval_samples_per_second": 54.672, |
| "eval_steps_per_second": 1.717, |
| "step": 360000 |
| }, |
| { |
| "base_loss": 0.5824657415151596, |
| "epoch": 5.091552734375, |
| "grad_norm": 0.0010494901798665524, |
| "learning_rate": 1.562013626098633e-05, |
| "lookahead_loss": 6.287062782287598, |
| "loss": 0.5936, |
| "step": 360500 |
| }, |
| { |
| "base_loss": 0.6174823232293128, |
| "epoch": 5.092506408691406, |
| "grad_norm": 0.0011219014413654804, |
| "learning_rate": 1.5572452545166016e-05, |
| "lookahead_loss": 6.3347588586807255, |
| "loss": 0.6246, |
| "step": 361000 |
| }, |
| { |
| "base_loss": 0.5997858379483223, |
| "epoch": 5.0934600830078125, |
| "grad_norm": 0.0010824851924553514, |
| "learning_rate": 1.5524768829345703e-05, |
| "lookahead_loss": 6.319412599563599, |
| "loss": 0.6122, |
| "step": 361500 |
| }, |
| { |
| "base_loss": 0.5813568589091301, |
| "epoch": 5.094413757324219, |
| "grad_norm": 0.001104526687413454, |
| "learning_rate": 1.547708511352539e-05, |
| "lookahead_loss": 6.322312892436981, |
| "loss": 0.5951, |
| "step": 362000 |
| }, |
| { |
| "base_loss": 0.5759395672678947, |
| "epoch": 5.095367431640625, |
| "grad_norm": 0.001081343274563551, |
| "learning_rate": 1.542940139770508e-05, |
| "lookahead_loss": 6.258695254802704, |
| "loss": 0.5915, |
| "step": 362500 |
| }, |
| { |
| "base_loss": 0.5969497102499008, |
| "epoch": 5.096321105957031, |
| "grad_norm": 0.0011133088264614344, |
| "learning_rate": 1.5381717681884767e-05, |
| "lookahead_loss": 6.303308537006378, |
| "loss": 0.61, |
| "step": 363000 |
| }, |
| { |
| "base_loss": 0.6147504753470421, |
| "epoch": 5.0972747802734375, |
| "grad_norm": 0.001123305642977357, |
| "learning_rate": 1.5334033966064454e-05, |
| "lookahead_loss": 6.34167172908783, |
| "loss": 0.6241, |
| "step": 363500 |
| }, |
| { |
| "base_loss": 0.5965510091781616, |
| "epoch": 5.098228454589844, |
| "grad_norm": 0.0011167360935360193, |
| "learning_rate": 1.528635025024414e-05, |
| "lookahead_loss": 6.270585143089295, |
| "loss": 0.6065, |
| "step": 364000 |
| }, |
| { |
| "base_loss": 0.5827678750753402, |
| "epoch": 5.09918212890625, |
| "grad_norm": 0.001104156021028757, |
| "learning_rate": 1.523866653442383e-05, |
| "lookahead_loss": 6.335385172843933, |
| "loss": 0.595, |
| "step": 364500 |
| }, |
| { |
| "base_loss": 0.5870707350373268, |
| "epoch": 5.100135803222656, |
| "grad_norm": 0.0010957367485389113, |
| "learning_rate": 1.5190982818603516e-05, |
| "lookahead_loss": 6.311425636291504, |
| "loss": 0.6025, |
| "step": 365000 |
| }, |
| { |
| "epoch": 5.100135803222656, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.3001686483145525, |
| "eval_lookahead_perplexity": 544.663759005586, |
| "eval_loss": 0.20717626810073853, |
| "eval_perplexity": 1.2301993975817966, |
| "eval_runtime": 90.4079, |
| "eval_samples_per_second": 55.305, |
| "eval_steps_per_second": 1.737, |
| "step": 365000 |
| }, |
| { |
| "base_loss": 0.6111934832334518, |
| "epoch": 5.1010894775390625, |
| "grad_norm": 0.0010889478726312518, |
| "learning_rate": 1.5143299102783205e-05, |
| "lookahead_loss": 6.365397205829621, |
| "loss": 0.6214, |
| "step": 365500 |
| }, |
| { |
| "base_loss": 0.6032542023062706, |
| "epoch": 5.102043151855469, |
| "grad_norm": 0.001146532827988267, |
| "learning_rate": 1.5095615386962891e-05, |
| "lookahead_loss": 6.334895478248596, |
| "loss": 0.6076, |
| "step": 366000 |
| }, |
| { |
| "base_loss": 0.5893560016155243, |
| "epoch": 5.102996826171875, |
| "grad_norm": 0.0010869104880839586, |
| "learning_rate": 1.5047931671142578e-05, |
| "lookahead_loss": 6.309252546310425, |
| "loss": 0.6029, |
| "step": 366500 |
| }, |
| { |
| "base_loss": 0.5904992881417275, |
| "epoch": 5.103950500488281, |
| "grad_norm": 0.001103839953429997, |
| "learning_rate": 1.5000247955322267e-05, |
| "lookahead_loss": 6.336761679649353, |
| "loss": 0.6019, |
| "step": 367000 |
| }, |
| { |
| "base_loss": 0.6113997294902801, |
| "epoch": 5.1049041748046875, |
| "grad_norm": 0.0010869849938899279, |
| "learning_rate": 1.4952564239501954e-05, |
| "lookahead_loss": 6.279005553245544, |
| "loss": 0.6242, |
| "step": 367500 |
| }, |
| { |
| "base_loss": 0.6017834762334824, |
| "epoch": 5.105857849121094, |
| "grad_norm": 0.001036735251545906, |
| "learning_rate": 1.4904880523681642e-05, |
| "lookahead_loss": 6.2851017117500305, |
| "loss": 0.6133, |
| "step": 368000 |
| }, |
| { |
| "base_loss": 0.5975290570855141, |
| "epoch": 5.1068115234375, |
| "grad_norm": 0.0010710596106946468, |
| "learning_rate": 1.4857196807861329e-05, |
| "lookahead_loss": 6.347316368103027, |
| "loss": 0.6067, |
| "step": 368500 |
| }, |
| { |
| "base_loss": 0.5801187572479248, |
| "epoch": 5.107765197753906, |
| "grad_norm": 0.001139249769039452, |
| "learning_rate": 1.4809513092041016e-05, |
| "lookahead_loss": 6.257316456317902, |
| "loss": 0.5924, |
| "step": 369000 |
| }, |
| { |
| "base_loss": 0.603221082687378, |
| "epoch": 5.1087188720703125, |
| "grad_norm": 0.0011551164789125323, |
| "learning_rate": 1.4761829376220704e-05, |
| "lookahead_loss": 6.2666192288398745, |
| "loss": 0.6172, |
| "step": 369500 |
| }, |
| { |
| "base_loss": 0.606291466653347, |
| "epoch": 5.109672546386719, |
| "grad_norm": 0.001088725752197206, |
| "learning_rate": 1.4714145660400391e-05, |
| "lookahead_loss": 6.244538120269775, |
| "loss": 0.616, |
| "step": 370000 |
| }, |
| { |
| "epoch": 5.109672546386719, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.297252341200369, |
| "eval_lookahead_perplexity": 543.0776661014929, |
| "eval_loss": 0.20717072486877441, |
| "eval_perplexity": 1.2301925783200742, |
| "eval_runtime": 92.6089, |
| "eval_samples_per_second": 53.99, |
| "eval_steps_per_second": 1.695, |
| "step": 370000 |
| }, |
| { |
| "base_loss": 0.5948919664621353, |
| "epoch": 5.110626220703125, |
| "grad_norm": 0.0010841034818440676, |
| "learning_rate": 1.466646194458008e-05, |
| "lookahead_loss": 6.219464591503144, |
| "loss": 0.606, |
| "step": 370500 |
| }, |
| { |
| "base_loss": 0.5846513741016388, |
| "epoch": 5.111579895019531, |
| "grad_norm": 0.0010996430646628141, |
| "learning_rate": 1.4618778228759766e-05, |
| "lookahead_loss": 6.3166262807846065, |
| "loss": 0.597, |
| "step": 371000 |
| }, |
| { |
| "base_loss": 0.6055964279770851, |
| "epoch": 5.1125335693359375, |
| "grad_norm": 0.0010927943512797356, |
| "learning_rate": 1.4571094512939453e-05, |
| "lookahead_loss": 6.335015828132629, |
| "loss": 0.6131, |
| "step": 371500 |
| }, |
| { |
| "base_loss": 0.6173858237266541, |
| "epoch": 5.113487243652344, |
| "grad_norm": 0.0011214661644771695, |
| "learning_rate": 1.4523410797119142e-05, |
| "lookahead_loss": 6.343456150531769, |
| "loss": 0.627, |
| "step": 372000 |
| }, |
| { |
| "base_loss": 0.5966166469454766, |
| "epoch": 5.11444091796875, |
| "grad_norm": 0.0010272579966112971, |
| "learning_rate": 1.4475727081298829e-05, |
| "lookahead_loss": 6.2267381420135495, |
| "loss": 0.606, |
| "step": 372500 |
| }, |
| { |
| "base_loss": 0.5850279053449631, |
| "epoch": 5.115394592285156, |
| "grad_norm": 0.001128312898799777, |
| "learning_rate": 1.4428043365478517e-05, |
| "lookahead_loss": 6.2816207437515255, |
| "loss": 0.5965, |
| "step": 373000 |
| }, |
| { |
| "base_loss": 0.6050868096351624, |
| "epoch": 5.1163482666015625, |
| "grad_norm": 0.0011569701600819826, |
| "learning_rate": 1.4380359649658204e-05, |
| "lookahead_loss": 6.286871615409851, |
| "loss": 0.6157, |
| "step": 373500 |
| }, |
| { |
| "base_loss": 0.6095665054321289, |
| "epoch": 5.117301940917969, |
| "grad_norm": 0.0011440969537943602, |
| "learning_rate": 1.433267593383789e-05, |
| "lookahead_loss": 6.284195227622986, |
| "loss": 0.6217, |
| "step": 374000 |
| }, |
| { |
| "base_loss": 0.6041606207489967, |
| "epoch": 5.118255615234375, |
| "grad_norm": 0.0011145136086270213, |
| "learning_rate": 1.428499221801758e-05, |
| "lookahead_loss": 6.316567860603333, |
| "loss": 0.6116, |
| "step": 374500 |
| }, |
| { |
| "base_loss": 0.5852125850319863, |
| "epoch": 5.119209289550781, |
| "grad_norm": 0.001106358366087079, |
| "learning_rate": 1.4237308502197266e-05, |
| "lookahead_loss": 6.311560387611389, |
| "loss": 0.5951, |
| "step": 375000 |
| }, |
| { |
| "epoch": 5.119209289550781, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.294692088239871, |
| "eval_lookahead_perplexity": 541.689028289642, |
| "eval_loss": 0.20716556906700134, |
| "eval_perplexity": 1.2301862357073483, |
| "eval_runtime": 90.9096, |
| "eval_samples_per_second": 55.0, |
| "eval_steps_per_second": 1.727, |
| "step": 375000 |
| }, |
| { |
| "base_loss": 0.5954181181788445, |
| "epoch": 6.000953674316406, |
| "grad_norm": 0.0011187719646841288, |
| "learning_rate": 1.4189624786376955e-05, |
| "lookahead_loss": 6.385282723426819, |
| "loss": 0.6012, |
| "step": 375500 |
| }, |
| { |
| "base_loss": 0.5865643661618233, |
| "epoch": 6.0019073486328125, |
| "grad_norm": 0.0011442394461482763, |
| "learning_rate": 1.4141941070556641e-05, |
| "lookahead_loss": 6.220819156169892, |
| "loss": 0.5983, |
| "step": 376000 |
| }, |
| { |
| "base_loss": 0.6043179650306701, |
| "epoch": 6.002861022949219, |
| "grad_norm": 0.0011097525712102652, |
| "learning_rate": 1.4094257354736328e-05, |
| "lookahead_loss": 6.230756608486176, |
| "loss": 0.613, |
| "step": 376500 |
| }, |
| { |
| "base_loss": 0.6149949839115143, |
| "epoch": 6.003814697265625, |
| "grad_norm": 0.0010844056960195303, |
| "learning_rate": 1.4046573638916017e-05, |
| "lookahead_loss": 6.254981481552124, |
| "loss": 0.6229, |
| "step": 377000 |
| }, |
| { |
| "base_loss": 0.603040216743946, |
| "epoch": 6.004768371582031, |
| "grad_norm": 0.001105528324842453, |
| "learning_rate": 1.3998889923095704e-05, |
| "lookahead_loss": 6.231416835784912, |
| "loss": 0.6098, |
| "step": 377500 |
| }, |
| { |
| "base_loss": 0.590958813726902, |
| "epoch": 6.0057220458984375, |
| "grad_norm": 0.0010404631029814482, |
| "learning_rate": 1.3951206207275392e-05, |
| "lookahead_loss": 6.355237211227417, |
| "loss": 0.5985, |
| "step": 378000 |
| }, |
| { |
| "base_loss": 0.5807944664359093, |
| "epoch": 6.006675720214844, |
| "grad_norm": 0.0010901595233008265, |
| "learning_rate": 1.3903522491455079e-05, |
| "lookahead_loss": 6.2100413432121275, |
| "loss": 0.5965, |
| "step": 378500 |
| }, |
| { |
| "base_loss": 0.6047119013071061, |
| "epoch": 6.00762939453125, |
| "grad_norm": 0.0011048950254917145, |
| "learning_rate": 1.3855838775634766e-05, |
| "lookahead_loss": 6.267502905368805, |
| "loss": 0.6165, |
| "step": 379000 |
| }, |
| { |
| "base_loss": 0.6036030429005623, |
| "epoch": 6.008583068847656, |
| "grad_norm": 0.0010420246981084347, |
| "learning_rate": 1.3808155059814454e-05, |
| "lookahead_loss": 6.271774408817291, |
| "loss": 0.6077, |
| "step": 379500 |
| }, |
| { |
| "base_loss": 0.5903495861887932, |
| "epoch": 6.0095367431640625, |
| "grad_norm": 0.001150952186435461, |
| "learning_rate": 1.3760471343994141e-05, |
| "lookahead_loss": 6.283748052120209, |
| "loss": 0.6068, |
| "step": 380000 |
| }, |
| { |
| "epoch": 6.0095367431640625, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.292171517880961, |
| "eval_lookahead_perplexity": 540.3253822860049, |
| "eval_loss": 0.20716047286987305, |
| "eval_perplexity": 1.2301799664517612, |
| "eval_runtime": 91.422, |
| "eval_samples_per_second": 54.691, |
| "eval_steps_per_second": 1.717, |
| "step": 380000 |
| }, |
| { |
| "base_loss": 0.592279341518879, |
| "epoch": 6.010490417480469, |
| "grad_norm": 0.0010976491030305624, |
| "learning_rate": 1.371278762817383e-05, |
| "lookahead_loss": 6.236666479110718, |
| "loss": 0.6037, |
| "step": 380500 |
| }, |
| { |
| "base_loss": 0.5882293724417686, |
| "epoch": 6.011444091796875, |
| "grad_norm": 0.0011407588608562946, |
| "learning_rate": 1.3665103912353516e-05, |
| "lookahead_loss": 6.257264825820923, |
| "loss": 0.6041, |
| "step": 381000 |
| }, |
| { |
| "base_loss": 0.6102479678988457, |
| "epoch": 6.012397766113281, |
| "grad_norm": 0.0010918615153059363, |
| "learning_rate": 1.3617420196533203e-05, |
| "lookahead_loss": 6.255623918533325, |
| "loss": 0.6214, |
| "step": 381500 |
| }, |
| { |
| "base_loss": 0.5958946568965912, |
| "epoch": 6.0133514404296875, |
| "grad_norm": 0.001107473624870181, |
| "learning_rate": 1.3569736480712892e-05, |
| "lookahead_loss": 6.330819105148316, |
| "loss": 0.6088, |
| "step": 382000 |
| }, |
| { |
| "base_loss": 0.5949127861857414, |
| "epoch": 6.014305114746094, |
| "grad_norm": 0.0011172577505931258, |
| "learning_rate": 1.3522052764892579e-05, |
| "lookahead_loss": 6.318566505432129, |
| "loss": 0.607, |
| "step": 382500 |
| }, |
| { |
| "base_loss": 0.5792632920742035, |
| "epoch": 6.0152587890625, |
| "grad_norm": 0.0011169774224981666, |
| "learning_rate": 1.3474369049072265e-05, |
| "lookahead_loss": 6.247372644424439, |
| "loss": 0.5868, |
| "step": 383000 |
| }, |
| { |
| "base_loss": 0.6084413604736328, |
| "epoch": 6.016212463378906, |
| "grad_norm": 0.0011083075078204274, |
| "learning_rate": 1.3426685333251954e-05, |
| "lookahead_loss": 6.301636509895324, |
| "loss": 0.6167, |
| "step": 383500 |
| }, |
| { |
| "base_loss": 0.6049468902349472, |
| "epoch": 6.0171661376953125, |
| "grad_norm": 0.001102580688893795, |
| "learning_rate": 1.337900161743164e-05, |
| "lookahead_loss": 6.3453411102294925, |
| "loss": 0.6096, |
| "step": 384000 |
| }, |
| { |
| "base_loss": 0.5921321566104889, |
| "epoch": 6.018119812011719, |
| "grad_norm": 0.0010313682723790407, |
| "learning_rate": 1.333131790161133e-05, |
| "lookahead_loss": 6.338734080314636, |
| "loss": 0.6005, |
| "step": 384500 |
| }, |
| { |
| "base_loss": 0.5876540603637695, |
| "epoch": 6.019073486328125, |
| "grad_norm": 0.001066872850060463, |
| "learning_rate": 1.3283634185791016e-05, |
| "lookahead_loss": 6.364590894699097, |
| "loss": 0.5974, |
| "step": 385000 |
| }, |
| { |
| "epoch": 6.019073486328125, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.2893890938438926, |
| "eval_lookahead_perplexity": 538.8240575840532, |
| "eval_loss": 0.20715488493442535, |
| "eval_perplexity": 1.2301730923047258, |
| "eval_runtime": 92.0267, |
| "eval_samples_per_second": 54.332, |
| "eval_steps_per_second": 1.706, |
| "step": 385000 |
| }, |
| { |
| "base_loss": 0.5890076372027397, |
| "epoch": 6.020027160644531, |
| "grad_norm": 0.0011358478805050254, |
| "learning_rate": 1.3235950469970703e-05, |
| "lookahead_loss": 6.209294054985047, |
| "loss": 0.603, |
| "step": 385500 |
| }, |
| { |
| "base_loss": 0.6188538994193077, |
| "epoch": 6.0209808349609375, |
| "grad_norm": 0.0010753298411145806, |
| "learning_rate": 1.3188266754150391e-05, |
| "lookahead_loss": 6.266224889755249, |
| "loss": 0.6263, |
| "step": 386000 |
| }, |
| { |
| "base_loss": 0.596300050675869, |
| "epoch": 6.021934509277344, |
| "grad_norm": 0.0011278757592663169, |
| "learning_rate": 1.3140583038330078e-05, |
| "lookahead_loss": 6.252052826881409, |
| "loss": 0.6084, |
| "step": 386500 |
| }, |
| { |
| "base_loss": 0.5962796038985252, |
| "epoch": 6.02288818359375, |
| "grad_norm": 0.0011039053788408637, |
| "learning_rate": 1.3092899322509767e-05, |
| "lookahead_loss": 6.271143918991089, |
| "loss": 0.6041, |
| "step": 387000 |
| }, |
| { |
| "base_loss": 0.5983630774617195, |
| "epoch": 6.023841857910156, |
| "grad_norm": 0.0011156456312164664, |
| "learning_rate": 1.3045215606689454e-05, |
| "lookahead_loss": 6.247088316917419, |
| "loss": 0.6052, |
| "step": 387500 |
| }, |
| { |
| "base_loss": 0.6114985771775245, |
| "epoch": 6.0247955322265625, |
| "grad_norm": 0.0010853740386664867, |
| "learning_rate": 1.299753189086914e-05, |
| "lookahead_loss": 6.26153412771225, |
| "loss": 0.6224, |
| "step": 388000 |
| }, |
| { |
| "base_loss": 0.6061187900304794, |
| "epoch": 6.025749206542969, |
| "grad_norm": 0.001081890077330172, |
| "learning_rate": 1.2949848175048829e-05, |
| "lookahead_loss": 6.220615880966187, |
| "loss": 0.6158, |
| "step": 388500 |
| }, |
| { |
| "base_loss": 0.5953421378135682, |
| "epoch": 6.026702880859375, |
| "grad_norm": 0.0011105469893664122, |
| "learning_rate": 1.2902164459228516e-05, |
| "lookahead_loss": 6.231943651199341, |
| "loss": 0.605, |
| "step": 389000 |
| }, |
| { |
| "base_loss": 0.5930399923324585, |
| "epoch": 6.027656555175781, |
| "grad_norm": 0.0010852537816390395, |
| "learning_rate": 1.2854480743408204e-05, |
| "lookahead_loss": 6.355642718315124, |
| "loss": 0.6037, |
| "step": 389500 |
| }, |
| { |
| "base_loss": 0.6187563810944557, |
| "epoch": 6.0286102294921875, |
| "grad_norm": 0.0010983969550579786, |
| "learning_rate": 1.2806797027587891e-05, |
| "lookahead_loss": 6.366688157558441, |
| "loss": 0.6267, |
| "step": 390000 |
| }, |
| { |
| "epoch": 6.0286102294921875, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.286968604444315, |
| "eval_lookahead_perplexity": 537.5214168145098, |
| "eval_loss": 0.2071501761674881, |
| "eval_perplexity": 1.2301672997199797, |
| "eval_runtime": 91.251, |
| "eval_samples_per_second": 54.794, |
| "eval_steps_per_second": 1.721, |
| "step": 390000 |
| }, |
| { |
| "base_loss": 0.599323958337307, |
| "epoch": 6.029563903808594, |
| "grad_norm": 0.001101642264984548, |
| "learning_rate": 1.2759113311767578e-05, |
| "lookahead_loss": 6.299301620960236, |
| "loss": 0.6094, |
| "step": 390500 |
| }, |
| { |
| "base_loss": 0.593437955737114, |
| "epoch": 6.030517578125, |
| "grad_norm": 0.0010927050607278943, |
| "learning_rate": 1.2711429595947266e-05, |
| "lookahead_loss": 6.304755561351776, |
| "loss": 0.6065, |
| "step": 391000 |
| }, |
| { |
| "base_loss": 0.5899879291653634, |
| "epoch": 6.031471252441406, |
| "grad_norm": 0.0011103905271738768, |
| "learning_rate": 1.2663745880126953e-05, |
| "lookahead_loss": 6.316039553165436, |
| "loss": 0.6029, |
| "step": 391500 |
| }, |
| { |
| "base_loss": 0.609737318277359, |
| "epoch": 6.0324249267578125, |
| "grad_norm": 0.0011069606989622116, |
| "learning_rate": 1.2616062164306642e-05, |
| "lookahead_loss": 6.261803119182587, |
| "loss": 0.626, |
| "step": 392000 |
| }, |
| { |
| "base_loss": 0.5963039031624794, |
| "epoch": 6.033378601074219, |
| "grad_norm": 0.0011312129208818078, |
| "learning_rate": 1.2568378448486329e-05, |
| "lookahead_loss": 6.314544658184052, |
| "loss": 0.6073, |
| "step": 392500 |
| }, |
| { |
| "base_loss": 0.5939543527364731, |
| "epoch": 6.034332275390625, |
| "grad_norm": 0.001062846858985722, |
| "learning_rate": 1.2520694732666015e-05, |
| "lookahead_loss": 6.379460736274719, |
| "loss": 0.607, |
| "step": 393000 |
| }, |
| { |
| "base_loss": 0.5901711618304253, |
| "epoch": 6.035285949707031, |
| "grad_norm": 0.0011185839539393783, |
| "learning_rate": 1.2473011016845704e-05, |
| "lookahead_loss": 6.221417744636535, |
| "loss": 0.6018, |
| "step": 393500 |
| }, |
| { |
| "base_loss": 0.6095206972360611, |
| "epoch": 6.0362396240234375, |
| "grad_norm": 0.0011291452683508396, |
| "learning_rate": 1.242532730102539e-05, |
| "lookahead_loss": 6.328900773048401, |
| "loss": 0.6202, |
| "step": 394000 |
| }, |
| { |
| "base_loss": 0.6031560533642769, |
| "epoch": 6.037193298339844, |
| "grad_norm": 0.0011474916245788336, |
| "learning_rate": 1.237764358520508e-05, |
| "lookahead_loss": 6.284544788837433, |
| "loss": 0.6122, |
| "step": 394500 |
| }, |
| { |
| "base_loss": 0.589473883986473, |
| "epoch": 6.03814697265625, |
| "grad_norm": 0.0010779218282550573, |
| "learning_rate": 1.2329959869384766e-05, |
| "lookahead_loss": 6.292745516777039, |
| "loss": 0.5994, |
| "step": 395000 |
| }, |
| { |
| "epoch": 6.03814697265625, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.28468038479741, |
| "eval_lookahead_perplexity": 536.2928558925989, |
| "eval_loss": 0.20714624226093292, |
| "eval_perplexity": 1.2301624603662942, |
| "eval_runtime": 92.8052, |
| "eval_samples_per_second": 53.876, |
| "eval_steps_per_second": 1.692, |
| "step": 395000 |
| }, |
| { |
| "base_loss": 0.6025843568444252, |
| "epoch": 6.039100646972656, |
| "grad_norm": 0.001086984178982675, |
| "learning_rate": 1.2282276153564453e-05, |
| "lookahead_loss": 6.262492390632629, |
| "loss": 0.6127, |
| "step": 395500 |
| }, |
| { |
| "base_loss": 0.6174288346767426, |
| "epoch": 6.0400543212890625, |
| "grad_norm": 0.0010939656058326364, |
| "learning_rate": 1.2234592437744141e-05, |
| "lookahead_loss": 6.2870379695892336, |
| "loss": 0.6249, |
| "step": 396000 |
| }, |
| { |
| "base_loss": 0.6014795810580253, |
| "epoch": 6.041007995605469, |
| "grad_norm": 0.0011165516916662455, |
| "learning_rate": 1.2186908721923828e-05, |
| "lookahead_loss": 6.232427957057953, |
| "loss": 0.6128, |
| "step": 396500 |
| }, |
| { |
| "base_loss": 0.5805132260918617, |
| "epoch": 6.041961669921875, |
| "grad_norm": 0.001125194481573999, |
| "learning_rate": 1.2139225006103517e-05, |
| "lookahead_loss": 6.307816028594971, |
| "loss": 0.5953, |
| "step": 397000 |
| }, |
| { |
| "base_loss": 0.6034971331357956, |
| "epoch": 6.042915344238281, |
| "grad_norm": 0.0010759946890175343, |
| "learning_rate": 1.2091541290283204e-05, |
| "lookahead_loss": 6.327818301200867, |
| "loss": 0.6143, |
| "step": 397500 |
| }, |
| { |
| "base_loss": 0.6110702828168869, |
| "epoch": 6.0438690185546875, |
| "grad_norm": 0.0011335865128785372, |
| "learning_rate": 1.204385757446289e-05, |
| "lookahead_loss": 6.344313054084778, |
| "loss": 0.6233, |
| "step": 398000 |
| }, |
| { |
| "base_loss": 0.5920091760754586, |
| "epoch": 6.044822692871094, |
| "grad_norm": 0.0011095399968326092, |
| "learning_rate": 1.1996173858642579e-05, |
| "lookahead_loss": 6.279779013633728, |
| "loss": 0.6051, |
| "step": 398500 |
| }, |
| { |
| "base_loss": 0.5922821204066276, |
| "epoch": 6.0457763671875, |
| "grad_norm": 0.001092213555239141, |
| "learning_rate": 1.1948490142822266e-05, |
| "lookahead_loss": 6.276170256614685, |
| "loss": 0.6023, |
| "step": 399000 |
| }, |
| { |
| "base_loss": 0.6183968783020973, |
| "epoch": 6.046730041503906, |
| "grad_norm": 0.0011093751527369022, |
| "learning_rate": 1.1900806427001954e-05, |
| "lookahead_loss": 6.241053509235382, |
| "loss": 0.6271, |
| "step": 399500 |
| }, |
| { |
| "base_loss": 0.6054804750084877, |
| "epoch": 6.0476837158203125, |
| "grad_norm": 0.001156223937869072, |
| "learning_rate": 1.1853122711181641e-05, |
| "lookahead_loss": 6.292171361923217, |
| "loss": 0.6225, |
| "step": 400000 |
| }, |
| { |
| "epoch": 6.0476837158203125, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.282726495791548, |
| "eval_lookahead_perplexity": 535.2460222090004, |
| "eval_loss": 0.20714208483695984, |
| "eval_perplexity": 1.2301573460700217, |
| "eval_runtime": 91.3116, |
| "eval_samples_per_second": 54.758, |
| "eval_steps_per_second": 1.719, |
| "step": 400000 |
| }, |
| { |
| "base_loss": 0.5904572269320488, |
| "epoch": 6.048637390136719, |
| "grad_norm": 0.0010891122510656714, |
| "learning_rate": 1.1805438995361328e-05, |
| "lookahead_loss": 6.270732181072235, |
| "loss": 0.6001, |
| "step": 400500 |
| }, |
| { |
| "base_loss": 0.5977823759317398, |
| "epoch": 6.049591064453125, |
| "grad_norm": 0.0011509527685120702, |
| "learning_rate": 1.1757755279541016e-05, |
| "lookahead_loss": 6.241637663841248, |
| "loss": 0.6059, |
| "step": 401000 |
| }, |
| { |
| "base_loss": 0.6174590476155281, |
| "epoch": 6.050544738769531, |
| "grad_norm": 0.0010234012734144926, |
| "learning_rate": 1.1710071563720703e-05, |
| "lookahead_loss": 6.341926455974579, |
| "loss": 0.6282, |
| "step": 401500 |
| }, |
| { |
| "base_loss": 0.5955156463384629, |
| "epoch": 6.0514984130859375, |
| "grad_norm": 0.001154066063463688, |
| "learning_rate": 1.1662387847900392e-05, |
| "lookahead_loss": 6.2955641860961915, |
| "loss": 0.608, |
| "step": 402000 |
| }, |
| { |
| "base_loss": 0.5936241209506988, |
| "epoch": 6.052452087402344, |
| "grad_norm": 0.0011350855929777026, |
| "learning_rate": 1.1614704132080079e-05, |
| "lookahead_loss": 6.227645565032959, |
| "loss": 0.6042, |
| "step": 402500 |
| }, |
| { |
| "base_loss": 0.6117795875668526, |
| "epoch": 6.05340576171875, |
| "grad_norm": 0.001109893317334354, |
| "learning_rate": 1.1567020416259765e-05, |
| "lookahead_loss": 6.238243167877197, |
| "loss": 0.6205, |
| "step": 403000 |
| }, |
| { |
| "base_loss": 0.6093643299937248, |
| "epoch": 6.054359436035156, |
| "grad_norm": 0.0011104453587904572, |
| "learning_rate": 1.1519336700439454e-05, |
| "lookahead_loss": 6.285483991622924, |
| "loss": 0.6236, |
| "step": 403500 |
| }, |
| { |
| "base_loss": 0.5967902150750161, |
| "epoch": 6.0553131103515625, |
| "grad_norm": 0.0011117176618427038, |
| "learning_rate": 1.147165298461914e-05, |
| "lookahead_loss": 6.290799595832825, |
| "loss": 0.6048, |
| "step": 404000 |
| }, |
| { |
| "base_loss": 0.5867149458527565, |
| "epoch": 6.056266784667969, |
| "grad_norm": 0.001066114753484726, |
| "learning_rate": 1.142396926879883e-05, |
| "lookahead_loss": 6.314110722541809, |
| "loss": 0.6003, |
| "step": 404500 |
| }, |
| { |
| "base_loss": 0.6157781246900559, |
| "epoch": 6.057220458984375, |
| "grad_norm": 0.001078008092008531, |
| "learning_rate": 1.1376285552978516e-05, |
| "lookahead_loss": 6.294362932682037, |
| "loss": 0.629, |
| "step": 405000 |
| }, |
| { |
| "epoch": 6.057220458984375, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.280795160573892, |
| "eval_lookahead_perplexity": 534.2132803226017, |
| "eval_loss": 0.20713835954666138, |
| "eval_perplexity": 1.2301527633853309, |
| "eval_runtime": 91.5706, |
| "eval_samples_per_second": 54.603, |
| "eval_steps_per_second": 1.715, |
| "step": 405000 |
| }, |
| { |
| "base_loss": 0.6060819316506386, |
| "epoch": 6.058174133300781, |
| "grad_norm": 0.0011201421730220318, |
| "learning_rate": 1.1328601837158203e-05, |
| "lookahead_loss": 6.31790843296051, |
| "loss": 0.6154, |
| "step": 405500 |
| }, |
| { |
| "base_loss": 0.5837663099765777, |
| "epoch": 6.0591278076171875, |
| "grad_norm": 0.0011035632342100143, |
| "learning_rate": 1.1280918121337891e-05, |
| "lookahead_loss": 6.2060595207214355, |
| "loss": 0.599, |
| "step": 406000 |
| }, |
| { |
| "base_loss": 0.5909119842648506, |
| "epoch": 6.060081481933594, |
| "grad_norm": 0.0010986344423145056, |
| "learning_rate": 1.1233234405517578e-05, |
| "lookahead_loss": 6.279904296875, |
| "loss": 0.6045, |
| "step": 406500 |
| }, |
| { |
| "base_loss": 0.609372730076313, |
| "epoch": 6.06103515625, |
| "grad_norm": 0.001120261033065617, |
| "learning_rate": 1.1185550689697267e-05, |
| "lookahead_loss": 6.231126732826233, |
| "loss": 0.6181, |
| "step": 407000 |
| }, |
| { |
| "base_loss": 0.6013825216889381, |
| "epoch": 6.061988830566406, |
| "grad_norm": 0.0011455032508820295, |
| "learning_rate": 1.1137866973876954e-05, |
| "lookahead_loss": 6.261857038497925, |
| "loss": 0.6139, |
| "step": 407500 |
| }, |
| { |
| "base_loss": 0.59294895529747, |
| "epoch": 6.0629425048828125, |
| "grad_norm": 0.0011340271448716521, |
| "learning_rate": 1.109018325805664e-05, |
| "lookahead_loss": 6.269025160789489, |
| "loss": 0.6062, |
| "step": 408000 |
| }, |
| { |
| "base_loss": 0.6099907007813453, |
| "epoch": 6.063896179199219, |
| "grad_norm": 0.0010581036331132054, |
| "learning_rate": 1.1042499542236329e-05, |
| "lookahead_loss": 6.264707807064056, |
| "loss": 0.6212, |
| "step": 408500 |
| }, |
| { |
| "base_loss": 0.6085181525945663, |
| "epoch": 6.064849853515625, |
| "grad_norm": 0.0011085295118391514, |
| "learning_rate": 1.0994815826416016e-05, |
| "lookahead_loss": 6.328743109703064, |
| "loss": 0.6151, |
| "step": 409000 |
| }, |
| { |
| "base_loss": 0.6055323982238769, |
| "epoch": 6.065803527832031, |
| "grad_norm": 0.0011180423898622394, |
| "learning_rate": 1.0947132110595704e-05, |
| "lookahead_loss": 6.239625741958618, |
| "loss": 0.6142, |
| "step": 409500 |
| }, |
| { |
| "base_loss": 0.6037949919104576, |
| "epoch": 6.0667572021484375, |
| "grad_norm": 0.0011036383220925927, |
| "learning_rate": 1.0899448394775391e-05, |
| "lookahead_loss": 6.2200113606452945, |
| "loss": 0.6107, |
| "step": 410000 |
| }, |
| { |
| "epoch": 6.0667572021484375, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.27883860554558, |
| "eval_lookahead_perplexity": 533.1690844892074, |
| "eval_loss": 0.20713473856449127, |
| "eval_perplexity": 1.2301483090321728, |
| "eval_runtime": 92.0938, |
| "eval_samples_per_second": 54.292, |
| "eval_steps_per_second": 1.705, |
| "step": 410000 |
| }, |
| { |
| "base_loss": 0.6215471062660217, |
| "epoch": 6.067710876464844, |
| "grad_norm": 0.0011463616974651814, |
| "learning_rate": 1.0851764678955078e-05, |
| "lookahead_loss": 6.295929139137268, |
| "loss": 0.6294, |
| "step": 410500 |
| }, |
| { |
| "base_loss": 0.6006580455303192, |
| "epoch": 6.06866455078125, |
| "grad_norm": 0.0010866274824365973, |
| "learning_rate": 1.0804080963134766e-05, |
| "lookahead_loss": 6.255359673500061, |
| "loss": 0.6088, |
| "step": 411000 |
| }, |
| { |
| "base_loss": 0.5863450763821602, |
| "epoch": 6.069618225097656, |
| "grad_norm": 0.0010793661931529641, |
| "learning_rate": 1.0756397247314453e-05, |
| "lookahead_loss": 6.290106748104096, |
| "loss": 0.5996, |
| "step": 411500 |
| }, |
| { |
| "base_loss": 0.6130303152799607, |
| "epoch": 6.0705718994140625, |
| "grad_norm": 0.0011292450362816453, |
| "learning_rate": 1.0708713531494142e-05, |
| "lookahead_loss": 6.193022495269775, |
| "loss": 0.6263, |
| "step": 412000 |
| }, |
| { |
| "base_loss": 0.5976909038424492, |
| "epoch": 6.071525573730469, |
| "grad_norm": 0.0011028555454686284, |
| "learning_rate": 1.0661029815673829e-05, |
| "lookahead_loss": 6.238008366107941, |
| "loss": 0.6101, |
| "step": 412500 |
| }, |
| { |
| "base_loss": 0.5941678040623665, |
| "epoch": 6.072479248046875, |
| "grad_norm": 0.0011350243585184216, |
| "learning_rate": 1.0613346099853515e-05, |
| "lookahead_loss": 6.284453297615051, |
| "loss": 0.6041, |
| "step": 413000 |
| }, |
| { |
| "base_loss": 0.59308890157938, |
| "epoch": 6.073432922363281, |
| "grad_norm": 0.0010425560176372528, |
| "learning_rate": 1.0565662384033204e-05, |
| "lookahead_loss": 6.2601313934326175, |
| "loss": 0.6043, |
| "step": 413500 |
| }, |
| { |
| "base_loss": 0.6148577529788017, |
| "epoch": 6.0743865966796875, |
| "grad_norm": 0.0010980789083987474, |
| "learning_rate": 1.051797866821289e-05, |
| "lookahead_loss": 6.254723328113556, |
| "loss": 0.6223, |
| "step": 414000 |
| }, |
| { |
| "base_loss": 0.5937941102385521, |
| "epoch": 6.075340270996094, |
| "grad_norm": 0.0011371398577466607, |
| "learning_rate": 1.047029495239258e-05, |
| "lookahead_loss": 6.319699608802796, |
| "loss": 0.6057, |
| "step": 414500 |
| }, |
| { |
| "base_loss": 0.5855166874527932, |
| "epoch": 6.0762939453125, |
| "grad_norm": 0.0010461227502673864, |
| "learning_rate": 1.0422611236572266e-05, |
| "lookahead_loss": 6.3037777528762815, |
| "loss": 0.5961, |
| "step": 415000 |
| }, |
| { |
| "epoch": 6.0762939453125, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.2767525427638535, |
| "eval_lookahead_perplexity": 532.0580195841749, |
| "eval_loss": 0.20713090896606445, |
| "eval_perplexity": 1.2301435980671642, |
| "eval_runtime": 90.955, |
| "eval_samples_per_second": 54.972, |
| "eval_steps_per_second": 1.726, |
| "step": 415000 |
| }, |
| { |
| "base_loss": 0.6238721495270729, |
| "epoch": 6.077247619628906, |
| "grad_norm": 0.0010918197222054005, |
| "learning_rate": 1.0374927520751953e-05, |
| "lookahead_loss": 6.2675533857345584, |
| "loss": 0.6338, |
| "step": 415500 |
| }, |
| { |
| "base_loss": 0.5962715279459954, |
| "epoch": 6.0782012939453125, |
| "grad_norm": 0.0011410253355279565, |
| "learning_rate": 1.0327243804931641e-05, |
| "lookahead_loss": 6.27958661365509, |
| "loss": 0.6104, |
| "step": 416000 |
| }, |
| { |
| "base_loss": 0.5984803900718689, |
| "epoch": 6.079154968261719, |
| "grad_norm": 0.0010592457838356495, |
| "learning_rate": 1.0279560089111328e-05, |
| "lookahead_loss": 6.307873971939087, |
| "loss": 0.6081, |
| "step": 416500 |
| }, |
| { |
| "base_loss": 0.6096297157406807, |
| "epoch": 6.080108642578125, |
| "grad_norm": 0.0010972806485369802, |
| "learning_rate": 1.0231876373291017e-05, |
| "lookahead_loss": 6.251428637504578, |
| "loss": 0.62, |
| "step": 417000 |
| }, |
| { |
| "base_loss": 0.6096931555867195, |
| "epoch": 6.081062316894531, |
| "grad_norm": 0.0011736972955986857, |
| "learning_rate": 1.0184192657470704e-05, |
| "lookahead_loss": 6.28662513923645, |
| "loss": 0.6192, |
| "step": 417500 |
| }, |
| { |
| "base_loss": 0.5950792465209961, |
| "epoch": 6.0820159912109375, |
| "grad_norm": 0.0010587719734758139, |
| "learning_rate": 1.013650894165039e-05, |
| "lookahead_loss": 6.306061523914337, |
| "loss": 0.6073, |
| "step": 418000 |
| }, |
| { |
| "base_loss": 0.5926352781057358, |
| "epoch": 6.082969665527344, |
| "grad_norm": 0.0011153679806739092, |
| "learning_rate": 1.0088825225830079e-05, |
| "lookahead_loss": 6.331948488235474, |
| "loss": 0.6044, |
| "step": 418500 |
| }, |
| { |
| "base_loss": 0.618686983525753, |
| "epoch": 6.08392333984375, |
| "grad_norm": 0.0010920428903773427, |
| "learning_rate": 1.0041141510009766e-05, |
| "lookahead_loss": 6.321983070850372, |
| "loss": 0.6292, |
| "step": 419000 |
| }, |
| { |
| "base_loss": 0.5942374100089073, |
| "epoch": 6.084877014160156, |
| "grad_norm": 0.001100387773476541, |
| "learning_rate": 9.993457794189454e-06, |
| "lookahead_loss": 6.271707439422608, |
| "loss": 0.6034, |
| "step": 419500 |
| }, |
| { |
| "base_loss": 0.5854629936814308, |
| "epoch": 6.0858306884765625, |
| "grad_norm": 0.0011402657255530357, |
| "learning_rate": 9.945774078369141e-06, |
| "lookahead_loss": 6.2728576302528385, |
| "loss": 0.5962, |
| "step": 420000 |
| }, |
| { |
| "epoch": 6.0858306884765625, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.27458217349677, |
| "eval_lookahead_perplexity": 530.9045094344544, |
| "eval_loss": 0.20712696015834808, |
| "eval_perplexity": 1.2301387404762227, |
| "eval_runtime": 90.6741, |
| "eval_samples_per_second": 55.143, |
| "eval_steps_per_second": 1.731, |
| "step": 420000 |
| }, |
| { |
| "base_loss": 0.5934821357131004, |
| "epoch": 6.086784362792969, |
| "grad_norm": 0.0011842896929010749, |
| "learning_rate": 9.898090362548828e-06, |
| "lookahead_loss": 6.252230400085449, |
| "loss": 0.6039, |
| "step": 420500 |
| }, |
| { |
| "base_loss": 0.6230015980005265, |
| "epoch": 6.087738037109375, |
| "grad_norm": 0.001042664865963161, |
| "learning_rate": 9.850406646728516e-06, |
| "lookahead_loss": 6.282988451957703, |
| "loss": 0.6287, |
| "step": 421000 |
| }, |
| { |
| "base_loss": 0.5975295419692993, |
| "epoch": 6.088691711425781, |
| "grad_norm": 0.0011371064465492964, |
| "learning_rate": 9.802722930908203e-06, |
| "lookahead_loss": 6.272774960517883, |
| "loss": 0.6057, |
| "step": 421500 |
| }, |
| { |
| "base_loss": 0.5983778918385506, |
| "epoch": 6.0896453857421875, |
| "grad_norm": 0.0011219978332519531, |
| "learning_rate": 9.755039215087892e-06, |
| "lookahead_loss": 6.301621433734894, |
| "loss": 0.6089, |
| "step": 422000 |
| }, |
| { |
| "base_loss": 0.5868593170046806, |
| "epoch": 6.090599060058594, |
| "grad_norm": 0.0010826161596924067, |
| "learning_rate": 9.707355499267579e-06, |
| "lookahead_loss": 6.280946511268616, |
| "loss": 0.5982, |
| "step": 422500 |
| }, |
| { |
| "base_loss": 0.5818374307155609, |
| "epoch": 6.091552734375, |
| "grad_norm": 0.0010603091213852167, |
| "learning_rate": 9.659671783447265e-06, |
| "lookahead_loss": 6.2535722813606265, |
| "loss": 0.5917, |
| "step": 423000 |
| }, |
| { |
| "base_loss": 0.616866985142231, |
| "epoch": 6.092506408691406, |
| "grad_norm": 0.0011452294420450926, |
| "learning_rate": 9.611988067626954e-06, |
| "lookahead_loss": 6.298460816383362, |
| "loss": 0.6261, |
| "step": 423500 |
| }, |
| { |
| "base_loss": 0.5975575439333916, |
| "epoch": 6.0934600830078125, |
| "grad_norm": 0.0010709463385865092, |
| "learning_rate": 9.56430435180664e-06, |
| "lookahead_loss": 6.2952479648590085, |
| "loss": 0.6119, |
| "step": 424000 |
| }, |
| { |
| "base_loss": 0.584998004078865, |
| "epoch": 6.094413757324219, |
| "grad_norm": 0.001079953508451581, |
| "learning_rate": 9.51662063598633e-06, |
| "lookahead_loss": 6.284721467971802, |
| "loss": 0.5966, |
| "step": 424500 |
| }, |
| { |
| "base_loss": 0.5778142619729042, |
| "epoch": 6.095367431640625, |
| "grad_norm": 0.001095326617360115, |
| "learning_rate": 9.468936920166016e-06, |
| "lookahead_loss": 6.222258620738983, |
| "loss": 0.5928, |
| "step": 425000 |
| }, |
| { |
| "epoch": 6.095367431640625, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.273199597867533, |
| "eval_lookahead_perplexity": 530.171000980557, |
| "eval_loss": 0.20712387561798096, |
| "eval_perplexity": 1.2301349460694726, |
| "eval_runtime": 91.6966, |
| "eval_samples_per_second": 54.528, |
| "eval_steps_per_second": 1.712, |
| "step": 425000 |
| }, |
| { |
| "base_loss": 0.5984759790301323, |
| "epoch": 6.096321105957031, |
| "grad_norm": 0.001100808964110911, |
| "learning_rate": 9.421253204345703e-06, |
| "lookahead_loss": 6.273516724586487, |
| "loss": 0.6115, |
| "step": 425500 |
| }, |
| { |
| "base_loss": 0.6139922738075256, |
| "epoch": 6.0972747802734375, |
| "grad_norm": 0.0011272229021415114, |
| "learning_rate": 9.373569488525391e-06, |
| "lookahead_loss": 6.297957478523254, |
| "loss": 0.6236, |
| "step": 426000 |
| }, |
| { |
| "base_loss": 0.5972231289744377, |
| "epoch": 6.098228454589844, |
| "grad_norm": 0.0010969050927087665, |
| "learning_rate": 9.325885772705078e-06, |
| "lookahead_loss": 6.234860178947449, |
| "loss": 0.6072, |
| "step": 426500 |
| }, |
| { |
| "base_loss": 0.5825409645438194, |
| "epoch": 6.09918212890625, |
| "grad_norm": 0.0011044559068977833, |
| "learning_rate": 9.278202056884767e-06, |
| "lookahead_loss": 6.312386203289032, |
| "loss": 0.5948, |
| "step": 427000 |
| }, |
| { |
| "base_loss": 0.5877818803787231, |
| "epoch": 6.100135803222656, |
| "grad_norm": 0.0011069976026192307, |
| "learning_rate": 9.230518341064454e-06, |
| "lookahead_loss": 6.295519777297974, |
| "loss": 0.6034, |
| "step": 427500 |
| }, |
| { |
| "base_loss": 0.610653886437416, |
| "epoch": 6.1010894775390625, |
| "grad_norm": 0.0010821627220138907, |
| "learning_rate": 9.18283462524414e-06, |
| "lookahead_loss": 6.3413438749313356, |
| "loss": 0.62, |
| "step": 428000 |
| }, |
| { |
| "base_loss": 0.6026809126138687, |
| "epoch": 6.102043151855469, |
| "grad_norm": 0.0011416695779189467, |
| "learning_rate": 9.135150909423829e-06, |
| "lookahead_loss": 6.302958374023437, |
| "loss": 0.6081, |
| "step": 428500 |
| }, |
| { |
| "base_loss": 0.5897697188258171, |
| "epoch": 6.102996826171875, |
| "grad_norm": 0.0010855476139113307, |
| "learning_rate": 9.087467193603516e-06, |
| "lookahead_loss": 6.285157390594483, |
| "loss": 0.6021, |
| "step": 429000 |
| }, |
| { |
| "base_loss": 0.590453925728798, |
| "epoch": 6.103950500488281, |
| "grad_norm": 0.0011181783629581332, |
| "learning_rate": 9.039783477783204e-06, |
| "lookahead_loss": 6.307080610275269, |
| "loss": 0.6019, |
| "step": 429500 |
| }, |
| { |
| "base_loss": 0.6123360496759415, |
| "epoch": 6.1049041748046875, |
| "grad_norm": 0.0010880377376452088, |
| "learning_rate": 8.992099761962891e-06, |
| "lookahead_loss": 6.254195990562439, |
| "loss": 0.6228, |
| "step": 430000 |
| }, |
| { |
| "epoch": 6.1049041748046875, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.271564553720882, |
| "eval_lookahead_perplexity": 529.3048562740563, |
| "eval_loss": 0.20712073147296906, |
| "eval_perplexity": 1.2301310783528983, |
| "eval_runtime": 92.2103, |
| "eval_samples_per_second": 54.224, |
| "eval_steps_per_second": 1.703, |
| "step": 430000 |
| }, |
| { |
| "base_loss": 0.6012233046293258, |
| "epoch": 6.105857849121094, |
| "grad_norm": 0.0010412432020530105, |
| "learning_rate": 8.944416046142578e-06, |
| "lookahead_loss": 6.260348888397217, |
| "loss": 0.6131, |
| "step": 430500 |
| }, |
| { |
| "base_loss": 0.5989438276290894, |
| "epoch": 6.1068115234375, |
| "grad_norm": 0.0010780078591778874, |
| "learning_rate": 8.896732330322266e-06, |
| "lookahead_loss": 6.3281553306579585, |
| "loss": 0.6071, |
| "step": 431000 |
| }, |
| { |
| "base_loss": 0.5841665432453156, |
| "epoch": 6.107765197753906, |
| "grad_norm": 0.0011316650779917836, |
| "learning_rate": 8.849048614501953e-06, |
| "lookahead_loss": 6.234313168525696, |
| "loss": 0.5947, |
| "step": 431500 |
| }, |
| { |
| "base_loss": 0.6041393259763718, |
| "epoch": 6.1087188720703125, |
| "grad_norm": 0.0011483209673315287, |
| "learning_rate": 8.801364898681642e-06, |
| "lookahead_loss": 6.231174618244171, |
| "loss": 0.6173, |
| "step": 432000 |
| }, |
| { |
| "base_loss": 0.6072890778183937, |
| "epoch": 6.109672546386719, |
| "grad_norm": 0.0010701629798859358, |
| "learning_rate": 8.753681182861329e-06, |
| "lookahead_loss": 6.205759740829468, |
| "loss": 0.617, |
| "step": 432500 |
| }, |
| { |
| "base_loss": 0.5975223676562309, |
| "epoch": 6.110626220703125, |
| "grad_norm": 0.0010894860606640577, |
| "learning_rate": 8.705997467041015e-06, |
| "lookahead_loss": 6.183219263553619, |
| "loss": 0.6065, |
| "step": 433000 |
| }, |
| { |
| "base_loss": 0.5839032330513001, |
| "epoch": 6.111579895019531, |
| "grad_norm": 0.0010885415831580758, |
| "learning_rate": 8.658313751220704e-06, |
| "lookahead_loss": 6.2854044160842895, |
| "loss": 0.5963, |
| "step": 433500 |
| }, |
| { |
| "base_loss": 0.6033086371421814, |
| "epoch": 6.1125335693359375, |
| "grad_norm": 0.0010762620950117707, |
| "learning_rate": 8.61063003540039e-06, |
| "lookahead_loss": 6.313331780433654, |
| "loss": 0.613, |
| "step": 434000 |
| }, |
| { |
| "base_loss": 0.6134573189020157, |
| "epoch": 6.113487243652344, |
| "grad_norm": 0.0010942388325929642, |
| "learning_rate": 8.56294631958008e-06, |
| "lookahead_loss": 6.318107944011688, |
| "loss": 0.6263, |
| "step": 434500 |
| }, |
| { |
| "base_loss": 0.5973056275844574, |
| "epoch": 6.11444091796875, |
| "grad_norm": 0.001045436249114573, |
| "learning_rate": 8.515262603759766e-06, |
| "lookahead_loss": 6.20644309425354, |
| "loss": 0.6064, |
| "step": 435000 |
| }, |
| { |
| "epoch": 6.11444091796875, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.270106329323766, |
| "eval_lookahead_perplexity": 528.5335735074714, |
| "eval_loss": 0.20711791515350342, |
| "eval_perplexity": 1.2301276139156756, |
| "eval_runtime": 91.8053, |
| "eval_samples_per_second": 54.463, |
| "eval_steps_per_second": 1.71, |
| "step": 435000 |
| }, |
| { |
| "base_loss": 0.5865064730644226, |
| "epoch": 6.115394592285156, |
| "grad_norm": 0.0011204156326130033, |
| "learning_rate": 8.467578887939453e-06, |
| "lookahead_loss": 6.250656041145325, |
| "loss": 0.5967, |
| "step": 435500 |
| }, |
| { |
| "base_loss": 0.6034195944666862, |
| "epoch": 6.1163482666015625, |
| "grad_norm": 0.0011835613986477256, |
| "learning_rate": 8.419895172119141e-06, |
| "lookahead_loss": 6.255055441856384, |
| "loss": 0.6142, |
| "step": 436000 |
| }, |
| { |
| "base_loss": 0.6118170965909958, |
| "epoch": 6.117301940917969, |
| "grad_norm": 0.0011420606169849634, |
| "learning_rate": 8.372211456298828e-06, |
| "lookahead_loss": 6.259607755661011, |
| "loss": 0.6237, |
| "step": 436500 |
| }, |
| { |
| "base_loss": 0.6024285949468613, |
| "epoch": 6.118255615234375, |
| "grad_norm": 0.0011443018447607756, |
| "learning_rate": 8.324527740478517e-06, |
| "lookahead_loss": 6.2809450225830075, |
| "loss": 0.6109, |
| "step": 437000 |
| }, |
| { |
| "base_loss": 0.5845644298195839, |
| "epoch": 6.119209289550781, |
| "grad_norm": 0.0011274107964709401, |
| "learning_rate": 8.276844024658204e-06, |
| "lookahead_loss": 6.284584188461304, |
| "loss": 0.5948, |
| "step": 437500 |
| }, |
| { |
| "base_loss": 0.5961613509654998, |
| "epoch": 7.000953674316406, |
| "grad_norm": 0.0011183428578078747, |
| "learning_rate": 8.22916030883789e-06, |
| "lookahead_loss": 6.349602223396301, |
| "loss": 0.6029, |
| "step": 438000 |
| }, |
| { |
| "base_loss": 0.5877805910706521, |
| "epoch": 7.0019073486328125, |
| "grad_norm": 0.001147622475400567, |
| "learning_rate": 8.181476593017579e-06, |
| "lookahead_loss": 6.187368534088135, |
| "loss": 0.5973, |
| "step": 438500 |
| }, |
| { |
| "base_loss": 0.6050056391954421, |
| "epoch": 7.002861022949219, |
| "grad_norm": 0.0011125532910227776, |
| "learning_rate": 8.133792877197266e-06, |
| "lookahead_loss": 6.200378650665283, |
| "loss": 0.6131, |
| "step": 439000 |
| }, |
| { |
| "base_loss": 0.6121045120954514, |
| "epoch": 7.003814697265625, |
| "grad_norm": 0.0010899071348831058, |
| "learning_rate": 8.086109161376954e-06, |
| "lookahead_loss": 6.22591156578064, |
| "loss": 0.6207, |
| "step": 439500 |
| }, |
| { |
| "base_loss": 0.6019601293206215, |
| "epoch": 7.004768371582031, |
| "grad_norm": 0.0011169801000505686, |
| "learning_rate": 8.038425445556641e-06, |
| "lookahead_loss": 6.211684448242187, |
| "loss": 0.608, |
| "step": 440000 |
| }, |
| { |
| "epoch": 7.004768371582031, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.268705815933764, |
| "eval_lookahead_perplexity": 527.7938732616643, |
| "eval_loss": 0.207115039229393, |
| "eval_perplexity": 1.2301240761670988, |
| "eval_runtime": 91.6482, |
| "eval_samples_per_second": 54.556, |
| "eval_steps_per_second": 1.713, |
| "step": 440000 |
| }, |
| { |
| "base_loss": 0.5894829227328301, |
| "epoch": 7.0057220458984375, |
| "grad_norm": 0.0010402505286037922, |
| "learning_rate": 7.990741729736328e-06, |
| "lookahead_loss": 6.329894530773163, |
| "loss": 0.5999, |
| "step": 440500 |
| }, |
| { |
| "base_loss": 0.5802138668894767, |
| "epoch": 7.006675720214844, |
| "grad_norm": 0.0010724926833063364, |
| "learning_rate": 7.943058013916016e-06, |
| "lookahead_loss": 6.185600045204162, |
| "loss": 0.5955, |
| "step": 441000 |
| }, |
| { |
| "base_loss": 0.6045641638636589, |
| "epoch": 7.00762939453125, |
| "grad_norm": 0.001113320467993617, |
| "learning_rate": 7.895374298095703e-06, |
| "lookahead_loss": 6.25577535200119, |
| "loss": 0.6166, |
| "step": 441500 |
| }, |
| { |
| "base_loss": 0.6033532832860946, |
| "epoch": 7.008583068847656, |
| "grad_norm": 0.001021145610138774, |
| "learning_rate": 7.847690582275392e-06, |
| "lookahead_loss": 6.258115340709686, |
| "loss": 0.6071, |
| "step": 442000 |
| }, |
| { |
| "base_loss": 0.5892392939925194, |
| "epoch": 7.0095367431640625, |
| "grad_norm": 0.0011431181337684393, |
| "learning_rate": 7.800006866455079e-06, |
| "lookahead_loss": 6.264180626392364, |
| "loss": 0.6052, |
| "step": 442500 |
| }, |
| { |
| "base_loss": 0.5968242118954659, |
| "epoch": 7.010490417480469, |
| "grad_norm": 0.0011058381060138345, |
| "learning_rate": 7.752323150634765e-06, |
| "lookahead_loss": 6.2021826705932614, |
| "loss": 0.6043, |
| "step": 443000 |
| }, |
| { |
| "base_loss": 0.590798145532608, |
| "epoch": 7.011444091796875, |
| "grad_norm": 0.001150521100498736, |
| "learning_rate": 7.704639434814454e-06, |
| "lookahead_loss": 6.237220158100128, |
| "loss": 0.6048, |
| "step": 443500 |
| }, |
| { |
| "base_loss": 0.6112803395986557, |
| "epoch": 7.012397766113281, |
| "grad_norm": 0.0011072250781580806, |
| "learning_rate": 7.65695571899414e-06, |
| "lookahead_loss": 6.236739232063293, |
| "loss": 0.6218, |
| "step": 444000 |
| }, |
| { |
| "base_loss": 0.5957660912275314, |
| "epoch": 7.0133514404296875, |
| "grad_norm": 0.0011012445902451873, |
| "learning_rate": 7.6092720031738284e-06, |
| "lookahead_loss": 6.307263396263123, |
| "loss": 0.6076, |
| "step": 444500 |
| }, |
| { |
| "base_loss": 0.5957861280441284, |
| "epoch": 7.014305114746094, |
| "grad_norm": 0.0011027586879208684, |
| "learning_rate": 7.561588287353516e-06, |
| "lookahead_loss": 6.285342982292176, |
| "loss": 0.6074, |
| "step": 445000 |
| }, |
| { |
| "epoch": 7.014305114746094, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.267345444462932, |
| "eval_lookahead_perplexity": 527.0763656830716, |
| "eval_loss": 0.20711229741573334, |
| "eval_perplexity": 1.2301207034007275, |
| "eval_runtime": 90.8882, |
| "eval_samples_per_second": 55.013, |
| "eval_steps_per_second": 1.727, |
| "step": 445000 |
| }, |
| { |
| "base_loss": 0.5817868651151658, |
| "epoch": 7.0152587890625, |
| "grad_norm": 0.0011148322373628616, |
| "learning_rate": 7.513904571533204e-06, |
| "lookahead_loss": 6.224323974609375, |
| "loss": 0.5886, |
| "step": 445500 |
| }, |
| { |
| "base_loss": 0.6079041356444359, |
| "epoch": 7.016212463378906, |
| "grad_norm": 0.001130820601247251, |
| "learning_rate": 7.466220855712891e-06, |
| "lookahead_loss": 6.284985821247101, |
| "loss": 0.6164, |
| "step": 446000 |
| }, |
| { |
| "base_loss": 0.6032249782681465, |
| "epoch": 7.0171661376953125, |
| "grad_norm": 0.0010859910398721695, |
| "learning_rate": 7.418537139892578e-06, |
| "lookahead_loss": 6.324811381340027, |
| "loss": 0.6089, |
| "step": 446500 |
| }, |
| { |
| "base_loss": 0.5946653738021851, |
| "epoch": 7.018119812011719, |
| "grad_norm": 0.0010381847387179732, |
| "learning_rate": 7.370853424072266e-06, |
| "lookahead_loss": 6.318586661338806, |
| "loss": 0.6029, |
| "step": 447000 |
| }, |
| { |
| "base_loss": 0.5883135892748833, |
| "epoch": 7.019073486328125, |
| "grad_norm": 0.0010651465272530913, |
| "learning_rate": 7.323169708251954e-06, |
| "lookahead_loss": 6.3373530521392825, |
| "loss": 0.5982, |
| "step": 447500 |
| }, |
| { |
| "base_loss": 0.5866989207267761, |
| "epoch": 7.020027160644531, |
| "grad_norm": 0.0011416026391088963, |
| "learning_rate": 7.275485992431641e-06, |
| "lookahead_loss": 6.17793447971344, |
| "loss": 0.6012, |
| "step": 448000 |
| }, |
| { |
| "base_loss": 0.6181823741197586, |
| "epoch": 7.0209808349609375, |
| "grad_norm": 0.0010932744480669498, |
| "learning_rate": 7.227802276611328e-06, |
| "lookahead_loss": 6.233171957969666, |
| "loss": 0.6252, |
| "step": 448500 |
| }, |
| { |
| "base_loss": 0.5960042692422867, |
| "epoch": 7.021934509277344, |
| "grad_norm": 0.0011309271212667227, |
| "learning_rate": 7.180118560791016e-06, |
| "lookahead_loss": 6.224097855091095, |
| "loss": 0.6085, |
| "step": 449000 |
| }, |
| { |
| "base_loss": 0.5992451857328415, |
| "epoch": 7.02288818359375, |
| "grad_norm": 0.0011079860851168633, |
| "learning_rate": 7.1324348449707034e-06, |
| "lookahead_loss": 6.251955393314361, |
| "loss": 0.6062, |
| "step": 449500 |
| }, |
| { |
| "base_loss": 0.5992803901433945, |
| "epoch": 7.023841857910156, |
| "grad_norm": 0.001132696750573814, |
| "learning_rate": 7.084751129150391e-06, |
| "lookahead_loss": 6.2213596034049985, |
| "loss": 0.6062, |
| "step": 450000 |
| }, |
| { |
| "epoch": 7.023841857910156, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.265722547476284, |
| "eval_lookahead_perplexity": 526.221668767587, |
| "eval_loss": 0.2071092277765274, |
| "eval_perplexity": 1.2301169273797838, |
| "eval_runtime": 91.2802, |
| "eval_samples_per_second": 54.776, |
| "eval_steps_per_second": 1.72, |
| "step": 450000 |
| }, |
| { |
| "base_loss": 0.6113395862579346, |
| "epoch": 7.0247955322265625, |
| "grad_norm": 0.0010815602727234364, |
| "learning_rate": 7.037067413330079e-06, |
| "lookahead_loss": 6.2310498752594, |
| "loss": 0.6221, |
| "step": 450500 |
| }, |
| { |
| "base_loss": 0.6042459404468536, |
| "epoch": 7.025749206542969, |
| "grad_norm": 0.0010827596997842193, |
| "learning_rate": 6.989383697509766e-06, |
| "lookahead_loss": 6.193919623374939, |
| "loss": 0.6151, |
| "step": 451000 |
| }, |
| { |
| "base_loss": 0.5966729502677918, |
| "epoch": 7.026702880859375, |
| "grad_norm": 0.0011122706346213818, |
| "learning_rate": 6.941699981689453e-06, |
| "lookahead_loss": 6.21730579662323, |
| "loss": 0.6058, |
| "step": 451500 |
| }, |
| { |
| "base_loss": 0.5946527794599533, |
| "epoch": 7.027656555175781, |
| "grad_norm": 0.0010856961598619819, |
| "learning_rate": 6.894016265869141e-06, |
| "lookahead_loss": 6.3204999446868895, |
| "loss": 0.6041, |
| "step": 452000 |
| }, |
| { |
| "base_loss": 0.6178021001815795, |
| "epoch": 7.0286102294921875, |
| "grad_norm": 0.0010914442827925086, |
| "learning_rate": 6.846332550048829e-06, |
| "lookahead_loss": 6.350885845184326, |
| "loss": 0.6271, |
| "step": 452500 |
| }, |
| { |
| "base_loss": 0.5997619133591652, |
| "epoch": 7.029563903808594, |
| "grad_norm": 0.0010976734338328242, |
| "learning_rate": 6.798648834228516e-06, |
| "lookahead_loss": 6.279042994499206, |
| "loss": 0.6093, |
| "step": 453000 |
| }, |
| { |
| "base_loss": 0.5922289202213288, |
| "epoch": 7.030517578125, |
| "grad_norm": 0.0010889185359701514, |
| "learning_rate": 6.750965118408203e-06, |
| "lookahead_loss": 6.284307628631592, |
| "loss": 0.6053, |
| "step": 453500 |
| }, |
| { |
| "base_loss": 0.5905209797620773, |
| "epoch": 7.031471252441406, |
| "grad_norm": 0.0011072177439928055, |
| "learning_rate": 6.703281402587891e-06, |
| "lookahead_loss": 6.292623271942139, |
| "loss": 0.6026, |
| "step": 454000 |
| }, |
| { |
| "base_loss": 0.6133147512674332, |
| "epoch": 7.0324249267578125, |
| "grad_norm": 0.0011112524662166834, |
| "learning_rate": 6.6555976867675784e-06, |
| "lookahead_loss": 6.244432949066162, |
| "loss": 0.6276, |
| "step": 454500 |
| }, |
| { |
| "base_loss": 0.5965310020446777, |
| "epoch": 7.033378601074219, |
| "grad_norm": 0.001126542454585433, |
| "learning_rate": 6.607913970947266e-06, |
| "lookahead_loss": 6.288736204147339, |
| "loss": 0.6076, |
| "step": 455000 |
| }, |
| { |
| "epoch": 7.033378601074219, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.264621932666523, |
| "eval_lookahead_perplexity": 525.6428200089147, |
| "eval_loss": 0.20710715651512146, |
| "eval_perplexity": 1.2301143794887062, |
| "eval_runtime": 90.1897, |
| "eval_samples_per_second": 55.439, |
| "eval_steps_per_second": 1.741, |
| "step": 455000 |
| }, |
| { |
| "base_loss": 0.5923399785757065, |
| "epoch": 7.034332275390625, |
| "grad_norm": 0.0010525787947699428, |
| "learning_rate": 6.560230255126954e-06, |
| "lookahead_loss": 6.364464108467102, |
| "loss": 0.6063, |
| "step": 455500 |
| }, |
| { |
| "base_loss": 0.5895907972455025, |
| "epoch": 7.035285949707031, |
| "grad_norm": 0.0011163650779053569, |
| "learning_rate": 6.512546539306641e-06, |
| "lookahead_loss": 6.196651001453399, |
| "loss": 0.6011, |
| "step": 456000 |
| }, |
| { |
| "base_loss": 0.6101583961248398, |
| "epoch": 7.0362396240234375, |
| "grad_norm": 0.0011191520607098937, |
| "learning_rate": 6.464862823486328e-06, |
| "lookahead_loss": 6.324227838516236, |
| "loss": 0.6218, |
| "step": 456500 |
| }, |
| { |
| "base_loss": 0.5974237969517708, |
| "epoch": 7.037193298339844, |
| "grad_norm": 0.0011339603224769235, |
| "learning_rate": 6.417179107666016e-06, |
| "lookahead_loss": 6.257159686088562, |
| "loss": 0.6098, |
| "step": 457000 |
| }, |
| { |
| "base_loss": 0.5909910210371018, |
| "epoch": 7.03814697265625, |
| "grad_norm": 0.001096972613595426, |
| "learning_rate": 6.369495391845704e-06, |
| "lookahead_loss": 6.265636465072632, |
| "loss": 0.6028, |
| "step": 457500 |
| }, |
| { |
| "base_loss": 0.5998276071548462, |
| "epoch": 7.039100646972656, |
| "grad_norm": 0.0010910106357187033, |
| "learning_rate": 6.321811676025391e-06, |
| "lookahead_loss": 6.244960191726684, |
| "loss": 0.6124, |
| "step": 458000 |
| }, |
| { |
| "base_loss": 0.6157890763282776, |
| "epoch": 7.0400543212890625, |
| "grad_norm": 0.001121240551583469, |
| "learning_rate": 6.274127960205078e-06, |
| "lookahead_loss": 6.279780546188355, |
| "loss": 0.6236, |
| "step": 458500 |
| }, |
| { |
| "base_loss": 0.600258769273758, |
| "epoch": 7.041007995605469, |
| "grad_norm": 0.0011285766959190369, |
| "learning_rate": 6.226444244384766e-06, |
| "lookahead_loss": 6.209328644752502, |
| "loss": 0.6114, |
| "step": 459000 |
| }, |
| { |
| "base_loss": 0.5789770235419274, |
| "epoch": 7.041961669921875, |
| "grad_norm": 0.0011222581379115582, |
| "learning_rate": 6.1787605285644534e-06, |
| "lookahead_loss": 6.2839519019126895, |
| "loss": 0.5952, |
| "step": 459500 |
| }, |
| { |
| "base_loss": 0.603790655374527, |
| "epoch": 7.042915344238281, |
| "grad_norm": 0.0010850606486201286, |
| "learning_rate": 6.131076812744141e-06, |
| "lookahead_loss": 6.308252753257752, |
| "loss": 0.6147, |
| "step": 460000 |
| }, |
| { |
| "epoch": 7.042915344238281, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.263513925357367, |
| "eval_lookahead_perplexity": 525.060726463843, |
| "eval_loss": 0.2071051001548767, |
| "eval_perplexity": 1.2301118499330004, |
| "eval_runtime": 92.9681, |
| "eval_samples_per_second": 53.782, |
| "eval_steps_per_second": 1.689, |
| "step": 460000 |
| }, |
| { |
| "base_loss": 0.6122875562906265, |
| "epoch": 7.0438690185546875, |
| "grad_norm": 0.001144933863542974, |
| "learning_rate": 6.083393096923829e-06, |
| "lookahead_loss": 6.326288844108581, |
| "loss": 0.6259, |
| "step": 460500 |
| }, |
| { |
| "base_loss": 0.5919451169967651, |
| "epoch": 7.044822692871094, |
| "grad_norm": 0.0011071843327954412, |
| "learning_rate": 6.035709381103516e-06, |
| "lookahead_loss": 6.259445383071899, |
| "loss": 0.6046, |
| "step": 461000 |
| }, |
| { |
| "base_loss": 0.5933005015850067, |
| "epoch": 7.0457763671875, |
| "grad_norm": 0.0010790773667395115, |
| "learning_rate": 5.988025665283203e-06, |
| "lookahead_loss": 6.257792898178101, |
| "loss": 0.6032, |
| "step": 461500 |
| }, |
| { |
| "base_loss": 0.619339332818985, |
| "epoch": 7.046730041503906, |
| "grad_norm": 0.0011052032932639122, |
| "learning_rate": 5.940341949462891e-06, |
| "lookahead_loss": 6.215020411014557, |
| "loss": 0.6282, |
| "step": 462000 |
| }, |
| { |
| "base_loss": 0.6016078860163688, |
| "epoch": 7.0476837158203125, |
| "grad_norm": 0.0011391551233828068, |
| "learning_rate": 5.892658233642579e-06, |
| "lookahead_loss": 6.270429663658142, |
| "loss": 0.6197, |
| "step": 462500 |
| }, |
| { |
| "base_loss": 0.5871764430999756, |
| "epoch": 7.048637390136719, |
| "grad_norm": 0.0010885728988796473, |
| "learning_rate": 5.844974517822266e-06, |
| "lookahead_loss": 6.2495416173934935, |
| "loss": 0.6004, |
| "step": 463000 |
| }, |
| { |
| "base_loss": 0.6003029895424843, |
| "epoch": 7.049591064453125, |
| "grad_norm": 0.0011504755821079016, |
| "learning_rate": 5.797290802001953e-06, |
| "lookahead_loss": 6.2231017370224, |
| "loss": 0.6079, |
| "step": 463500 |
| }, |
| { |
| "base_loss": 0.616134802877903, |
| "epoch": 7.050544738769531, |
| "grad_norm": 0.001045083161443472, |
| "learning_rate": 5.749607086181641e-06, |
| "lookahead_loss": 6.316908567428589, |
| "loss": 0.6265, |
| "step": 464000 |
| }, |
| { |
| "base_loss": 0.5965465674996376, |
| "epoch": 7.0514984130859375, |
| "grad_norm": 0.0011614857940003276, |
| "learning_rate": 5.7019233703613284e-06, |
| "lookahead_loss": 6.286425273895263, |
| "loss": 0.6091, |
| "step": 464500 |
| }, |
| { |
| "base_loss": 0.5948073084950447, |
| "epoch": 7.052452087402344, |
| "grad_norm": 0.001139484578743577, |
| "learning_rate": 5.654239654541016e-06, |
| "lookahead_loss": 6.1960596828460694, |
| "loss": 0.6053, |
| "step": 465000 |
| }, |
| { |
| "epoch": 7.052452087402344, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.262733459472656, |
| "eval_lookahead_perplexity": 524.6510943521738, |
| "eval_loss": 0.2071034163236618, |
| "eval_perplexity": 1.2301097786340136, |
| "eval_runtime": 93.0888, |
| "eval_samples_per_second": 53.712, |
| "eval_steps_per_second": 1.687, |
| "step": 465000 |
| }, |
| { |
| "base_loss": 0.6123905621767044, |
| "epoch": 7.05340576171875, |
| "grad_norm": 0.00109586538746953, |
| "learning_rate": 5.606555938720704e-06, |
| "lookahead_loss": 6.227264610290527, |
| "loss": 0.6202, |
| "step": 465500 |
| }, |
| { |
| "base_loss": 0.6105590149760246, |
| "epoch": 7.054359436035156, |
| "grad_norm": 0.0011044503189623356, |
| "learning_rate": 5.558872222900391e-06, |
| "lookahead_loss": 6.263851017951965, |
| "loss": 0.6254, |
| "step": 466000 |
| }, |
| { |
| "base_loss": 0.5943532618284225, |
| "epoch": 7.0553131103515625, |
| "grad_norm": 0.00111213861964643, |
| "learning_rate": 5.511188507080078e-06, |
| "lookahead_loss": 6.27136710357666, |
| "loss": 0.6029, |
| "step": 466500 |
| }, |
| { |
| "base_loss": 0.5899229286909103, |
| "epoch": 7.056266784667969, |
| "grad_norm": 0.0010533079039305449, |
| "learning_rate": 5.463504791259766e-06, |
| "lookahead_loss": 6.291095352172851, |
| "loss": 0.6048, |
| "step": 467000 |
| }, |
| { |
| "base_loss": 0.6136555113196372, |
| "epoch": 7.057220458984375, |
| "grad_norm": 0.001086205942556262, |
| "learning_rate": 5.415821075439454e-06, |
| "lookahead_loss": 6.285642471790314, |
| "loss": 0.6274, |
| "step": 467500 |
| }, |
| { |
| "base_loss": 0.60705413210392, |
| "epoch": 7.058174133300781, |
| "grad_norm": 0.0011131491046398878, |
| "learning_rate": 5.368137359619141e-06, |
| "lookahead_loss": 6.296099196434021, |
| "loss": 0.6149, |
| "step": 468000 |
| }, |
| { |
| "base_loss": 0.5850182236433029, |
| "epoch": 7.0591278076171875, |
| "grad_norm": 0.0010873244609683752, |
| "learning_rate": 5.320453643798828e-06, |
| "lookahead_loss": 6.182619555473328, |
| "loss": 0.6004, |
| "step": 468500 |
| }, |
| { |
| "base_loss": 0.5940621579289437, |
| "epoch": 7.060081481933594, |
| "grad_norm": 0.0011030533351004124, |
| "learning_rate": 5.272769927978516e-06, |
| "lookahead_loss": 6.263250873088837, |
| "loss": 0.6064, |
| "step": 469000 |
| }, |
| { |
| "base_loss": 0.6068593204021454, |
| "epoch": 7.06103515625, |
| "grad_norm": 0.0011013118783012033, |
| "learning_rate": 5.2250862121582034e-06, |
| "lookahead_loss": 6.212888800621033, |
| "loss": 0.6168, |
| "step": 469500 |
| }, |
| { |
| "base_loss": 0.6027862961888313, |
| "epoch": 7.061988830566406, |
| "grad_norm": 0.001138397492468357, |
| "learning_rate": 5.177402496337891e-06, |
| "lookahead_loss": 6.228745173454285, |
| "loss": 0.6144, |
| "step": 470000 |
| }, |
| { |
| "epoch": 7.061988830566406, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.261695091716779, |
| "eval_lookahead_perplexity": 524.1065963162379, |
| "eval_loss": 0.207101508975029, |
| "eval_perplexity": 1.2301074323880465, |
| "eval_runtime": 90.65, |
| "eval_samples_per_second": 55.157, |
| "eval_steps_per_second": 1.732, |
| "step": 470000 |
| }, |
| { |
| "base_loss": 0.5955105296373367, |
| "epoch": 7.0629425048828125, |
| "grad_norm": 0.001156990067102015, |
| "learning_rate": 5.129718780517579e-06, |
| "lookahead_loss": 6.243498850822449, |
| "loss": 0.6064, |
| "step": 470500 |
| }, |
| { |
| "base_loss": 0.6115499113202095, |
| "epoch": 7.063896179199219, |
| "grad_norm": 0.001073277904652059, |
| "learning_rate": 5.082035064697266e-06, |
| "lookahead_loss": 6.25689557170868, |
| "loss": 0.6233, |
| "step": 471000 |
| }, |
| { |
| "base_loss": 0.6069118053913116, |
| "epoch": 7.064849853515625, |
| "grad_norm": 0.0010952987940981984, |
| "learning_rate": 5.034351348876953e-06, |
| "lookahead_loss": 6.301583794593811, |
| "loss": 0.6156, |
| "step": 471500 |
| }, |
| { |
| "base_loss": 0.6021770805120468, |
| "epoch": 7.065803527832031, |
| "grad_norm": 0.0011316589079797268, |
| "learning_rate": 4.986667633056641e-06, |
| "lookahead_loss": 6.225786661148072, |
| "loss": 0.6123, |
| "step": 472000 |
| }, |
| { |
| "base_loss": 0.6015241233706474, |
| "epoch": 7.0667572021484375, |
| "grad_norm": 0.0011141011491417885, |
| "learning_rate": 4.938983917236329e-06, |
| "lookahead_loss": 6.202967374324799, |
| "loss": 0.6095, |
| "step": 472500 |
| }, |
| { |
| "base_loss": 0.6184404605031013, |
| "epoch": 7.067710876464844, |
| "grad_norm": 0.001143975299783051, |
| "learning_rate": 4.891300201416016e-06, |
| "lookahead_loss": 6.271997055530548, |
| "loss": 0.6279, |
| "step": 473000 |
| }, |
| { |
| "base_loss": 0.5974515009522438, |
| "epoch": 7.06866455078125, |
| "grad_norm": 0.0010884921066462994, |
| "learning_rate": 4.843616485595703e-06, |
| "lookahead_loss": 6.238460997581482, |
| "loss": 0.6056, |
| "step": 473500 |
| }, |
| { |
| "base_loss": 0.5842619987726212, |
| "epoch": 7.069618225097656, |
| "grad_norm": 0.0011051521869376302, |
| "learning_rate": 4.795932769775391e-06, |
| "lookahead_loss": 6.2725230369567875, |
| "loss": 0.5982, |
| "step": 474000 |
| }, |
| { |
| "base_loss": 0.614363546192646, |
| "epoch": 7.0705718994140625, |
| "grad_norm": 0.0011240324238315225, |
| "learning_rate": 4.7482490539550784e-06, |
| "lookahead_loss": 6.1859285850524905, |
| "loss": 0.6267, |
| "step": 474500 |
| }, |
| { |
| "base_loss": 0.6006701437830925, |
| "epoch": 7.071525573730469, |
| "grad_norm": 0.0011095181107521057, |
| "learning_rate": 4.700565338134766e-06, |
| "lookahead_loss": 6.229223669528961, |
| "loss": 0.6112, |
| "step": 475000 |
| }, |
| { |
| "epoch": 7.071525573730469, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.260892772065184, |
| "eval_lookahead_perplexity": 523.6862639374949, |
| "eval_loss": 0.20709997415542603, |
| "eval_perplexity": 1.2301055443964946, |
| "eval_runtime": 91.3111, |
| "eval_samples_per_second": 54.758, |
| "eval_steps_per_second": 1.719, |
| "step": 475000 |
| }, |
| { |
| "base_loss": 0.5963374812602997, |
| "epoch": 7.072479248046875, |
| "grad_norm": 0.0011368774576112628, |
| "learning_rate": 4.652881622314453e-06, |
| "lookahead_loss": 6.267845227241516, |
| "loss": 0.6058, |
| "step": 475500 |
| }, |
| { |
| "base_loss": 0.5906391541361808, |
| "epoch": 7.073432922363281, |
| "grad_norm": 0.0010553834727033973, |
| "learning_rate": 4.605197906494141e-06, |
| "lookahead_loss": 6.2372974953651426, |
| "loss": 0.6034, |
| "step": 476000 |
| }, |
| { |
| "base_loss": 0.6126260715126991, |
| "epoch": 7.0743865966796875, |
| "grad_norm": 0.0010900960769504309, |
| "learning_rate": 4.557514190673828e-06, |
| "lookahead_loss": 6.238807513236999, |
| "loss": 0.6218, |
| "step": 476500 |
| }, |
| { |
| "base_loss": 0.5941922485232353, |
| "epoch": 7.075340270996094, |
| "grad_norm": 0.0010984891559928656, |
| "learning_rate": 4.509830474853516e-06, |
| "lookahead_loss": 6.299616351604461, |
| "loss": 0.6054, |
| "step": 477000 |
| }, |
| { |
| "base_loss": 0.5850859879851341, |
| "epoch": 7.0762939453125, |
| "grad_norm": 0.0010677935788407922, |
| "learning_rate": 4.462146759033204e-06, |
| "lookahead_loss": 6.270443281173706, |
| "loss": 0.5961, |
| "step": 477500 |
| }, |
| { |
| "base_loss": 0.6249460031986237, |
| "epoch": 7.077247619628906, |
| "grad_norm": 0.0010833673877641559, |
| "learning_rate": 4.4144630432128904e-06, |
| "lookahead_loss": 6.2501684432029725, |
| "loss": 0.6356, |
| "step": 478000 |
| }, |
| { |
| "base_loss": 0.5957035277485847, |
| "epoch": 7.0782012939453125, |
| "grad_norm": 0.001110053970478475, |
| "learning_rate": 4.366779327392578e-06, |
| "lookahead_loss": 6.2742936916351315, |
| "loss": 0.6089, |
| "step": 478500 |
| }, |
| { |
| "base_loss": 0.5999418792724609, |
| "epoch": 7.079154968261719, |
| "grad_norm": 0.0010629543103277683, |
| "learning_rate": 4.319095611572266e-06, |
| "lookahead_loss": 6.2945415420532225, |
| "loss": 0.6071, |
| "step": 479000 |
| }, |
| { |
| "base_loss": 0.6110728977918625, |
| "epoch": 7.080108642578125, |
| "grad_norm": 0.0010882082860916853, |
| "learning_rate": 4.2714118957519534e-06, |
| "lookahead_loss": 6.2418430824279785, |
| "loss": 0.6197, |
| "step": 479500 |
| }, |
| { |
| "base_loss": 0.6104652171134949, |
| "epoch": 7.081062316894531, |
| "grad_norm": 0.0011661059688776731, |
| "learning_rate": 4.223728179931641e-06, |
| "lookahead_loss": 6.270271697044373, |
| "loss": 0.6218, |
| "step": 480000 |
| }, |
| { |
| "epoch": 7.081062316894531, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.260015819781124, |
| "eval_lookahead_perplexity": 523.2272173825853, |
| "eval_loss": 0.20709839463233948, |
| "eval_perplexity": 1.2301036014179227, |
| "eval_runtime": 92.6569, |
| "eval_samples_per_second": 53.963, |
| "eval_steps_per_second": 1.694, |
| "step": 480000 |
| }, |
| { |
| "base_loss": 0.5925396049022674, |
| "epoch": 7.0820159912109375, |
| "grad_norm": 0.0010843832278624177, |
| "learning_rate": 4.176044464111328e-06, |
| "lookahead_loss": 6.290804083824158, |
| "loss": 0.6057, |
| "step": 480500 |
| }, |
| { |
| "base_loss": 0.5912622154951096, |
| "epoch": 7.082969665527344, |
| "grad_norm": 0.0011063800193369389, |
| "learning_rate": 4.128360748291016e-06, |
| "lookahead_loss": 6.319176843643189, |
| "loss": 0.6034, |
| "step": 481000 |
| }, |
| { |
| "base_loss": 0.6196577532887458, |
| "epoch": 7.08392333984375, |
| "grad_norm": 0.0010821149917319417, |
| "learning_rate": 4.080677032470703e-06, |
| "lookahead_loss": 6.315744980335236, |
| "loss": 0.6285, |
| "step": 481500 |
| }, |
| { |
| "base_loss": 0.5937823454141616, |
| "epoch": 7.084877014160156, |
| "grad_norm": 0.0010981445666402578, |
| "learning_rate": 4.032993316650391e-06, |
| "lookahead_loss": 6.247111065387726, |
| "loss": 0.603, |
| "step": 482000 |
| }, |
| { |
| "base_loss": 0.5897138588428498, |
| "epoch": 7.0858306884765625, |
| "grad_norm": 0.0011262124171480536, |
| "learning_rate": 3.985309600830079e-06, |
| "lookahead_loss": 6.25641952419281, |
| "loss": 0.5988, |
| "step": 482500 |
| }, |
| { |
| "base_loss": 0.5952436604499817, |
| "epoch": 7.086784362792969, |
| "grad_norm": 0.0011732213897630572, |
| "learning_rate": 3.9376258850097654e-06, |
| "lookahead_loss": 6.227437392234802, |
| "loss": 0.606, |
| "step": 483000 |
| }, |
| { |
| "base_loss": 0.6236630493402481, |
| "epoch": 7.087738037109375, |
| "grad_norm": 0.0010312370723113418, |
| "learning_rate": 3.889942169189453e-06, |
| "lookahead_loss": 6.267721421718598, |
| "loss": 0.6287, |
| "step": 483500 |
| }, |
| { |
| "base_loss": 0.5960297654867173, |
| "epoch": 7.088691711425781, |
| "grad_norm": 0.0011183172464370728, |
| "learning_rate": 3.842258453369141e-06, |
| "lookahead_loss": 6.262453424453735, |
| "loss": 0.6056, |
| "step": 484000 |
| }, |
| { |
| "base_loss": 0.5999740233421326, |
| "epoch": 7.0896453857421875, |
| "grad_norm": 0.0011287264060229063, |
| "learning_rate": 3.7945747375488284e-06, |
| "lookahead_loss": 6.287140043258667, |
| "loss": 0.6091, |
| "step": 484500 |
| }, |
| { |
| "base_loss": 0.5876833364963532, |
| "epoch": 7.090599060058594, |
| "grad_norm": 0.0010590523015707731, |
| "learning_rate": 3.7468910217285157e-06, |
| "lookahead_loss": 6.26017622089386, |
| "loss": 0.5981, |
| "step": 485000 |
| }, |
| { |
| "epoch": 7.090599060058594, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.259280581062975, |
| "eval_lookahead_perplexity": 522.842661861354, |
| "eval_loss": 0.20709699392318726, |
| "eval_perplexity": 1.2301018784017568, |
| "eval_runtime": 89.5464, |
| "eval_samples_per_second": 55.837, |
| "eval_steps_per_second": 1.753, |
| "step": 485000 |
| }, |
| { |
| "base_loss": 0.5815649164319039, |
| "epoch": 7.091552734375, |
| "grad_norm": 0.001043815049342811, |
| "learning_rate": 3.6992073059082034e-06, |
| "lookahead_loss": 6.240018193244934, |
| "loss": 0.5916, |
| "step": 485500 |
| }, |
| { |
| "base_loss": 0.616571400463581, |
| "epoch": 7.092506408691406, |
| "grad_norm": 0.0011240957537665963, |
| "learning_rate": 3.6515235900878906e-06, |
| "lookahead_loss": 6.282034734249115, |
| "loss": 0.6274, |
| "step": 486000 |
| }, |
| { |
| "base_loss": 0.5999781568050384, |
| "epoch": 7.0934600830078125, |
| "grad_norm": 0.0010617803782224655, |
| "learning_rate": 3.6038398742675783e-06, |
| "lookahead_loss": 6.2716433649063115, |
| "loss": 0.6123, |
| "step": 486500 |
| }, |
| { |
| "base_loss": 0.5853218165636063, |
| "epoch": 7.094413757324219, |
| "grad_norm": 0.0010862386552616954, |
| "learning_rate": 3.556156158447266e-06, |
| "lookahead_loss": 6.264246835708618, |
| "loss": 0.5973, |
| "step": 487000 |
| }, |
| { |
| "base_loss": 0.5790126396417618, |
| "epoch": 7.095367431640625, |
| "grad_norm": 0.0010896100429818034, |
| "learning_rate": 3.508472442626953e-06, |
| "lookahead_loss": 6.2038795657157895, |
| "loss": 0.5941, |
| "step": 487500 |
| }, |
| { |
| "base_loss": 0.5979302336573601, |
| "epoch": 7.096321105957031, |
| "grad_norm": 0.001098281005397439, |
| "learning_rate": 3.460788726806641e-06, |
| "lookahead_loss": 6.263394642829895, |
| "loss": 0.6109, |
| "step": 488000 |
| }, |
| { |
| "base_loss": 0.6135610321164131, |
| "epoch": 7.0972747802734375, |
| "grad_norm": 0.0011270649265497923, |
| "learning_rate": 3.413105010986328e-06, |
| "lookahead_loss": 6.286933131217957, |
| "loss": 0.6234, |
| "step": 488500 |
| }, |
| { |
| "base_loss": 0.5961914101839065, |
| "epoch": 7.098228454589844, |
| "grad_norm": 0.0011138232657685876, |
| "learning_rate": 3.3654212951660158e-06, |
| "lookahead_loss": 6.229583405971527, |
| "loss": 0.6052, |
| "step": 489000 |
| }, |
| { |
| "base_loss": 0.5830495541095734, |
| "epoch": 7.09918212890625, |
| "grad_norm": 0.0010958199854940176, |
| "learning_rate": 3.3177375793457034e-06, |
| "lookahead_loss": 6.296017809391022, |
| "loss": 0.5957, |
| "step": 489500 |
| }, |
| { |
| "base_loss": 0.587930432677269, |
| "epoch": 7.100135803222656, |
| "grad_norm": 0.0011185267940163612, |
| "learning_rate": 3.2700538635253907e-06, |
| "lookahead_loss": 6.264325401306152, |
| "loss": 0.6033, |
| "step": 490000 |
| }, |
| { |
| "epoch": 7.100135803222656, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.25881780069857, |
| "eval_lookahead_perplexity": 522.6007565226091, |
| "eval_loss": 0.2070958912372589, |
| "eval_perplexity": 1.230100521986473, |
| "eval_runtime": 91.4681, |
| "eval_samples_per_second": 54.664, |
| "eval_steps_per_second": 1.716, |
| "step": 490000 |
| }, |
| { |
| "base_loss": 0.6116759033799172, |
| "epoch": 7.1010894775390625, |
| "grad_norm": 0.0010914826998487115, |
| "learning_rate": 3.2223701477050784e-06, |
| "lookahead_loss": 6.335911834716797, |
| "loss": 0.6208, |
| "step": 490500 |
| }, |
| { |
| "base_loss": 0.5991794173121452, |
| "epoch": 7.102043151855469, |
| "grad_norm": 0.0011365750106051564, |
| "learning_rate": 3.1746864318847656e-06, |
| "lookahead_loss": 6.299183880805969, |
| "loss": 0.6062, |
| "step": 491000 |
| }, |
| { |
| "base_loss": 0.5893770458698273, |
| "epoch": 7.102996826171875, |
| "grad_norm": 0.0010888108517974615, |
| "learning_rate": 3.1270027160644533e-06, |
| "lookahead_loss": 6.271809469223022, |
| "loss": 0.6033, |
| "step": 491500 |
| }, |
| { |
| "base_loss": 0.5916792218089104, |
| "epoch": 7.103950500488281, |
| "grad_norm": 0.0011168160708621144, |
| "learning_rate": 3.079319000244141e-06, |
| "lookahead_loss": 6.289948089599609, |
| "loss": 0.6027, |
| "step": 492000 |
| }, |
| { |
| "base_loss": 0.6132831824421883, |
| "epoch": 7.1049041748046875, |
| "grad_norm": 0.0010816961294040084, |
| "learning_rate": 3.031635284423828e-06, |
| "lookahead_loss": 6.232145028591156, |
| "loss": 0.6235, |
| "step": 492500 |
| }, |
| { |
| "base_loss": 0.6014310421943665, |
| "epoch": 7.105857849121094, |
| "grad_norm": 0.001060318318195641, |
| "learning_rate": 2.983951568603516e-06, |
| "lookahead_loss": 6.2375462627410885, |
| "loss": 0.6124, |
| "step": 493000 |
| }, |
| { |
| "base_loss": 0.5968408140540123, |
| "epoch": 7.1068115234375, |
| "grad_norm": 0.0010752023663371801, |
| "learning_rate": 2.936267852783203e-06, |
| "lookahead_loss": 6.3170910439491275, |
| "loss": 0.6051, |
| "step": 493500 |
| }, |
| { |
| "base_loss": 0.5824449016451836, |
| "epoch": 7.107765197753906, |
| "grad_norm": 0.0011570702772587538, |
| "learning_rate": 2.8885841369628908e-06, |
| "lookahead_loss": 6.208592594146729, |
| "loss": 0.5949, |
| "step": 494000 |
| }, |
| { |
| "base_loss": 0.6062813322544098, |
| "epoch": 7.1087188720703125, |
| "grad_norm": 0.0011445485288277268, |
| "learning_rate": 2.8409004211425784e-06, |
| "lookahead_loss": 6.210321761131286, |
| "loss": 0.619, |
| "step": 494500 |
| }, |
| { |
| "base_loss": 0.6082264738082885, |
| "epoch": 7.109672546386719, |
| "grad_norm": 0.0010743378661572933, |
| "learning_rate": 2.7932167053222657e-06, |
| "lookahead_loss": 6.190786142349243, |
| "loss": 0.6172, |
| "step": 495000 |
| }, |
| { |
| "epoch": 7.109672546386719, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.258252050929938, |
| "eval_lookahead_perplexity": 522.3051788848935, |
| "eval_loss": 0.20709487795829773, |
| "eval_perplexity": 1.2300992755521254, |
| "eval_runtime": 90.5758, |
| "eval_samples_per_second": 55.202, |
| "eval_steps_per_second": 1.733, |
| "step": 495000 |
| }, |
| { |
| "base_loss": 0.5961957424879074, |
| "epoch": 7.110626220703125, |
| "grad_norm": 0.0011050713947042823, |
| "learning_rate": 2.7455329895019534e-06, |
| "lookahead_loss": 6.164184574127197, |
| "loss": 0.6072, |
| "step": 495500 |
| }, |
| { |
| "base_loss": 0.5848405210375786, |
| "epoch": 7.111579895019531, |
| "grad_norm": 0.0010936327744275331, |
| "learning_rate": 2.6978492736816406e-06, |
| "lookahead_loss": 6.275397553443908, |
| "loss": 0.5972, |
| "step": 496000 |
| }, |
| { |
| "base_loss": 0.6031162394881249, |
| "epoch": 7.1125335693359375, |
| "grad_norm": 0.001085134455934167, |
| "learning_rate": 2.6501655578613283e-06, |
| "lookahead_loss": 6.301357226371765, |
| "loss": 0.6124, |
| "step": 496500 |
| }, |
| { |
| "base_loss": 0.6153186203241349, |
| "epoch": 7.113487243652344, |
| "grad_norm": 0.0011182770831510425, |
| "learning_rate": 2.602481842041016e-06, |
| "lookahead_loss": 6.307302838802338, |
| "loss": 0.6267, |
| "step": 497000 |
| }, |
| { |
| "base_loss": 0.5994082721471786, |
| "epoch": 7.11444091796875, |
| "grad_norm": 0.0010612837504595518, |
| "learning_rate": 2.554798126220703e-06, |
| "lookahead_loss": 6.18713902759552, |
| "loss": 0.6084, |
| "step": 497500 |
| }, |
| { |
| "base_loss": 0.5841842757463456, |
| "epoch": 7.115394592285156, |
| "grad_norm": 0.0011268676025792956, |
| "learning_rate": 2.507114410400391e-06, |
| "lookahead_loss": 6.234672443389893, |
| "loss": 0.5956, |
| "step": 498000 |
| }, |
| { |
| "base_loss": 0.6037292023897171, |
| "epoch": 7.1163482666015625, |
| "grad_norm": 0.001171495416201651, |
| "learning_rate": 2.459430694580078e-06, |
| "lookahead_loss": 6.24148467540741, |
| "loss": 0.6165, |
| "step": 498500 |
| }, |
| { |
| "base_loss": 0.607548170864582, |
| "epoch": 7.117301940917969, |
| "grad_norm": 0.0011324421502649784, |
| "learning_rate": 2.4117469787597658e-06, |
| "lookahead_loss": 6.238188538551331, |
| "loss": 0.6209, |
| "step": 499000 |
| }, |
| { |
| "base_loss": 0.6047358834147454, |
| "epoch": 7.118255615234375, |
| "grad_norm": 0.0011255667777732015, |
| "learning_rate": 2.3640632629394534e-06, |
| "lookahead_loss": 6.277647980690002, |
| "loss": 0.6121, |
| "step": 499500 |
| }, |
| { |
| "base_loss": 0.5826379895210266, |
| "epoch": 7.119209289550781, |
| "grad_norm": 0.0011003295658156276, |
| "learning_rate": 2.3163795471191407e-06, |
| "lookahead_loss": 6.27165866279602, |
| "loss": 0.5926, |
| "step": 500000 |
| }, |
| { |
| "epoch": 7.119209289550781, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.257852252679892, |
| "eval_lookahead_perplexity": 522.0964039250978, |
| "eval_loss": 0.20709407329559326, |
| "eval_perplexity": 1.2300982857375138, |
| "eval_runtime": 92.2348, |
| "eval_samples_per_second": 54.209, |
| "eval_steps_per_second": 1.702, |
| "step": 500000 |
| }, |
| { |
| "base_loss": 0.5965691907405853, |
| "epoch": 8.000953674316406, |
| "grad_norm": 0.0011052964255213737, |
| "learning_rate": 2.2686958312988284e-06, |
| "lookahead_loss": 6.344482093811036, |
| "loss": 0.6026, |
| "step": 500500 |
| }, |
| { |
| "base_loss": 0.5847149593234062, |
| "epoch": 8.001907348632812, |
| "grad_norm": 0.0011188529897481203, |
| "learning_rate": 2.2210121154785156e-06, |
| "lookahead_loss": 6.181741203308105, |
| "loss": 0.5971, |
| "step": 501000 |
| }, |
| { |
| "base_loss": 0.6025487731099128, |
| "epoch": 8.002861022949219, |
| "grad_norm": 0.0011255404679104686, |
| "learning_rate": 2.1733283996582033e-06, |
| "lookahead_loss": 6.201256879806518, |
| "loss": 0.6123, |
| "step": 501500 |
| }, |
| { |
| "base_loss": 0.6159883877038955, |
| "epoch": 8.003814697265625, |
| "grad_norm": 0.0010819864692166448, |
| "learning_rate": 2.125644683837891e-06, |
| "lookahead_loss": 6.209218832969666, |
| "loss": 0.6228, |
| "step": 502000 |
| }, |
| { |
| "base_loss": 0.5985354263782501, |
| "epoch": 8.004768371582031, |
| "grad_norm": 0.0011081405682489276, |
| "learning_rate": 2.077960968017578e-06, |
| "lookahead_loss": 6.19532088470459, |
| "loss": 0.6057, |
| "step": 502500 |
| }, |
| { |
| "base_loss": 0.59137887185812, |
| "epoch": 8.005722045898438, |
| "grad_norm": 0.0010434648720547557, |
| "learning_rate": 2.030277252197266e-06, |
| "lookahead_loss": 6.318549188613892, |
| "loss": 0.6001, |
| "step": 503000 |
| }, |
| { |
| "base_loss": 0.5807731298208236, |
| "epoch": 8.006675720214844, |
| "grad_norm": 0.0010843180352821946, |
| "learning_rate": 1.982593536376953e-06, |
| "lookahead_loss": 6.1740428781509396, |
| "loss": 0.5959, |
| "step": 503500 |
| }, |
| { |
| "base_loss": 0.6059517723321914, |
| "epoch": 8.00762939453125, |
| "grad_norm": 0.0011015934869647026, |
| "learning_rate": 1.9349098205566408e-06, |
| "lookahead_loss": 6.244119252204895, |
| "loss": 0.6161, |
| "step": 504000 |
| }, |
| { |
| "base_loss": 0.6048649581670761, |
| "epoch": 8.008583068847656, |
| "grad_norm": 0.0010392587864771485, |
| "learning_rate": 1.8872261047363282e-06, |
| "lookahead_loss": 6.230837811946869, |
| "loss": 0.6093, |
| "step": 504500 |
| }, |
| { |
| "base_loss": 0.593385848402977, |
| "epoch": 8.009536743164062, |
| "grad_norm": 0.001120842294767499, |
| "learning_rate": 1.8395423889160157e-06, |
| "lookahead_loss": 6.241628100395203, |
| "loss": 0.608, |
| "step": 505000 |
| }, |
| { |
| "epoch": 8.009536743164062, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.257503026590561, |
| "eval_lookahead_perplexity": 521.9141060731368, |
| "eval_loss": 0.20709335803985596, |
| "eval_perplexity": 1.2300974059029721, |
| "eval_runtime": 91.6562, |
| "eval_samples_per_second": 54.552, |
| "eval_steps_per_second": 1.713, |
| "step": 505000 |
| }, |
| { |
| "base_loss": 0.5941373434662819, |
| "epoch": 8.010490417480469, |
| "grad_norm": 0.0010929540731012821, |
| "learning_rate": 1.7918586730957031e-06, |
| "lookahead_loss": 6.197162329673767, |
| "loss": 0.6035, |
| "step": 505500 |
| }, |
| { |
| "base_loss": 0.5908739617466927, |
| "epoch": 8.011444091796875, |
| "grad_norm": 0.0011226508067920804, |
| "learning_rate": 1.7441749572753908e-06, |
| "lookahead_loss": 6.22726021194458, |
| "loss": 0.6052, |
| "step": 506000 |
| }, |
| { |
| "base_loss": 0.6081678086519241, |
| "epoch": 8.012397766113281, |
| "grad_norm": 0.0010954260360449553, |
| "learning_rate": 1.6964912414550783e-06, |
| "lookahead_loss": 6.215122665405273, |
| "loss": 0.6213, |
| "step": 506500 |
| }, |
| { |
| "base_loss": 0.5956925541162491, |
| "epoch": 8.013351440429688, |
| "grad_norm": 0.0011058412492275238, |
| "learning_rate": 1.6488075256347657e-06, |
| "lookahead_loss": 6.291760811805725, |
| "loss": 0.6083, |
| "step": 507000 |
| }, |
| { |
| "base_loss": 0.5932368034124375, |
| "epoch": 8.014305114746094, |
| "grad_norm": 0.0011063116835430264, |
| "learning_rate": 1.6011238098144532e-06, |
| "lookahead_loss": 6.2699484491348265, |
| "loss": 0.606, |
| "step": 507500 |
| }, |
| { |
| "base_loss": 0.5816438822746277, |
| "epoch": 8.0152587890625, |
| "grad_norm": 0.0011064352001994848, |
| "learning_rate": 1.5534400939941406e-06, |
| "lookahead_loss": 6.210830857753754, |
| "loss": 0.5884, |
| "step": 508000 |
| }, |
| { |
| "base_loss": 0.6093874707818031, |
| "epoch": 8.016212463378906, |
| "grad_norm": 0.0011251309188082814, |
| "learning_rate": 1.505756378173828e-06, |
| "lookahead_loss": 6.271678217887878, |
| "loss": 0.6167, |
| "step": 508500 |
| }, |
| { |
| "base_loss": 0.6042429065108299, |
| "epoch": 8.017166137695312, |
| "grad_norm": 0.001074893050827086, |
| "learning_rate": 1.4580726623535158e-06, |
| "lookahead_loss": 6.3098267641067505, |
| "loss": 0.6111, |
| "step": 509000 |
| }, |
| { |
| "base_loss": 0.5948937609791756, |
| "epoch": 8.018119812011719, |
| "grad_norm": 0.0010518047492951155, |
| "learning_rate": 1.4103889465332032e-06, |
| "lookahead_loss": 6.3096597938537595, |
| "loss": 0.6029, |
| "step": 509500 |
| }, |
| { |
| "base_loss": 0.5853534046411514, |
| "epoch": 8.019073486328125, |
| "grad_norm": 0.0010772800305858254, |
| "learning_rate": 1.3627052307128907e-06, |
| "lookahead_loss": 6.328934656620025, |
| "loss": 0.5975, |
| "step": 510000 |
| }, |
| { |
| "epoch": 8.019073486328125, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.257185920739707, |
| "eval_lookahead_perplexity": 521.7486302945136, |
| "eval_loss": 0.20709271728992462, |
| "eval_perplexity": 1.2300966177183963, |
| "eval_runtime": 89.5208, |
| "eval_samples_per_second": 55.853, |
| "eval_steps_per_second": 1.754, |
| "step": 510000 |
| }, |
| { |
| "base_loss": 0.5883547278642655, |
| "epoch": 8.020027160644531, |
| "grad_norm": 0.0011325281811878085, |
| "learning_rate": 1.3150215148925781e-06, |
| "lookahead_loss": 6.171751028060913, |
| "loss": 0.6033, |
| "step": 510500 |
| }, |
| { |
| "base_loss": 0.6178413493037224, |
| "epoch": 8.020980834960938, |
| "grad_norm": 0.001067935605533421, |
| "learning_rate": 1.2673377990722656e-06, |
| "lookahead_loss": 6.234110792160034, |
| "loss": 0.627, |
| "step": 511000 |
| }, |
| { |
| "base_loss": 0.599259612083435, |
| "epoch": 8.021934509277344, |
| "grad_norm": 0.0011157679837197065, |
| "learning_rate": 1.2196540832519533e-06, |
| "lookahead_loss": 6.210755274772644, |
| "loss": 0.611, |
| "step": 511500 |
| }, |
| { |
| "base_loss": 0.5985046907067298, |
| "epoch": 8.02288818359375, |
| "grad_norm": 0.0010958234779536724, |
| "learning_rate": 1.1719703674316407e-06, |
| "lookahead_loss": 6.240888547897339, |
| "loss": 0.6048, |
| "step": 512000 |
| }, |
| { |
| "base_loss": 0.5953476763367653, |
| "epoch": 8.023841857910156, |
| "grad_norm": 0.001115177758038044, |
| "learning_rate": 1.1242866516113282e-06, |
| "lookahead_loss": 6.217287230491638, |
| "loss": 0.6031, |
| "step": 512500 |
| }, |
| { |
| "base_loss": 0.612789287507534, |
| "epoch": 8.024795532226562, |
| "grad_norm": 0.0010878838365897536, |
| "learning_rate": 1.0766029357910156e-06, |
| "lookahead_loss": 6.225279389858246, |
| "loss": 0.6225, |
| "step": 513000 |
| }, |
| { |
| "base_loss": 0.6079529778957367, |
| "epoch": 8.025749206542969, |
| "grad_norm": 0.001078968751244247, |
| "learning_rate": 1.028919219970703e-06, |
| "lookahead_loss": 6.175548429965973, |
| "loss": 0.6162, |
| "step": 513500 |
| }, |
| { |
| "base_loss": 0.5974433195590972, |
| "epoch": 8.026702880859375, |
| "grad_norm": 0.0011195708066225052, |
| "learning_rate": 9.812355041503908e-07, |
| "lookahead_loss": 6.199098266601562, |
| "loss": 0.6053, |
| "step": 514000 |
| }, |
| { |
| "base_loss": 0.5935445895195007, |
| "epoch": 8.027656555175781, |
| "grad_norm": 0.0010943651432171464, |
| "learning_rate": 9.335517883300781e-07, |
| "lookahead_loss": 6.309714347839355, |
| "loss": 0.6063, |
| "step": 514500 |
| }, |
| { |
| "base_loss": 0.6162749938368798, |
| "epoch": 8.028610229492188, |
| "grad_norm": 0.001078986912034452, |
| "learning_rate": 8.858680725097657e-07, |
| "lookahead_loss": 6.3432404346466065, |
| "loss": 0.6248, |
| "step": 515000 |
| }, |
| { |
| "epoch": 8.028610229492188, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.2569849018852555, |
| "eval_lookahead_perplexity": 521.6437595233942, |
| "eval_loss": 0.20709232985973358, |
| "eval_perplexity": 1.230096141141921, |
| "eval_runtime": 91.7882, |
| "eval_samples_per_second": 54.473, |
| "eval_steps_per_second": 1.71, |
| "step": 515000 |
| }, |
| { |
| "base_loss": 0.6038788543343544, |
| "epoch": 8.029563903808594, |
| "grad_norm": 0.001102432026527822, |
| "learning_rate": 8.381843566894531e-07, |
| "lookahead_loss": 6.264419286251068, |
| "loss": 0.6123, |
| "step": 515500 |
| }, |
| { |
| "base_loss": 0.5950830878019333, |
| "epoch": 8.030517578125, |
| "grad_norm": 0.0011076893424615264, |
| "learning_rate": 7.905006408691407e-07, |
| "lookahead_loss": 6.274682451725006, |
| "loss": 0.6066, |
| "step": 516000 |
| }, |
| { |
| "base_loss": 0.5894574123620987, |
| "epoch": 8.031471252441406, |
| "grad_norm": 0.0011027451837435365, |
| "learning_rate": 7.428169250488282e-07, |
| "lookahead_loss": 6.2933365597724915, |
| "loss": 0.6024, |
| "step": 516500 |
| }, |
| { |
| "base_loss": 0.6123639032840729, |
| "epoch": 8.032424926757812, |
| "grad_norm": 0.001126546529121697, |
| "learning_rate": 6.951332092285156e-07, |
| "lookahead_loss": 6.2396877632141114, |
| "loss": 0.6264, |
| "step": 517000 |
| }, |
| { |
| "base_loss": 0.5966498643159867, |
| "epoch": 8.033378601074219, |
| "grad_norm": 0.0011282344348728657, |
| "learning_rate": 6.474494934082032e-07, |
| "lookahead_loss": 6.2721795229911805, |
| "loss": 0.6068, |
| "step": 517500 |
| }, |
| { |
| "base_loss": 0.5952624140977859, |
| "epoch": 8.034332275390625, |
| "grad_norm": 0.0010613331105560064, |
| "learning_rate": 5.997657775878906e-07, |
| "lookahead_loss": 6.34448275566101, |
| "loss": 0.6088, |
| "step": 518000 |
| }, |
| { |
| "base_loss": 0.5912229750752449, |
| "epoch": 8.035285949707031, |
| "grad_norm": 0.0011148882331326604, |
| "learning_rate": 5.520820617675782e-07, |
| "lookahead_loss": 6.200133923053741, |
| "loss": 0.6025, |
| "step": 518500 |
| }, |
| { |
| "base_loss": 0.6111020909547806, |
| "epoch": 8.036239624023438, |
| "grad_norm": 0.0011170883662998676, |
| "learning_rate": 5.043983459472657e-07, |
| "lookahead_loss": 6.301377963066101, |
| "loss": 0.6212, |
| "step": 519000 |
| }, |
| { |
| "base_loss": 0.6032237566113472, |
| "epoch": 8.037193298339844, |
| "grad_norm": 0.0011249141534790397, |
| "learning_rate": 4.5671463012695317e-07, |
| "lookahead_loss": 6.248824975967407, |
| "loss": 0.6122, |
| "step": 519500 |
| }, |
| { |
| "base_loss": 0.5889247298240662, |
| "epoch": 8.03814697265625, |
| "grad_norm": 0.0010875992011278868, |
| "learning_rate": 4.0903091430664063e-07, |
| "lookahead_loss": 6.256915027618408, |
| "loss": 0.6013, |
| "step": 520000 |
| }, |
| { |
| "epoch": 8.03814697265625, |
| "eval_accuracy": 0.0032320939334637964, |
| "eval_base_loss": 0.19372630566834642, |
| "eval_base_perplexity": 1.2137640371544767, |
| "eval_lookahead_loss": 6.256861397252677, |
| "eval_lookahead_perplexity": 521.5793380807926, |
| "eval_loss": 0.20709213614463806, |
| "eval_perplexity": 1.2300959028537526, |
| "eval_runtime": 91.5112, |
| "eval_samples_per_second": 54.638, |
| "eval_steps_per_second": 1.716, |
| "step": 520000 |
| }, |
| { |
| "base_loss": 0.5958697483539581, |
| "epoch": 8.039100646972656, |
| "grad_norm": 0.0011051874607801437, |
| "learning_rate": 3.6134719848632814e-07, |
| "lookahead_loss": 6.230529176712036, |
| "loss": 0.6107, |
| "step": 520500 |
| }, |
| { |
| "base_loss": 0.6128707799315453, |
| "epoch": 8.040054321289062, |
| "grad_norm": 0.001110102515667677, |
| "learning_rate": 3.1366348266601565e-07, |
| "lookahead_loss": 6.245839037895203, |
| "loss": 0.6219, |
| "step": 521000 |
| }, |
| { |
| "base_loss": 0.6014232878088951, |
| "epoch": 8.041007995605469, |
| "grad_norm": 0.0011205815244466066, |
| "learning_rate": 2.6597976684570316e-07, |
| "lookahead_loss": 6.203103644371033, |
| "loss": 0.6107, |
| "step": 521500 |
| }, |
| { |
| "base_loss": 0.579496483206749, |
| "epoch": 8.041961669921875, |
| "grad_norm": 0.0011324465740472078, |
| "learning_rate": 2.1829605102539064e-07, |
| "lookahead_loss": 6.285241819381714, |
| "loss": 0.5951, |
| "step": 522000 |
| }, |
| { |
| "base_loss": 0.6032205757498741, |
| "epoch": 8.042915344238281, |
| "grad_norm": 0.0010729862842708826, |
| "learning_rate": 1.7061233520507813e-07, |
| "lookahead_loss": 6.298031174659729, |
| "loss": 0.6152, |
| "step": 522500 |
| }, |
| { |
| "base_loss": 0.6130943556427956, |
| "epoch": 8.043869018554688, |
| "grad_norm": 0.0011428052093833685, |
| "learning_rate": 1.2292861938476564e-07, |
| "lookahead_loss": 6.320867140769958, |
| "loss": 0.6249, |
| "step": 523000 |
| }, |
| { |
| "base_loss": 0.592569636464119, |
| "epoch": 8.044822692871094, |
| "grad_norm": 0.0011110154446214437, |
| "learning_rate": 7.524490356445312e-08, |
| "lookahead_loss": 6.253005561828613, |
| "loss": 0.6046, |
| "step": 523500 |
| }, |
| { |
| "base_loss": 0.5920621357560157, |
| "epoch": 8.0457763671875, |
| "grad_norm": 0.001082174712792039, |
| "learning_rate": 2.7561187744140627e-08, |
| "lookahead_loss": 6.253642764091492, |
| "loss": 0.603, |
| "step": 524000 |
| }, |
| { |
| "epoch": 8.04632568359375, |
| "step": 524288, |
| "total_flos": 3.966527920280699e+18, |
| "train_loss": 0.6106607067631558, |
| "train_runtime": 117545.2213, |
| "train_samples_per_second": 142.73, |
| "train_steps_per_second": 4.46 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 524288, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.966527920280699e+18, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|