diff --git "a/checkpoint-18000/trainer_state.json" "b/checkpoint-18000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-18000/trainer_state.json" @@ -0,0 +1,126090 @@ +{ + "best_metric": 3.08333420753479, + "best_model_checkpoint": "models/GPTNeoX-160M-minipile-final/checkpoint-18000", + "epoch": 0.7578947368421053, + "eval_steps": 3000, + "global_step": 18000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4.210526315789474e-05, + "grad_norm": 3.890625, + "learning_rate": 1.0526315789473683e-06, + "loss": 7.3413, + "step": 1 + }, + { + "epoch": 8.421052631578948e-05, + "grad_norm": 4.5, + "learning_rate": 2.1052631578947366e-06, + "loss": 7.8082, + "step": 2 + }, + { + "epoch": 0.0001263157894736842, + "grad_norm": 3.640625, + "learning_rate": 3.157894736842105e-06, + "loss": 7.7825, + "step": 3 + }, + { + "epoch": 0.00016842105263157895, + "grad_norm": 4.875, + "learning_rate": 4.210526315789473e-06, + "loss": 8.2535, + "step": 4 + }, + { + "epoch": 0.0002105263157894737, + "grad_norm": 3.875, + "learning_rate": 5.263157894736842e-06, + "loss": 7.7617, + "step": 5 + }, + { + "epoch": 0.0002526315789473684, + "grad_norm": 4.0, + "learning_rate": 6.31578947368421e-06, + "loss": 7.7052, + "step": 6 + }, + { + "epoch": 0.00029473684210526316, + "grad_norm": 5.21875, + "learning_rate": 7.368421052631579e-06, + "loss": 8.0528, + "step": 7 + }, + { + "epoch": 0.0003368421052631579, + "grad_norm": 4.4375, + "learning_rate": 8.421052631578947e-06, + "loss": 8.0914, + "step": 8 + }, + { + "epoch": 0.00037894736842105265, + "grad_norm": 4.15625, + "learning_rate": 9.473684210526317e-06, + "loss": 7.847, + "step": 9 + }, + { + "epoch": 0.0004210526315789474, + "grad_norm": 3.40625, + "learning_rate": 1.0526315789473684e-05, + "loss": 7.3733, + "step": 10 + }, + { + "epoch": 0.0004631578947368421, + "grad_norm": 4.46875, + "learning_rate": 1.1578947368421053e-05, + "loss": 8.2678, + "step": 11 + }, + { + "epoch": 0.0005052631578947368, + "grad_norm": 3.890625, + "learning_rate": 1.263157894736842e-05, + "loss": 7.9476, + "step": 12 + }, + { + "epoch": 0.0005473684210526316, + "grad_norm": 4.5, + "learning_rate": 1.368421052631579e-05, + "loss": 8.2657, + "step": 13 + }, + { + "epoch": 0.0005894736842105263, + "grad_norm": 4.625, + "learning_rate": 1.4736842105263159e-05, + "loss": 7.954, + "step": 14 + }, + { + "epoch": 0.0006315789473684211, + "grad_norm": 4.28125, + "learning_rate": 1.5789473684210526e-05, + "loss": 7.2995, + "step": 15 + }, + { + "epoch": 0.0006736842105263158, + "grad_norm": 3.859375, + "learning_rate": 1.6842105263157893e-05, + "loss": 7.876, + "step": 16 + }, + { + "epoch": 0.0007157894736842105, + "grad_norm": 4.3125, + "learning_rate": 1.7894736842105264e-05, + "loss": 7.925, + "step": 17 + }, + { + "epoch": 0.0007578947368421053, + "grad_norm": 3.078125, + "learning_rate": 1.8947368421052634e-05, + "loss": 7.2259, + "step": 18 + }, + { + "epoch": 0.0008, + "grad_norm": 5.34375, + "learning_rate": 2e-05, + "loss": 8.2554, + "step": 19 + }, + { + "epoch": 0.0008421052631578948, + "grad_norm": 3.109375, + "learning_rate": 2.105263157894737e-05, + "loss": 7.42, + "step": 20 + }, + { + "epoch": 0.0008842105263157895, + "grad_norm": 2.921875, + "learning_rate": 2.2105263157894736e-05, + "loss": 7.3183, + "step": 21 + }, + { + "epoch": 0.0009263157894736842, + "grad_norm": 3.21875, + "learning_rate": 2.3157894736842107e-05, + "loss": 7.8771, + "step": 22 + }, + { + "epoch": 0.0009684210526315789, + "grad_norm": 2.734375, + "learning_rate": 2.4210526315789474e-05, + "loss": 7.4952, + "step": 23 + }, + { + "epoch": 0.0010105263157894737, + "grad_norm": 2.71875, + "learning_rate": 2.526315789473684e-05, + "loss": 7.5569, + "step": 24 + }, + { + "epoch": 0.0010526315789473684, + "grad_norm": 2.859375, + "learning_rate": 2.631578947368421e-05, + "loss": 6.8489, + "step": 25 + }, + { + "epoch": 0.0010947368421052631, + "grad_norm": 2.84375, + "learning_rate": 2.736842105263158e-05, + "loss": 7.8147, + "step": 26 + }, + { + "epoch": 0.0011368421052631579, + "grad_norm": 1.9296875, + "learning_rate": 2.842105263157895e-05, + "loss": 6.961, + "step": 27 + }, + { + "epoch": 0.0011789473684210526, + "grad_norm": 2.625, + "learning_rate": 2.9473684210526317e-05, + "loss": 7.421, + "step": 28 + }, + { + "epoch": 0.0012210526315789474, + "grad_norm": 2.03125, + "learning_rate": 3.0526315789473684e-05, + "loss": 6.9197, + "step": 29 + }, + { + "epoch": 0.0012631578947368421, + "grad_norm": 2.703125, + "learning_rate": 3.157894736842105e-05, + "loss": 7.7633, + "step": 30 + }, + { + "epoch": 0.0013052631578947369, + "grad_norm": 2.0625, + "learning_rate": 3.263157894736842e-05, + "loss": 6.9348, + "step": 31 + }, + { + "epoch": 0.0013473684210526316, + "grad_norm": 1.875, + "learning_rate": 3.3684210526315786e-05, + "loss": 6.9246, + "step": 32 + }, + { + "epoch": 0.0013894736842105264, + "grad_norm": 2.1875, + "learning_rate": 3.473684210526316e-05, + "loss": 7.4443, + "step": 33 + }, + { + "epoch": 0.001431578947368421, + "grad_norm": 1.65625, + "learning_rate": 3.578947368421053e-05, + "loss": 7.2099, + "step": 34 + }, + { + "epoch": 0.0014736842105263158, + "grad_norm": 1.859375, + "learning_rate": 3.6842105263157895e-05, + "loss": 6.8235, + "step": 35 + }, + { + "epoch": 0.0015157894736842106, + "grad_norm": 1.953125, + "learning_rate": 3.789473684210527e-05, + "loss": 6.9099, + "step": 36 + }, + { + "epoch": 0.0015578947368421053, + "grad_norm": 1.90625, + "learning_rate": 3.8947368421052636e-05, + "loss": 6.941, + "step": 37 + }, + { + "epoch": 0.0016, + "grad_norm": 1.6328125, + "learning_rate": 4e-05, + "loss": 6.6937, + "step": 38 + }, + { + "epoch": 0.0016421052631578948, + "grad_norm": 1.734375, + "learning_rate": 4.105263157894737e-05, + "loss": 7.0925, + "step": 39 + }, + { + "epoch": 0.0016842105263157896, + "grad_norm": 1.4609375, + "learning_rate": 4.210526315789474e-05, + "loss": 6.8102, + "step": 40 + }, + { + "epoch": 0.0017263157894736843, + "grad_norm": 2.078125, + "learning_rate": 4.3157894736842105e-05, + "loss": 6.9307, + "step": 41 + }, + { + "epoch": 0.001768421052631579, + "grad_norm": 1.6796875, + "learning_rate": 4.421052631578947e-05, + "loss": 6.3676, + "step": 42 + }, + { + "epoch": 0.0018105263157894736, + "grad_norm": 1.78125, + "learning_rate": 4.5263157894736846e-05, + "loss": 7.1898, + "step": 43 + }, + { + "epoch": 0.0018526315789473683, + "grad_norm": 1.4765625, + "learning_rate": 4.6315789473684214e-05, + "loss": 6.763, + "step": 44 + }, + { + "epoch": 0.001894736842105263, + "grad_norm": 1.34375, + "learning_rate": 4.736842105263158e-05, + "loss": 6.5191, + "step": 45 + }, + { + "epoch": 0.0019368421052631578, + "grad_norm": 1.578125, + "learning_rate": 4.842105263157895e-05, + "loss": 6.8033, + "step": 46 + }, + { + "epoch": 0.0019789473684210528, + "grad_norm": 1.765625, + "learning_rate": 4.9473684210526315e-05, + "loss": 6.9758, + "step": 47 + }, + { + "epoch": 0.0020210526315789473, + "grad_norm": 1.8359375, + "learning_rate": 5.052631578947368e-05, + "loss": 7.1849, + "step": 48 + }, + { + "epoch": 0.0020631578947368423, + "grad_norm": 2.296875, + "learning_rate": 5.157894736842105e-05, + "loss": 5.9802, + "step": 49 + }, + { + "epoch": 0.002105263157894737, + "grad_norm": 1.4453125, + "learning_rate": 5.263157894736842e-05, + "loss": 6.4116, + "step": 50 + }, + { + "epoch": 0.0021473684210526318, + "grad_norm": 1.2890625, + "learning_rate": 5.368421052631579e-05, + "loss": 6.084, + "step": 51 + }, + { + "epoch": 0.0021894736842105263, + "grad_norm": 1.3828125, + "learning_rate": 5.473684210526316e-05, + "loss": 6.541, + "step": 52 + }, + { + "epoch": 0.0022315789473684212, + "grad_norm": 1.25, + "learning_rate": 5.5789473684210526e-05, + "loss": 6.3551, + "step": 53 + }, + { + "epoch": 0.0022736842105263158, + "grad_norm": 1.640625, + "learning_rate": 5.68421052631579e-05, + "loss": 6.6938, + "step": 54 + }, + { + "epoch": 0.0023157894736842107, + "grad_norm": 1.265625, + "learning_rate": 5.789473684210527e-05, + "loss": 6.2223, + "step": 55 + }, + { + "epoch": 0.0023578947368421053, + "grad_norm": 1.265625, + "learning_rate": 5.8947368421052634e-05, + "loss": 6.2336, + "step": 56 + }, + { + "epoch": 0.0024, + "grad_norm": 1.0703125, + "learning_rate": 6e-05, + "loss": 6.0368, + "step": 57 + }, + { + "epoch": 0.0024421052631578948, + "grad_norm": 1.1484375, + "learning_rate": 6.105263157894737e-05, + "loss": 6.2726, + "step": 58 + }, + { + "epoch": 0.0024842105263157893, + "grad_norm": 1.578125, + "learning_rate": 6.210526315789474e-05, + "loss": 6.8905, + "step": 59 + }, + { + "epoch": 0.0025263157894736842, + "grad_norm": 1.1328125, + "learning_rate": 6.31578947368421e-05, + "loss": 6.0762, + "step": 60 + }, + { + "epoch": 0.0025684210526315788, + "grad_norm": 1.6640625, + "learning_rate": 6.421052631578946e-05, + "loss": 6.2807, + "step": 61 + }, + { + "epoch": 0.0026105263157894737, + "grad_norm": 1.703125, + "learning_rate": 6.526315789473684e-05, + "loss": 6.2477, + "step": 62 + }, + { + "epoch": 0.0026526315789473683, + "grad_norm": 1.2421875, + "learning_rate": 6.631578947368421e-05, + "loss": 6.0277, + "step": 63 + }, + { + "epoch": 0.0026947368421052632, + "grad_norm": 1.5234375, + "learning_rate": 6.736842105263157e-05, + "loss": 5.8858, + "step": 64 + }, + { + "epoch": 0.0027368421052631577, + "grad_norm": 1.0546875, + "learning_rate": 6.842105263157896e-05, + "loss": 6.3088, + "step": 65 + }, + { + "epoch": 0.0027789473684210527, + "grad_norm": 1.03125, + "learning_rate": 6.947368421052632e-05, + "loss": 6.5203, + "step": 66 + }, + { + "epoch": 0.0028210526315789472, + "grad_norm": 1.3671875, + "learning_rate": 7.05263157894737e-05, + "loss": 6.5611, + "step": 67 + }, + { + "epoch": 0.002863157894736842, + "grad_norm": 1.0546875, + "learning_rate": 7.157894736842105e-05, + "loss": 6.0409, + "step": 68 + }, + { + "epoch": 0.0029052631578947367, + "grad_norm": 0.91015625, + "learning_rate": 7.263157894736843e-05, + "loss": 5.9745, + "step": 69 + }, + { + "epoch": 0.0029473684210526317, + "grad_norm": 1.15625, + "learning_rate": 7.368421052631579e-05, + "loss": 5.946, + "step": 70 + }, + { + "epoch": 0.002989473684210526, + "grad_norm": 1.1953125, + "learning_rate": 7.473684210526316e-05, + "loss": 6.0341, + "step": 71 + }, + { + "epoch": 0.003031578947368421, + "grad_norm": 0.91015625, + "learning_rate": 7.578947368421054e-05, + "loss": 5.6885, + "step": 72 + }, + { + "epoch": 0.0030736842105263157, + "grad_norm": 1.2109375, + "learning_rate": 7.68421052631579e-05, + "loss": 6.4778, + "step": 73 + }, + { + "epoch": 0.0031157894736842107, + "grad_norm": 0.9921875, + "learning_rate": 7.789473684210527e-05, + "loss": 6.1629, + "step": 74 + }, + { + "epoch": 0.003157894736842105, + "grad_norm": 1.140625, + "learning_rate": 7.894736842105263e-05, + "loss": 5.6599, + "step": 75 + }, + { + "epoch": 0.0032, + "grad_norm": 2.0, + "learning_rate": 8e-05, + "loss": 5.901, + "step": 76 + }, + { + "epoch": 0.0032421052631578947, + "grad_norm": 1.09375, + "learning_rate": 8.105263157894737e-05, + "loss": 6.076, + "step": 77 + }, + { + "epoch": 0.0032842105263157896, + "grad_norm": 0.8203125, + "learning_rate": 8.210526315789474e-05, + "loss": 5.7718, + "step": 78 + }, + { + "epoch": 0.003326315789473684, + "grad_norm": 0.88671875, + "learning_rate": 8.315789473684212e-05, + "loss": 5.7908, + "step": 79 + }, + { + "epoch": 0.003368421052631579, + "grad_norm": 1.09375, + "learning_rate": 8.421052631578948e-05, + "loss": 5.9521, + "step": 80 + }, + { + "epoch": 0.0034105263157894737, + "grad_norm": 1.0390625, + "learning_rate": 8.526315789473685e-05, + "loss": 6.0692, + "step": 81 + }, + { + "epoch": 0.0034526315789473686, + "grad_norm": 0.8046875, + "learning_rate": 8.631578947368421e-05, + "loss": 5.7741, + "step": 82 + }, + { + "epoch": 0.003494736842105263, + "grad_norm": 1.1640625, + "learning_rate": 8.736842105263158e-05, + "loss": 5.6126, + "step": 83 + }, + { + "epoch": 0.003536842105263158, + "grad_norm": 1.2890625, + "learning_rate": 8.842105263157894e-05, + "loss": 5.8609, + "step": 84 + }, + { + "epoch": 0.0035789473684210526, + "grad_norm": 0.84765625, + "learning_rate": 8.947368421052632e-05, + "loss": 5.8477, + "step": 85 + }, + { + "epoch": 0.003621052631578947, + "grad_norm": 0.8671875, + "learning_rate": 9.052631578947369e-05, + "loss": 5.8577, + "step": 86 + }, + { + "epoch": 0.003663157894736842, + "grad_norm": 1.2109375, + "learning_rate": 9.157894736842105e-05, + "loss": 5.9344, + "step": 87 + }, + { + "epoch": 0.0037052631578947367, + "grad_norm": 1.5390625, + "learning_rate": 9.263157894736843e-05, + "loss": 6.1071, + "step": 88 + }, + { + "epoch": 0.0037473684210526316, + "grad_norm": 1.046875, + "learning_rate": 9.368421052631579e-05, + "loss": 5.7145, + "step": 89 + }, + { + "epoch": 0.003789473684210526, + "grad_norm": 0.72265625, + "learning_rate": 9.473684210526316e-05, + "loss": 5.8452, + "step": 90 + }, + { + "epoch": 0.003831578947368421, + "grad_norm": 0.9765625, + "learning_rate": 9.578947368421052e-05, + "loss": 5.6488, + "step": 91 + }, + { + "epoch": 0.0038736842105263156, + "grad_norm": 0.95703125, + "learning_rate": 9.68421052631579e-05, + "loss": 5.5844, + "step": 92 + }, + { + "epoch": 0.00391578947368421, + "grad_norm": 0.84765625, + "learning_rate": 9.789473684210526e-05, + "loss": 5.7262, + "step": 93 + }, + { + "epoch": 0.0039578947368421056, + "grad_norm": 0.9609375, + "learning_rate": 9.894736842105263e-05, + "loss": 5.6465, + "step": 94 + }, + { + "epoch": 0.004, + "grad_norm": 0.96484375, + "learning_rate": 0.0001, + "loss": 5.7792, + "step": 95 + }, + { + "epoch": 0.004042105263157895, + "grad_norm": 1.3203125, + "learning_rate": 0.00010105263157894737, + "loss": 5.8167, + "step": 96 + }, + { + "epoch": 0.004084210526315789, + "grad_norm": 0.84765625, + "learning_rate": 0.00010210526315789474, + "loss": 5.7599, + "step": 97 + }, + { + "epoch": 0.0041263157894736845, + "grad_norm": 1.140625, + "learning_rate": 0.0001031578947368421, + "loss": 5.5593, + "step": 98 + }, + { + "epoch": 0.004168421052631579, + "grad_norm": 0.8828125, + "learning_rate": 0.00010421052631578947, + "loss": 5.6243, + "step": 99 + }, + { + "epoch": 0.004210526315789474, + "grad_norm": 0.66015625, + "learning_rate": 0.00010526315789473683, + "loss": 5.756, + "step": 100 + }, + { + "epoch": 0.004252631578947368, + "grad_norm": 0.69921875, + "learning_rate": 0.00010631578947368421, + "loss": 5.5719, + "step": 101 + }, + { + "epoch": 0.0042947368421052635, + "grad_norm": 0.7109375, + "learning_rate": 0.00010736842105263158, + "loss": 5.418, + "step": 102 + }, + { + "epoch": 0.004336842105263158, + "grad_norm": 1.3046875, + "learning_rate": 0.00010842105263157894, + "loss": 5.5703, + "step": 103 + }, + { + "epoch": 0.004378947368421053, + "grad_norm": 1.0859375, + "learning_rate": 0.00010947368421052632, + "loss": 5.6246, + "step": 104 + }, + { + "epoch": 0.004421052631578947, + "grad_norm": 1.1484375, + "learning_rate": 0.00011052631578947368, + "loss": 5.8189, + "step": 105 + }, + { + "epoch": 0.0044631578947368425, + "grad_norm": 0.82421875, + "learning_rate": 0.00011157894736842105, + "loss": 5.4517, + "step": 106 + }, + { + "epoch": 0.004505263157894737, + "grad_norm": 0.77734375, + "learning_rate": 0.00011263157894736841, + "loss": 5.4263, + "step": 107 + }, + { + "epoch": 0.0045473684210526315, + "grad_norm": 1.015625, + "learning_rate": 0.0001136842105263158, + "loss": 5.5597, + "step": 108 + }, + { + "epoch": 0.004589473684210526, + "grad_norm": 1.15625, + "learning_rate": 0.00011473684210526316, + "loss": 5.6423, + "step": 109 + }, + { + "epoch": 0.0046315789473684215, + "grad_norm": 1.0234375, + "learning_rate": 0.00011578947368421053, + "loss": 5.6542, + "step": 110 + }, + { + "epoch": 0.004673684210526316, + "grad_norm": 1.0859375, + "learning_rate": 0.00011684210526315791, + "loss": 5.5691, + "step": 111 + }, + { + "epoch": 0.0047157894736842105, + "grad_norm": 1.1875, + "learning_rate": 0.00011789473684210527, + "loss": 5.4473, + "step": 112 + }, + { + "epoch": 0.004757894736842105, + "grad_norm": 0.7109375, + "learning_rate": 0.00011894736842105264, + "loss": 5.4498, + "step": 113 + }, + { + "epoch": 0.0048, + "grad_norm": 0.8671875, + "learning_rate": 0.00012, + "loss": 5.67, + "step": 114 + }, + { + "epoch": 0.004842105263157895, + "grad_norm": 1.2734375, + "learning_rate": 0.00012105263157894738, + "loss": 5.3806, + "step": 115 + }, + { + "epoch": 0.0048842105263157895, + "grad_norm": 0.83984375, + "learning_rate": 0.00012210526315789474, + "loss": 5.4164, + "step": 116 + }, + { + "epoch": 0.004926315789473684, + "grad_norm": 0.84765625, + "learning_rate": 0.0001231578947368421, + "loss": 5.2849, + "step": 117 + }, + { + "epoch": 0.0049684210526315786, + "grad_norm": 0.9609375, + "learning_rate": 0.00012421052631578949, + "loss": 5.4942, + "step": 118 + }, + { + "epoch": 0.005010526315789474, + "grad_norm": 0.86328125, + "learning_rate": 0.00012526315789473686, + "loss": 5.7359, + "step": 119 + }, + { + "epoch": 0.0050526315789473685, + "grad_norm": 0.671875, + "learning_rate": 0.0001263157894736842, + "loss": 5.5311, + "step": 120 + }, + { + "epoch": 0.005094736842105263, + "grad_norm": 1.1484375, + "learning_rate": 0.00012736842105263158, + "loss": 5.3792, + "step": 121 + }, + { + "epoch": 0.0051368421052631575, + "grad_norm": 0.74609375, + "learning_rate": 0.00012842105263157893, + "loss": 5.4266, + "step": 122 + }, + { + "epoch": 0.005178947368421053, + "grad_norm": 0.9375, + "learning_rate": 0.00012947368421052633, + "loss": 5.401, + "step": 123 + }, + { + "epoch": 0.0052210526315789475, + "grad_norm": 0.640625, + "learning_rate": 0.00013052631578947368, + "loss": 5.1973, + "step": 124 + }, + { + "epoch": 0.005263157894736842, + "grad_norm": 0.99609375, + "learning_rate": 0.00013157894736842105, + "loss": 5.287, + "step": 125 + }, + { + "epoch": 0.0053052631578947365, + "grad_norm": 1.078125, + "learning_rate": 0.00013263157894736842, + "loss": 5.3064, + "step": 126 + }, + { + "epoch": 0.005347368421052632, + "grad_norm": 1.296875, + "learning_rate": 0.0001336842105263158, + "loss": 5.2671, + "step": 127 + }, + { + "epoch": 0.0053894736842105264, + "grad_norm": 1.28125, + "learning_rate": 0.00013473684210526314, + "loss": 5.3091, + "step": 128 + }, + { + "epoch": 0.005431578947368421, + "grad_norm": 1.0859375, + "learning_rate": 0.00013578947368421052, + "loss": 5.3557, + "step": 129 + }, + { + "epoch": 0.0054736842105263155, + "grad_norm": 0.78515625, + "learning_rate": 0.00013684210526315792, + "loss": 5.4329, + "step": 130 + }, + { + "epoch": 0.005515789473684211, + "grad_norm": 0.82421875, + "learning_rate": 0.00013789473684210527, + "loss": 5.1083, + "step": 131 + }, + { + "epoch": 0.005557894736842105, + "grad_norm": 1.0859375, + "learning_rate": 0.00013894736842105264, + "loss": 5.3215, + "step": 132 + }, + { + "epoch": 0.0056, + "grad_norm": 1.1015625, + "learning_rate": 0.00014000000000000001, + "loss": 5.1231, + "step": 133 + }, + { + "epoch": 0.0056421052631578945, + "grad_norm": 0.82421875, + "learning_rate": 0.0001410526315789474, + "loss": 5.2299, + "step": 134 + }, + { + "epoch": 0.00568421052631579, + "grad_norm": 0.875, + "learning_rate": 0.00014210526315789474, + "loss": 5.3445, + "step": 135 + }, + { + "epoch": 0.005726315789473684, + "grad_norm": 0.75390625, + "learning_rate": 0.0001431578947368421, + "loss": 5.0036, + "step": 136 + }, + { + "epoch": 0.005768421052631579, + "grad_norm": 0.9765625, + "learning_rate": 0.00014421052631578948, + "loss": 4.9686, + "step": 137 + }, + { + "epoch": 0.0058105263157894734, + "grad_norm": 0.8984375, + "learning_rate": 0.00014526315789473686, + "loss": 5.2371, + "step": 138 + }, + { + "epoch": 0.005852631578947369, + "grad_norm": 0.8046875, + "learning_rate": 0.0001463157894736842, + "loss": 5.2915, + "step": 139 + }, + { + "epoch": 0.005894736842105263, + "grad_norm": 0.83984375, + "learning_rate": 0.00014736842105263158, + "loss": 5.2411, + "step": 140 + }, + { + "epoch": 0.005936842105263158, + "grad_norm": 0.80859375, + "learning_rate": 0.00014842105263157895, + "loss": 5.2067, + "step": 141 + }, + { + "epoch": 0.005978947368421052, + "grad_norm": 0.625, + "learning_rate": 0.00014947368421052633, + "loss": 5.2988, + "step": 142 + }, + { + "epoch": 0.006021052631578947, + "grad_norm": 1.234375, + "learning_rate": 0.00015052631578947367, + "loss": 5.2165, + "step": 143 + }, + { + "epoch": 0.006063157894736842, + "grad_norm": 0.94921875, + "learning_rate": 0.00015157894736842108, + "loss": 5.2836, + "step": 144 + }, + { + "epoch": 0.006105263157894737, + "grad_norm": 0.8515625, + "learning_rate": 0.00015263157894736842, + "loss": 5.1639, + "step": 145 + }, + { + "epoch": 0.006147368421052631, + "grad_norm": 0.734375, + "learning_rate": 0.0001536842105263158, + "loss": 5.0287, + "step": 146 + }, + { + "epoch": 0.006189473684210526, + "grad_norm": 0.99609375, + "learning_rate": 0.00015473684210526314, + "loss": 4.9608, + "step": 147 + }, + { + "epoch": 0.006231578947368421, + "grad_norm": 0.7421875, + "learning_rate": 0.00015578947368421054, + "loss": 5.2198, + "step": 148 + }, + { + "epoch": 0.006273684210526316, + "grad_norm": 0.8515625, + "learning_rate": 0.0001568421052631579, + "loss": 5.2419, + "step": 149 + }, + { + "epoch": 0.00631578947368421, + "grad_norm": 0.79296875, + "learning_rate": 0.00015789473684210527, + "loss": 5.0146, + "step": 150 + }, + { + "epoch": 0.006357894736842105, + "grad_norm": 0.95703125, + "learning_rate": 0.00015894736842105264, + "loss": 5.2386, + "step": 151 + }, + { + "epoch": 0.0064, + "grad_norm": 0.96875, + "learning_rate": 0.00016, + "loss": 5.1775, + "step": 152 + }, + { + "epoch": 0.006442105263157895, + "grad_norm": 0.6875, + "learning_rate": 0.00016105263157894736, + "loss": 5.1883, + "step": 153 + }, + { + "epoch": 0.006484210526315789, + "grad_norm": 0.90234375, + "learning_rate": 0.00016210526315789473, + "loss": 5.1067, + "step": 154 + }, + { + "epoch": 0.006526315789473684, + "grad_norm": 0.88671875, + "learning_rate": 0.0001631578947368421, + "loss": 5.0509, + "step": 155 + }, + { + "epoch": 0.006568421052631579, + "grad_norm": 0.91796875, + "learning_rate": 0.00016421052631578948, + "loss": 4.9433, + "step": 156 + }, + { + "epoch": 0.006610526315789474, + "grad_norm": 0.76953125, + "learning_rate": 0.00016526315789473683, + "loss": 5.23, + "step": 157 + }, + { + "epoch": 0.006652631578947368, + "grad_norm": 0.98046875, + "learning_rate": 0.00016631578947368423, + "loss": 5.023, + "step": 158 + }, + { + "epoch": 0.006694736842105263, + "grad_norm": 0.8828125, + "learning_rate": 0.00016736842105263158, + "loss": 5.0362, + "step": 159 + }, + { + "epoch": 0.006736842105263158, + "grad_norm": 0.66015625, + "learning_rate": 0.00016842105263157895, + "loss": 5.2109, + "step": 160 + }, + { + "epoch": 0.006778947368421053, + "grad_norm": 0.84375, + "learning_rate": 0.0001694736842105263, + "loss": 5.1547, + "step": 161 + }, + { + "epoch": 0.006821052631578947, + "grad_norm": 0.79296875, + "learning_rate": 0.0001705263157894737, + "loss": 5.0887, + "step": 162 + }, + { + "epoch": 0.006863157894736842, + "grad_norm": 0.6953125, + "learning_rate": 0.00017157894736842105, + "loss": 5.193, + "step": 163 + }, + { + "epoch": 0.006905263157894737, + "grad_norm": 0.734375, + "learning_rate": 0.00017263157894736842, + "loss": 5.2748, + "step": 164 + }, + { + "epoch": 0.006947368421052632, + "grad_norm": 0.60546875, + "learning_rate": 0.0001736842105263158, + "loss": 5.1514, + "step": 165 + }, + { + "epoch": 0.006989473684210526, + "grad_norm": 0.8125, + "learning_rate": 0.00017473684210526317, + "loss": 4.7939, + "step": 166 + }, + { + "epoch": 0.007031578947368421, + "grad_norm": 0.953125, + "learning_rate": 0.00017578947368421052, + "loss": 4.9078, + "step": 167 + }, + { + "epoch": 0.007073684210526316, + "grad_norm": 0.9375, + "learning_rate": 0.0001768421052631579, + "loss": 5.2173, + "step": 168 + }, + { + "epoch": 0.007115789473684211, + "grad_norm": 0.8671875, + "learning_rate": 0.00017789473684210526, + "loss": 4.913, + "step": 169 + }, + { + "epoch": 0.007157894736842105, + "grad_norm": 0.8125, + "learning_rate": 0.00017894736842105264, + "loss": 5.1521, + "step": 170 + }, + { + "epoch": 0.0072, + "grad_norm": 0.875, + "learning_rate": 0.00017999999999999998, + "loss": 4.9807, + "step": 171 + }, + { + "epoch": 0.007242105263157894, + "grad_norm": 0.6328125, + "learning_rate": 0.00018105263157894739, + "loss": 4.9046, + "step": 172 + }, + { + "epoch": 0.00728421052631579, + "grad_norm": 1.0546875, + "learning_rate": 0.00018210526315789476, + "loss": 5.2114, + "step": 173 + }, + { + "epoch": 0.007326315789473684, + "grad_norm": 1.2421875, + "learning_rate": 0.0001831578947368421, + "loss": 5.0849, + "step": 174 + }, + { + "epoch": 0.007368421052631579, + "grad_norm": 0.99609375, + "learning_rate": 0.00018421052631578948, + "loss": 4.9578, + "step": 175 + }, + { + "epoch": 0.007410526315789473, + "grad_norm": 0.87109375, + "learning_rate": 0.00018526315789473685, + "loss": 5.1629, + "step": 176 + }, + { + "epoch": 0.007452631578947369, + "grad_norm": 0.66015625, + "learning_rate": 0.00018631578947368423, + "loss": 5.0355, + "step": 177 + }, + { + "epoch": 0.007494736842105263, + "grad_norm": 1.1328125, + "learning_rate": 0.00018736842105263158, + "loss": 5.1637, + "step": 178 + }, + { + "epoch": 0.007536842105263158, + "grad_norm": 1.140625, + "learning_rate": 0.00018842105263157895, + "loss": 4.9553, + "step": 179 + }, + { + "epoch": 0.007578947368421052, + "grad_norm": 0.91015625, + "learning_rate": 0.00018947368421052632, + "loss": 4.7738, + "step": 180 + }, + { + "epoch": 0.007621052631578948, + "grad_norm": 0.69140625, + "learning_rate": 0.0001905263157894737, + "loss": 5.2183, + "step": 181 + }, + { + "epoch": 0.007663157894736842, + "grad_norm": 0.99609375, + "learning_rate": 0.00019157894736842104, + "loss": 4.9233, + "step": 182 + }, + { + "epoch": 0.007705263157894737, + "grad_norm": 0.73828125, + "learning_rate": 0.00019263157894736845, + "loss": 5.0219, + "step": 183 + }, + { + "epoch": 0.007747368421052631, + "grad_norm": 1.1484375, + "learning_rate": 0.0001936842105263158, + "loss": 4.6389, + "step": 184 + }, + { + "epoch": 0.007789473684210527, + "grad_norm": 0.76953125, + "learning_rate": 0.00019473684210526317, + "loss": 4.9254, + "step": 185 + }, + { + "epoch": 0.00783157894736842, + "grad_norm": 0.75, + "learning_rate": 0.0001957894736842105, + "loss": 4.8345, + "step": 186 + }, + { + "epoch": 0.007873684210526316, + "grad_norm": 0.609375, + "learning_rate": 0.00019684210526315791, + "loss": 5.1883, + "step": 187 + }, + { + "epoch": 0.007915789473684211, + "grad_norm": 0.703125, + "learning_rate": 0.00019789473684210526, + "loss": 4.916, + "step": 188 + }, + { + "epoch": 0.007957894736842105, + "grad_norm": 0.79296875, + "learning_rate": 0.00019894736842105264, + "loss": 4.886, + "step": 189 + }, + { + "epoch": 0.008, + "grad_norm": 1.109375, + "learning_rate": 0.0002, + "loss": 4.6236, + "step": 190 + }, + { + "epoch": 0.008042105263157896, + "grad_norm": 0.703125, + "learning_rate": 0.00020105263157894738, + "loss": 4.8837, + "step": 191 + }, + { + "epoch": 0.00808421052631579, + "grad_norm": 0.87109375, + "learning_rate": 0.00020210526315789473, + "loss": 4.6736, + "step": 192 + }, + { + "epoch": 0.008126315789473685, + "grad_norm": 0.6875, + "learning_rate": 0.0002031578947368421, + "loss": 4.9368, + "step": 193 + }, + { + "epoch": 0.008168421052631578, + "grad_norm": 0.6875, + "learning_rate": 0.00020421052631578948, + "loss": 5.2258, + "step": 194 + }, + { + "epoch": 0.008210526315789474, + "grad_norm": 0.67578125, + "learning_rate": 0.00020526315789473685, + "loss": 4.7898, + "step": 195 + }, + { + "epoch": 0.008252631578947369, + "grad_norm": 0.8515625, + "learning_rate": 0.0002063157894736842, + "loss": 4.9005, + "step": 196 + }, + { + "epoch": 0.008294736842105263, + "grad_norm": 0.953125, + "learning_rate": 0.0002073684210526316, + "loss": 4.882, + "step": 197 + }, + { + "epoch": 0.008336842105263158, + "grad_norm": 1.140625, + "learning_rate": 0.00020842105263157895, + "loss": 5.1039, + "step": 198 + }, + { + "epoch": 0.008378947368421052, + "grad_norm": 1.125, + "learning_rate": 0.00020947368421052632, + "loss": 4.9803, + "step": 199 + }, + { + "epoch": 0.008421052631578947, + "grad_norm": 1.4296875, + "learning_rate": 0.00021052631578947367, + "loss": 4.8663, + "step": 200 + }, + { + "epoch": 0.008463157894736843, + "grad_norm": 0.79296875, + "learning_rate": 0.00021157894736842107, + "loss": 4.7288, + "step": 201 + }, + { + "epoch": 0.008505263157894736, + "grad_norm": 0.9921875, + "learning_rate": 0.00021263157894736842, + "loss": 4.7374, + "step": 202 + }, + { + "epoch": 0.008547368421052632, + "grad_norm": 0.8828125, + "learning_rate": 0.0002136842105263158, + "loss": 4.6774, + "step": 203 + }, + { + "epoch": 0.008589473684210527, + "grad_norm": 0.9609375, + "learning_rate": 0.00021473684210526316, + "loss": 4.8405, + "step": 204 + }, + { + "epoch": 0.00863157894736842, + "grad_norm": 0.8515625, + "learning_rate": 0.00021578947368421054, + "loss": 4.8519, + "step": 205 + }, + { + "epoch": 0.008673684210526316, + "grad_norm": 0.796875, + "learning_rate": 0.00021684210526315789, + "loss": 5.0526, + "step": 206 + }, + { + "epoch": 0.00871578947368421, + "grad_norm": 0.84375, + "learning_rate": 0.00021789473684210526, + "loss": 5.0063, + "step": 207 + }, + { + "epoch": 0.008757894736842105, + "grad_norm": 0.828125, + "learning_rate": 0.00021894736842105263, + "loss": 5.0228, + "step": 208 + }, + { + "epoch": 0.0088, + "grad_norm": 0.7109375, + "learning_rate": 0.00022, + "loss": 5.1899, + "step": 209 + }, + { + "epoch": 0.008842105263157894, + "grad_norm": 1.125, + "learning_rate": 0.00022105263157894735, + "loss": 4.7837, + "step": 210 + }, + { + "epoch": 0.00888421052631579, + "grad_norm": 0.73046875, + "learning_rate": 0.00022210526315789476, + "loss": 5.0637, + "step": 211 + }, + { + "epoch": 0.008926315789473685, + "grad_norm": 1.1171875, + "learning_rate": 0.0002231578947368421, + "loss": 4.8577, + "step": 212 + }, + { + "epoch": 0.008968421052631579, + "grad_norm": 1.234375, + "learning_rate": 0.00022421052631578948, + "loss": 5.0281, + "step": 213 + }, + { + "epoch": 0.009010526315789474, + "grad_norm": 0.6875, + "learning_rate": 0.00022526315789473682, + "loss": 4.6837, + "step": 214 + }, + { + "epoch": 0.009052631578947368, + "grad_norm": 1.25, + "learning_rate": 0.00022631578947368422, + "loss": 4.7071, + "step": 215 + }, + { + "epoch": 0.009094736842105263, + "grad_norm": 0.73828125, + "learning_rate": 0.0002273684210526316, + "loss": 4.5763, + "step": 216 + }, + { + "epoch": 0.009136842105263158, + "grad_norm": 0.71875, + "learning_rate": 0.00022842105263157895, + "loss": 4.7718, + "step": 217 + }, + { + "epoch": 0.009178947368421052, + "grad_norm": 0.9921875, + "learning_rate": 0.00022947368421052632, + "loss": 4.7802, + "step": 218 + }, + { + "epoch": 0.009221052631578948, + "grad_norm": 0.79296875, + "learning_rate": 0.0002305263157894737, + "loss": 5.1001, + "step": 219 + }, + { + "epoch": 0.009263157894736843, + "grad_norm": 0.94921875, + "learning_rate": 0.00023157894736842107, + "loss": 4.8724, + "step": 220 + }, + { + "epoch": 0.009305263157894737, + "grad_norm": 1.2109375, + "learning_rate": 0.00023263157894736841, + "loss": 4.7159, + "step": 221 + }, + { + "epoch": 0.009347368421052632, + "grad_norm": 0.66796875, + "learning_rate": 0.00023368421052631582, + "loss": 4.8988, + "step": 222 + }, + { + "epoch": 0.009389473684210526, + "grad_norm": 0.828125, + "learning_rate": 0.00023473684210526316, + "loss": 4.8679, + "step": 223 + }, + { + "epoch": 0.009431578947368421, + "grad_norm": 1.1953125, + "learning_rate": 0.00023578947368421054, + "loss": 4.9539, + "step": 224 + }, + { + "epoch": 0.009473684210526316, + "grad_norm": 1.0546875, + "learning_rate": 0.00023684210526315788, + "loss": 4.7077, + "step": 225 + }, + { + "epoch": 0.00951578947368421, + "grad_norm": 1.0234375, + "learning_rate": 0.00023789473684210529, + "loss": 4.9912, + "step": 226 + }, + { + "epoch": 0.009557894736842105, + "grad_norm": 0.84375, + "learning_rate": 0.00023894736842105263, + "loss": 4.8173, + "step": 227 + }, + { + "epoch": 0.0096, + "grad_norm": 0.61328125, + "learning_rate": 0.00024, + "loss": 4.7645, + "step": 228 + }, + { + "epoch": 0.009642105263157895, + "grad_norm": 0.83203125, + "learning_rate": 0.00024105263157894738, + "loss": 4.9278, + "step": 229 + }, + { + "epoch": 0.00968421052631579, + "grad_norm": 0.68359375, + "learning_rate": 0.00024210526315789475, + "loss": 4.8724, + "step": 230 + }, + { + "epoch": 0.009726315789473684, + "grad_norm": 0.64453125, + "learning_rate": 0.0002431578947368421, + "loss": 4.5503, + "step": 231 + }, + { + "epoch": 0.009768421052631579, + "grad_norm": 0.5859375, + "learning_rate": 0.0002442105263157895, + "loss": 4.741, + "step": 232 + }, + { + "epoch": 0.009810526315789474, + "grad_norm": 1.1015625, + "learning_rate": 0.00024526315789473685, + "loss": 4.5287, + "step": 233 + }, + { + "epoch": 0.009852631578947368, + "grad_norm": 0.734375, + "learning_rate": 0.0002463157894736842, + "loss": 4.9546, + "step": 234 + }, + { + "epoch": 0.009894736842105263, + "grad_norm": 0.796875, + "learning_rate": 0.0002473684210526316, + "loss": 4.6637, + "step": 235 + }, + { + "epoch": 0.009936842105263157, + "grad_norm": 0.8046875, + "learning_rate": 0.00024842105263157897, + "loss": 4.7514, + "step": 236 + }, + { + "epoch": 0.009978947368421053, + "grad_norm": 1.0, + "learning_rate": 0.00024947368421052635, + "loss": 4.8299, + "step": 237 + }, + { + "epoch": 0.010021052631578948, + "grad_norm": 1.2109375, + "learning_rate": 0.0002505263157894737, + "loss": 4.6256, + "step": 238 + }, + { + "epoch": 0.010063157894736842, + "grad_norm": 0.96484375, + "learning_rate": 0.0002515789473684211, + "loss": 4.7495, + "step": 239 + }, + { + "epoch": 0.010105263157894737, + "grad_norm": 0.890625, + "learning_rate": 0.0002526315789473684, + "loss": 4.5281, + "step": 240 + }, + { + "epoch": 0.010147368421052632, + "grad_norm": 0.75, + "learning_rate": 0.0002536842105263158, + "loss": 4.8828, + "step": 241 + }, + { + "epoch": 0.010189473684210526, + "grad_norm": 2.421875, + "learning_rate": 0.00025473684210526316, + "loss": 4.6277, + "step": 242 + }, + { + "epoch": 0.010231578947368421, + "grad_norm": 1.171875, + "learning_rate": 0.00025578947368421054, + "loss": 5.1124, + "step": 243 + }, + { + "epoch": 0.010273684210526315, + "grad_norm": 0.828125, + "learning_rate": 0.00025684210526315786, + "loss": 4.9601, + "step": 244 + }, + { + "epoch": 0.01031578947368421, + "grad_norm": 1.046875, + "learning_rate": 0.0002578947368421053, + "loss": 4.7439, + "step": 245 + }, + { + "epoch": 0.010357894736842106, + "grad_norm": 0.91796875, + "learning_rate": 0.00025894736842105266, + "loss": 4.6651, + "step": 246 + }, + { + "epoch": 0.0104, + "grad_norm": 0.8671875, + "learning_rate": 0.00026000000000000003, + "loss": 4.4402, + "step": 247 + }, + { + "epoch": 0.010442105263157895, + "grad_norm": 0.953125, + "learning_rate": 0.00026105263157894735, + "loss": 4.8318, + "step": 248 + }, + { + "epoch": 0.01048421052631579, + "grad_norm": 0.84765625, + "learning_rate": 0.0002621052631578947, + "loss": 4.5234, + "step": 249 + }, + { + "epoch": 0.010526315789473684, + "grad_norm": 0.71875, + "learning_rate": 0.0002631578947368421, + "loss": 4.8966, + "step": 250 + }, + { + "epoch": 0.01056842105263158, + "grad_norm": 0.8515625, + "learning_rate": 0.0002642105263157895, + "loss": 4.5258, + "step": 251 + }, + { + "epoch": 0.010610526315789473, + "grad_norm": 0.9296875, + "learning_rate": 0.00026526315789473685, + "loss": 4.5859, + "step": 252 + }, + { + "epoch": 0.010652631578947368, + "grad_norm": 0.75, + "learning_rate": 0.0002663157894736842, + "loss": 4.5474, + "step": 253 + }, + { + "epoch": 0.010694736842105264, + "grad_norm": 0.8046875, + "learning_rate": 0.0002673684210526316, + "loss": 5.1724, + "step": 254 + }, + { + "epoch": 0.010736842105263157, + "grad_norm": 0.65625, + "learning_rate": 0.00026842105263157897, + "loss": 4.8648, + "step": 255 + }, + { + "epoch": 0.010778947368421053, + "grad_norm": 0.6484375, + "learning_rate": 0.0002694736842105263, + "loss": 4.8057, + "step": 256 + }, + { + "epoch": 0.010821052631578947, + "grad_norm": 0.98828125, + "learning_rate": 0.00027052631578947366, + "loss": 4.8077, + "step": 257 + }, + { + "epoch": 0.010863157894736842, + "grad_norm": 0.703125, + "learning_rate": 0.00027157894736842104, + "loss": 5.0211, + "step": 258 + }, + { + "epoch": 0.010905263157894737, + "grad_norm": 0.80859375, + "learning_rate": 0.00027263157894736847, + "loss": 4.9733, + "step": 259 + }, + { + "epoch": 0.010947368421052631, + "grad_norm": 0.6484375, + "learning_rate": 0.00027368421052631584, + "loss": 4.795, + "step": 260 + }, + { + "epoch": 0.010989473684210526, + "grad_norm": 0.67578125, + "learning_rate": 0.00027473684210526316, + "loss": 4.6317, + "step": 261 + }, + { + "epoch": 0.011031578947368422, + "grad_norm": 1.0390625, + "learning_rate": 0.00027578947368421053, + "loss": 4.4151, + "step": 262 + }, + { + "epoch": 0.011073684210526315, + "grad_norm": 0.76953125, + "learning_rate": 0.0002768421052631579, + "loss": 4.8259, + "step": 263 + }, + { + "epoch": 0.01111578947368421, + "grad_norm": 1.4765625, + "learning_rate": 0.0002778947368421053, + "loss": 4.8518, + "step": 264 + }, + { + "epoch": 0.011157894736842104, + "grad_norm": 0.81640625, + "learning_rate": 0.0002789473684210526, + "loss": 4.4706, + "step": 265 + }, + { + "epoch": 0.0112, + "grad_norm": 0.75390625, + "learning_rate": 0.00028000000000000003, + "loss": 4.7413, + "step": 266 + }, + { + "epoch": 0.011242105263157895, + "grad_norm": 1.03125, + "learning_rate": 0.0002810526315789474, + "loss": 4.7639, + "step": 267 + }, + { + "epoch": 0.011284210526315789, + "grad_norm": 0.57421875, + "learning_rate": 0.0002821052631578948, + "loss": 4.7669, + "step": 268 + }, + { + "epoch": 0.011326315789473684, + "grad_norm": 0.890625, + "learning_rate": 0.0002831578947368421, + "loss": 4.33, + "step": 269 + }, + { + "epoch": 0.01136842105263158, + "grad_norm": 0.68359375, + "learning_rate": 0.00028421052631578947, + "loss": 4.4285, + "step": 270 + }, + { + "epoch": 0.011410526315789473, + "grad_norm": 0.80859375, + "learning_rate": 0.00028526315789473685, + "loss": 4.5333, + "step": 271 + }, + { + "epoch": 0.011452631578947369, + "grad_norm": 0.74609375, + "learning_rate": 0.0002863157894736842, + "loss": 4.6635, + "step": 272 + }, + { + "epoch": 0.011494736842105262, + "grad_norm": 0.63671875, + "learning_rate": 0.0002873684210526316, + "loss": 4.5812, + "step": 273 + }, + { + "epoch": 0.011536842105263158, + "grad_norm": 0.80859375, + "learning_rate": 0.00028842105263157897, + "loss": 4.8065, + "step": 274 + }, + { + "epoch": 0.011578947368421053, + "grad_norm": 1.0546875, + "learning_rate": 0.00028947368421052634, + "loss": 4.6259, + "step": 275 + }, + { + "epoch": 0.011621052631578947, + "grad_norm": 0.6796875, + "learning_rate": 0.0002905263157894737, + "loss": 4.648, + "step": 276 + }, + { + "epoch": 0.011663157894736842, + "grad_norm": 0.91015625, + "learning_rate": 0.00029157894736842104, + "loss": 4.7563, + "step": 277 + }, + { + "epoch": 0.011705263157894738, + "grad_norm": 0.83984375, + "learning_rate": 0.0002926315789473684, + "loss": 4.7388, + "step": 278 + }, + { + "epoch": 0.011747368421052631, + "grad_norm": 0.69921875, + "learning_rate": 0.0002936842105263158, + "loss": 4.5267, + "step": 279 + }, + { + "epoch": 0.011789473684210527, + "grad_norm": 0.97265625, + "learning_rate": 0.00029473684210526316, + "loss": 4.7164, + "step": 280 + }, + { + "epoch": 0.01183157894736842, + "grad_norm": 0.8515625, + "learning_rate": 0.00029578947368421053, + "loss": 4.728, + "step": 281 + }, + { + "epoch": 0.011873684210526316, + "grad_norm": 0.8359375, + "learning_rate": 0.0002968421052631579, + "loss": 4.6658, + "step": 282 + }, + { + "epoch": 0.011915789473684211, + "grad_norm": 0.66796875, + "learning_rate": 0.0002978947368421053, + "loss": 4.8512, + "step": 283 + }, + { + "epoch": 0.011957894736842105, + "grad_norm": 0.7734375, + "learning_rate": 0.00029894736842105265, + "loss": 4.3698, + "step": 284 + }, + { + "epoch": 0.012, + "grad_norm": 0.609375, + "learning_rate": 0.0003, + "loss": 4.5798, + "step": 285 + }, + { + "epoch": 0.012042105263157894, + "grad_norm": 0.74609375, + "learning_rate": 0.00030105263157894735, + "loss": 4.8204, + "step": 286 + }, + { + "epoch": 0.01208421052631579, + "grad_norm": 0.59765625, + "learning_rate": 0.0003021052631578947, + "loss": 4.8963, + "step": 287 + }, + { + "epoch": 0.012126315789473685, + "grad_norm": 0.61328125, + "learning_rate": 0.00030315789473684215, + "loss": 4.581, + "step": 288 + }, + { + "epoch": 0.012168421052631578, + "grad_norm": 0.7265625, + "learning_rate": 0.00030421052631578947, + "loss": 4.5804, + "step": 289 + }, + { + "epoch": 0.012210526315789474, + "grad_norm": 0.921875, + "learning_rate": 0.00030526315789473684, + "loss": 4.5165, + "step": 290 + }, + { + "epoch": 0.01225263157894737, + "grad_norm": 0.63671875, + "learning_rate": 0.0003063157894736842, + "loss": 4.7213, + "step": 291 + }, + { + "epoch": 0.012294736842105263, + "grad_norm": 0.9296875, + "learning_rate": 0.0003073684210526316, + "loss": 4.4559, + "step": 292 + }, + { + "epoch": 0.012336842105263158, + "grad_norm": 0.7109375, + "learning_rate": 0.0003084210526315789, + "loss": 4.6154, + "step": 293 + }, + { + "epoch": 0.012378947368421052, + "grad_norm": 0.8203125, + "learning_rate": 0.0003094736842105263, + "loss": 4.3068, + "step": 294 + }, + { + "epoch": 0.012421052631578947, + "grad_norm": 0.6875, + "learning_rate": 0.0003105263157894737, + "loss": 4.6104, + "step": 295 + }, + { + "epoch": 0.012463157894736843, + "grad_norm": 0.640625, + "learning_rate": 0.0003115789473684211, + "loss": 4.6329, + "step": 296 + }, + { + "epoch": 0.012505263157894736, + "grad_norm": 0.80859375, + "learning_rate": 0.0003126315789473684, + "loss": 4.497, + "step": 297 + }, + { + "epoch": 0.012547368421052632, + "grad_norm": 0.71484375, + "learning_rate": 0.0003136842105263158, + "loss": 4.5875, + "step": 298 + }, + { + "epoch": 0.012589473684210527, + "grad_norm": 0.76953125, + "learning_rate": 0.00031473684210526316, + "loss": 4.1527, + "step": 299 + }, + { + "epoch": 0.01263157894736842, + "grad_norm": 0.88671875, + "learning_rate": 0.00031578947368421053, + "loss": 4.6874, + "step": 300 + }, + { + "epoch": 0.012673684210526316, + "grad_norm": 0.6796875, + "learning_rate": 0.00031684210526315785, + "loss": 4.4477, + "step": 301 + }, + { + "epoch": 0.01271578947368421, + "grad_norm": 0.68359375, + "learning_rate": 0.0003178947368421053, + "loss": 4.5187, + "step": 302 + }, + { + "epoch": 0.012757894736842105, + "grad_norm": 0.66796875, + "learning_rate": 0.00031894736842105265, + "loss": 4.5969, + "step": 303 + }, + { + "epoch": 0.0128, + "grad_norm": 0.6484375, + "learning_rate": 0.00032, + "loss": 4.7898, + "step": 304 + }, + { + "epoch": 0.012842105263157894, + "grad_norm": 0.70703125, + "learning_rate": 0.0003210526315789474, + "loss": 4.7483, + "step": 305 + }, + { + "epoch": 0.01288421052631579, + "grad_norm": 0.765625, + "learning_rate": 0.0003221052631578947, + "loss": 4.2825, + "step": 306 + }, + { + "epoch": 0.012926315789473685, + "grad_norm": 0.67578125, + "learning_rate": 0.0003231578947368421, + "loss": 4.6983, + "step": 307 + }, + { + "epoch": 0.012968421052631579, + "grad_norm": 1.0703125, + "learning_rate": 0.00032421052631578947, + "loss": 4.1973, + "step": 308 + }, + { + "epoch": 0.013010526315789474, + "grad_norm": 0.67578125, + "learning_rate": 0.0003252631578947369, + "loss": 4.4216, + "step": 309 + }, + { + "epoch": 0.013052631578947368, + "grad_norm": 0.9296875, + "learning_rate": 0.0003263157894736842, + "loss": 4.6639, + "step": 310 + }, + { + "epoch": 0.013094736842105263, + "grad_norm": 0.63671875, + "learning_rate": 0.0003273684210526316, + "loss": 4.6799, + "step": 311 + }, + { + "epoch": 0.013136842105263159, + "grad_norm": 0.92578125, + "learning_rate": 0.00032842105263157896, + "loss": 4.2909, + "step": 312 + }, + { + "epoch": 0.013178947368421052, + "grad_norm": 0.77734375, + "learning_rate": 0.00032947368421052634, + "loss": 4.9743, + "step": 313 + }, + { + "epoch": 0.013221052631578948, + "grad_norm": 0.71875, + "learning_rate": 0.00033052631578947366, + "loss": 4.3255, + "step": 314 + }, + { + "epoch": 0.013263157894736841, + "grad_norm": 1.015625, + "learning_rate": 0.00033157894736842103, + "loss": 4.5748, + "step": 315 + }, + { + "epoch": 0.013305263157894737, + "grad_norm": 0.8671875, + "learning_rate": 0.00033263157894736846, + "loss": 4.3673, + "step": 316 + }, + { + "epoch": 0.013347368421052632, + "grad_norm": 0.77734375, + "learning_rate": 0.00033368421052631583, + "loss": 4.656, + "step": 317 + }, + { + "epoch": 0.013389473684210526, + "grad_norm": 0.6171875, + "learning_rate": 0.00033473684210526315, + "loss": 4.5088, + "step": 318 + }, + { + "epoch": 0.013431578947368421, + "grad_norm": 0.91015625, + "learning_rate": 0.00033578947368421053, + "loss": 4.6969, + "step": 319 + }, + { + "epoch": 0.013473684210526317, + "grad_norm": 0.58984375, + "learning_rate": 0.0003368421052631579, + "loss": 4.7857, + "step": 320 + }, + { + "epoch": 0.01351578947368421, + "grad_norm": 0.79296875, + "learning_rate": 0.0003378947368421053, + "loss": 4.6076, + "step": 321 + }, + { + "epoch": 0.013557894736842106, + "grad_norm": 0.87109375, + "learning_rate": 0.0003389473684210526, + "loss": 4.1856, + "step": 322 + }, + { + "epoch": 0.0136, + "grad_norm": 0.7265625, + "learning_rate": 0.00034, + "loss": 4.5701, + "step": 323 + }, + { + "epoch": 0.013642105263157895, + "grad_norm": 0.66796875, + "learning_rate": 0.0003410526315789474, + "loss": 4.4323, + "step": 324 + }, + { + "epoch": 0.01368421052631579, + "grad_norm": 0.71484375, + "learning_rate": 0.00034210526315789477, + "loss": 4.3803, + "step": 325 + }, + { + "epoch": 0.013726315789473684, + "grad_norm": 0.9765625, + "learning_rate": 0.0003431578947368421, + "loss": 4.2215, + "step": 326 + }, + { + "epoch": 0.013768421052631579, + "grad_norm": 0.73828125, + "learning_rate": 0.00034421052631578947, + "loss": 4.7205, + "step": 327 + }, + { + "epoch": 0.013810526315789474, + "grad_norm": 0.78515625, + "learning_rate": 0.00034526315789473684, + "loss": 4.6807, + "step": 328 + }, + { + "epoch": 0.013852631578947368, + "grad_norm": 0.71484375, + "learning_rate": 0.0003463157894736842, + "loss": 4.6929, + "step": 329 + }, + { + "epoch": 0.013894736842105264, + "grad_norm": 0.74609375, + "learning_rate": 0.0003473684210526316, + "loss": 4.3836, + "step": 330 + }, + { + "epoch": 0.013936842105263157, + "grad_norm": 0.61328125, + "learning_rate": 0.00034842105263157896, + "loss": 4.5278, + "step": 331 + }, + { + "epoch": 0.013978947368421053, + "grad_norm": 0.953125, + "learning_rate": 0.00034947368421052634, + "loss": 4.6396, + "step": 332 + }, + { + "epoch": 0.014021052631578948, + "grad_norm": 0.65625, + "learning_rate": 0.0003505263157894737, + "loss": 4.5432, + "step": 333 + }, + { + "epoch": 0.014063157894736842, + "grad_norm": 0.62109375, + "learning_rate": 0.00035157894736842103, + "loss": 4.6171, + "step": 334 + }, + { + "epoch": 0.014105263157894737, + "grad_norm": 0.609375, + "learning_rate": 0.0003526315789473684, + "loss": 4.6272, + "step": 335 + }, + { + "epoch": 0.014147368421052632, + "grad_norm": 0.796875, + "learning_rate": 0.0003536842105263158, + "loss": 4.3247, + "step": 336 + }, + { + "epoch": 0.014189473684210526, + "grad_norm": 0.85546875, + "learning_rate": 0.0003547368421052632, + "loss": 4.3696, + "step": 337 + }, + { + "epoch": 0.014231578947368421, + "grad_norm": 0.60546875, + "learning_rate": 0.0003557894736842105, + "loss": 4.7596, + "step": 338 + }, + { + "epoch": 0.014273684210526315, + "grad_norm": 0.72265625, + "learning_rate": 0.0003568421052631579, + "loss": 4.3865, + "step": 339 + }, + { + "epoch": 0.01431578947368421, + "grad_norm": 0.85546875, + "learning_rate": 0.0003578947368421053, + "loss": 4.4291, + "step": 340 + }, + { + "epoch": 0.014357894736842106, + "grad_norm": 0.74609375, + "learning_rate": 0.00035894736842105265, + "loss": 4.397, + "step": 341 + }, + { + "epoch": 0.0144, + "grad_norm": 0.6875, + "learning_rate": 0.00035999999999999997, + "loss": 4.5021, + "step": 342 + }, + { + "epoch": 0.014442105263157895, + "grad_norm": 0.7734375, + "learning_rate": 0.00036105263157894734, + "loss": 4.3352, + "step": 343 + }, + { + "epoch": 0.014484210526315789, + "grad_norm": 0.828125, + "learning_rate": 0.00036210526315789477, + "loss": 4.3564, + "step": 344 + }, + { + "epoch": 0.014526315789473684, + "grad_norm": 0.79296875, + "learning_rate": 0.00036315789473684214, + "loss": 4.92, + "step": 345 + }, + { + "epoch": 0.01456842105263158, + "grad_norm": 0.8203125, + "learning_rate": 0.0003642105263157895, + "loss": 4.4194, + "step": 346 + }, + { + "epoch": 0.014610526315789473, + "grad_norm": 0.6953125, + "learning_rate": 0.00036526315789473684, + "loss": 4.4044, + "step": 347 + }, + { + "epoch": 0.014652631578947369, + "grad_norm": 0.7421875, + "learning_rate": 0.0003663157894736842, + "loss": 4.3202, + "step": 348 + }, + { + "epoch": 0.014694736842105264, + "grad_norm": 0.6484375, + "learning_rate": 0.0003673684210526316, + "loss": 4.2268, + "step": 349 + }, + { + "epoch": 0.014736842105263158, + "grad_norm": 0.73046875, + "learning_rate": 0.00036842105263157896, + "loss": 4.7891, + "step": 350 + }, + { + "epoch": 0.014778947368421053, + "grad_norm": 0.62109375, + "learning_rate": 0.00036947368421052633, + "loss": 4.4866, + "step": 351 + }, + { + "epoch": 0.014821052631578947, + "grad_norm": 1.03125, + "learning_rate": 0.0003705263157894737, + "loss": 4.3846, + "step": 352 + }, + { + "epoch": 0.014863157894736842, + "grad_norm": 0.8125, + "learning_rate": 0.0003715789473684211, + "loss": 4.5073, + "step": 353 + }, + { + "epoch": 0.014905263157894737, + "grad_norm": 0.84375, + "learning_rate": 0.00037263157894736846, + "loss": 4.4347, + "step": 354 + }, + { + "epoch": 0.014947368421052631, + "grad_norm": 0.828125, + "learning_rate": 0.0003736842105263158, + "loss": 4.293, + "step": 355 + }, + { + "epoch": 0.014989473684210526, + "grad_norm": 0.734375, + "learning_rate": 0.00037473684210526315, + "loss": 4.4316, + "step": 356 + }, + { + "epoch": 0.015031578947368422, + "grad_norm": 0.8046875, + "learning_rate": 0.0003757894736842105, + "loss": 4.4448, + "step": 357 + }, + { + "epoch": 0.015073684210526316, + "grad_norm": 0.765625, + "learning_rate": 0.0003768421052631579, + "loss": 4.4419, + "step": 358 + }, + { + "epoch": 0.015115789473684211, + "grad_norm": 0.7421875, + "learning_rate": 0.00037789473684210527, + "loss": 4.7215, + "step": 359 + }, + { + "epoch": 0.015157894736842105, + "grad_norm": 0.578125, + "learning_rate": 0.00037894736842105265, + "loss": 4.4125, + "step": 360 + }, + { + "epoch": 0.0152, + "grad_norm": 0.6953125, + "learning_rate": 0.00038, + "loss": 4.3999, + "step": 361 + }, + { + "epoch": 0.015242105263157895, + "grad_norm": 0.87109375, + "learning_rate": 0.0003810526315789474, + "loss": 4.7075, + "step": 362 + }, + { + "epoch": 0.015284210526315789, + "grad_norm": 0.65625, + "learning_rate": 0.0003821052631578947, + "loss": 4.631, + "step": 363 + }, + { + "epoch": 0.015326315789473684, + "grad_norm": 0.859375, + "learning_rate": 0.0003831578947368421, + "loss": 4.5804, + "step": 364 + }, + { + "epoch": 0.015368421052631578, + "grad_norm": 0.7578125, + "learning_rate": 0.00038421052631578946, + "loss": 4.5522, + "step": 365 + }, + { + "epoch": 0.015410526315789473, + "grad_norm": 0.75390625, + "learning_rate": 0.0003852631578947369, + "loss": 4.0813, + "step": 366 + }, + { + "epoch": 0.015452631578947369, + "grad_norm": 0.796875, + "learning_rate": 0.0003863157894736842, + "loss": 4.4482, + "step": 367 + }, + { + "epoch": 0.015494736842105263, + "grad_norm": 0.765625, + "learning_rate": 0.0003873684210526316, + "loss": 4.4242, + "step": 368 + }, + { + "epoch": 0.015536842105263158, + "grad_norm": 0.70703125, + "learning_rate": 0.00038842105263157896, + "loss": 4.4984, + "step": 369 + }, + { + "epoch": 0.015578947368421053, + "grad_norm": 0.640625, + "learning_rate": 0.00038947368421052633, + "loss": 4.338, + "step": 370 + }, + { + "epoch": 0.015621052631578947, + "grad_norm": 0.734375, + "learning_rate": 0.00039052631578947365, + "loss": 4.5405, + "step": 371 + }, + { + "epoch": 0.01566315789473684, + "grad_norm": 0.73046875, + "learning_rate": 0.000391578947368421, + "loss": 4.4824, + "step": 372 + }, + { + "epoch": 0.015705263157894736, + "grad_norm": 0.68359375, + "learning_rate": 0.00039263157894736846, + "loss": 4.3563, + "step": 373 + }, + { + "epoch": 0.01574736842105263, + "grad_norm": 0.66015625, + "learning_rate": 0.00039368421052631583, + "loss": 4.4242, + "step": 374 + }, + { + "epoch": 0.015789473684210527, + "grad_norm": 0.70703125, + "learning_rate": 0.00039473684210526315, + "loss": 4.2607, + "step": 375 + }, + { + "epoch": 0.015831578947368422, + "grad_norm": 0.671875, + "learning_rate": 0.0003957894736842105, + "loss": 4.2254, + "step": 376 + }, + { + "epoch": 0.015873684210526314, + "grad_norm": 0.671875, + "learning_rate": 0.0003968421052631579, + "loss": 4.1182, + "step": 377 + }, + { + "epoch": 0.01591578947368421, + "grad_norm": 0.59765625, + "learning_rate": 0.00039789473684210527, + "loss": 4.8709, + "step": 378 + }, + { + "epoch": 0.015957894736842105, + "grad_norm": 0.7890625, + "learning_rate": 0.0003989473684210526, + "loss": 4.3529, + "step": 379 + }, + { + "epoch": 0.016, + "grad_norm": 0.7421875, + "learning_rate": 0.0004, + "loss": 4.0789, + "step": 380 + }, + { + "epoch": 0.016042105263157896, + "grad_norm": 0.5546875, + "learning_rate": 0.0004010526315789474, + "loss": 4.2138, + "step": 381 + }, + { + "epoch": 0.01608421052631579, + "grad_norm": 0.6015625, + "learning_rate": 0.00040210526315789477, + "loss": 4.3717, + "step": 382 + }, + { + "epoch": 0.016126315789473683, + "grad_norm": 0.56640625, + "learning_rate": 0.0004031578947368421, + "loss": 4.3187, + "step": 383 + }, + { + "epoch": 0.01616842105263158, + "grad_norm": 1.25, + "learning_rate": 0.00040421052631578946, + "loss": 4.4462, + "step": 384 + }, + { + "epoch": 0.016210526315789474, + "grad_norm": 0.96484375, + "learning_rate": 0.00040526315789473684, + "loss": 3.9189, + "step": 385 + }, + { + "epoch": 0.01625263157894737, + "grad_norm": 0.5859375, + "learning_rate": 0.0004063157894736842, + "loss": 4.4457, + "step": 386 + }, + { + "epoch": 0.016294736842105265, + "grad_norm": 0.72265625, + "learning_rate": 0.00040736842105263164, + "loss": 4.5699, + "step": 387 + }, + { + "epoch": 0.016336842105263157, + "grad_norm": 0.70703125, + "learning_rate": 0.00040842105263157896, + "loss": 4.2877, + "step": 388 + }, + { + "epoch": 0.016378947368421052, + "grad_norm": 0.59375, + "learning_rate": 0.00040947368421052633, + "loss": 4.271, + "step": 389 + }, + { + "epoch": 0.016421052631578947, + "grad_norm": 0.6171875, + "learning_rate": 0.0004105263157894737, + "loss": 4.2374, + "step": 390 + }, + { + "epoch": 0.016463157894736843, + "grad_norm": 0.80078125, + "learning_rate": 0.0004115789473684211, + "loss": 4.2013, + "step": 391 + }, + { + "epoch": 0.016505263157894738, + "grad_norm": 0.61328125, + "learning_rate": 0.0004126315789473684, + "loss": 4.4356, + "step": 392 + }, + { + "epoch": 0.01654736842105263, + "grad_norm": 0.65234375, + "learning_rate": 0.0004136842105263158, + "loss": 4.4449, + "step": 393 + }, + { + "epoch": 0.016589473684210525, + "grad_norm": 1.1328125, + "learning_rate": 0.0004147368421052632, + "loss": 4.0937, + "step": 394 + }, + { + "epoch": 0.01663157894736842, + "grad_norm": 0.75, + "learning_rate": 0.0004157894736842106, + "loss": 4.2837, + "step": 395 + }, + { + "epoch": 0.016673684210526316, + "grad_norm": 0.671875, + "learning_rate": 0.0004168421052631579, + "loss": 4.4151, + "step": 396 + }, + { + "epoch": 0.01671578947368421, + "grad_norm": 0.75, + "learning_rate": 0.00041789473684210527, + "loss": 4.6205, + "step": 397 + }, + { + "epoch": 0.016757894736842104, + "grad_norm": 0.671875, + "learning_rate": 0.00041894736842105264, + "loss": 4.3583, + "step": 398 + }, + { + "epoch": 0.0168, + "grad_norm": 0.6171875, + "learning_rate": 0.00042, + "loss": 4.3391, + "step": 399 + }, + { + "epoch": 0.016842105263157894, + "grad_norm": 0.60546875, + "learning_rate": 0.00042105263157894734, + "loss": 4.5594, + "step": 400 + }, + { + "epoch": 0.01688421052631579, + "grad_norm": 0.81640625, + "learning_rate": 0.00042210526315789477, + "loss": 4.1017, + "step": 401 + }, + { + "epoch": 0.016926315789473685, + "grad_norm": 0.84765625, + "learning_rate": 0.00042315789473684214, + "loss": 4.1371, + "step": 402 + }, + { + "epoch": 0.01696842105263158, + "grad_norm": 0.6171875, + "learning_rate": 0.0004242105263157895, + "loss": 4.2816, + "step": 403 + }, + { + "epoch": 0.017010526315789472, + "grad_norm": 0.80859375, + "learning_rate": 0.00042526315789473683, + "loss": 4.3082, + "step": 404 + }, + { + "epoch": 0.017052631578947368, + "grad_norm": 0.62890625, + "learning_rate": 0.0004263157894736842, + "loss": 4.4282, + "step": 405 + }, + { + "epoch": 0.017094736842105263, + "grad_norm": 0.6015625, + "learning_rate": 0.0004273684210526316, + "loss": 4.6419, + "step": 406 + }, + { + "epoch": 0.01713684210526316, + "grad_norm": 0.56640625, + "learning_rate": 0.00042842105263157896, + "loss": 4.5408, + "step": 407 + }, + { + "epoch": 0.017178947368421054, + "grad_norm": 0.5703125, + "learning_rate": 0.00042947368421052633, + "loss": 4.3513, + "step": 408 + }, + { + "epoch": 0.017221052631578946, + "grad_norm": 0.578125, + "learning_rate": 0.0004305263157894737, + "loss": 4.3757, + "step": 409 + }, + { + "epoch": 0.01726315789473684, + "grad_norm": 0.6328125, + "learning_rate": 0.0004315789473684211, + "loss": 4.2217, + "step": 410 + }, + { + "epoch": 0.017305263157894737, + "grad_norm": 0.61328125, + "learning_rate": 0.00043263157894736845, + "loss": 4.2977, + "step": 411 + }, + { + "epoch": 0.017347368421052632, + "grad_norm": 0.66015625, + "learning_rate": 0.00043368421052631577, + "loss": 4.2425, + "step": 412 + }, + { + "epoch": 0.017389473684210528, + "grad_norm": 0.5546875, + "learning_rate": 0.00043473684210526315, + "loss": 4.204, + "step": 413 + }, + { + "epoch": 0.01743157894736842, + "grad_norm": 0.68359375, + "learning_rate": 0.0004357894736842105, + "loss": 4.3425, + "step": 414 + }, + { + "epoch": 0.017473684210526315, + "grad_norm": 0.5390625, + "learning_rate": 0.00043684210526315795, + "loss": 4.4786, + "step": 415 + }, + { + "epoch": 0.01751578947368421, + "grad_norm": 1.0625, + "learning_rate": 0.00043789473684210527, + "loss": 4.1803, + "step": 416 + }, + { + "epoch": 0.017557894736842106, + "grad_norm": 0.68359375, + "learning_rate": 0.00043894736842105264, + "loss": 4.3962, + "step": 417 + }, + { + "epoch": 0.0176, + "grad_norm": 0.73828125, + "learning_rate": 0.00044, + "loss": 4.1655, + "step": 418 + }, + { + "epoch": 0.017642105263157896, + "grad_norm": 0.57421875, + "learning_rate": 0.0004410526315789474, + "loss": 4.4406, + "step": 419 + }, + { + "epoch": 0.01768421052631579, + "grad_norm": 0.68359375, + "learning_rate": 0.0004421052631578947, + "loss": 3.8846, + "step": 420 + }, + { + "epoch": 0.017726315789473684, + "grad_norm": 0.75, + "learning_rate": 0.0004431578947368421, + "loss": 4.44, + "step": 421 + }, + { + "epoch": 0.01776842105263158, + "grad_norm": 0.546875, + "learning_rate": 0.0004442105263157895, + "loss": 4.4924, + "step": 422 + }, + { + "epoch": 0.017810526315789475, + "grad_norm": 0.8671875, + "learning_rate": 0.0004452631578947369, + "loss": 4.1203, + "step": 423 + }, + { + "epoch": 0.01785263157894737, + "grad_norm": 0.5625, + "learning_rate": 0.0004463157894736842, + "loss": 4.1306, + "step": 424 + }, + { + "epoch": 0.017894736842105262, + "grad_norm": 0.61328125, + "learning_rate": 0.0004473684210526316, + "loss": 4.5186, + "step": 425 + }, + { + "epoch": 0.017936842105263157, + "grad_norm": 0.59375, + "learning_rate": 0.00044842105263157895, + "loss": 4.1231, + "step": 426 + }, + { + "epoch": 0.017978947368421053, + "grad_norm": 0.55859375, + "learning_rate": 0.00044947368421052633, + "loss": 4.4977, + "step": 427 + }, + { + "epoch": 0.018021052631578948, + "grad_norm": 0.66796875, + "learning_rate": 0.00045052631578947365, + "loss": 4.1505, + "step": 428 + }, + { + "epoch": 0.018063157894736843, + "grad_norm": 0.89453125, + "learning_rate": 0.0004515789473684211, + "loss": 4.3092, + "step": 429 + }, + { + "epoch": 0.018105263157894735, + "grad_norm": 0.578125, + "learning_rate": 0.00045263157894736845, + "loss": 4.2571, + "step": 430 + }, + { + "epoch": 0.01814736842105263, + "grad_norm": 0.71875, + "learning_rate": 0.0004536842105263158, + "loss": 4.574, + "step": 431 + }, + { + "epoch": 0.018189473684210526, + "grad_norm": 0.64453125, + "learning_rate": 0.0004547368421052632, + "loss": 4.4953, + "step": 432 + }, + { + "epoch": 0.01823157894736842, + "grad_norm": 0.58984375, + "learning_rate": 0.0004557894736842105, + "loss": 4.3898, + "step": 433 + }, + { + "epoch": 0.018273684210526317, + "grad_norm": 1.1953125, + "learning_rate": 0.0004568421052631579, + "loss": 4.3192, + "step": 434 + }, + { + "epoch": 0.01831578947368421, + "grad_norm": 0.765625, + "learning_rate": 0.00045789473684210527, + "loss": 4.3572, + "step": 435 + }, + { + "epoch": 0.018357894736842104, + "grad_norm": 1.03125, + "learning_rate": 0.00045894736842105264, + "loss": 4.4823, + "step": 436 + }, + { + "epoch": 0.0184, + "grad_norm": 0.69140625, + "learning_rate": 0.00046, + "loss": 4.3421, + "step": 437 + }, + { + "epoch": 0.018442105263157895, + "grad_norm": 0.6875, + "learning_rate": 0.0004610526315789474, + "loss": 4.2476, + "step": 438 + }, + { + "epoch": 0.01848421052631579, + "grad_norm": 0.78515625, + "learning_rate": 0.00046210526315789476, + "loss": 4.193, + "step": 439 + }, + { + "epoch": 0.018526315789473686, + "grad_norm": 0.6875, + "learning_rate": 0.00046315789473684214, + "loss": 4.476, + "step": 440 + }, + { + "epoch": 0.018568421052631578, + "grad_norm": 0.81640625, + "learning_rate": 0.00046421052631578946, + "loss": 4.8103, + "step": 441 + }, + { + "epoch": 0.018610526315789473, + "grad_norm": 0.8046875, + "learning_rate": 0.00046526315789473683, + "loss": 4.257, + "step": 442 + }, + { + "epoch": 0.01865263157894737, + "grad_norm": 0.7265625, + "learning_rate": 0.0004663157894736842, + "loss": 4.1948, + "step": 443 + }, + { + "epoch": 0.018694736842105264, + "grad_norm": 0.671875, + "learning_rate": 0.00046736842105263163, + "loss": 4.1955, + "step": 444 + }, + { + "epoch": 0.01873684210526316, + "grad_norm": 0.64453125, + "learning_rate": 0.00046842105263157895, + "loss": 4.251, + "step": 445 + }, + { + "epoch": 0.01877894736842105, + "grad_norm": 0.71484375, + "learning_rate": 0.0004694736842105263, + "loss": 4.3446, + "step": 446 + }, + { + "epoch": 0.018821052631578947, + "grad_norm": 0.71484375, + "learning_rate": 0.0004705263157894737, + "loss": 4.2658, + "step": 447 + }, + { + "epoch": 0.018863157894736842, + "grad_norm": 0.5859375, + "learning_rate": 0.0004715789473684211, + "loss": 4.4793, + "step": 448 + }, + { + "epoch": 0.018905263157894737, + "grad_norm": 0.66015625, + "learning_rate": 0.0004726315789473684, + "loss": 4.1144, + "step": 449 + }, + { + "epoch": 0.018947368421052633, + "grad_norm": 0.6640625, + "learning_rate": 0.00047368421052631577, + "loss": 4.1679, + "step": 450 + }, + { + "epoch": 0.018989473684210525, + "grad_norm": 0.64453125, + "learning_rate": 0.0004747368421052632, + "loss": 4.3472, + "step": 451 + }, + { + "epoch": 0.01903157894736842, + "grad_norm": 0.546875, + "learning_rate": 0.00047578947368421057, + "loss": 4.237, + "step": 452 + }, + { + "epoch": 0.019073684210526316, + "grad_norm": 0.6015625, + "learning_rate": 0.0004768421052631579, + "loss": 4.4889, + "step": 453 + }, + { + "epoch": 0.01911578947368421, + "grad_norm": 0.73828125, + "learning_rate": 0.00047789473684210526, + "loss": 4.0149, + "step": 454 + }, + { + "epoch": 0.019157894736842106, + "grad_norm": 0.78515625, + "learning_rate": 0.00047894736842105264, + "loss": 4.1984, + "step": 455 + }, + { + "epoch": 0.0192, + "grad_norm": 0.734375, + "learning_rate": 0.00048, + "loss": 3.8028, + "step": 456 + }, + { + "epoch": 0.019242105263157894, + "grad_norm": 0.92578125, + "learning_rate": 0.00048105263157894733, + "loss": 3.8972, + "step": 457 + }, + { + "epoch": 0.01928421052631579, + "grad_norm": 0.6640625, + "learning_rate": 0.00048210526315789476, + "loss": 4.0164, + "step": 458 + }, + { + "epoch": 0.019326315789473685, + "grad_norm": 0.89453125, + "learning_rate": 0.00048315789473684213, + "loss": 4.0093, + "step": 459 + }, + { + "epoch": 0.01936842105263158, + "grad_norm": 0.64453125, + "learning_rate": 0.0004842105263157895, + "loss": 4.4367, + "step": 460 + }, + { + "epoch": 0.019410526315789475, + "grad_norm": 0.7890625, + "learning_rate": 0.00048526315789473683, + "loss": 4.4371, + "step": 461 + }, + { + "epoch": 0.019452631578947367, + "grad_norm": 0.63671875, + "learning_rate": 0.0004863157894736842, + "loss": 4.1765, + "step": 462 + }, + { + "epoch": 0.019494736842105263, + "grad_norm": 0.5859375, + "learning_rate": 0.0004873684210526316, + "loss": 4.302, + "step": 463 + }, + { + "epoch": 0.019536842105263158, + "grad_norm": 0.6171875, + "learning_rate": 0.000488421052631579, + "loss": 4.3553, + "step": 464 + }, + { + "epoch": 0.019578947368421053, + "grad_norm": 0.58984375, + "learning_rate": 0.0004894736842105264, + "loss": 4.2031, + "step": 465 + }, + { + "epoch": 0.01962105263157895, + "grad_norm": 0.73046875, + "learning_rate": 0.0004905263157894737, + "loss": 4.2276, + "step": 466 + }, + { + "epoch": 0.01966315789473684, + "grad_norm": 0.6640625, + "learning_rate": 0.000491578947368421, + "loss": 4.2627, + "step": 467 + }, + { + "epoch": 0.019705263157894736, + "grad_norm": 0.65234375, + "learning_rate": 0.0004926315789473684, + "loss": 4.4481, + "step": 468 + }, + { + "epoch": 0.01974736842105263, + "grad_norm": 0.6328125, + "learning_rate": 0.0004936842105263158, + "loss": 4.4823, + "step": 469 + }, + { + "epoch": 0.019789473684210527, + "grad_norm": 0.6484375, + "learning_rate": 0.0004947368421052632, + "loss": 4.2895, + "step": 470 + }, + { + "epoch": 0.019831578947368422, + "grad_norm": 0.61328125, + "learning_rate": 0.0004957894736842105, + "loss": 4.1776, + "step": 471 + }, + { + "epoch": 0.019873684210526314, + "grad_norm": 0.78125, + "learning_rate": 0.0004968421052631579, + "loss": 4.1281, + "step": 472 + }, + { + "epoch": 0.01991578947368421, + "grad_norm": 0.56640625, + "learning_rate": 0.0004978947368421053, + "loss": 4.2832, + "step": 473 + }, + { + "epoch": 0.019957894736842105, + "grad_norm": 0.828125, + "learning_rate": 0.0004989473684210527, + "loss": 4.8963, + "step": 474 + }, + { + "epoch": 0.02, + "grad_norm": 0.62890625, + "learning_rate": 0.0005, + "loss": 4.3687, + "step": 475 + }, + { + "epoch": 0.020042105263157896, + "grad_norm": 0.7890625, + "learning_rate": 0.0004999999977226469, + "loss": 4.3004, + "step": 476 + }, + { + "epoch": 0.02008421052631579, + "grad_norm": 0.67578125, + "learning_rate": 0.0004999999908905876, + "loss": 4.3485, + "step": 477 + }, + { + "epoch": 0.020126315789473683, + "grad_norm": 0.66015625, + "learning_rate": 0.0004999999795038221, + "loss": 4.2256, + "step": 478 + }, + { + "epoch": 0.02016842105263158, + "grad_norm": 1.9375, + "learning_rate": 0.0004999999635623508, + "loss": 3.9872, + "step": 479 + }, + { + "epoch": 0.020210526315789474, + "grad_norm": 0.625, + "learning_rate": 0.0004999999430661738, + "loss": 4.0898, + "step": 480 + }, + { + "epoch": 0.02025263157894737, + "grad_norm": 0.59765625, + "learning_rate": 0.0004999999180152917, + "loss": 4.3916, + "step": 481 + }, + { + "epoch": 0.020294736842105265, + "grad_norm": 0.59375, + "learning_rate": 0.0004999998884097047, + "loss": 4.3878, + "step": 482 + }, + { + "epoch": 0.020336842105263157, + "grad_norm": 0.5703125, + "learning_rate": 0.0004999998542494136, + "loss": 4.3222, + "step": 483 + }, + { + "epoch": 0.020378947368421052, + "grad_norm": 0.63671875, + "learning_rate": 0.0004999998155344189, + "loss": 4.4022, + "step": 484 + }, + { + "epoch": 0.020421052631578947, + "grad_norm": 0.60546875, + "learning_rate": 0.0004999997722647211, + "loss": 4.2781, + "step": 485 + }, + { + "epoch": 0.020463157894736843, + "grad_norm": 0.66015625, + "learning_rate": 0.0004999997244403214, + "loss": 4.3358, + "step": 486 + }, + { + "epoch": 0.020505263157894738, + "grad_norm": 0.7578125, + "learning_rate": 0.0004999996720612204, + "loss": 3.7881, + "step": 487 + }, + { + "epoch": 0.02054736842105263, + "grad_norm": 1.8203125, + "learning_rate": 0.000499999615127419, + "loss": 4.8277, + "step": 488 + }, + { + "epoch": 0.020589473684210526, + "grad_norm": 0.640625, + "learning_rate": 0.0004999995536389185, + "loss": 4.2344, + "step": 489 + }, + { + "epoch": 0.02063157894736842, + "grad_norm": 0.60546875, + "learning_rate": 0.0004999994875957198, + "loss": 4.3953, + "step": 490 + }, + { + "epoch": 0.020673684210526316, + "grad_norm": 0.54296875, + "learning_rate": 0.0004999994169978242, + "loss": 4.4188, + "step": 491 + }, + { + "epoch": 0.02071578947368421, + "grad_norm": 0.6015625, + "learning_rate": 0.0004999993418452329, + "loss": 4.385, + "step": 492 + }, + { + "epoch": 0.020757894736842104, + "grad_norm": 0.54296875, + "learning_rate": 0.0004999992621379474, + "loss": 4.3987, + "step": 493 + }, + { + "epoch": 0.0208, + "grad_norm": 0.57421875, + "learning_rate": 0.000499999177875969, + "loss": 4.1516, + "step": 494 + }, + { + "epoch": 0.020842105263157894, + "grad_norm": 0.7734375, + "learning_rate": 0.0004999990890592994, + "loss": 3.6346, + "step": 495 + }, + { + "epoch": 0.02088421052631579, + "grad_norm": 0.6484375, + "learning_rate": 0.0004999989956879402, + "loss": 4.209, + "step": 496 + }, + { + "epoch": 0.020926315789473685, + "grad_norm": 0.5390625, + "learning_rate": 0.0004999988977618929, + "loss": 4.2969, + "step": 497 + }, + { + "epoch": 0.02096842105263158, + "grad_norm": 0.5390625, + "learning_rate": 0.0004999987952811595, + "loss": 4.2258, + "step": 498 + }, + { + "epoch": 0.021010526315789473, + "grad_norm": 0.77734375, + "learning_rate": 0.0004999986882457417, + "loss": 3.8085, + "step": 499 + }, + { + "epoch": 0.021052631578947368, + "grad_norm": 0.609375, + "learning_rate": 0.0004999985766556415, + "loss": 4.1657, + "step": 500 + }, + { + "epoch": 0.021094736842105263, + "grad_norm": 0.64453125, + "learning_rate": 0.0004999984605108611, + "loss": 4.2793, + "step": 501 + }, + { + "epoch": 0.02113684210526316, + "grad_norm": 0.91015625, + "learning_rate": 0.0004999983398114024, + "loss": 3.8602, + "step": 502 + }, + { + "epoch": 0.021178947368421054, + "grad_norm": 0.80078125, + "learning_rate": 0.0004999982145572678, + "loss": 4.1351, + "step": 503 + }, + { + "epoch": 0.021221052631578946, + "grad_norm": 0.69140625, + "learning_rate": 0.0004999980847484593, + "loss": 3.9963, + "step": 504 + }, + { + "epoch": 0.02126315789473684, + "grad_norm": 0.890625, + "learning_rate": 0.0004999979503849796, + "loss": 4.3515, + "step": 505 + }, + { + "epoch": 0.021305263157894737, + "grad_norm": 0.6484375, + "learning_rate": 0.0004999978114668309, + "loss": 4.1705, + "step": 506 + }, + { + "epoch": 0.021347368421052632, + "grad_norm": 0.5390625, + "learning_rate": 0.0004999976679940158, + "loss": 4.3977, + "step": 507 + }, + { + "epoch": 0.021389473684210528, + "grad_norm": 0.55078125, + "learning_rate": 0.0004999975199665369, + "loss": 4.8703, + "step": 508 + }, + { + "epoch": 0.02143157894736842, + "grad_norm": 0.60546875, + "learning_rate": 0.000499997367384397, + "loss": 4.594, + "step": 509 + }, + { + "epoch": 0.021473684210526315, + "grad_norm": 0.62109375, + "learning_rate": 0.0004999972102475987, + "loss": 4.0068, + "step": 510 + }, + { + "epoch": 0.02151578947368421, + "grad_norm": 0.69921875, + "learning_rate": 0.0004999970485561451, + "loss": 4.3193, + "step": 511 + }, + { + "epoch": 0.021557894736842106, + "grad_norm": 0.75390625, + "learning_rate": 0.0004999968823100389, + "loss": 4.0248, + "step": 512 + }, + { + "epoch": 0.0216, + "grad_norm": 0.66015625, + "learning_rate": 0.0004999967115092833, + "loss": 4.5359, + "step": 513 + }, + { + "epoch": 0.021642105263157893, + "grad_norm": 0.60546875, + "learning_rate": 0.0004999965361538813, + "loss": 4.3914, + "step": 514 + }, + { + "epoch": 0.02168421052631579, + "grad_norm": 0.52734375, + "learning_rate": 0.0004999963562438361, + "loss": 4.3953, + "step": 515 + }, + { + "epoch": 0.021726315789473684, + "grad_norm": 0.490234375, + "learning_rate": 0.0004999961717791512, + "loss": 4.6134, + "step": 516 + }, + { + "epoch": 0.02176842105263158, + "grad_norm": 0.76171875, + "learning_rate": 0.0004999959827598296, + "loss": 4.6595, + "step": 517 + }, + { + "epoch": 0.021810526315789475, + "grad_norm": 0.66796875, + "learning_rate": 0.000499995789185875, + "loss": 3.9478, + "step": 518 + }, + { + "epoch": 0.02185263157894737, + "grad_norm": 0.53125, + "learning_rate": 0.0004999955910572908, + "loss": 4.493, + "step": 519 + }, + { + "epoch": 0.021894736842105262, + "grad_norm": 0.70703125, + "learning_rate": 0.0004999953883740808, + "loss": 4.1525, + "step": 520 + }, + { + "epoch": 0.021936842105263157, + "grad_norm": 0.6484375, + "learning_rate": 0.0004999951811362484, + "loss": 4.106, + "step": 521 + }, + { + "epoch": 0.021978947368421053, + "grad_norm": 0.58984375, + "learning_rate": 0.0004999949693437976, + "loss": 4.5819, + "step": 522 + }, + { + "epoch": 0.022021052631578948, + "grad_norm": 0.5, + "learning_rate": 0.0004999947529967323, + "loss": 4.3143, + "step": 523 + }, + { + "epoch": 0.022063157894736844, + "grad_norm": 0.55078125, + "learning_rate": 0.0004999945320950562, + "loss": 4.2218, + "step": 524 + }, + { + "epoch": 0.022105263157894735, + "grad_norm": 0.625, + "learning_rate": 0.0004999943066387734, + "loss": 4.0465, + "step": 525 + }, + { + "epoch": 0.02214736842105263, + "grad_norm": 0.59375, + "learning_rate": 0.0004999940766278882, + "loss": 4.2427, + "step": 526 + }, + { + "epoch": 0.022189473684210526, + "grad_norm": 0.7265625, + "learning_rate": 0.0004999938420624046, + "loss": 3.7561, + "step": 527 + }, + { + "epoch": 0.02223157894736842, + "grad_norm": 0.71875, + "learning_rate": 0.0004999936029423269, + "loss": 3.9305, + "step": 528 + }, + { + "epoch": 0.022273684210526317, + "grad_norm": 0.55078125, + "learning_rate": 0.0004999933592676596, + "loss": 4.5952, + "step": 529 + }, + { + "epoch": 0.02231578947368421, + "grad_norm": 0.70703125, + "learning_rate": 0.0004999931110384069, + "loss": 3.8931, + "step": 530 + }, + { + "epoch": 0.022357894736842104, + "grad_norm": 0.68359375, + "learning_rate": 0.0004999928582545734, + "loss": 4.1726, + "step": 531 + }, + { + "epoch": 0.0224, + "grad_norm": 0.60546875, + "learning_rate": 0.0004999926009161639, + "loss": 3.9406, + "step": 532 + }, + { + "epoch": 0.022442105263157895, + "grad_norm": 0.5703125, + "learning_rate": 0.0004999923390231829, + "loss": 4.0424, + "step": 533 + }, + { + "epoch": 0.02248421052631579, + "grad_norm": 0.58203125, + "learning_rate": 0.0004999920725756352, + "loss": 4.2692, + "step": 534 + }, + { + "epoch": 0.022526315789473682, + "grad_norm": 0.5703125, + "learning_rate": 0.0004999918015735256, + "loss": 3.9708, + "step": 535 + }, + { + "epoch": 0.022568421052631578, + "grad_norm": 0.77734375, + "learning_rate": 0.0004999915260168592, + "loss": 3.944, + "step": 536 + }, + { + "epoch": 0.022610526315789473, + "grad_norm": 0.56640625, + "learning_rate": 0.0004999912459056409, + "loss": 4.3285, + "step": 537 + }, + { + "epoch": 0.02265263157894737, + "grad_norm": 0.5234375, + "learning_rate": 0.0004999909612398759, + "loss": 4.3843, + "step": 538 + }, + { + "epoch": 0.022694736842105264, + "grad_norm": 0.71484375, + "learning_rate": 0.0004999906720195692, + "loss": 4.2188, + "step": 539 + }, + { + "epoch": 0.02273684210526316, + "grad_norm": 0.51953125, + "learning_rate": 0.0004999903782447262, + "loss": 4.3927, + "step": 540 + }, + { + "epoch": 0.02277894736842105, + "grad_norm": 0.5, + "learning_rate": 0.0004999900799153522, + "loss": 4.1103, + "step": 541 + }, + { + "epoch": 0.022821052631578947, + "grad_norm": 0.57421875, + "learning_rate": 0.0004999897770314527, + "loss": 3.994, + "step": 542 + }, + { + "epoch": 0.022863157894736842, + "grad_norm": 0.73046875, + "learning_rate": 0.0004999894695930332, + "loss": 4.3853, + "step": 543 + }, + { + "epoch": 0.022905263157894738, + "grad_norm": 0.63671875, + "learning_rate": 0.0004999891576000993, + "loss": 3.9278, + "step": 544 + }, + { + "epoch": 0.022947368421052633, + "grad_norm": 0.58203125, + "learning_rate": 0.0004999888410526566, + "loss": 4.1244, + "step": 545 + }, + { + "epoch": 0.022989473684210525, + "grad_norm": 0.5625, + "learning_rate": 0.000499988519950711, + "loss": 4.4078, + "step": 546 + }, + { + "epoch": 0.02303157894736842, + "grad_norm": 0.58984375, + "learning_rate": 0.0004999881942942683, + "loss": 3.7988, + "step": 547 + }, + { + "epoch": 0.023073684210526316, + "grad_norm": 0.58984375, + "learning_rate": 0.0004999878640833345, + "loss": 4.393, + "step": 548 + }, + { + "epoch": 0.02311578947368421, + "grad_norm": 0.671875, + "learning_rate": 0.0004999875293179155, + "loss": 4.2036, + "step": 549 + }, + { + "epoch": 0.023157894736842106, + "grad_norm": 0.62890625, + "learning_rate": 0.0004999871899980173, + "loss": 3.8576, + "step": 550 + }, + { + "epoch": 0.0232, + "grad_norm": 0.89453125, + "learning_rate": 0.0004999868461236464, + "loss": 3.9127, + "step": 551 + }, + { + "epoch": 0.023242105263157894, + "grad_norm": 0.671875, + "learning_rate": 0.0004999864976948088, + "loss": 4.1617, + "step": 552 + }, + { + "epoch": 0.02328421052631579, + "grad_norm": 0.53125, + "learning_rate": 0.0004999861447115109, + "loss": 4.3166, + "step": 553 + }, + { + "epoch": 0.023326315789473685, + "grad_norm": 0.65234375, + "learning_rate": 0.0004999857871737592, + "loss": 4.219, + "step": 554 + }, + { + "epoch": 0.02336842105263158, + "grad_norm": 0.578125, + "learning_rate": 0.0004999854250815602, + "loss": 4.4197, + "step": 555 + }, + { + "epoch": 0.023410526315789475, + "grad_norm": 0.90234375, + "learning_rate": 0.0004999850584349205, + "loss": 3.9932, + "step": 556 + }, + { + "epoch": 0.023452631578947367, + "grad_norm": 1.0078125, + "learning_rate": 0.0004999846872338467, + "loss": 4.3658, + "step": 557 + }, + { + "epoch": 0.023494736842105263, + "grad_norm": 0.62890625, + "learning_rate": 0.0004999843114783456, + "loss": 4.4909, + "step": 558 + }, + { + "epoch": 0.023536842105263158, + "grad_norm": 0.80078125, + "learning_rate": 0.0004999839311684241, + "loss": 3.5408, + "step": 559 + }, + { + "epoch": 0.023578947368421053, + "grad_norm": 0.59375, + "learning_rate": 0.0004999835463040891, + "loss": 4.0307, + "step": 560 + }, + { + "epoch": 0.02362105263157895, + "grad_norm": 0.8046875, + "learning_rate": 0.0004999831568853476, + "loss": 4.1097, + "step": 561 + }, + { + "epoch": 0.02366315789473684, + "grad_norm": 0.54296875, + "learning_rate": 0.0004999827629122067, + "loss": 4.5409, + "step": 562 + }, + { + "epoch": 0.023705263157894736, + "grad_norm": 0.59375, + "learning_rate": 0.0004999823643846736, + "loss": 4.5116, + "step": 563 + }, + { + "epoch": 0.02374736842105263, + "grad_norm": 0.515625, + "learning_rate": 0.0004999819613027555, + "loss": 4.239, + "step": 564 + }, + { + "epoch": 0.023789473684210527, + "grad_norm": 0.5234375, + "learning_rate": 0.0004999815536664598, + "loss": 4.1938, + "step": 565 + }, + { + "epoch": 0.023831578947368422, + "grad_norm": 0.75, + "learning_rate": 0.0004999811414757939, + "loss": 3.6577, + "step": 566 + }, + { + "epoch": 0.023873684210526314, + "grad_norm": 0.5859375, + "learning_rate": 0.0004999807247307653, + "loss": 4.2733, + "step": 567 + }, + { + "epoch": 0.02391578947368421, + "grad_norm": 0.59375, + "learning_rate": 0.0004999803034313817, + "loss": 3.7278, + "step": 568 + }, + { + "epoch": 0.023957894736842105, + "grad_norm": 0.5078125, + "learning_rate": 0.0004999798775776505, + "loss": 4.1731, + "step": 569 + }, + { + "epoch": 0.024, + "grad_norm": 0.53515625, + "learning_rate": 0.0004999794471695798, + "loss": 4.4026, + "step": 570 + }, + { + "epoch": 0.024042105263157896, + "grad_norm": 0.5390625, + "learning_rate": 0.0004999790122071773, + "loss": 4.2953, + "step": 571 + }, + { + "epoch": 0.024084210526315788, + "grad_norm": 0.6328125, + "learning_rate": 0.0004999785726904508, + "loss": 4.3452, + "step": 572 + }, + { + "epoch": 0.024126315789473683, + "grad_norm": 0.73046875, + "learning_rate": 0.0004999781286194085, + "loss": 3.5627, + "step": 573 + }, + { + "epoch": 0.02416842105263158, + "grad_norm": 0.62890625, + "learning_rate": 0.0004999776799940584, + "loss": 4.3395, + "step": 574 + }, + { + "epoch": 0.024210526315789474, + "grad_norm": 0.60546875, + "learning_rate": 0.0004999772268144087, + "loss": 4.6307, + "step": 575 + }, + { + "epoch": 0.02425263157894737, + "grad_norm": 0.6796875, + "learning_rate": 0.0004999767690804676, + "loss": 4.0194, + "step": 576 + }, + { + "epoch": 0.024294736842105265, + "grad_norm": 1.015625, + "learning_rate": 0.0004999763067922435, + "loss": 3.7996, + "step": 577 + }, + { + "epoch": 0.024336842105263157, + "grad_norm": 0.69921875, + "learning_rate": 0.0004999758399497447, + "loss": 4.2088, + "step": 578 + }, + { + "epoch": 0.024378947368421052, + "grad_norm": 0.59765625, + "learning_rate": 0.00049997536855298, + "loss": 4.4423, + "step": 579 + }, + { + "epoch": 0.024421052631578948, + "grad_norm": 0.68359375, + "learning_rate": 0.0004999748926019577, + "loss": 4.206, + "step": 580 + }, + { + "epoch": 0.024463157894736843, + "grad_norm": 0.75390625, + "learning_rate": 0.0004999744120966865, + "loss": 4.1363, + "step": 581 + }, + { + "epoch": 0.02450526315789474, + "grad_norm": 0.5703125, + "learning_rate": 0.0004999739270371752, + "loss": 4.3203, + "step": 582 + }, + { + "epoch": 0.02454736842105263, + "grad_norm": 0.6484375, + "learning_rate": 0.0004999734374234328, + "loss": 4.4022, + "step": 583 + }, + { + "epoch": 0.024589473684210526, + "grad_norm": 0.69140625, + "learning_rate": 0.000499972943255468, + "loss": 4.241, + "step": 584 + }, + { + "epoch": 0.02463157894736842, + "grad_norm": 0.5546875, + "learning_rate": 0.0004999724445332898, + "loss": 3.9091, + "step": 585 + }, + { + "epoch": 0.024673684210526316, + "grad_norm": 0.58984375, + "learning_rate": 0.0004999719412569075, + "loss": 4.6019, + "step": 586 + }, + { + "epoch": 0.024715789473684212, + "grad_norm": 0.67578125, + "learning_rate": 0.0004999714334263301, + "loss": 3.8187, + "step": 587 + }, + { + "epoch": 0.024757894736842104, + "grad_norm": 1.0703125, + "learning_rate": 0.0004999709210415669, + "loss": 4.2854, + "step": 588 + }, + { + "epoch": 0.0248, + "grad_norm": 0.65625, + "learning_rate": 0.0004999704041026272, + "loss": 4.0036, + "step": 589 + }, + { + "epoch": 0.024842105263157895, + "grad_norm": 0.6328125, + "learning_rate": 0.0004999698826095204, + "loss": 3.6707, + "step": 590 + }, + { + "epoch": 0.02488421052631579, + "grad_norm": 0.66796875, + "learning_rate": 0.000499969356562256, + "loss": 4.102, + "step": 591 + }, + { + "epoch": 0.024926315789473685, + "grad_norm": 0.6015625, + "learning_rate": 0.0004999688259608438, + "loss": 4.2249, + "step": 592 + }, + { + "epoch": 0.024968421052631577, + "grad_norm": 0.5546875, + "learning_rate": 0.0004999682908052932, + "loss": 4.2307, + "step": 593 + }, + { + "epoch": 0.025010526315789473, + "grad_norm": 0.6328125, + "learning_rate": 0.000499967751095614, + "loss": 4.0243, + "step": 594 + }, + { + "epoch": 0.025052631578947368, + "grad_norm": 0.80078125, + "learning_rate": 0.0004999672068318161, + "loss": 3.7756, + "step": 595 + }, + { + "epoch": 0.025094736842105263, + "grad_norm": 0.52734375, + "learning_rate": 0.0004999666580139094, + "loss": 4.2975, + "step": 596 + }, + { + "epoch": 0.02513684210526316, + "grad_norm": 0.58203125, + "learning_rate": 0.000499966104641904, + "loss": 4.1384, + "step": 597 + }, + { + "epoch": 0.025178947368421054, + "grad_norm": 0.703125, + "learning_rate": 0.0004999655467158097, + "loss": 3.7663, + "step": 598 + }, + { + "epoch": 0.025221052631578946, + "grad_norm": 0.5625, + "learning_rate": 0.0004999649842356369, + "loss": 4.2195, + "step": 599 + }, + { + "epoch": 0.02526315789473684, + "grad_norm": 0.54296875, + "learning_rate": 0.0004999644172013957, + "loss": 3.9898, + "step": 600 + }, + { + "epoch": 0.025305263157894737, + "grad_norm": 0.53125, + "learning_rate": 0.0004999638456130967, + "loss": 4.5602, + "step": 601 + }, + { + "epoch": 0.025347368421052632, + "grad_norm": 0.578125, + "learning_rate": 0.00049996326947075, + "loss": 3.6831, + "step": 602 + }, + { + "epoch": 0.025389473684210528, + "grad_norm": 0.52734375, + "learning_rate": 0.0004999626887743663, + "loss": 4.4858, + "step": 603 + }, + { + "epoch": 0.02543157894736842, + "grad_norm": 0.62109375, + "learning_rate": 0.000499962103523956, + "loss": 4.1604, + "step": 604 + }, + { + "epoch": 0.025473684210526315, + "grad_norm": 0.50390625, + "learning_rate": 0.00049996151371953, + "loss": 4.045, + "step": 605 + }, + { + "epoch": 0.02551578947368421, + "grad_norm": 0.59765625, + "learning_rate": 0.0004999609193610988, + "loss": 4.3689, + "step": 606 + }, + { + "epoch": 0.025557894736842106, + "grad_norm": 0.59375, + "learning_rate": 0.0004999603204486734, + "loss": 4.0321, + "step": 607 + }, + { + "epoch": 0.0256, + "grad_norm": 0.63671875, + "learning_rate": 0.0004999597169822646, + "loss": 4.116, + "step": 608 + }, + { + "epoch": 0.025642105263157893, + "grad_norm": 0.48046875, + "learning_rate": 0.0004999591089618835, + "loss": 4.4186, + "step": 609 + }, + { + "epoch": 0.02568421052631579, + "grad_norm": 0.5625, + "learning_rate": 0.000499958496387541, + "loss": 4.0515, + "step": 610 + }, + { + "epoch": 0.025726315789473684, + "grad_norm": 0.484375, + "learning_rate": 0.0004999578792592485, + "loss": 4.1381, + "step": 611 + }, + { + "epoch": 0.02576842105263158, + "grad_norm": 0.49609375, + "learning_rate": 0.0004999572575770172, + "loss": 3.8973, + "step": 612 + }, + { + "epoch": 0.025810526315789475, + "grad_norm": 0.59765625, + "learning_rate": 0.0004999566313408582, + "loss": 3.9779, + "step": 613 + }, + { + "epoch": 0.02585263157894737, + "grad_norm": 0.5546875, + "learning_rate": 0.0004999560005507831, + "loss": 4.0894, + "step": 614 + }, + { + "epoch": 0.025894736842105262, + "grad_norm": 0.4921875, + "learning_rate": 0.0004999553652068034, + "loss": 4.0895, + "step": 615 + }, + { + "epoch": 0.025936842105263157, + "grad_norm": 0.54296875, + "learning_rate": 0.0004999547253089307, + "loss": 4.043, + "step": 616 + }, + { + "epoch": 0.025978947368421053, + "grad_norm": 0.4609375, + "learning_rate": 0.0004999540808571765, + "loss": 4.3555, + "step": 617 + }, + { + "epoch": 0.026021052631578948, + "grad_norm": 0.5390625, + "learning_rate": 0.0004999534318515526, + "loss": 3.954, + "step": 618 + }, + { + "epoch": 0.026063157894736844, + "grad_norm": 0.5390625, + "learning_rate": 0.0004999527782920709, + "loss": 4.2232, + "step": 619 + }, + { + "epoch": 0.026105263157894736, + "grad_norm": 0.6328125, + "learning_rate": 0.0004999521201787433, + "loss": 3.8596, + "step": 620 + }, + { + "epoch": 0.02614736842105263, + "grad_norm": 0.55859375, + "learning_rate": 0.0004999514575115817, + "loss": 4.6642, + "step": 621 + }, + { + "epoch": 0.026189473684210526, + "grad_norm": 0.4921875, + "learning_rate": 0.0004999507902905982, + "loss": 4.1311, + "step": 622 + }, + { + "epoch": 0.026231578947368422, + "grad_norm": 0.5546875, + "learning_rate": 0.000499950118515805, + "loss": 4.2543, + "step": 623 + }, + { + "epoch": 0.026273684210526317, + "grad_norm": 0.51171875, + "learning_rate": 0.0004999494421872144, + "loss": 4.0853, + "step": 624 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 0.6015625, + "learning_rate": 0.0004999487613048386, + "loss": 3.9806, + "step": 625 + }, + { + "epoch": 0.026357894736842104, + "grad_norm": 0.65625, + "learning_rate": 0.00049994807586869, + "loss": 3.8894, + "step": 626 + }, + { + "epoch": 0.0264, + "grad_norm": 0.6171875, + "learning_rate": 0.0004999473858787811, + "loss": 3.9208, + "step": 627 + }, + { + "epoch": 0.026442105263157895, + "grad_norm": 0.5546875, + "learning_rate": 0.0004999466913351246, + "loss": 4.5039, + "step": 628 + }, + { + "epoch": 0.02648421052631579, + "grad_norm": 0.58203125, + "learning_rate": 0.0004999459922377331, + "loss": 4.1046, + "step": 629 + }, + { + "epoch": 0.026526315789473683, + "grad_norm": 0.6484375, + "learning_rate": 0.0004999452885866191, + "loss": 3.8371, + "step": 630 + }, + { + "epoch": 0.026568421052631578, + "grad_norm": 0.66796875, + "learning_rate": 0.0004999445803817958, + "loss": 3.8807, + "step": 631 + }, + { + "epoch": 0.026610526315789473, + "grad_norm": 0.470703125, + "learning_rate": 0.0004999438676232758, + "loss": 4.3093, + "step": 632 + }, + { + "epoch": 0.02665263157894737, + "grad_norm": 0.5390625, + "learning_rate": 0.0004999431503110723, + "loss": 4.2689, + "step": 633 + }, + { + "epoch": 0.026694736842105264, + "grad_norm": 0.61328125, + "learning_rate": 0.0004999424284451982, + "loss": 4.0376, + "step": 634 + }, + { + "epoch": 0.02673684210526316, + "grad_norm": 0.70703125, + "learning_rate": 0.0004999417020256668, + "loss": 3.6316, + "step": 635 + }, + { + "epoch": 0.02677894736842105, + "grad_norm": 0.578125, + "learning_rate": 0.0004999409710524912, + "loss": 4.0667, + "step": 636 + }, + { + "epoch": 0.026821052631578947, + "grad_norm": 0.53125, + "learning_rate": 0.0004999402355256848, + "loss": 3.9578, + "step": 637 + }, + { + "epoch": 0.026863157894736842, + "grad_norm": 0.5078125, + "learning_rate": 0.000499939495445261, + "loss": 4.2133, + "step": 638 + }, + { + "epoch": 0.026905263157894738, + "grad_norm": 0.640625, + "learning_rate": 0.0004999387508112332, + "loss": 3.9077, + "step": 639 + }, + { + "epoch": 0.026947368421052633, + "grad_norm": 0.6015625, + "learning_rate": 0.000499938001623615, + "loss": 3.5485, + "step": 640 + }, + { + "epoch": 0.026989473684210525, + "grad_norm": 0.5703125, + "learning_rate": 0.0004999372478824201, + "loss": 4.1327, + "step": 641 + }, + { + "epoch": 0.02703157894736842, + "grad_norm": 0.55859375, + "learning_rate": 0.0004999364895876623, + "loss": 4.2273, + "step": 642 + }, + { + "epoch": 0.027073684210526316, + "grad_norm": 0.515625, + "learning_rate": 0.0004999357267393552, + "loss": 4.1991, + "step": 643 + }, + { + "epoch": 0.02711578947368421, + "grad_norm": 0.68359375, + "learning_rate": 0.0004999349593375129, + "loss": 3.8135, + "step": 644 + }, + { + "epoch": 0.027157894736842107, + "grad_norm": 0.5234375, + "learning_rate": 0.0004999341873821492, + "loss": 4.0758, + "step": 645 + }, + { + "epoch": 0.0272, + "grad_norm": 0.63671875, + "learning_rate": 0.0004999334108732785, + "loss": 3.8673, + "step": 646 + }, + { + "epoch": 0.027242105263157894, + "grad_norm": 0.63671875, + "learning_rate": 0.0004999326298109145, + "loss": 4.2295, + "step": 647 + }, + { + "epoch": 0.02728421052631579, + "grad_norm": 0.51953125, + "learning_rate": 0.0004999318441950716, + "loss": 4.1584, + "step": 648 + }, + { + "epoch": 0.027326315789473685, + "grad_norm": 0.546875, + "learning_rate": 0.0004999310540257643, + "loss": 4.0127, + "step": 649 + }, + { + "epoch": 0.02736842105263158, + "grad_norm": 0.54296875, + "learning_rate": 0.0004999302593030068, + "loss": 4.3217, + "step": 650 + }, + { + "epoch": 0.027410526315789472, + "grad_norm": 0.91796875, + "learning_rate": 0.0004999294600268137, + "loss": 4.0146, + "step": 651 + }, + { + "epoch": 0.027452631578947367, + "grad_norm": 0.498046875, + "learning_rate": 0.0004999286561971994, + "loss": 4.3334, + "step": 652 + }, + { + "epoch": 0.027494736842105263, + "grad_norm": 0.55078125, + "learning_rate": 0.0004999278478141787, + "loss": 4.462, + "step": 653 + }, + { + "epoch": 0.027536842105263158, + "grad_norm": 0.5078125, + "learning_rate": 0.0004999270348777662, + "loss": 3.9484, + "step": 654 + }, + { + "epoch": 0.027578947368421054, + "grad_norm": 0.5703125, + "learning_rate": 0.0004999262173879769, + "loss": 4.4601, + "step": 655 + }, + { + "epoch": 0.02762105263157895, + "grad_norm": 0.470703125, + "learning_rate": 0.0004999253953448255, + "loss": 4.2206, + "step": 656 + }, + { + "epoch": 0.02766315789473684, + "grad_norm": 0.50390625, + "learning_rate": 0.0004999245687483271, + "loss": 4.341, + "step": 657 + }, + { + "epoch": 0.027705263157894736, + "grad_norm": 0.45703125, + "learning_rate": 0.0004999237375984965, + "loss": 4.1534, + "step": 658 + }, + { + "epoch": 0.02774736842105263, + "grad_norm": 1.1484375, + "learning_rate": 0.0004999229018953493, + "loss": 4.3511, + "step": 659 + }, + { + "epoch": 0.027789473684210527, + "grad_norm": 0.5, + "learning_rate": 0.0004999220616389003, + "loss": 4.3578, + "step": 660 + }, + { + "epoch": 0.027831578947368422, + "grad_norm": 0.59765625, + "learning_rate": 0.000499921216829165, + "loss": 4.2076, + "step": 661 + }, + { + "epoch": 0.027873684210526314, + "grad_norm": 0.5234375, + "learning_rate": 0.0004999203674661589, + "loss": 4.3109, + "step": 662 + }, + { + "epoch": 0.02791578947368421, + "grad_norm": 0.56640625, + "learning_rate": 0.0004999195135498973, + "loss": 4.3867, + "step": 663 + }, + { + "epoch": 0.027957894736842105, + "grad_norm": 0.51953125, + "learning_rate": 0.0004999186550803958, + "loss": 4.0917, + "step": 664 + }, + { + "epoch": 0.028, + "grad_norm": 0.474609375, + "learning_rate": 0.00049991779205767, + "loss": 4.3205, + "step": 665 + }, + { + "epoch": 0.028042105263157896, + "grad_norm": 0.46484375, + "learning_rate": 0.0004999169244817356, + "loss": 3.965, + "step": 666 + }, + { + "epoch": 0.028084210526315788, + "grad_norm": 0.49609375, + "learning_rate": 0.0004999160523526086, + "loss": 4.1982, + "step": 667 + }, + { + "epoch": 0.028126315789473683, + "grad_norm": 0.48046875, + "learning_rate": 0.0004999151756703048, + "loss": 4.1706, + "step": 668 + }, + { + "epoch": 0.02816842105263158, + "grad_norm": 0.455078125, + "learning_rate": 0.0004999142944348401, + "loss": 4.3561, + "step": 669 + }, + { + "epoch": 0.028210526315789474, + "grad_norm": 0.55078125, + "learning_rate": 0.0004999134086462305, + "loss": 4.0577, + "step": 670 + }, + { + "epoch": 0.02825263157894737, + "grad_norm": 0.890625, + "learning_rate": 0.0004999125183044923, + "loss": 3.8123, + "step": 671 + }, + { + "epoch": 0.028294736842105265, + "grad_norm": 0.462890625, + "learning_rate": 0.0004999116234096417, + "loss": 4.2543, + "step": 672 + }, + { + "epoch": 0.028336842105263157, + "grad_norm": 0.5625, + "learning_rate": 0.0004999107239616948, + "loss": 4.1865, + "step": 673 + }, + { + "epoch": 0.028378947368421052, + "grad_norm": 0.57421875, + "learning_rate": 0.0004999098199606684, + "loss": 4.2058, + "step": 674 + }, + { + "epoch": 0.028421052631578948, + "grad_norm": 0.51953125, + "learning_rate": 0.0004999089114065784, + "loss": 3.7633, + "step": 675 + }, + { + "epoch": 0.028463157894736843, + "grad_norm": 0.48046875, + "learning_rate": 0.0004999079982994419, + "loss": 4.1803, + "step": 676 + }, + { + "epoch": 0.02850526315789474, + "grad_norm": 0.5078125, + "learning_rate": 0.0004999070806392753, + "loss": 4.0662, + "step": 677 + }, + { + "epoch": 0.02854736842105263, + "grad_norm": 0.6875, + "learning_rate": 0.0004999061584260952, + "loss": 3.5857, + "step": 678 + }, + { + "epoch": 0.028589473684210526, + "grad_norm": 0.6484375, + "learning_rate": 0.0004999052316599185, + "loss": 3.8849, + "step": 679 + }, + { + "epoch": 0.02863157894736842, + "grad_norm": 0.5859375, + "learning_rate": 0.0004999043003407623, + "loss": 3.686, + "step": 680 + }, + { + "epoch": 0.028673684210526316, + "grad_norm": 1.0625, + "learning_rate": 0.0004999033644686432, + "loss": 3.7717, + "step": 681 + }, + { + "epoch": 0.028715789473684212, + "grad_norm": 0.6171875, + "learning_rate": 0.0004999024240435784, + "loss": 3.8231, + "step": 682 + }, + { + "epoch": 0.028757894736842104, + "grad_norm": 0.453125, + "learning_rate": 0.0004999014790655851, + "loss": 4.4855, + "step": 683 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5, + "learning_rate": 0.0004999005295346806, + "loss": 4.2384, + "step": 684 + }, + { + "epoch": 0.028842105263157895, + "grad_norm": 0.5234375, + "learning_rate": 0.000499899575450882, + "loss": 4.2844, + "step": 685 + }, + { + "epoch": 0.02888421052631579, + "grad_norm": 0.69921875, + "learning_rate": 0.0004998986168142066, + "loss": 3.9843, + "step": 686 + }, + { + "epoch": 0.028926315789473685, + "grad_norm": 0.50390625, + "learning_rate": 0.0004998976536246722, + "loss": 4.0072, + "step": 687 + }, + { + "epoch": 0.028968421052631577, + "grad_norm": 0.58203125, + "learning_rate": 0.0004998966858822962, + "loss": 3.4171, + "step": 688 + }, + { + "epoch": 0.029010526315789473, + "grad_norm": 0.490234375, + "learning_rate": 0.0004998957135870961, + "loss": 4.1952, + "step": 689 + }, + { + "epoch": 0.029052631578947368, + "grad_norm": 0.5546875, + "learning_rate": 0.0004998947367390897, + "loss": 4.0596, + "step": 690 + }, + { + "epoch": 0.029094736842105264, + "grad_norm": 0.5, + "learning_rate": 0.0004998937553382948, + "loss": 3.9193, + "step": 691 + }, + { + "epoch": 0.02913684210526316, + "grad_norm": 0.60546875, + "learning_rate": 0.0004998927693847294, + "loss": 4.1471, + "step": 692 + }, + { + "epoch": 0.029178947368421054, + "grad_norm": 0.51171875, + "learning_rate": 0.0004998917788784113, + "loss": 4.4418, + "step": 693 + }, + { + "epoch": 0.029221052631578946, + "grad_norm": 0.7578125, + "learning_rate": 0.0004998907838193586, + "loss": 4.2383, + "step": 694 + }, + { + "epoch": 0.02926315789473684, + "grad_norm": 0.6015625, + "learning_rate": 0.0004998897842075894, + "loss": 4.1382, + "step": 695 + }, + { + "epoch": 0.029305263157894737, + "grad_norm": 0.5, + "learning_rate": 0.000499888780043122, + "loss": 4.1138, + "step": 696 + }, + { + "epoch": 0.029347368421052632, + "grad_norm": 0.546875, + "learning_rate": 0.0004998877713259745, + "loss": 4.0818, + "step": 697 + }, + { + "epoch": 0.029389473684210528, + "grad_norm": 0.5078125, + "learning_rate": 0.0004998867580561655, + "loss": 3.9813, + "step": 698 + }, + { + "epoch": 0.02943157894736842, + "grad_norm": 0.47265625, + "learning_rate": 0.0004998857402337133, + "loss": 4.1947, + "step": 699 + }, + { + "epoch": 0.029473684210526315, + "grad_norm": 0.453125, + "learning_rate": 0.0004998847178586366, + "loss": 4.3387, + "step": 700 + }, + { + "epoch": 0.02951578947368421, + "grad_norm": 0.51953125, + "learning_rate": 0.0004998836909309541, + "loss": 3.9755, + "step": 701 + }, + { + "epoch": 0.029557894736842106, + "grad_norm": 0.498046875, + "learning_rate": 0.0004998826594506842, + "loss": 4.0868, + "step": 702 + }, + { + "epoch": 0.0296, + "grad_norm": 0.5703125, + "learning_rate": 0.0004998816234178458, + "loss": 4.3954, + "step": 703 + }, + { + "epoch": 0.029642105263157893, + "grad_norm": 0.51953125, + "learning_rate": 0.0004998805828324578, + "loss": 4.1477, + "step": 704 + }, + { + "epoch": 0.02968421052631579, + "grad_norm": 0.494140625, + "learning_rate": 0.0004998795376945392, + "loss": 4.2149, + "step": 705 + }, + { + "epoch": 0.029726315789473684, + "grad_norm": 0.51953125, + "learning_rate": 0.0004998784880041091, + "loss": 3.9834, + "step": 706 + }, + { + "epoch": 0.02976842105263158, + "grad_norm": 0.65234375, + "learning_rate": 0.0004998774337611864, + "loss": 4.2129, + "step": 707 + }, + { + "epoch": 0.029810526315789475, + "grad_norm": 0.515625, + "learning_rate": 0.0004998763749657906, + "loss": 4.0314, + "step": 708 + }, + { + "epoch": 0.029852631578947367, + "grad_norm": 0.490234375, + "learning_rate": 0.0004998753116179408, + "loss": 4.1842, + "step": 709 + }, + { + "epoch": 0.029894736842105262, + "grad_norm": 0.46875, + "learning_rate": 0.0004998742437176565, + "loss": 3.9284, + "step": 710 + }, + { + "epoch": 0.029936842105263158, + "grad_norm": 0.490234375, + "learning_rate": 0.0004998731712649569, + "loss": 4.4043, + "step": 711 + }, + { + "epoch": 0.029978947368421053, + "grad_norm": 0.73828125, + "learning_rate": 0.0004998720942598618, + "loss": 3.5528, + "step": 712 + }, + { + "epoch": 0.03002105263157895, + "grad_norm": 0.6328125, + "learning_rate": 0.0004998710127023906, + "loss": 3.768, + "step": 713 + }, + { + "epoch": 0.030063157894736844, + "grad_norm": 0.54296875, + "learning_rate": 0.0004998699265925633, + "loss": 3.937, + "step": 714 + }, + { + "epoch": 0.030105263157894736, + "grad_norm": 0.5234375, + "learning_rate": 0.0004998688359303994, + "loss": 4.2029, + "step": 715 + }, + { + "epoch": 0.03014736842105263, + "grad_norm": 0.59375, + "learning_rate": 0.000499867740715919, + "loss": 3.8871, + "step": 716 + }, + { + "epoch": 0.030189473684210526, + "grad_norm": 1.2578125, + "learning_rate": 0.0004998666409491419, + "loss": 4.0253, + "step": 717 + }, + { + "epoch": 0.030231578947368422, + "grad_norm": 0.51953125, + "learning_rate": 0.0004998655366300882, + "loss": 4.2328, + "step": 718 + }, + { + "epoch": 0.030273684210526317, + "grad_norm": 0.56640625, + "learning_rate": 0.0004998644277587781, + "loss": 3.9334, + "step": 719 + }, + { + "epoch": 0.03031578947368421, + "grad_norm": 0.6328125, + "learning_rate": 0.0004998633143352315, + "loss": 3.8762, + "step": 720 + }, + { + "epoch": 0.030357894736842105, + "grad_norm": 0.46484375, + "learning_rate": 0.0004998621963594691, + "loss": 4.1105, + "step": 721 + }, + { + "epoch": 0.0304, + "grad_norm": 1.46875, + "learning_rate": 0.0004998610738315108, + "loss": 3.9509, + "step": 722 + }, + { + "epoch": 0.030442105263157895, + "grad_norm": 0.57421875, + "learning_rate": 0.0004998599467513774, + "loss": 4.1413, + "step": 723 + }, + { + "epoch": 0.03048421052631579, + "grad_norm": 0.5078125, + "learning_rate": 0.0004998588151190895, + "loss": 4.4055, + "step": 724 + }, + { + "epoch": 0.030526315789473683, + "grad_norm": 0.62890625, + "learning_rate": 0.0004998576789346675, + "loss": 3.7939, + "step": 725 + }, + { + "epoch": 0.030568421052631578, + "grad_norm": 0.68359375, + "learning_rate": 0.000499856538198132, + "loss": 3.8, + "step": 726 + }, + { + "epoch": 0.030610526315789473, + "grad_norm": 0.921875, + "learning_rate": 0.0004998553929095041, + "loss": 3.8837, + "step": 727 + }, + { + "epoch": 0.03065263157894737, + "grad_norm": 0.58203125, + "learning_rate": 0.0004998542430688045, + "loss": 3.6112, + "step": 728 + }, + { + "epoch": 0.030694736842105264, + "grad_norm": 0.55078125, + "learning_rate": 0.0004998530886760542, + "loss": 4.1042, + "step": 729 + }, + { + "epoch": 0.030736842105263156, + "grad_norm": 0.76171875, + "learning_rate": 0.000499851929731274, + "loss": 3.7537, + "step": 730 + }, + { + "epoch": 0.03077894736842105, + "grad_norm": 0.6640625, + "learning_rate": 0.0004998507662344854, + "loss": 3.767, + "step": 731 + }, + { + "epoch": 0.030821052631578947, + "grad_norm": 0.6015625, + "learning_rate": 0.0004998495981857093, + "loss": 4.0155, + "step": 732 + }, + { + "epoch": 0.030863157894736842, + "grad_norm": 0.51953125, + "learning_rate": 0.0004998484255849671, + "loss": 4.1372, + "step": 733 + }, + { + "epoch": 0.030905263157894738, + "grad_norm": 0.61328125, + "learning_rate": 0.0004998472484322802, + "loss": 3.9748, + "step": 734 + }, + { + "epoch": 0.030947368421052633, + "grad_norm": 0.55859375, + "learning_rate": 0.0004998460667276699, + "loss": 3.9526, + "step": 735 + }, + { + "epoch": 0.030989473684210525, + "grad_norm": 0.546875, + "learning_rate": 0.0004998448804711578, + "loss": 3.9407, + "step": 736 + }, + { + "epoch": 0.03103157894736842, + "grad_norm": 0.51171875, + "learning_rate": 0.0004998436896627656, + "loss": 4.1717, + "step": 737 + }, + { + "epoch": 0.031073684210526316, + "grad_norm": 0.58203125, + "learning_rate": 0.000499842494302515, + "loss": 3.7643, + "step": 738 + }, + { + "epoch": 0.03111578947368421, + "grad_norm": 0.51171875, + "learning_rate": 0.0004998412943904276, + "loss": 3.8482, + "step": 739 + }, + { + "epoch": 0.031157894736842107, + "grad_norm": 0.5625, + "learning_rate": 0.0004998400899265254, + "loss": 4.2118, + "step": 740 + }, + { + "epoch": 0.0312, + "grad_norm": 0.46875, + "learning_rate": 0.0004998388809108303, + "loss": 4.3336, + "step": 741 + }, + { + "epoch": 0.031242105263157894, + "grad_norm": 0.5625, + "learning_rate": 0.0004998376673433644, + "loss": 3.9516, + "step": 742 + }, + { + "epoch": 0.03128421052631579, + "grad_norm": 0.765625, + "learning_rate": 0.0004998364492241498, + "loss": 3.7602, + "step": 743 + }, + { + "epoch": 0.03132631578947368, + "grad_norm": 0.4765625, + "learning_rate": 0.0004998352265532085, + "loss": 4.2425, + "step": 744 + }, + { + "epoch": 0.03136842105263158, + "grad_norm": 0.5390625, + "learning_rate": 0.0004998339993305629, + "loss": 4.1772, + "step": 745 + }, + { + "epoch": 0.03141052631578947, + "grad_norm": 0.64453125, + "learning_rate": 0.0004998327675562354, + "loss": 3.8244, + "step": 746 + }, + { + "epoch": 0.03145263157894737, + "grad_norm": 0.56640625, + "learning_rate": 0.0004998315312302485, + "loss": 3.9586, + "step": 747 + }, + { + "epoch": 0.03149473684210526, + "grad_norm": 0.5, + "learning_rate": 0.0004998302903526246, + "loss": 3.7922, + "step": 748 + }, + { + "epoch": 0.031536842105263155, + "grad_norm": 0.52734375, + "learning_rate": 0.0004998290449233863, + "loss": 3.9835, + "step": 749 + }, + { + "epoch": 0.031578947368421054, + "grad_norm": 0.52734375, + "learning_rate": 0.0004998277949425563, + "loss": 4.2068, + "step": 750 + }, + { + "epoch": 0.031621052631578946, + "grad_norm": 0.55859375, + "learning_rate": 0.0004998265404101576, + "loss": 4.128, + "step": 751 + }, + { + "epoch": 0.031663157894736844, + "grad_norm": 0.58984375, + "learning_rate": 0.0004998252813262127, + "loss": 4.2271, + "step": 752 + }, + { + "epoch": 0.031705263157894736, + "grad_norm": 0.72265625, + "learning_rate": 0.0004998240176907448, + "loss": 4.006, + "step": 753 + }, + { + "epoch": 0.03174736842105263, + "grad_norm": 0.48046875, + "learning_rate": 0.0004998227495037768, + "loss": 4.0048, + "step": 754 + }, + { + "epoch": 0.03178947368421053, + "grad_norm": 0.59375, + "learning_rate": 0.0004998214767653319, + "loss": 3.9074, + "step": 755 + }, + { + "epoch": 0.03183157894736842, + "grad_norm": 0.62109375, + "learning_rate": 0.000499820199475433, + "loss": 3.8607, + "step": 756 + }, + { + "epoch": 0.03187368421052632, + "grad_norm": 0.546875, + "learning_rate": 0.0004998189176341038, + "loss": 3.9136, + "step": 757 + }, + { + "epoch": 0.03191578947368421, + "grad_norm": 0.546875, + "learning_rate": 0.0004998176312413674, + "loss": 4.0978, + "step": 758 + }, + { + "epoch": 0.0319578947368421, + "grad_norm": 0.48828125, + "learning_rate": 0.0004998163402972472, + "loss": 4.6721, + "step": 759 + }, + { + "epoch": 0.032, + "grad_norm": 0.48828125, + "learning_rate": 0.0004998150448017669, + "loss": 4.2847, + "step": 760 + }, + { + "epoch": 0.03204210526315789, + "grad_norm": 0.498046875, + "learning_rate": 0.00049981374475495, + "loss": 3.9226, + "step": 761 + }, + { + "epoch": 0.03208421052631579, + "grad_norm": 0.546875, + "learning_rate": 0.0004998124401568201, + "loss": 3.9979, + "step": 762 + }, + { + "epoch": 0.03212631578947368, + "grad_norm": 0.54296875, + "learning_rate": 0.0004998111310074012, + "loss": 4.0338, + "step": 763 + }, + { + "epoch": 0.03216842105263158, + "grad_norm": 0.5390625, + "learning_rate": 0.0004998098173067169, + "loss": 3.9018, + "step": 764 + }, + { + "epoch": 0.032210526315789474, + "grad_norm": 0.52734375, + "learning_rate": 0.0004998084990547913, + "loss": 4.1493, + "step": 765 + }, + { + "epoch": 0.032252631578947366, + "grad_norm": 0.5859375, + "learning_rate": 0.0004998071762516483, + "loss": 3.926, + "step": 766 + }, + { + "epoch": 0.032294736842105265, + "grad_norm": 0.5078125, + "learning_rate": 0.0004998058488973121, + "loss": 4.3805, + "step": 767 + }, + { + "epoch": 0.03233684210526316, + "grad_norm": 0.51953125, + "learning_rate": 0.0004998045169918069, + "loss": 3.7523, + "step": 768 + }, + { + "epoch": 0.032378947368421056, + "grad_norm": 0.66796875, + "learning_rate": 0.0004998031805351569, + "loss": 3.596, + "step": 769 + }, + { + "epoch": 0.03242105263157895, + "grad_norm": 0.494140625, + "learning_rate": 0.0004998018395273863, + "loss": 4.0849, + "step": 770 + }, + { + "epoch": 0.03246315789473684, + "grad_norm": 0.498046875, + "learning_rate": 0.0004998004939685197, + "loss": 3.9699, + "step": 771 + }, + { + "epoch": 0.03250526315789474, + "grad_norm": 0.515625, + "learning_rate": 0.0004997991438585817, + "loss": 4.0008, + "step": 772 + }, + { + "epoch": 0.03254736842105263, + "grad_norm": 0.4609375, + "learning_rate": 0.0004997977891975969, + "loss": 4.0709, + "step": 773 + }, + { + "epoch": 0.03258947368421053, + "grad_norm": 0.490234375, + "learning_rate": 0.0004997964299855897, + "loss": 4.0414, + "step": 774 + }, + { + "epoch": 0.03263157894736842, + "grad_norm": 0.470703125, + "learning_rate": 0.0004997950662225852, + "loss": 4.13, + "step": 775 + }, + { + "epoch": 0.03267368421052631, + "grad_norm": 0.83984375, + "learning_rate": 0.000499793697908608, + "loss": 3.997, + "step": 776 + }, + { + "epoch": 0.03271578947368421, + "grad_norm": 0.5390625, + "learning_rate": 0.0004997923250436831, + "loss": 3.9943, + "step": 777 + }, + { + "epoch": 0.032757894736842104, + "grad_norm": 0.5859375, + "learning_rate": 0.0004997909476278356, + "loss": 3.6848, + "step": 778 + }, + { + "epoch": 0.0328, + "grad_norm": 0.55859375, + "learning_rate": 0.0004997895656610905, + "loss": 4.4947, + "step": 779 + }, + { + "epoch": 0.032842105263157895, + "grad_norm": 1.4921875, + "learning_rate": 0.000499788179143473, + "loss": 3.9596, + "step": 780 + }, + { + "epoch": 0.03288421052631579, + "grad_norm": 0.5703125, + "learning_rate": 0.0004997867880750084, + "loss": 3.921, + "step": 781 + }, + { + "epoch": 0.032926315789473685, + "grad_norm": 0.5546875, + "learning_rate": 0.0004997853924557219, + "loss": 4.0804, + "step": 782 + }, + { + "epoch": 0.03296842105263158, + "grad_norm": 0.69140625, + "learning_rate": 0.0004997839922856391, + "loss": 3.9264, + "step": 783 + }, + { + "epoch": 0.033010526315789476, + "grad_norm": 0.62109375, + "learning_rate": 0.0004997825875647855, + "loss": 3.608, + "step": 784 + }, + { + "epoch": 0.03305263157894737, + "grad_norm": 0.55078125, + "learning_rate": 0.0004997811782931866, + "loss": 4.0276, + "step": 785 + }, + { + "epoch": 0.03309473684210526, + "grad_norm": 0.578125, + "learning_rate": 0.000499779764470868, + "loss": 3.7064, + "step": 786 + }, + { + "epoch": 0.03313684210526316, + "grad_norm": 0.48828125, + "learning_rate": 0.0004997783460978557, + "loss": 4.1538, + "step": 787 + }, + { + "epoch": 0.03317894736842105, + "grad_norm": 0.96484375, + "learning_rate": 0.0004997769231741753, + "loss": 4.189, + "step": 788 + }, + { + "epoch": 0.03322105263157895, + "grad_norm": 0.51953125, + "learning_rate": 0.000499775495699853, + "loss": 4.2725, + "step": 789 + }, + { + "epoch": 0.03326315789473684, + "grad_norm": 0.58984375, + "learning_rate": 0.0004997740636749145, + "loss": 3.6191, + "step": 790 + }, + { + "epoch": 0.033305263157894734, + "grad_norm": 0.55859375, + "learning_rate": 0.0004997726270993861, + "loss": 3.9842, + "step": 791 + }, + { + "epoch": 0.03334736842105263, + "grad_norm": 0.45703125, + "learning_rate": 0.0004997711859732939, + "loss": 4.1349, + "step": 792 + }, + { + "epoch": 0.033389473684210524, + "grad_norm": 0.5390625, + "learning_rate": 0.0004997697402966642, + "loss": 3.8909, + "step": 793 + }, + { + "epoch": 0.03343157894736842, + "grad_norm": 0.484375, + "learning_rate": 0.0004997682900695232, + "loss": 4.1655, + "step": 794 + }, + { + "epoch": 0.033473684210526315, + "grad_norm": 0.466796875, + "learning_rate": 0.0004997668352918975, + "loss": 4.059, + "step": 795 + }, + { + "epoch": 0.03351578947368421, + "grad_norm": 0.62890625, + "learning_rate": 0.0004997653759638135, + "loss": 3.6017, + "step": 796 + }, + { + "epoch": 0.033557894736842106, + "grad_norm": 0.53515625, + "learning_rate": 0.0004997639120852978, + "loss": 3.7744, + "step": 797 + }, + { + "epoch": 0.0336, + "grad_norm": 0.44140625, + "learning_rate": 0.0004997624436563771, + "loss": 4.2117, + "step": 798 + }, + { + "epoch": 0.0336421052631579, + "grad_norm": 0.4609375, + "learning_rate": 0.0004997609706770781, + "loss": 4.047, + "step": 799 + }, + { + "epoch": 0.03368421052631579, + "grad_norm": 0.5078125, + "learning_rate": 0.0004997594931474277, + "loss": 4.1946, + "step": 800 + }, + { + "epoch": 0.03372631578947369, + "grad_norm": 0.498046875, + "learning_rate": 0.0004997580110674528, + "loss": 3.7523, + "step": 801 + }, + { + "epoch": 0.03376842105263158, + "grad_norm": 0.6484375, + "learning_rate": 0.0004997565244371805, + "loss": 4.0859, + "step": 802 + }, + { + "epoch": 0.03381052631578947, + "grad_norm": 0.83984375, + "learning_rate": 0.0004997550332566375, + "loss": 3.7302, + "step": 803 + }, + { + "epoch": 0.03385263157894737, + "grad_norm": 0.515625, + "learning_rate": 0.0004997535375258515, + "loss": 3.7161, + "step": 804 + }, + { + "epoch": 0.03389473684210526, + "grad_norm": 0.453125, + "learning_rate": 0.0004997520372448494, + "loss": 4.2415, + "step": 805 + }, + { + "epoch": 0.03393684210526316, + "grad_norm": 0.462890625, + "learning_rate": 0.0004997505324136586, + "loss": 4.1853, + "step": 806 + }, + { + "epoch": 0.03397894736842105, + "grad_norm": 0.6015625, + "learning_rate": 0.0004997490230323066, + "loss": 3.8699, + "step": 807 + }, + { + "epoch": 0.034021052631578945, + "grad_norm": 0.546875, + "learning_rate": 0.0004997475091008208, + "loss": 4.0161, + "step": 808 + }, + { + "epoch": 0.034063157894736844, + "grad_norm": 0.484375, + "learning_rate": 0.0004997459906192288, + "loss": 3.9752, + "step": 809 + }, + { + "epoch": 0.034105263157894736, + "grad_norm": 0.5859375, + "learning_rate": 0.0004997444675875583, + "loss": 4.1348, + "step": 810 + }, + { + "epoch": 0.034147368421052635, + "grad_norm": 0.55078125, + "learning_rate": 0.0004997429400058371, + "loss": 4.2644, + "step": 811 + }, + { + "epoch": 0.034189473684210527, + "grad_norm": 0.5859375, + "learning_rate": 0.000499741407874093, + "loss": 3.7122, + "step": 812 + }, + { + "epoch": 0.03423157894736842, + "grad_norm": 0.546875, + "learning_rate": 0.0004997398711923537, + "loss": 3.6367, + "step": 813 + }, + { + "epoch": 0.03427368421052632, + "grad_norm": 0.466796875, + "learning_rate": 0.0004997383299606476, + "loss": 3.6281, + "step": 814 + }, + { + "epoch": 0.03431578947368421, + "grad_norm": 0.54296875, + "learning_rate": 0.0004997367841790024, + "loss": 3.7011, + "step": 815 + }, + { + "epoch": 0.03435789473684211, + "grad_norm": 0.63671875, + "learning_rate": 0.0004997352338474465, + "loss": 3.7414, + "step": 816 + }, + { + "epoch": 0.0344, + "grad_norm": 0.51953125, + "learning_rate": 0.0004997336789660081, + "loss": 4.1041, + "step": 817 + }, + { + "epoch": 0.03444210526315789, + "grad_norm": 0.466796875, + "learning_rate": 0.0004997321195347154, + "loss": 3.8263, + "step": 818 + }, + { + "epoch": 0.03448421052631579, + "grad_norm": 0.515625, + "learning_rate": 0.0004997305555535969, + "loss": 4.1188, + "step": 819 + }, + { + "epoch": 0.03452631578947368, + "grad_norm": 0.44921875, + "learning_rate": 0.0004997289870226812, + "loss": 4.0564, + "step": 820 + }, + { + "epoch": 0.03456842105263158, + "grad_norm": 0.5078125, + "learning_rate": 0.0004997274139419967, + "loss": 3.723, + "step": 821 + }, + { + "epoch": 0.034610526315789474, + "grad_norm": 0.69921875, + "learning_rate": 0.0004997258363115721, + "loss": 4.3223, + "step": 822 + }, + { + "epoch": 0.034652631578947365, + "grad_norm": 0.52734375, + "learning_rate": 0.0004997242541314363, + "loss": 3.5808, + "step": 823 + }, + { + "epoch": 0.034694736842105264, + "grad_norm": 0.50390625, + "learning_rate": 0.0004997226674016179, + "loss": 3.8489, + "step": 824 + }, + { + "epoch": 0.034736842105263156, + "grad_norm": 0.482421875, + "learning_rate": 0.000499721076122146, + "loss": 3.9126, + "step": 825 + }, + { + "epoch": 0.034778947368421055, + "grad_norm": 0.5078125, + "learning_rate": 0.0004997194802930494, + "loss": 4.1561, + "step": 826 + }, + { + "epoch": 0.03482105263157895, + "grad_norm": 0.466796875, + "learning_rate": 0.0004997178799143573, + "loss": 3.6641, + "step": 827 + }, + { + "epoch": 0.03486315789473684, + "grad_norm": 0.451171875, + "learning_rate": 0.0004997162749860989, + "loss": 4.2048, + "step": 828 + }, + { + "epoch": 0.03490526315789474, + "grad_norm": 0.5078125, + "learning_rate": 0.0004997146655083034, + "loss": 4.0615, + "step": 829 + }, + { + "epoch": 0.03494736842105263, + "grad_norm": 0.478515625, + "learning_rate": 0.000499713051481, + "loss": 3.8288, + "step": 830 + }, + { + "epoch": 0.03498947368421053, + "grad_norm": 0.490234375, + "learning_rate": 0.0004997114329042181, + "loss": 4.1566, + "step": 831 + }, + { + "epoch": 0.03503157894736842, + "grad_norm": 0.46484375, + "learning_rate": 0.0004997098097779875, + "loss": 4.0533, + "step": 832 + }, + { + "epoch": 0.03507368421052631, + "grad_norm": 0.5625, + "learning_rate": 0.0004997081821023375, + "loss": 4.0512, + "step": 833 + }, + { + "epoch": 0.03511578947368421, + "grad_norm": 0.462890625, + "learning_rate": 0.0004997065498772978, + "loss": 3.9289, + "step": 834 + }, + { + "epoch": 0.0351578947368421, + "grad_norm": 0.55078125, + "learning_rate": 0.0004997049131028982, + "loss": 3.4434, + "step": 835 + }, + { + "epoch": 0.0352, + "grad_norm": 0.64453125, + "learning_rate": 0.0004997032717791684, + "loss": 3.7508, + "step": 836 + }, + { + "epoch": 0.035242105263157894, + "grad_norm": 0.462890625, + "learning_rate": 0.0004997016259061384, + "loss": 4.1984, + "step": 837 + }, + { + "epoch": 0.03528421052631579, + "grad_norm": 0.458984375, + "learning_rate": 0.0004996999754838382, + "loss": 4.005, + "step": 838 + }, + { + "epoch": 0.035326315789473685, + "grad_norm": 0.48828125, + "learning_rate": 0.0004996983205122978, + "loss": 4.275, + "step": 839 + }, + { + "epoch": 0.03536842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.0004996966609915474, + "loss": 3.8773, + "step": 840 + }, + { + "epoch": 0.035410526315789476, + "grad_norm": 0.48046875, + "learning_rate": 0.0004996949969216173, + "loss": 3.9692, + "step": 841 + }, + { + "epoch": 0.03545263157894737, + "grad_norm": 0.5, + "learning_rate": 0.0004996933283025376, + "loss": 3.4895, + "step": 842 + }, + { + "epoch": 0.035494736842105266, + "grad_norm": 0.48046875, + "learning_rate": 0.000499691655134339, + "loss": 4.0123, + "step": 843 + }, + { + "epoch": 0.03553684210526316, + "grad_norm": 0.50390625, + "learning_rate": 0.0004996899774170516, + "loss": 3.8246, + "step": 844 + }, + { + "epoch": 0.03557894736842105, + "grad_norm": 0.462890625, + "learning_rate": 0.0004996882951507063, + "loss": 4.1701, + "step": 845 + }, + { + "epoch": 0.03562105263157895, + "grad_norm": 0.50390625, + "learning_rate": 0.0004996866083353335, + "loss": 4.2301, + "step": 846 + }, + { + "epoch": 0.03566315789473684, + "grad_norm": 0.5, + "learning_rate": 0.0004996849169709644, + "loss": 4.0677, + "step": 847 + }, + { + "epoch": 0.03570526315789474, + "grad_norm": 0.51953125, + "learning_rate": 0.0004996832210576291, + "loss": 4.1931, + "step": 848 + }, + { + "epoch": 0.03574736842105263, + "grad_norm": 0.671875, + "learning_rate": 0.0004996815205953591, + "loss": 4.0824, + "step": 849 + }, + { + "epoch": 0.035789473684210524, + "grad_norm": 0.578125, + "learning_rate": 0.0004996798155841851, + "loss": 4.1235, + "step": 850 + }, + { + "epoch": 0.03583157894736842, + "grad_norm": 0.5078125, + "learning_rate": 0.0004996781060241382, + "loss": 3.8044, + "step": 851 + }, + { + "epoch": 0.035873684210526315, + "grad_norm": 0.53125, + "learning_rate": 0.0004996763919152496, + "loss": 3.7707, + "step": 852 + }, + { + "epoch": 0.03591578947368421, + "grad_norm": 0.52734375, + "learning_rate": 0.0004996746732575505, + "loss": 3.7273, + "step": 853 + }, + { + "epoch": 0.035957894736842105, + "grad_norm": 0.6015625, + "learning_rate": 0.0004996729500510722, + "loss": 3.8799, + "step": 854 + }, + { + "epoch": 0.036, + "grad_norm": 0.515625, + "learning_rate": 0.0004996712222958462, + "loss": 3.8271, + "step": 855 + }, + { + "epoch": 0.036042105263157896, + "grad_norm": 0.482421875, + "learning_rate": 0.0004996694899919036, + "loss": 3.9493, + "step": 856 + }, + { + "epoch": 0.03608421052631579, + "grad_norm": 0.59375, + "learning_rate": 0.0004996677531392765, + "loss": 3.37, + "step": 857 + }, + { + "epoch": 0.03612631578947369, + "grad_norm": 0.53515625, + "learning_rate": 0.0004996660117379962, + "loss": 3.8475, + "step": 858 + }, + { + "epoch": 0.03616842105263158, + "grad_norm": 0.51171875, + "learning_rate": 0.0004996642657880946, + "loss": 3.9846, + "step": 859 + }, + { + "epoch": 0.03621052631578947, + "grad_norm": 0.515625, + "learning_rate": 0.0004996625152896033, + "loss": 4.2411, + "step": 860 + }, + { + "epoch": 0.03625263157894737, + "grad_norm": 0.52734375, + "learning_rate": 0.0004996607602425543, + "loss": 3.8842, + "step": 861 + }, + { + "epoch": 0.03629473684210526, + "grad_norm": 0.55078125, + "learning_rate": 0.0004996590006469797, + "loss": 3.9542, + "step": 862 + }, + { + "epoch": 0.03633684210526316, + "grad_norm": 0.51171875, + "learning_rate": 0.0004996572365029114, + "loss": 4.1191, + "step": 863 + }, + { + "epoch": 0.03637894736842105, + "grad_norm": 0.478515625, + "learning_rate": 0.0004996554678103816, + "loss": 4.015, + "step": 864 + }, + { + "epoch": 0.036421052631578944, + "grad_norm": 0.5703125, + "learning_rate": 0.0004996536945694225, + "loss": 3.9961, + "step": 865 + }, + { + "epoch": 0.03646315789473684, + "grad_norm": 0.5390625, + "learning_rate": 0.0004996519167800664, + "loss": 3.6294, + "step": 866 + }, + { + "epoch": 0.036505263157894735, + "grad_norm": 0.66796875, + "learning_rate": 0.0004996501344423456, + "loss": 3.467, + "step": 867 + }, + { + "epoch": 0.036547368421052634, + "grad_norm": 0.4765625, + "learning_rate": 0.0004996483475562929, + "loss": 4.0293, + "step": 868 + }, + { + "epoch": 0.036589473684210526, + "grad_norm": 0.60546875, + "learning_rate": 0.0004996465561219405, + "loss": 3.8073, + "step": 869 + }, + { + "epoch": 0.03663157894736842, + "grad_norm": 0.482421875, + "learning_rate": 0.0004996447601393211, + "loss": 4.0301, + "step": 870 + }, + { + "epoch": 0.03667368421052632, + "grad_norm": 0.59765625, + "learning_rate": 0.0004996429596084676, + "loss": 3.9327, + "step": 871 + }, + { + "epoch": 0.03671578947368421, + "grad_norm": 0.53125, + "learning_rate": 0.0004996411545294127, + "loss": 3.8282, + "step": 872 + }, + { + "epoch": 0.03675789473684211, + "grad_norm": 0.55859375, + "learning_rate": 0.0004996393449021892, + "loss": 3.5362, + "step": 873 + }, + { + "epoch": 0.0368, + "grad_norm": 0.5, + "learning_rate": 0.0004996375307268303, + "loss": 3.8343, + "step": 874 + }, + { + "epoch": 0.03684210526315789, + "grad_norm": 0.55859375, + "learning_rate": 0.0004996357120033688, + "loss": 3.8469, + "step": 875 + }, + { + "epoch": 0.03688421052631579, + "grad_norm": 0.5859375, + "learning_rate": 0.0004996338887318379, + "loss": 3.9204, + "step": 876 + }, + { + "epoch": 0.03692631578947368, + "grad_norm": 0.484375, + "learning_rate": 0.0004996320609122708, + "loss": 4.391, + "step": 877 + }, + { + "epoch": 0.03696842105263158, + "grad_norm": 0.48046875, + "learning_rate": 0.0004996302285447011, + "loss": 4.0084, + "step": 878 + }, + { + "epoch": 0.03701052631578947, + "grad_norm": 0.46484375, + "learning_rate": 0.0004996283916291617, + "loss": 4.096, + "step": 879 + }, + { + "epoch": 0.03705263157894737, + "grad_norm": 0.53515625, + "learning_rate": 0.0004996265501656864, + "loss": 3.7891, + "step": 880 + }, + { + "epoch": 0.037094736842105264, + "grad_norm": 0.7734375, + "learning_rate": 0.0004996247041543087, + "loss": 4.262, + "step": 881 + }, + { + "epoch": 0.037136842105263156, + "grad_norm": 0.515625, + "learning_rate": 0.0004996228535950621, + "loss": 3.758, + "step": 882 + }, + { + "epoch": 0.037178947368421054, + "grad_norm": 0.48828125, + "learning_rate": 0.0004996209984879804, + "loss": 4.0539, + "step": 883 + }, + { + "epoch": 0.037221052631578946, + "grad_norm": 0.546875, + "learning_rate": 0.0004996191388330976, + "loss": 3.8809, + "step": 884 + }, + { + "epoch": 0.037263157894736845, + "grad_norm": 0.50390625, + "learning_rate": 0.0004996172746304471, + "loss": 4.0158, + "step": 885 + }, + { + "epoch": 0.03730526315789474, + "grad_norm": 0.625, + "learning_rate": 0.0004996154058800633, + "loss": 4.023, + "step": 886 + }, + { + "epoch": 0.03734736842105263, + "grad_norm": 0.5625, + "learning_rate": 0.00049961353258198, + "loss": 3.9961, + "step": 887 + }, + { + "epoch": 0.03738947368421053, + "grad_norm": 0.4609375, + "learning_rate": 0.0004996116547362316, + "loss": 4.0106, + "step": 888 + }, + { + "epoch": 0.03743157894736842, + "grad_norm": 0.4609375, + "learning_rate": 0.000499609772342852, + "loss": 4.1207, + "step": 889 + }, + { + "epoch": 0.03747368421052632, + "grad_norm": 0.482421875, + "learning_rate": 0.0004996078854018756, + "loss": 3.699, + "step": 890 + }, + { + "epoch": 0.03751578947368421, + "grad_norm": 0.51171875, + "learning_rate": 0.0004996059939133368, + "loss": 4.0467, + "step": 891 + }, + { + "epoch": 0.0375578947368421, + "grad_norm": 0.67578125, + "learning_rate": 0.0004996040978772702, + "loss": 3.7281, + "step": 892 + }, + { + "epoch": 0.0376, + "grad_norm": 0.5078125, + "learning_rate": 0.00049960219729371, + "loss": 4.0397, + "step": 893 + }, + { + "epoch": 0.03764210526315789, + "grad_norm": 0.51953125, + "learning_rate": 0.0004996002921626912, + "loss": 3.7749, + "step": 894 + }, + { + "epoch": 0.03768421052631579, + "grad_norm": 0.478515625, + "learning_rate": 0.0004995983824842483, + "loss": 4.0509, + "step": 895 + }, + { + "epoch": 0.037726315789473684, + "grad_norm": 0.47265625, + "learning_rate": 0.0004995964682584161, + "loss": 3.9461, + "step": 896 + }, + { + "epoch": 0.037768421052631576, + "grad_norm": 0.486328125, + "learning_rate": 0.0004995945494852297, + "loss": 3.9287, + "step": 897 + }, + { + "epoch": 0.037810526315789475, + "grad_norm": 0.48046875, + "learning_rate": 0.0004995926261647236, + "loss": 3.8354, + "step": 898 + }, + { + "epoch": 0.03785263157894737, + "grad_norm": 0.451171875, + "learning_rate": 0.0004995906982969332, + "loss": 4.0678, + "step": 899 + }, + { + "epoch": 0.037894736842105266, + "grad_norm": 0.5078125, + "learning_rate": 0.0004995887658818936, + "loss": 3.6049, + "step": 900 + }, + { + "epoch": 0.03793684210526316, + "grad_norm": 0.5078125, + "learning_rate": 0.00049958682891964, + "loss": 3.9772, + "step": 901 + }, + { + "epoch": 0.03797894736842105, + "grad_norm": 0.546875, + "learning_rate": 0.0004995848874102075, + "loss": 3.5326, + "step": 902 + }, + { + "epoch": 0.03802105263157895, + "grad_norm": 0.52734375, + "learning_rate": 0.0004995829413536316, + "loss": 3.9914, + "step": 903 + }, + { + "epoch": 0.03806315789473684, + "grad_norm": 0.54296875, + "learning_rate": 0.0004995809907499478, + "loss": 4.2027, + "step": 904 + }, + { + "epoch": 0.03810526315789474, + "grad_norm": 0.45703125, + "learning_rate": 0.0004995790355991916, + "loss": 4.1437, + "step": 905 + }, + { + "epoch": 0.03814736842105263, + "grad_norm": 0.5546875, + "learning_rate": 0.0004995770759013985, + "loss": 4.0409, + "step": 906 + }, + { + "epoch": 0.03818947368421052, + "grad_norm": 0.51953125, + "learning_rate": 0.0004995751116566045, + "loss": 4.0225, + "step": 907 + }, + { + "epoch": 0.03823157894736842, + "grad_norm": 0.482421875, + "learning_rate": 0.0004995731428648451, + "loss": 3.89, + "step": 908 + }, + { + "epoch": 0.038273684210526314, + "grad_norm": 0.46875, + "learning_rate": 0.0004995711695261563, + "loss": 4.1284, + "step": 909 + }, + { + "epoch": 0.03831578947368421, + "grad_norm": 0.46875, + "learning_rate": 0.000499569191640574, + "loss": 4.3671, + "step": 910 + }, + { + "epoch": 0.038357894736842105, + "grad_norm": 0.44140625, + "learning_rate": 0.0004995672092081343, + "loss": 4.2369, + "step": 911 + }, + { + "epoch": 0.0384, + "grad_norm": 0.44921875, + "learning_rate": 0.0004995652222288732, + "loss": 4.3193, + "step": 912 + }, + { + "epoch": 0.038442105263157896, + "grad_norm": 0.52734375, + "learning_rate": 0.0004995632307028271, + "loss": 3.8291, + "step": 913 + }, + { + "epoch": 0.03848421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.0004995612346300321, + "loss": 3.9642, + "step": 914 + }, + { + "epoch": 0.038526315789473686, + "grad_norm": 0.65625, + "learning_rate": 0.0004995592340105246, + "loss": 3.4461, + "step": 915 + }, + { + "epoch": 0.03856842105263158, + "grad_norm": 0.462890625, + "learning_rate": 0.0004995572288443411, + "loss": 4.0714, + "step": 916 + }, + { + "epoch": 0.03861052631578948, + "grad_norm": 0.48828125, + "learning_rate": 0.0004995552191315181, + "loss": 3.7779, + "step": 917 + }, + { + "epoch": 0.03865263157894737, + "grad_norm": 0.4765625, + "learning_rate": 0.0004995532048720923, + "loss": 3.724, + "step": 918 + }, + { + "epoch": 0.03869473684210526, + "grad_norm": 0.455078125, + "learning_rate": 0.0004995511860661002, + "loss": 4.0334, + "step": 919 + }, + { + "epoch": 0.03873684210526316, + "grad_norm": 0.53125, + "learning_rate": 0.0004995491627135788, + "loss": 4.0845, + "step": 920 + }, + { + "epoch": 0.03877894736842105, + "grad_norm": 0.6015625, + "learning_rate": 0.0004995471348145648, + "loss": 3.9664, + "step": 921 + }, + { + "epoch": 0.03882105263157895, + "grad_norm": 0.55078125, + "learning_rate": 0.0004995451023690952, + "loss": 3.9028, + "step": 922 + }, + { + "epoch": 0.03886315789473684, + "grad_norm": 0.55859375, + "learning_rate": 0.0004995430653772071, + "loss": 3.9435, + "step": 923 + }, + { + "epoch": 0.038905263157894734, + "grad_norm": 0.51171875, + "learning_rate": 0.0004995410238389375, + "loss": 4.0502, + "step": 924 + }, + { + "epoch": 0.03894736842105263, + "grad_norm": 0.451171875, + "learning_rate": 0.0004995389777543237, + "loss": 3.8892, + "step": 925 + }, + { + "epoch": 0.038989473684210525, + "grad_norm": 0.49609375, + "learning_rate": 0.0004995369271234029, + "loss": 4.3862, + "step": 926 + }, + { + "epoch": 0.039031578947368424, + "grad_norm": 0.5078125, + "learning_rate": 0.0004995348719462124, + "loss": 3.6519, + "step": 927 + }, + { + "epoch": 0.039073684210526316, + "grad_norm": 0.5390625, + "learning_rate": 0.0004995328122227898, + "loss": 3.7276, + "step": 928 + }, + { + "epoch": 0.03911578947368421, + "grad_norm": 0.515625, + "learning_rate": 0.0004995307479531726, + "loss": 3.676, + "step": 929 + }, + { + "epoch": 0.03915789473684211, + "grad_norm": 0.486328125, + "learning_rate": 0.0004995286791373982, + "loss": 3.8373, + "step": 930 + }, + { + "epoch": 0.0392, + "grad_norm": 0.54296875, + "learning_rate": 0.0004995266057755045, + "loss": 4.1112, + "step": 931 + }, + { + "epoch": 0.0392421052631579, + "grad_norm": 0.6171875, + "learning_rate": 0.0004995245278675293, + "loss": 3.818, + "step": 932 + }, + { + "epoch": 0.03928421052631579, + "grad_norm": 0.52734375, + "learning_rate": 0.0004995224454135104, + "loss": 3.8587, + "step": 933 + }, + { + "epoch": 0.03932631578947368, + "grad_norm": 0.5078125, + "learning_rate": 0.0004995203584134857, + "loss": 4.1682, + "step": 934 + }, + { + "epoch": 0.03936842105263158, + "grad_norm": 0.53125, + "learning_rate": 0.0004995182668674932, + "loss": 3.7342, + "step": 935 + }, + { + "epoch": 0.03941052631578947, + "grad_norm": 0.58203125, + "learning_rate": 0.0004995161707755711, + "loss": 3.6353, + "step": 936 + }, + { + "epoch": 0.03945263157894737, + "grad_norm": 0.56640625, + "learning_rate": 0.0004995140701377574, + "loss": 3.5279, + "step": 937 + }, + { + "epoch": 0.03949473684210526, + "grad_norm": 0.54296875, + "learning_rate": 0.0004995119649540907, + "loss": 3.6527, + "step": 938 + }, + { + "epoch": 0.039536842105263155, + "grad_norm": 0.546875, + "learning_rate": 0.000499509855224609, + "loss": 3.8956, + "step": 939 + }, + { + "epoch": 0.039578947368421054, + "grad_norm": 0.47265625, + "learning_rate": 0.0004995077409493511, + "loss": 4.0267, + "step": 940 + }, + { + "epoch": 0.039621052631578946, + "grad_norm": 0.5234375, + "learning_rate": 0.0004995056221283551, + "loss": 3.7905, + "step": 941 + }, + { + "epoch": 0.039663157894736845, + "grad_norm": 0.486328125, + "learning_rate": 0.0004995034987616599, + "loss": 3.886, + "step": 942 + }, + { + "epoch": 0.03970526315789474, + "grad_norm": 0.57421875, + "learning_rate": 0.0004995013708493041, + "loss": 4.0504, + "step": 943 + }, + { + "epoch": 0.03974736842105263, + "grad_norm": 0.5390625, + "learning_rate": 0.0004994992383913266, + "loss": 3.7585, + "step": 944 + }, + { + "epoch": 0.03978947368421053, + "grad_norm": 0.4609375, + "learning_rate": 0.000499497101387766, + "loss": 3.9558, + "step": 945 + }, + { + "epoch": 0.03983157894736842, + "grad_norm": 0.55859375, + "learning_rate": 0.0004994949598386614, + "loss": 3.8778, + "step": 946 + }, + { + "epoch": 0.03987368421052632, + "grad_norm": 0.474609375, + "learning_rate": 0.0004994928137440518, + "loss": 3.9991, + "step": 947 + }, + { + "epoch": 0.03991578947368421, + "grad_norm": 0.4453125, + "learning_rate": 0.0004994906631039761, + "loss": 3.9531, + "step": 948 + }, + { + "epoch": 0.0399578947368421, + "grad_norm": 0.59375, + "learning_rate": 0.0004994885079184739, + "loss": 3.9253, + "step": 949 + }, + { + "epoch": 0.04, + "grad_norm": 0.51171875, + "learning_rate": 0.0004994863481875841, + "loss": 3.8145, + "step": 950 + }, + { + "epoch": 0.04004210526315789, + "grad_norm": 0.578125, + "learning_rate": 0.0004994841839113462, + "loss": 3.9797, + "step": 951 + }, + { + "epoch": 0.04008421052631579, + "grad_norm": 0.58203125, + "learning_rate": 0.0004994820150897996, + "loss": 3.8238, + "step": 952 + }, + { + "epoch": 0.040126315789473684, + "grad_norm": 0.57421875, + "learning_rate": 0.0004994798417229838, + "loss": 3.6275, + "step": 953 + }, + { + "epoch": 0.04016842105263158, + "grad_norm": 0.53125, + "learning_rate": 0.0004994776638109384, + "loss": 3.9721, + "step": 954 + }, + { + "epoch": 0.040210526315789474, + "grad_norm": 0.5234375, + "learning_rate": 0.0004994754813537031, + "loss": 3.9007, + "step": 955 + }, + { + "epoch": 0.040252631578947366, + "grad_norm": 0.64453125, + "learning_rate": 0.0004994732943513177, + "loss": 3.5901, + "step": 956 + }, + { + "epoch": 0.040294736842105265, + "grad_norm": 0.53515625, + "learning_rate": 0.0004994711028038219, + "loss": 4.1228, + "step": 957 + }, + { + "epoch": 0.04033684210526316, + "grad_norm": 0.5390625, + "learning_rate": 0.0004994689067112558, + "loss": 3.806, + "step": 958 + }, + { + "epoch": 0.040378947368421056, + "grad_norm": 0.546875, + "learning_rate": 0.0004994667060736593, + "loss": 3.5774, + "step": 959 + }, + { + "epoch": 0.04042105263157895, + "grad_norm": 0.53515625, + "learning_rate": 0.0004994645008910726, + "loss": 3.1609, + "step": 960 + }, + { + "epoch": 0.04046315789473684, + "grad_norm": 0.474609375, + "learning_rate": 0.0004994622911635357, + "loss": 4.4032, + "step": 961 + }, + { + "epoch": 0.04050526315789474, + "grad_norm": 0.61328125, + "learning_rate": 0.000499460076891089, + "loss": 3.5839, + "step": 962 + }, + { + "epoch": 0.04054736842105263, + "grad_norm": 0.482421875, + "learning_rate": 0.0004994578580737728, + "loss": 3.6538, + "step": 963 + }, + { + "epoch": 0.04058947368421053, + "grad_norm": 0.50390625, + "learning_rate": 0.0004994556347116274, + "loss": 4.1319, + "step": 964 + }, + { + "epoch": 0.04063157894736842, + "grad_norm": 0.494140625, + "learning_rate": 0.0004994534068046936, + "loss": 3.8493, + "step": 965 + }, + { + "epoch": 0.04067368421052631, + "grad_norm": 0.494140625, + "learning_rate": 0.0004994511743530119, + "loss": 3.775, + "step": 966 + }, + { + "epoch": 0.04071578947368421, + "grad_norm": 0.4765625, + "learning_rate": 0.0004994489373566228, + "loss": 3.8811, + "step": 967 + }, + { + "epoch": 0.040757894736842104, + "grad_norm": 0.455078125, + "learning_rate": 0.000499446695815567, + "loss": 4.1439, + "step": 968 + }, + { + "epoch": 0.0408, + "grad_norm": 0.482421875, + "learning_rate": 0.0004994444497298856, + "loss": 3.7446, + "step": 969 + }, + { + "epoch": 0.040842105263157895, + "grad_norm": 0.431640625, + "learning_rate": 0.0004994421990996195, + "loss": 3.8694, + "step": 970 + }, + { + "epoch": 0.04088421052631579, + "grad_norm": 0.451171875, + "learning_rate": 0.0004994399439248096, + "loss": 3.8993, + "step": 971 + }, + { + "epoch": 0.040926315789473686, + "grad_norm": 0.4765625, + "learning_rate": 0.0004994376842054969, + "loss": 3.8486, + "step": 972 + }, + { + "epoch": 0.04096842105263158, + "grad_norm": 0.486328125, + "learning_rate": 0.0004994354199417227, + "loss": 4.4611, + "step": 973 + }, + { + "epoch": 0.041010526315789476, + "grad_norm": 0.43359375, + "learning_rate": 0.0004994331511335283, + "loss": 4.0182, + "step": 974 + }, + { + "epoch": 0.04105263157894737, + "grad_norm": 0.5390625, + "learning_rate": 0.0004994308777809548, + "loss": 4.131, + "step": 975 + }, + { + "epoch": 0.04109473684210526, + "grad_norm": 0.439453125, + "learning_rate": 0.000499428599884044, + "loss": 3.8359, + "step": 976 + }, + { + "epoch": 0.04113684210526316, + "grad_norm": 0.515625, + "learning_rate": 0.000499426317442837, + "loss": 3.5298, + "step": 977 + }, + { + "epoch": 0.04117894736842105, + "grad_norm": 0.453125, + "learning_rate": 0.0004994240304573756, + "loss": 4.0759, + "step": 978 + }, + { + "epoch": 0.04122105263157895, + "grad_norm": 0.5625, + "learning_rate": 0.0004994217389277015, + "loss": 3.7825, + "step": 979 + }, + { + "epoch": 0.04126315789473684, + "grad_norm": 0.4921875, + "learning_rate": 0.0004994194428538563, + "loss": 3.7227, + "step": 980 + }, + { + "epoch": 0.041305263157894734, + "grad_norm": 0.5, + "learning_rate": 0.0004994171422358819, + "loss": 4.0787, + "step": 981 + }, + { + "epoch": 0.04134736842105263, + "grad_norm": 0.482421875, + "learning_rate": 0.0004994148370738203, + "loss": 3.7981, + "step": 982 + }, + { + "epoch": 0.041389473684210525, + "grad_norm": 0.43359375, + "learning_rate": 0.0004994125273677134, + "loss": 3.5939, + "step": 983 + }, + { + "epoch": 0.04143157894736842, + "grad_norm": 0.45703125, + "learning_rate": 0.0004994102131176033, + "loss": 4.1439, + "step": 984 + }, + { + "epoch": 0.041473684210526315, + "grad_norm": 0.44921875, + "learning_rate": 0.0004994078943235321, + "loss": 4.0923, + "step": 985 + }, + { + "epoch": 0.04151578947368421, + "grad_norm": 0.51953125, + "learning_rate": 0.000499405570985542, + "loss": 3.883, + "step": 986 + }, + { + "epoch": 0.041557894736842106, + "grad_norm": 0.58203125, + "learning_rate": 0.0004994032431036756, + "loss": 3.8801, + "step": 987 + }, + { + "epoch": 0.0416, + "grad_norm": 0.55078125, + "learning_rate": 0.0004994009106779752, + "loss": 3.7071, + "step": 988 + }, + { + "epoch": 0.0416421052631579, + "grad_norm": 0.5078125, + "learning_rate": 0.000499398573708483, + "loss": 4.0476, + "step": 989 + }, + { + "epoch": 0.04168421052631579, + "grad_norm": 0.62890625, + "learning_rate": 0.0004993962321952421, + "loss": 3.3707, + "step": 990 + }, + { + "epoch": 0.04172631578947368, + "grad_norm": 0.5234375, + "learning_rate": 0.0004993938861382947, + "loss": 3.8226, + "step": 991 + }, + { + "epoch": 0.04176842105263158, + "grad_norm": 0.462890625, + "learning_rate": 0.0004993915355376838, + "loss": 3.9579, + "step": 992 + }, + { + "epoch": 0.04181052631578947, + "grad_norm": 0.875, + "learning_rate": 0.0004993891803934522, + "loss": 3.5042, + "step": 993 + }, + { + "epoch": 0.04185263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0004993868207056427, + "loss": 4.0232, + "step": 994 + }, + { + "epoch": 0.04189473684210526, + "grad_norm": 0.61328125, + "learning_rate": 0.0004993844564742982, + "loss": 4.1442, + "step": 995 + }, + { + "epoch": 0.04193684210526316, + "grad_norm": 0.515625, + "learning_rate": 0.000499382087699462, + "loss": 3.9318, + "step": 996 + }, + { + "epoch": 0.04197894736842105, + "grad_norm": 0.5, + "learning_rate": 0.0004993797143811773, + "loss": 3.8454, + "step": 997 + }, + { + "epoch": 0.042021052631578945, + "grad_norm": 0.51953125, + "learning_rate": 0.0004993773365194871, + "loss": 3.6206, + "step": 998 + }, + { + "epoch": 0.042063157894736844, + "grad_norm": 0.51171875, + "learning_rate": 0.0004993749541144349, + "loss": 3.8465, + "step": 999 + }, + { + "epoch": 0.042105263157894736, + "grad_norm": 0.52734375, + "learning_rate": 0.000499372567166064, + "loss": 3.843, + "step": 1000 + }, + { + "epoch": 0.042147368421052635, + "grad_norm": 0.58984375, + "learning_rate": 0.0004993701756744179, + "loss": 3.5974, + "step": 1001 + }, + { + "epoch": 0.04218947368421053, + "grad_norm": 0.51953125, + "learning_rate": 0.0004993677796395401, + "loss": 3.8475, + "step": 1002 + }, + { + "epoch": 0.04223157894736842, + "grad_norm": 0.478515625, + "learning_rate": 0.0004993653790614746, + "loss": 3.6086, + "step": 1003 + }, + { + "epoch": 0.04227368421052632, + "grad_norm": 0.447265625, + "learning_rate": 0.0004993629739402647, + "loss": 4.0577, + "step": 1004 + }, + { + "epoch": 0.04231578947368421, + "grad_norm": 0.53515625, + "learning_rate": 0.0004993605642759545, + "loss": 3.6497, + "step": 1005 + }, + { + "epoch": 0.04235789473684211, + "grad_norm": 0.47265625, + "learning_rate": 0.0004993581500685877, + "loss": 3.6451, + "step": 1006 + }, + { + "epoch": 0.0424, + "grad_norm": 0.5390625, + "learning_rate": 0.0004993557313182085, + "loss": 3.9034, + "step": 1007 + }, + { + "epoch": 0.04244210526315789, + "grad_norm": 0.44140625, + "learning_rate": 0.0004993533080248608, + "loss": 3.9603, + "step": 1008 + }, + { + "epoch": 0.04248421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.0004993508801885889, + "loss": 3.7417, + "step": 1009 + }, + { + "epoch": 0.04252631578947368, + "grad_norm": 0.53515625, + "learning_rate": 0.0004993484478094368, + "loss": 3.6999, + "step": 1010 + }, + { + "epoch": 0.04256842105263158, + "grad_norm": 0.49609375, + "learning_rate": 0.000499346010887449, + "loss": 3.9864, + "step": 1011 + }, + { + "epoch": 0.042610526315789474, + "grad_norm": 0.474609375, + "learning_rate": 0.0004993435694226699, + "loss": 3.2921, + "step": 1012 + }, + { + "epoch": 0.042652631578947366, + "grad_norm": 0.5234375, + "learning_rate": 0.0004993411234151439, + "loss": 3.9383, + "step": 1013 + }, + { + "epoch": 0.042694736842105264, + "grad_norm": 0.498046875, + "learning_rate": 0.0004993386728649155, + "loss": 3.7808, + "step": 1014 + }, + { + "epoch": 0.042736842105263156, + "grad_norm": 0.6328125, + "learning_rate": 0.0004993362177720295, + "loss": 3.4354, + "step": 1015 + }, + { + "epoch": 0.042778947368421055, + "grad_norm": 0.51953125, + "learning_rate": 0.0004993337581365307, + "loss": 3.8211, + "step": 1016 + }, + { + "epoch": 0.04282105263157895, + "grad_norm": 0.578125, + "learning_rate": 0.0004993312939584637, + "loss": 3.875, + "step": 1017 + }, + { + "epoch": 0.04286315789473684, + "grad_norm": 0.484375, + "learning_rate": 0.0004993288252378734, + "loss": 3.6614, + "step": 1018 + }, + { + "epoch": 0.04290526315789474, + "grad_norm": 0.66015625, + "learning_rate": 0.0004993263519748049, + "loss": 3.3945, + "step": 1019 + }, + { + "epoch": 0.04294736842105263, + "grad_norm": 0.474609375, + "learning_rate": 0.0004993238741693032, + "loss": 3.7737, + "step": 1020 + }, + { + "epoch": 0.04298947368421053, + "grad_norm": 0.47265625, + "learning_rate": 0.0004993213918214136, + "loss": 3.8601, + "step": 1021 + }, + { + "epoch": 0.04303157894736842, + "grad_norm": 0.49609375, + "learning_rate": 0.000499318904931181, + "loss": 3.7544, + "step": 1022 + }, + { + "epoch": 0.04307368421052631, + "grad_norm": 0.73046875, + "learning_rate": 0.000499316413498651, + "loss": 3.718, + "step": 1023 + }, + { + "epoch": 0.04311578947368421, + "grad_norm": 1.640625, + "learning_rate": 0.0004993139175238688, + "loss": 4.5859, + "step": 1024 + }, + { + "epoch": 0.0431578947368421, + "grad_norm": 0.56640625, + "learning_rate": 0.00049931141700688, + "loss": 3.5511, + "step": 1025 + }, + { + "epoch": 0.0432, + "grad_norm": 0.49609375, + "learning_rate": 0.0004993089119477301, + "loss": 4.1038, + "step": 1026 + }, + { + "epoch": 0.043242105263157894, + "grad_norm": 0.51953125, + "learning_rate": 0.0004993064023464648, + "loss": 3.5809, + "step": 1027 + }, + { + "epoch": 0.043284210526315786, + "grad_norm": 0.447265625, + "learning_rate": 0.0004993038882031296, + "loss": 4.0216, + "step": 1028 + }, + { + "epoch": 0.043326315789473685, + "grad_norm": 0.5703125, + "learning_rate": 0.0004993013695177706, + "loss": 3.6265, + "step": 1029 + }, + { + "epoch": 0.04336842105263158, + "grad_norm": 0.48046875, + "learning_rate": 0.0004992988462904337, + "loss": 3.9212, + "step": 1030 + }, + { + "epoch": 0.043410526315789476, + "grad_norm": 0.5078125, + "learning_rate": 0.0004992963185211645, + "loss": 3.6245, + "step": 1031 + }, + { + "epoch": 0.04345263157894737, + "grad_norm": 0.5078125, + "learning_rate": 0.0004992937862100095, + "loss": 3.7956, + "step": 1032 + }, + { + "epoch": 0.04349473684210527, + "grad_norm": 0.45703125, + "learning_rate": 0.0004992912493570144, + "loss": 3.4924, + "step": 1033 + }, + { + "epoch": 0.04353684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.0004992887079622257, + "loss": 4.0111, + "step": 1034 + }, + { + "epoch": 0.04357894736842105, + "grad_norm": 0.53515625, + "learning_rate": 0.0004992861620256898, + "loss": 3.6975, + "step": 1035 + }, + { + "epoch": 0.04362105263157895, + "grad_norm": 0.58203125, + "learning_rate": 0.0004992836115474528, + "loss": 3.6044, + "step": 1036 + }, + { + "epoch": 0.04366315789473684, + "grad_norm": 0.4609375, + "learning_rate": 0.0004992810565275612, + "loss": 3.8523, + "step": 1037 + }, + { + "epoch": 0.04370526315789474, + "grad_norm": 0.49609375, + "learning_rate": 0.0004992784969660619, + "loss": 3.8583, + "step": 1038 + }, + { + "epoch": 0.04374736842105263, + "grad_norm": 0.59765625, + "learning_rate": 0.0004992759328630011, + "loss": 3.5004, + "step": 1039 + }, + { + "epoch": 0.043789473684210524, + "grad_norm": 0.5234375, + "learning_rate": 0.0004992733642184258, + "loss": 3.7063, + "step": 1040 + }, + { + "epoch": 0.04383157894736842, + "grad_norm": 0.54296875, + "learning_rate": 0.0004992707910323825, + "loss": 3.5979, + "step": 1041 + }, + { + "epoch": 0.043873684210526315, + "grad_norm": 0.6015625, + "learning_rate": 0.0004992682133049184, + "loss": 3.3756, + "step": 1042 + }, + { + "epoch": 0.043915789473684214, + "grad_norm": 0.484375, + "learning_rate": 0.0004992656310360803, + "loss": 4.0856, + "step": 1043 + }, + { + "epoch": 0.043957894736842106, + "grad_norm": 0.47265625, + "learning_rate": 0.0004992630442259153, + "loss": 3.8896, + "step": 1044 + }, + { + "epoch": 0.044, + "grad_norm": 0.498046875, + "learning_rate": 0.0004992604528744705, + "loss": 4.0158, + "step": 1045 + }, + { + "epoch": 0.044042105263157896, + "grad_norm": 0.490234375, + "learning_rate": 0.0004992578569817931, + "loss": 4.2554, + "step": 1046 + }, + { + "epoch": 0.04408421052631579, + "grad_norm": 0.4921875, + "learning_rate": 0.0004992552565479305, + "loss": 3.8884, + "step": 1047 + }, + { + "epoch": 0.04412631578947369, + "grad_norm": 0.58203125, + "learning_rate": 0.0004992526515729299, + "loss": 3.7229, + "step": 1048 + }, + { + "epoch": 0.04416842105263158, + "grad_norm": 0.515625, + "learning_rate": 0.0004992500420568388, + "loss": 4.0046, + "step": 1049 + }, + { + "epoch": 0.04421052631578947, + "grad_norm": 0.5625, + "learning_rate": 0.0004992474279997049, + "loss": 3.837, + "step": 1050 + }, + { + "epoch": 0.04425263157894737, + "grad_norm": 0.5, + "learning_rate": 0.0004992448094015756, + "loss": 3.8705, + "step": 1051 + }, + { + "epoch": 0.04429473684210526, + "grad_norm": 0.515625, + "learning_rate": 0.0004992421862624988, + "loss": 3.7249, + "step": 1052 + }, + { + "epoch": 0.04433684210526316, + "grad_norm": 0.5078125, + "learning_rate": 0.0004992395585825222, + "loss": 3.8203, + "step": 1053 + }, + { + "epoch": 0.04437894736842105, + "grad_norm": 0.5546875, + "learning_rate": 0.0004992369263616937, + "loss": 3.7441, + "step": 1054 + }, + { + "epoch": 0.044421052631578944, + "grad_norm": 0.478515625, + "learning_rate": 0.0004992342896000613, + "loss": 3.5447, + "step": 1055 + }, + { + "epoch": 0.04446315789473684, + "grad_norm": 0.45703125, + "learning_rate": 0.0004992316482976729, + "loss": 4.2132, + "step": 1056 + }, + { + "epoch": 0.044505263157894735, + "grad_norm": 0.478515625, + "learning_rate": 0.0004992290024545767, + "loss": 3.6485, + "step": 1057 + }, + { + "epoch": 0.044547368421052634, + "grad_norm": 0.427734375, + "learning_rate": 0.0004992263520708209, + "loss": 4.137, + "step": 1058 + }, + { + "epoch": 0.044589473684210526, + "grad_norm": 0.51171875, + "learning_rate": 0.0004992236971464539, + "loss": 4.0215, + "step": 1059 + }, + { + "epoch": 0.04463157894736842, + "grad_norm": 0.48828125, + "learning_rate": 0.0004992210376815238, + "loss": 4.052, + "step": 1060 + }, + { + "epoch": 0.04467368421052632, + "grad_norm": 0.5, + "learning_rate": 0.0004992183736760793, + "loss": 3.8045, + "step": 1061 + }, + { + "epoch": 0.04471578947368421, + "grad_norm": 0.466796875, + "learning_rate": 0.0004992157051301689, + "loss": 3.7097, + "step": 1062 + }, + { + "epoch": 0.04475789473684211, + "grad_norm": 0.470703125, + "learning_rate": 0.0004992130320438411, + "loss": 4.0733, + "step": 1063 + }, + { + "epoch": 0.0448, + "grad_norm": 0.49609375, + "learning_rate": 0.0004992103544171447, + "loss": 3.9754, + "step": 1064 + }, + { + "epoch": 0.04484210526315789, + "grad_norm": 0.671875, + "learning_rate": 0.0004992076722501284, + "loss": 3.7527, + "step": 1065 + }, + { + "epoch": 0.04488421052631579, + "grad_norm": 0.55859375, + "learning_rate": 0.0004992049855428412, + "loss": 3.8109, + "step": 1066 + }, + { + "epoch": 0.04492631578947368, + "grad_norm": 0.470703125, + "learning_rate": 0.0004992022942953319, + "loss": 3.4605, + "step": 1067 + }, + { + "epoch": 0.04496842105263158, + "grad_norm": 0.5390625, + "learning_rate": 0.0004991995985076497, + "loss": 3.5055, + "step": 1068 + }, + { + "epoch": 0.04501052631578947, + "grad_norm": 0.431640625, + "learning_rate": 0.0004991968981798435, + "loss": 3.9842, + "step": 1069 + }, + { + "epoch": 0.045052631578947365, + "grad_norm": 0.51953125, + "learning_rate": 0.0004991941933119626, + "loss": 3.8903, + "step": 1070 + }, + { + "epoch": 0.045094736842105264, + "grad_norm": 0.462890625, + "learning_rate": 0.0004991914839040563, + "loss": 3.7382, + "step": 1071 + }, + { + "epoch": 0.045136842105263156, + "grad_norm": 0.462890625, + "learning_rate": 0.000499188769956174, + "loss": 3.5897, + "step": 1072 + }, + { + "epoch": 0.045178947368421055, + "grad_norm": 0.451171875, + "learning_rate": 0.0004991860514683651, + "loss": 3.9113, + "step": 1073 + }, + { + "epoch": 0.04522105263157895, + "grad_norm": 0.435546875, + "learning_rate": 0.0004991833284406791, + "loss": 3.7818, + "step": 1074 + }, + { + "epoch": 0.045263157894736845, + "grad_norm": 0.466796875, + "learning_rate": 0.0004991806008731657, + "loss": 4.0869, + "step": 1075 + }, + { + "epoch": 0.04530526315789474, + "grad_norm": 0.482421875, + "learning_rate": 0.0004991778687658744, + "loss": 4.0519, + "step": 1076 + }, + { + "epoch": 0.04534736842105263, + "grad_norm": 0.53515625, + "learning_rate": 0.0004991751321188553, + "loss": 3.8419, + "step": 1077 + }, + { + "epoch": 0.04538947368421053, + "grad_norm": 0.4765625, + "learning_rate": 0.0004991723909321578, + "loss": 3.6765, + "step": 1078 + }, + { + "epoch": 0.04543157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.0004991696452058323, + "loss": 4.2501, + "step": 1079 + }, + { + "epoch": 0.04547368421052632, + "grad_norm": 0.470703125, + "learning_rate": 0.0004991668949399286, + "loss": 3.5953, + "step": 1080 + }, + { + "epoch": 0.04551578947368421, + "grad_norm": 0.470703125, + "learning_rate": 0.0004991641401344967, + "loss": 3.5416, + "step": 1081 + }, + { + "epoch": 0.0455578947368421, + "grad_norm": 0.4609375, + "learning_rate": 0.0004991613807895871, + "loss": 4.0286, + "step": 1082 + }, + { + "epoch": 0.0456, + "grad_norm": 0.44921875, + "learning_rate": 0.0004991586169052497, + "loss": 3.7822, + "step": 1083 + }, + { + "epoch": 0.045642105263157894, + "grad_norm": 0.478515625, + "learning_rate": 0.0004991558484815353, + "loss": 3.708, + "step": 1084 + }, + { + "epoch": 0.04568421052631579, + "grad_norm": 0.52734375, + "learning_rate": 0.0004991530755184939, + "loss": 3.5665, + "step": 1085 + }, + { + "epoch": 0.045726315789473684, + "grad_norm": 0.466796875, + "learning_rate": 0.0004991502980161763, + "loss": 3.9086, + "step": 1086 + }, + { + "epoch": 0.045768421052631576, + "grad_norm": 0.50390625, + "learning_rate": 0.0004991475159746329, + "loss": 3.6985, + "step": 1087 + }, + { + "epoch": 0.045810526315789475, + "grad_norm": 0.462890625, + "learning_rate": 0.0004991447293939146, + "loss": 3.6663, + "step": 1088 + }, + { + "epoch": 0.04585263157894737, + "grad_norm": 0.470703125, + "learning_rate": 0.0004991419382740721, + "loss": 3.7988, + "step": 1089 + }, + { + "epoch": 0.045894736842105266, + "grad_norm": 0.462890625, + "learning_rate": 0.000499139142615156, + "loss": 3.5434, + "step": 1090 + }, + { + "epoch": 0.04593684210526316, + "grad_norm": 0.5859375, + "learning_rate": 0.0004991363424172176, + "loss": 3.7934, + "step": 1091 + }, + { + "epoch": 0.04597894736842105, + "grad_norm": 0.515625, + "learning_rate": 0.0004991335376803077, + "loss": 3.4404, + "step": 1092 + }, + { + "epoch": 0.04602105263157895, + "grad_norm": 0.46875, + "learning_rate": 0.0004991307284044775, + "loss": 3.4192, + "step": 1093 + }, + { + "epoch": 0.04606315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.0004991279145897783, + "loss": 3.7616, + "step": 1094 + }, + { + "epoch": 0.04610526315789474, + "grad_norm": 0.439453125, + "learning_rate": 0.000499125096236261, + "loss": 4.0635, + "step": 1095 + }, + { + "epoch": 0.04614736842105263, + "grad_norm": 0.45703125, + "learning_rate": 0.0004991222733439773, + "loss": 3.7645, + "step": 1096 + }, + { + "epoch": 0.04618947368421052, + "grad_norm": 0.45703125, + "learning_rate": 0.0004991194459129784, + "loss": 3.8296, + "step": 1097 + }, + { + "epoch": 0.04623157894736842, + "grad_norm": 0.462890625, + "learning_rate": 0.000499116613943316, + "loss": 3.7204, + "step": 1098 + }, + { + "epoch": 0.046273684210526314, + "grad_norm": 0.490234375, + "learning_rate": 0.0004991137774350414, + "loss": 3.5295, + "step": 1099 + }, + { + "epoch": 0.04631578947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.0004991109363882065, + "loss": 3.9595, + "step": 1100 + }, + { + "epoch": 0.046357894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.0004991080908028632, + "loss": 4.3297, + "step": 1101 + }, + { + "epoch": 0.0464, + "grad_norm": 0.423828125, + "learning_rate": 0.0004991052406790631, + "loss": 3.8261, + "step": 1102 + }, + { + "epoch": 0.046442105263157896, + "grad_norm": 0.45703125, + "learning_rate": 0.0004991023860168582, + "loss": 3.8489, + "step": 1103 + }, + { + "epoch": 0.04648421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.0004990995268163005, + "loss": 4.0968, + "step": 1104 + }, + { + "epoch": 0.046526315789473686, + "grad_norm": 0.494140625, + "learning_rate": 0.0004990966630774421, + "loss": 4.3113, + "step": 1105 + }, + { + "epoch": 0.04656842105263158, + "grad_norm": 0.447265625, + "learning_rate": 0.0004990937948003351, + "loss": 4.0536, + "step": 1106 + }, + { + "epoch": 0.04661052631578947, + "grad_norm": 1.015625, + "learning_rate": 0.0004990909219850319, + "loss": 3.5195, + "step": 1107 + }, + { + "epoch": 0.04665263157894737, + "grad_norm": 0.6015625, + "learning_rate": 0.0004990880446315848, + "loss": 3.1762, + "step": 1108 + }, + { + "epoch": 0.04669473684210526, + "grad_norm": 0.5234375, + "learning_rate": 0.0004990851627400462, + "loss": 3.4462, + "step": 1109 + }, + { + "epoch": 0.04673684210526316, + "grad_norm": 0.51171875, + "learning_rate": 0.0004990822763104685, + "loss": 3.8925, + "step": 1110 + }, + { + "epoch": 0.04677894736842105, + "grad_norm": 0.46875, + "learning_rate": 0.0004990793853429045, + "loss": 3.8434, + "step": 1111 + }, + { + "epoch": 0.04682105263157895, + "grad_norm": 0.6328125, + "learning_rate": 0.0004990764898374067, + "loss": 4.2752, + "step": 1112 + }, + { + "epoch": 0.04686315789473684, + "grad_norm": 0.478515625, + "learning_rate": 0.0004990735897940279, + "loss": 3.9717, + "step": 1113 + }, + { + "epoch": 0.046905263157894735, + "grad_norm": 0.5703125, + "learning_rate": 0.0004990706852128211, + "loss": 3.6493, + "step": 1114 + }, + { + "epoch": 0.04694736842105263, + "grad_norm": 0.46484375, + "learning_rate": 0.0004990677760938388, + "loss": 3.9603, + "step": 1115 + }, + { + "epoch": 0.046989473684210525, + "grad_norm": 0.65234375, + "learning_rate": 0.0004990648624371344, + "loss": 3.6008, + "step": 1116 + }, + { + "epoch": 0.047031578947368424, + "grad_norm": 0.5078125, + "learning_rate": 0.0004990619442427609, + "loss": 3.6804, + "step": 1117 + }, + { + "epoch": 0.047073684210526316, + "grad_norm": 0.447265625, + "learning_rate": 0.0004990590215107713, + "loss": 3.8254, + "step": 1118 + }, + { + "epoch": 0.04711578947368421, + "grad_norm": 0.72265625, + "learning_rate": 0.000499056094241219, + "loss": 3.6268, + "step": 1119 + }, + { + "epoch": 0.04715789473684211, + "grad_norm": 0.4921875, + "learning_rate": 0.0004990531624341572, + "loss": 3.6993, + "step": 1120 + }, + { + "epoch": 0.0472, + "grad_norm": 0.48046875, + "learning_rate": 0.0004990502260896395, + "loss": 3.9004, + "step": 1121 + }, + { + "epoch": 0.0472421052631579, + "grad_norm": 0.71484375, + "learning_rate": 0.0004990472852077194, + "loss": 4.029, + "step": 1122 + }, + { + "epoch": 0.04728421052631579, + "grad_norm": 0.44921875, + "learning_rate": 0.0004990443397884502, + "loss": 4.139, + "step": 1123 + }, + { + "epoch": 0.04732631578947368, + "grad_norm": 0.57421875, + "learning_rate": 0.0004990413898318858, + "loss": 3.8081, + "step": 1124 + }, + { + "epoch": 0.04736842105263158, + "grad_norm": 0.52734375, + "learning_rate": 0.00049903843533808, + "loss": 3.8606, + "step": 1125 + }, + { + "epoch": 0.04741052631578947, + "grad_norm": 0.4765625, + "learning_rate": 0.0004990354763070864, + "loss": 4.0022, + "step": 1126 + }, + { + "epoch": 0.04745263157894737, + "grad_norm": 0.47265625, + "learning_rate": 0.0004990325127389591, + "loss": 3.8362, + "step": 1127 + }, + { + "epoch": 0.04749473684210526, + "grad_norm": 0.51171875, + "learning_rate": 0.000499029544633752, + "loss": 3.7651, + "step": 1128 + }, + { + "epoch": 0.047536842105263155, + "grad_norm": 0.875, + "learning_rate": 0.0004990265719915191, + "loss": 3.7897, + "step": 1129 + }, + { + "epoch": 0.047578947368421054, + "grad_norm": 0.515625, + "learning_rate": 0.0004990235948123148, + "loss": 3.9444, + "step": 1130 + }, + { + "epoch": 0.047621052631578946, + "grad_norm": 0.470703125, + "learning_rate": 0.000499020613096193, + "loss": 3.8025, + "step": 1131 + }, + { + "epoch": 0.047663157894736845, + "grad_norm": 0.51953125, + "learning_rate": 0.0004990176268432083, + "loss": 3.8231, + "step": 1132 + }, + { + "epoch": 0.04770526315789474, + "grad_norm": 0.765625, + "learning_rate": 0.0004990146360534152, + "loss": 4.151, + "step": 1133 + }, + { + "epoch": 0.04774736842105263, + "grad_norm": 0.53515625, + "learning_rate": 0.0004990116407268679, + "loss": 3.8828, + "step": 1134 + }, + { + "epoch": 0.04778947368421053, + "grad_norm": 0.51953125, + "learning_rate": 0.000499008640863621, + "loss": 4.0393, + "step": 1135 + }, + { + "epoch": 0.04783157894736842, + "grad_norm": 0.5, + "learning_rate": 0.0004990056364637293, + "loss": 3.8835, + "step": 1136 + }, + { + "epoch": 0.04787368421052632, + "grad_norm": 0.51171875, + "learning_rate": 0.0004990026275272474, + "loss": 3.6874, + "step": 1137 + }, + { + "epoch": 0.04791578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.0004989996140542303, + "loss": 4.1264, + "step": 1138 + }, + { + "epoch": 0.0479578947368421, + "grad_norm": 0.47265625, + "learning_rate": 0.0004989965960447327, + "loss": 3.621, + "step": 1139 + }, + { + "epoch": 0.048, + "grad_norm": 0.515625, + "learning_rate": 0.0004989935734988098, + "loss": 4.0086, + "step": 1140 + }, + { + "epoch": 0.04804210526315789, + "grad_norm": 0.427734375, + "learning_rate": 0.0004989905464165165, + "loss": 4.1447, + "step": 1141 + }, + { + "epoch": 0.04808421052631579, + "grad_norm": 0.5390625, + "learning_rate": 0.000498987514797908, + "loss": 3.8843, + "step": 1142 + }, + { + "epoch": 0.048126315789473684, + "grad_norm": 0.5546875, + "learning_rate": 0.0004989844786430394, + "loss": 3.6188, + "step": 1143 + }, + { + "epoch": 0.048168421052631576, + "grad_norm": 0.4765625, + "learning_rate": 0.0004989814379519663, + "loss": 3.9116, + "step": 1144 + }, + { + "epoch": 0.048210526315789475, + "grad_norm": 0.474609375, + "learning_rate": 0.0004989783927247439, + "loss": 4.0312, + "step": 1145 + }, + { + "epoch": 0.048252631578947366, + "grad_norm": 0.54296875, + "learning_rate": 0.0004989753429614276, + "loss": 3.8522, + "step": 1146 + }, + { + "epoch": 0.048294736842105265, + "grad_norm": 0.46484375, + "learning_rate": 0.0004989722886620732, + "loss": 3.861, + "step": 1147 + }, + { + "epoch": 0.04833684210526316, + "grad_norm": 0.494140625, + "learning_rate": 0.0004989692298267362, + "loss": 3.996, + "step": 1148 + }, + { + "epoch": 0.048378947368421056, + "grad_norm": 0.48828125, + "learning_rate": 0.0004989661664554724, + "loss": 3.5301, + "step": 1149 + }, + { + "epoch": 0.04842105263157895, + "grad_norm": 0.451171875, + "learning_rate": 0.0004989630985483375, + "loss": 3.5198, + "step": 1150 + }, + { + "epoch": 0.04846315789473684, + "grad_norm": 0.474609375, + "learning_rate": 0.0004989600261053876, + "loss": 3.9163, + "step": 1151 + }, + { + "epoch": 0.04850526315789474, + "grad_norm": 0.51171875, + "learning_rate": 0.0004989569491266785, + "loss": 3.6483, + "step": 1152 + }, + { + "epoch": 0.04854736842105263, + "grad_norm": 0.53125, + "learning_rate": 0.0004989538676122662, + "loss": 3.655, + "step": 1153 + }, + { + "epoch": 0.04858947368421053, + "grad_norm": 0.5234375, + "learning_rate": 0.000498950781562207, + "loss": 3.3422, + "step": 1154 + }, + { + "epoch": 0.04863157894736842, + "grad_norm": 0.478515625, + "learning_rate": 0.000498947690976557, + "loss": 4.0828, + "step": 1155 + }, + { + "epoch": 0.04867368421052631, + "grad_norm": 0.4765625, + "learning_rate": 0.0004989445958553726, + "loss": 4.0799, + "step": 1156 + }, + { + "epoch": 0.04871578947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.0004989414961987103, + "loss": 4.135, + "step": 1157 + }, + { + "epoch": 0.048757894736842104, + "grad_norm": 0.53125, + "learning_rate": 0.0004989383920066264, + "loss": 3.7147, + "step": 1158 + }, + { + "epoch": 0.0488, + "grad_norm": 0.494140625, + "learning_rate": 0.0004989352832791774, + "loss": 3.9812, + "step": 1159 + }, + { + "epoch": 0.048842105263157895, + "grad_norm": 0.4609375, + "learning_rate": 0.00049893217001642, + "loss": 3.4013, + "step": 1160 + }, + { + "epoch": 0.04888421052631579, + "grad_norm": 0.7109375, + "learning_rate": 0.000498929052218411, + "loss": 4.0839, + "step": 1161 + }, + { + "epoch": 0.048926315789473686, + "grad_norm": 0.4140625, + "learning_rate": 0.0004989259298852071, + "loss": 3.9679, + "step": 1162 + }, + { + "epoch": 0.04896842105263158, + "grad_norm": 0.484375, + "learning_rate": 0.0004989228030168653, + "loss": 3.6415, + "step": 1163 + }, + { + "epoch": 0.04901052631578948, + "grad_norm": 0.478515625, + "learning_rate": 0.0004989196716134424, + "loss": 3.9239, + "step": 1164 + }, + { + "epoch": 0.04905263157894737, + "grad_norm": 0.404296875, + "learning_rate": 0.0004989165356749956, + "loss": 3.8078, + "step": 1165 + }, + { + "epoch": 0.04909473684210526, + "grad_norm": 0.5078125, + "learning_rate": 0.0004989133952015821, + "loss": 4.3432, + "step": 1166 + }, + { + "epoch": 0.04913684210526316, + "grad_norm": 0.47265625, + "learning_rate": 0.0004989102501932589, + "loss": 4.0647, + "step": 1167 + }, + { + "epoch": 0.04917894736842105, + "grad_norm": 0.4609375, + "learning_rate": 0.0004989071006500834, + "loss": 3.8704, + "step": 1168 + }, + { + "epoch": 0.04922105263157895, + "grad_norm": 0.44921875, + "learning_rate": 0.000498903946572113, + "loss": 3.6771, + "step": 1169 + }, + { + "epoch": 0.04926315789473684, + "grad_norm": 0.5546875, + "learning_rate": 0.0004989007879594051, + "loss": 3.5376, + "step": 1170 + }, + { + "epoch": 0.049305263157894734, + "grad_norm": 0.5078125, + "learning_rate": 0.0004988976248120173, + "loss": 3.8722, + "step": 1171 + }, + { + "epoch": 0.04934736842105263, + "grad_norm": 0.46484375, + "learning_rate": 0.0004988944571300073, + "loss": 4.0583, + "step": 1172 + }, + { + "epoch": 0.049389473684210525, + "grad_norm": 0.51171875, + "learning_rate": 0.0004988912849134326, + "loss": 3.6733, + "step": 1173 + }, + { + "epoch": 0.049431578947368424, + "grad_norm": 0.66015625, + "learning_rate": 0.0004988881081623512, + "loss": 4.1548, + "step": 1174 + }, + { + "epoch": 0.049473684210526316, + "grad_norm": 0.515625, + "learning_rate": 0.000498884926876821, + "loss": 3.4042, + "step": 1175 + }, + { + "epoch": 0.04951578947368421, + "grad_norm": 0.470703125, + "learning_rate": 0.0004988817410568996, + "loss": 3.7907, + "step": 1176 + }, + { + "epoch": 0.049557894736842106, + "grad_norm": 0.45703125, + "learning_rate": 0.0004988785507026455, + "loss": 3.828, + "step": 1177 + }, + { + "epoch": 0.0496, + "grad_norm": 0.51953125, + "learning_rate": 0.0004988753558141165, + "loss": 3.8158, + "step": 1178 + }, + { + "epoch": 0.0496421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.000498872156391371, + "loss": 3.9778, + "step": 1179 + }, + { + "epoch": 0.04968421052631579, + "grad_norm": 0.59375, + "learning_rate": 0.0004988689524344673, + "loss": 3.4973, + "step": 1180 + }, + { + "epoch": 0.04972631578947368, + "grad_norm": 0.47265625, + "learning_rate": 0.0004988657439434637, + "loss": 3.7954, + "step": 1181 + }, + { + "epoch": 0.04976842105263158, + "grad_norm": 0.494140625, + "learning_rate": 0.0004988625309184186, + "loss": 3.8242, + "step": 1182 + }, + { + "epoch": 0.04981052631578947, + "grad_norm": 0.52734375, + "learning_rate": 0.0004988593133593905, + "loss": 3.7245, + "step": 1183 + }, + { + "epoch": 0.04985263157894737, + "grad_norm": 0.490234375, + "learning_rate": 0.0004988560912664382, + "loss": 3.3523, + "step": 1184 + }, + { + "epoch": 0.04989473684210526, + "grad_norm": 0.458984375, + "learning_rate": 0.0004988528646396204, + "loss": 3.8366, + "step": 1185 + }, + { + "epoch": 0.049936842105263154, + "grad_norm": 0.53515625, + "learning_rate": 0.0004988496334789956, + "loss": 3.8055, + "step": 1186 + }, + { + "epoch": 0.04997894736842105, + "grad_norm": 0.53125, + "learning_rate": 0.0004988463977846229, + "loss": 3.6484, + "step": 1187 + }, + { + "epoch": 0.050021052631578945, + "grad_norm": 0.76171875, + "learning_rate": 0.0004988431575565613, + "loss": 3.9612, + "step": 1188 + }, + { + "epoch": 0.050063157894736844, + "grad_norm": 0.53125, + "learning_rate": 0.0004988399127948697, + "loss": 3.8424, + "step": 1189 + }, + { + "epoch": 0.050105263157894736, + "grad_norm": 0.4765625, + "learning_rate": 0.0004988366634996074, + "loss": 4.1103, + "step": 1190 + }, + { + "epoch": 0.050147368421052635, + "grad_norm": 0.6015625, + "learning_rate": 0.0004988334096708332, + "loss": 3.6221, + "step": 1191 + }, + { + "epoch": 0.05018947368421053, + "grad_norm": 0.54296875, + "learning_rate": 0.0004988301513086068, + "loss": 4.1425, + "step": 1192 + }, + { + "epoch": 0.05023157894736842, + "grad_norm": 0.46875, + "learning_rate": 0.0004988268884129874, + "loss": 3.8124, + "step": 1193 + }, + { + "epoch": 0.05027368421052632, + "grad_norm": 0.48046875, + "learning_rate": 0.0004988236209840344, + "loss": 3.7767, + "step": 1194 + }, + { + "epoch": 0.05031578947368421, + "grad_norm": 0.46875, + "learning_rate": 0.0004988203490218075, + "loss": 3.7257, + "step": 1195 + }, + { + "epoch": 0.05035789473684211, + "grad_norm": 0.5078125, + "learning_rate": 0.0004988170725263661, + "loss": 3.6231, + "step": 1196 + }, + { + "epoch": 0.0504, + "grad_norm": 0.498046875, + "learning_rate": 0.00049881379149777, + "loss": 3.7598, + "step": 1197 + }, + { + "epoch": 0.05044210526315789, + "grad_norm": 0.5546875, + "learning_rate": 0.000498810505936079, + "loss": 3.8414, + "step": 1198 + }, + { + "epoch": 0.05048421052631579, + "grad_norm": 0.482421875, + "learning_rate": 0.0004988072158413528, + "loss": 3.775, + "step": 1199 + }, + { + "epoch": 0.05052631578947368, + "grad_norm": 0.458984375, + "learning_rate": 0.0004988039212136516, + "loss": 3.6302, + "step": 1200 + }, + { + "epoch": 0.05056842105263158, + "grad_norm": 0.455078125, + "learning_rate": 0.0004988006220530353, + "loss": 3.7299, + "step": 1201 + }, + { + "epoch": 0.050610526315789474, + "grad_norm": 0.6328125, + "learning_rate": 0.0004987973183595639, + "loss": 3.7273, + "step": 1202 + }, + { + "epoch": 0.050652631578947366, + "grad_norm": 0.5703125, + "learning_rate": 0.0004987940101332977, + "loss": 3.6947, + "step": 1203 + }, + { + "epoch": 0.050694736842105265, + "grad_norm": 0.55078125, + "learning_rate": 0.000498790697374297, + "loss": 2.9553, + "step": 1204 + }, + { + "epoch": 0.05073684210526316, + "grad_norm": 0.52734375, + "learning_rate": 0.000498787380082622, + "loss": 3.6512, + "step": 1205 + }, + { + "epoch": 0.050778947368421055, + "grad_norm": 0.458984375, + "learning_rate": 0.0004987840582583334, + "loss": 3.831, + "step": 1206 + }, + { + "epoch": 0.05082105263157895, + "grad_norm": 0.5078125, + "learning_rate": 0.0004987807319014916, + "loss": 3.875, + "step": 1207 + }, + { + "epoch": 0.05086315789473684, + "grad_norm": 0.64453125, + "learning_rate": 0.0004987774010121569, + "loss": 3.1948, + "step": 1208 + }, + { + "epoch": 0.05090526315789474, + "grad_norm": 0.50390625, + "learning_rate": 0.0004987740655903905, + "loss": 3.8225, + "step": 1209 + }, + { + "epoch": 0.05094736842105263, + "grad_norm": 0.466796875, + "learning_rate": 0.0004987707256362529, + "loss": 3.5869, + "step": 1210 + }, + { + "epoch": 0.05098947368421053, + "grad_norm": 0.435546875, + "learning_rate": 0.0004987673811498049, + "loss": 3.9492, + "step": 1211 + }, + { + "epoch": 0.05103157894736842, + "grad_norm": 0.7109375, + "learning_rate": 0.0004987640321311075, + "loss": 3.985, + "step": 1212 + }, + { + "epoch": 0.05107368421052631, + "grad_norm": 0.466796875, + "learning_rate": 0.0004987606785802217, + "loss": 3.6899, + "step": 1213 + }, + { + "epoch": 0.05111578947368421, + "grad_norm": 0.5703125, + "learning_rate": 0.0004987573204972087, + "loss": 3.5856, + "step": 1214 + }, + { + "epoch": 0.051157894736842104, + "grad_norm": 0.470703125, + "learning_rate": 0.0004987539578821296, + "loss": 3.689, + "step": 1215 + }, + { + "epoch": 0.0512, + "grad_norm": 0.50390625, + "learning_rate": 0.0004987505907350456, + "loss": 3.7019, + "step": 1216 + }, + { + "epoch": 0.051242105263157894, + "grad_norm": 0.482421875, + "learning_rate": 0.0004987472190560181, + "loss": 3.9487, + "step": 1217 + }, + { + "epoch": 0.051284210526315786, + "grad_norm": 0.5078125, + "learning_rate": 0.0004987438428451087, + "loss": 3.7958, + "step": 1218 + }, + { + "epoch": 0.051326315789473685, + "grad_norm": 0.431640625, + "learning_rate": 0.0004987404621023785, + "loss": 4.2904, + "step": 1219 + }, + { + "epoch": 0.05136842105263158, + "grad_norm": 0.498046875, + "learning_rate": 0.0004987370768278895, + "loss": 3.9304, + "step": 1220 + }, + { + "epoch": 0.051410526315789476, + "grad_norm": 0.470703125, + "learning_rate": 0.0004987336870217032, + "loss": 4.2174, + "step": 1221 + }, + { + "epoch": 0.05145263157894737, + "grad_norm": 0.4921875, + "learning_rate": 0.0004987302926838812, + "loss": 3.825, + "step": 1222 + }, + { + "epoch": 0.05149473684210526, + "grad_norm": 0.486328125, + "learning_rate": 0.0004987268938144858, + "loss": 4.1986, + "step": 1223 + }, + { + "epoch": 0.05153684210526316, + "grad_norm": 0.5078125, + "learning_rate": 0.0004987234904135784, + "loss": 3.8196, + "step": 1224 + }, + { + "epoch": 0.05157894736842105, + "grad_norm": 0.4921875, + "learning_rate": 0.0004987200824812213, + "loss": 3.6349, + "step": 1225 + }, + { + "epoch": 0.05162105263157895, + "grad_norm": 0.4765625, + "learning_rate": 0.0004987166700174766, + "loss": 3.9788, + "step": 1226 + }, + { + "epoch": 0.05166315789473684, + "grad_norm": 0.546875, + "learning_rate": 0.0004987132530224064, + "loss": 3.9148, + "step": 1227 + }, + { + "epoch": 0.05170526315789474, + "grad_norm": 0.482421875, + "learning_rate": 0.0004987098314960729, + "loss": 3.9758, + "step": 1228 + }, + { + "epoch": 0.05174736842105263, + "grad_norm": 1.140625, + "learning_rate": 0.0004987064054385386, + "loss": 3.6117, + "step": 1229 + }, + { + "epoch": 0.051789473684210524, + "grad_norm": 0.51171875, + "learning_rate": 0.0004987029748498656, + "loss": 3.8208, + "step": 1230 + }, + { + "epoch": 0.05183157894736842, + "grad_norm": 0.5078125, + "learning_rate": 0.0004986995397301167, + "loss": 4.0551, + "step": 1231 + }, + { + "epoch": 0.051873684210526315, + "grad_norm": 0.498046875, + "learning_rate": 0.0004986961000793545, + "loss": 3.6894, + "step": 1232 + }, + { + "epoch": 0.051915789473684214, + "grad_norm": 0.42578125, + "learning_rate": 0.0004986926558976416, + "loss": 3.9324, + "step": 1233 + }, + { + "epoch": 0.051957894736842106, + "grad_norm": 0.57421875, + "learning_rate": 0.0004986892071850405, + "loss": 3.8361, + "step": 1234 + }, + { + "epoch": 0.052, + "grad_norm": 0.484375, + "learning_rate": 0.0004986857539416143, + "loss": 3.7379, + "step": 1235 + }, + { + "epoch": 0.052042105263157896, + "grad_norm": 0.44921875, + "learning_rate": 0.0004986822961674259, + "loss": 4.0313, + "step": 1236 + }, + { + "epoch": 0.05208421052631579, + "grad_norm": 0.5078125, + "learning_rate": 0.0004986788338625383, + "loss": 3.3483, + "step": 1237 + }, + { + "epoch": 0.05212631578947369, + "grad_norm": 0.458984375, + "learning_rate": 0.0004986753670270145, + "loss": 3.6726, + "step": 1238 + }, + { + "epoch": 0.05216842105263158, + "grad_norm": 0.45703125, + "learning_rate": 0.0004986718956609177, + "loss": 3.9699, + "step": 1239 + }, + { + "epoch": 0.05221052631578947, + "grad_norm": 0.462890625, + "learning_rate": 0.0004986684197643111, + "loss": 4.0079, + "step": 1240 + }, + { + "epoch": 0.05225263157894737, + "grad_norm": 0.455078125, + "learning_rate": 0.000498664939337258, + "loss": 3.9858, + "step": 1241 + }, + { + "epoch": 0.05229473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.000498661454379822, + "loss": 3.5264, + "step": 1242 + }, + { + "epoch": 0.05233684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.0004986579648920665, + "loss": 3.901, + "step": 1243 + }, + { + "epoch": 0.05237894736842105, + "grad_norm": 0.490234375, + "learning_rate": 0.000498654470874055, + "loss": 4.0074, + "step": 1244 + }, + { + "epoch": 0.052421052631578945, + "grad_norm": 0.43359375, + "learning_rate": 0.0004986509723258511, + "loss": 4.1283, + "step": 1245 + }, + { + "epoch": 0.052463157894736843, + "grad_norm": 0.55078125, + "learning_rate": 0.0004986474692475187, + "loss": 3.6396, + "step": 1246 + }, + { + "epoch": 0.052505263157894735, + "grad_norm": 0.4453125, + "learning_rate": 0.0004986439616391216, + "loss": 3.4812, + "step": 1247 + }, + { + "epoch": 0.052547368421052634, + "grad_norm": 0.515625, + "learning_rate": 0.0004986404495007236, + "loss": 3.998, + "step": 1248 + }, + { + "epoch": 0.052589473684210526, + "grad_norm": 0.46875, + "learning_rate": 0.0004986369328323888, + "loss": 3.7454, + "step": 1249 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.0004986334116341812, + "loss": 3.8636, + "step": 1250 + }, + { + "epoch": 0.05267368421052632, + "grad_norm": 0.451171875, + "learning_rate": 0.0004986298859061649, + "loss": 3.6326, + "step": 1251 + }, + { + "epoch": 0.05271578947368421, + "grad_norm": 0.455078125, + "learning_rate": 0.0004986263556484043, + "loss": 3.8106, + "step": 1252 + }, + { + "epoch": 0.05275789473684211, + "grad_norm": 0.41015625, + "learning_rate": 0.0004986228208609636, + "loss": 4.4184, + "step": 1253 + }, + { + "epoch": 0.0528, + "grad_norm": 0.44921875, + "learning_rate": 0.0004986192815439073, + "loss": 3.7713, + "step": 1254 + }, + { + "epoch": 0.05284210526315789, + "grad_norm": 0.53515625, + "learning_rate": 0.0004986157376972997, + "loss": 3.4042, + "step": 1255 + }, + { + "epoch": 0.05288421052631579, + "grad_norm": 0.466796875, + "learning_rate": 0.0004986121893212056, + "loss": 3.9335, + "step": 1256 + }, + { + "epoch": 0.05292631578947368, + "grad_norm": 0.482421875, + "learning_rate": 0.0004986086364156893, + "loss": 4.0119, + "step": 1257 + }, + { + "epoch": 0.05296842105263158, + "grad_norm": 0.439453125, + "learning_rate": 0.0004986050789808159, + "loss": 3.9728, + "step": 1258 + }, + { + "epoch": 0.05301052631578947, + "grad_norm": 0.44921875, + "learning_rate": 0.00049860151701665, + "loss": 3.9399, + "step": 1259 + }, + { + "epoch": 0.053052631578947365, + "grad_norm": 0.5703125, + "learning_rate": 0.0004985979505232566, + "loss": 3.9068, + "step": 1260 + }, + { + "epoch": 0.053094736842105264, + "grad_norm": 0.62109375, + "learning_rate": 0.0004985943795007006, + "loss": 3.2969, + "step": 1261 + }, + { + "epoch": 0.053136842105263156, + "grad_norm": 0.44140625, + "learning_rate": 0.0004985908039490471, + "loss": 3.9107, + "step": 1262 + }, + { + "epoch": 0.053178947368421055, + "grad_norm": 0.50390625, + "learning_rate": 0.0004985872238683613, + "loss": 3.9077, + "step": 1263 + }, + { + "epoch": 0.05322105263157895, + "grad_norm": 0.439453125, + "learning_rate": 0.0004985836392587083, + "loss": 4.3125, + "step": 1264 + }, + { + "epoch": 0.05326315789473684, + "grad_norm": 0.5234375, + "learning_rate": 0.0004985800501201535, + "loss": 3.6438, + "step": 1265 + }, + { + "epoch": 0.05330526315789474, + "grad_norm": 0.44921875, + "learning_rate": 0.0004985764564527621, + "loss": 3.9617, + "step": 1266 + }, + { + "epoch": 0.05334736842105263, + "grad_norm": 0.61328125, + "learning_rate": 0.0004985728582565999, + "loss": 3.5043, + "step": 1267 + }, + { + "epoch": 0.05338947368421053, + "grad_norm": 0.5625, + "learning_rate": 0.0004985692555317323, + "loss": 3.3678, + "step": 1268 + }, + { + "epoch": 0.05343157894736842, + "grad_norm": 0.53515625, + "learning_rate": 0.0004985656482782248, + "loss": 3.7745, + "step": 1269 + }, + { + "epoch": 0.05347368421052632, + "grad_norm": 0.486328125, + "learning_rate": 0.0004985620364961433, + "loss": 3.8117, + "step": 1270 + }, + { + "epoch": 0.05351578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.0004985584201855535, + "loss": 3.7198, + "step": 1271 + }, + { + "epoch": 0.0535578947368421, + "grad_norm": 0.447265625, + "learning_rate": 0.0004985547993465214, + "loss": 3.7349, + "step": 1272 + }, + { + "epoch": 0.0536, + "grad_norm": 0.4765625, + "learning_rate": 0.0004985511739791129, + "loss": 3.7648, + "step": 1273 + }, + { + "epoch": 0.053642105263157894, + "grad_norm": 0.466796875, + "learning_rate": 0.000498547544083394, + "loss": 4.1469, + "step": 1274 + }, + { + "epoch": 0.05368421052631579, + "grad_norm": 0.458984375, + "learning_rate": 0.0004985439096594308, + "loss": 3.9309, + "step": 1275 + }, + { + "epoch": 0.053726315789473685, + "grad_norm": 0.3984375, + "learning_rate": 0.0004985402707072898, + "loss": 3.8224, + "step": 1276 + }, + { + "epoch": 0.053768421052631576, + "grad_norm": 0.43359375, + "learning_rate": 0.000498536627227037, + "loss": 3.6667, + "step": 1277 + }, + { + "epoch": 0.053810526315789475, + "grad_norm": 0.60546875, + "learning_rate": 0.0004985329792187387, + "loss": 3.5203, + "step": 1278 + }, + { + "epoch": 0.05385263157894737, + "grad_norm": 0.53125, + "learning_rate": 0.0004985293266824617, + "loss": 4.2161, + "step": 1279 + }, + { + "epoch": 0.053894736842105266, + "grad_norm": 0.4375, + "learning_rate": 0.0004985256696182724, + "loss": 3.6726, + "step": 1280 + }, + { + "epoch": 0.05393684210526316, + "grad_norm": 0.58984375, + "learning_rate": 0.0004985220080262374, + "loss": 3.4364, + "step": 1281 + }, + { + "epoch": 0.05397894736842105, + "grad_norm": 0.47265625, + "learning_rate": 0.0004985183419064233, + "loss": 4.0991, + "step": 1282 + }, + { + "epoch": 0.05402105263157895, + "grad_norm": 0.470703125, + "learning_rate": 0.0004985146712588971, + "loss": 3.4965, + "step": 1283 + }, + { + "epoch": 0.05406315789473684, + "grad_norm": 0.486328125, + "learning_rate": 0.0004985109960837256, + "loss": 3.9651, + "step": 1284 + }, + { + "epoch": 0.05410526315789474, + "grad_norm": 0.53125, + "learning_rate": 0.0004985073163809757, + "loss": 3.6569, + "step": 1285 + }, + { + "epoch": 0.05414736842105263, + "grad_norm": 0.490234375, + "learning_rate": 0.0004985036321507144, + "loss": 3.7453, + "step": 1286 + }, + { + "epoch": 0.05418947368421052, + "grad_norm": 0.50390625, + "learning_rate": 0.0004984999433930091, + "loss": 3.5282, + "step": 1287 + }, + { + "epoch": 0.05423157894736842, + "grad_norm": 0.54296875, + "learning_rate": 0.0004984962501079266, + "loss": 3.7703, + "step": 1288 + }, + { + "epoch": 0.054273684210526314, + "grad_norm": 0.52734375, + "learning_rate": 0.0004984925522955345, + "loss": 3.8629, + "step": 1289 + }, + { + "epoch": 0.05431578947368421, + "grad_norm": 0.486328125, + "learning_rate": 0.0004984888499559001, + "loss": 3.8146, + "step": 1290 + }, + { + "epoch": 0.054357894736842105, + "grad_norm": 0.51171875, + "learning_rate": 0.0004984851430890907, + "loss": 3.8921, + "step": 1291 + }, + { + "epoch": 0.0544, + "grad_norm": 0.54296875, + "learning_rate": 0.000498481431695174, + "loss": 3.4637, + "step": 1292 + }, + { + "epoch": 0.054442105263157896, + "grad_norm": 0.5078125, + "learning_rate": 0.0004984777157742177, + "loss": 3.7084, + "step": 1293 + }, + { + "epoch": 0.05448421052631579, + "grad_norm": 0.478515625, + "learning_rate": 0.0004984739953262892, + "loss": 3.5203, + "step": 1294 + }, + { + "epoch": 0.05452631578947369, + "grad_norm": 0.466796875, + "learning_rate": 0.0004984702703514565, + "loss": 3.8092, + "step": 1295 + }, + { + "epoch": 0.05456842105263158, + "grad_norm": 0.51171875, + "learning_rate": 0.0004984665408497874, + "loss": 3.7166, + "step": 1296 + }, + { + "epoch": 0.05461052631578947, + "grad_norm": 0.47265625, + "learning_rate": 0.0004984628068213499, + "loss": 3.8697, + "step": 1297 + }, + { + "epoch": 0.05465263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.0004984590682662119, + "loss": 3.8686, + "step": 1298 + }, + { + "epoch": 0.05469473684210526, + "grad_norm": 0.470703125, + "learning_rate": 0.0004984553251844417, + "loss": 4.2682, + "step": 1299 + }, + { + "epoch": 0.05473684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.0004984515775761074, + "loss": 3.7709, + "step": 1300 + }, + { + "epoch": 0.05477894736842105, + "grad_norm": 0.470703125, + "learning_rate": 0.0004984478254412772, + "loss": 3.9018, + "step": 1301 + }, + { + "epoch": 0.054821052631578944, + "grad_norm": 0.4609375, + "learning_rate": 0.0004984440687800197, + "loss": 3.4633, + "step": 1302 + }, + { + "epoch": 0.05486315789473684, + "grad_norm": 0.48046875, + "learning_rate": 0.000498440307592403, + "loss": 3.7377, + "step": 1303 + }, + { + "epoch": 0.054905263157894735, + "grad_norm": 0.5078125, + "learning_rate": 0.0004984365418784958, + "loss": 4.0835, + "step": 1304 + }, + { + "epoch": 0.054947368421052634, + "grad_norm": 0.453125, + "learning_rate": 0.0004984327716383669, + "loss": 3.6445, + "step": 1305 + }, + { + "epoch": 0.054989473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.0004984289968720846, + "loss": 4.3372, + "step": 1306 + }, + { + "epoch": 0.055031578947368424, + "grad_norm": 0.4921875, + "learning_rate": 0.000498425217579718, + "loss": 3.402, + "step": 1307 + }, + { + "epoch": 0.055073684210526316, + "grad_norm": 0.51171875, + "learning_rate": 0.0004984214337613357, + "loss": 3.3674, + "step": 1308 + }, + { + "epoch": 0.05511578947368421, + "grad_norm": 0.462890625, + "learning_rate": 0.0004984176454170068, + "loss": 3.8537, + "step": 1309 + }, + { + "epoch": 0.05515789473684211, + "grad_norm": 0.59375, + "learning_rate": 0.0004984138525468003, + "loss": 3.4536, + "step": 1310 + }, + { + "epoch": 0.0552, + "grad_norm": 0.412109375, + "learning_rate": 0.0004984100551507853, + "loss": 3.814, + "step": 1311 + }, + { + "epoch": 0.0552421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.0004984062532290309, + "loss": 3.6992, + "step": 1312 + }, + { + "epoch": 0.05528421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.0004984024467816065, + "loss": 4.0318, + "step": 1313 + }, + { + "epoch": 0.05532631578947368, + "grad_norm": 0.46484375, + "learning_rate": 0.0004983986358085814, + "loss": 3.8128, + "step": 1314 + }, + { + "epoch": 0.05536842105263158, + "grad_norm": 0.447265625, + "learning_rate": 0.000498394820310025, + "loss": 3.4022, + "step": 1315 + }, + { + "epoch": 0.05541052631578947, + "grad_norm": 0.40234375, + "learning_rate": 0.0004983910002860067, + "loss": 4.1656, + "step": 1316 + }, + { + "epoch": 0.05545263157894737, + "grad_norm": 0.486328125, + "learning_rate": 0.0004983871757365963, + "loss": 3.4264, + "step": 1317 + }, + { + "epoch": 0.05549473684210526, + "grad_norm": 1.109375, + "learning_rate": 0.0004983833466618634, + "loss": 4.0115, + "step": 1318 + }, + { + "epoch": 0.055536842105263155, + "grad_norm": 0.4609375, + "learning_rate": 0.0004983795130618778, + "loss": 3.6987, + "step": 1319 + }, + { + "epoch": 0.055578947368421054, + "grad_norm": 0.470703125, + "learning_rate": 0.0004983756749367093, + "loss": 4.0374, + "step": 1320 + }, + { + "epoch": 0.055621052631578946, + "grad_norm": 0.55078125, + "learning_rate": 0.0004983718322864278, + "loss": 3.7045, + "step": 1321 + }, + { + "epoch": 0.055663157894736845, + "grad_norm": 0.498046875, + "learning_rate": 0.0004983679851111034, + "loss": 3.4282, + "step": 1322 + }, + { + "epoch": 0.05570526315789474, + "grad_norm": 0.46875, + "learning_rate": 0.000498364133410806, + "loss": 3.2793, + "step": 1323 + }, + { + "epoch": 0.05574736842105263, + "grad_norm": 0.462890625, + "learning_rate": 0.0004983602771856061, + "loss": 3.9131, + "step": 1324 + }, + { + "epoch": 0.05578947368421053, + "grad_norm": 0.462890625, + "learning_rate": 0.0004983564164355736, + "loss": 3.878, + "step": 1325 + }, + { + "epoch": 0.05583157894736842, + "grad_norm": 0.484375, + "learning_rate": 0.000498352551160779, + "loss": 3.5558, + "step": 1326 + }, + { + "epoch": 0.05587368421052632, + "grad_norm": 0.65625, + "learning_rate": 0.0004983486813612928, + "loss": 3.8112, + "step": 1327 + }, + { + "epoch": 0.05591578947368421, + "grad_norm": 0.453125, + "learning_rate": 0.0004983448070371854, + "loss": 3.7708, + "step": 1328 + }, + { + "epoch": 0.0559578947368421, + "grad_norm": 0.50390625, + "learning_rate": 0.0004983409281885273, + "loss": 3.6721, + "step": 1329 + }, + { + "epoch": 0.056, + "grad_norm": 0.482421875, + "learning_rate": 0.0004983370448153895, + "loss": 4.0084, + "step": 1330 + }, + { + "epoch": 0.05604210526315789, + "grad_norm": 0.5703125, + "learning_rate": 0.0004983331569178424, + "loss": 3.2946, + "step": 1331 + }, + { + "epoch": 0.05608421052631579, + "grad_norm": 0.4921875, + "learning_rate": 0.000498329264495957, + "loss": 3.8263, + "step": 1332 + }, + { + "epoch": 0.056126315789473684, + "grad_norm": 0.498046875, + "learning_rate": 0.0004983253675498042, + "loss": 3.4891, + "step": 1333 + }, + { + "epoch": 0.056168421052631576, + "grad_norm": 0.59375, + "learning_rate": 0.0004983214660794548, + "loss": 3.9235, + "step": 1334 + }, + { + "epoch": 0.056210526315789475, + "grad_norm": 0.486328125, + "learning_rate": 0.0004983175600849802, + "loss": 3.7288, + "step": 1335 + }, + { + "epoch": 0.05625263157894737, + "grad_norm": 0.5234375, + "learning_rate": 0.0004983136495664514, + "loss": 3.3728, + "step": 1336 + }, + { + "epoch": 0.056294736842105265, + "grad_norm": 0.5, + "learning_rate": 0.0004983097345239397, + "loss": 4.0245, + "step": 1337 + }, + { + "epoch": 0.05633684210526316, + "grad_norm": 0.46875, + "learning_rate": 0.0004983058149575163, + "loss": 3.7233, + "step": 1338 + }, + { + "epoch": 0.05637894736842105, + "grad_norm": 0.55078125, + "learning_rate": 0.0004983018908672527, + "loss": 3.6046, + "step": 1339 + }, + { + "epoch": 0.05642105263157895, + "grad_norm": 0.484375, + "learning_rate": 0.0004982979622532204, + "loss": 3.8371, + "step": 1340 + }, + { + "epoch": 0.05646315789473684, + "grad_norm": 0.6640625, + "learning_rate": 0.0004982940291154911, + "loss": 3.9984, + "step": 1341 + }, + { + "epoch": 0.05650526315789474, + "grad_norm": 0.6328125, + "learning_rate": 0.0004982900914541361, + "loss": 3.7613, + "step": 1342 + }, + { + "epoch": 0.05654736842105263, + "grad_norm": 0.515625, + "learning_rate": 0.0004982861492692275, + "loss": 3.3244, + "step": 1343 + }, + { + "epoch": 0.05658947368421053, + "grad_norm": 0.53515625, + "learning_rate": 0.000498282202560837, + "loss": 3.3399, + "step": 1344 + }, + { + "epoch": 0.05663157894736842, + "grad_norm": 0.5078125, + "learning_rate": 0.0004982782513290365, + "loss": 3.7781, + "step": 1345 + }, + { + "epoch": 0.056673684210526314, + "grad_norm": 0.5703125, + "learning_rate": 0.0004982742955738979, + "loss": 4.0354, + "step": 1346 + }, + { + "epoch": 0.05671578947368421, + "grad_norm": 0.55859375, + "learning_rate": 0.0004982703352954934, + "loss": 3.7529, + "step": 1347 + }, + { + "epoch": 0.056757894736842104, + "grad_norm": 0.5234375, + "learning_rate": 0.000498266370493895, + "loss": 3.6758, + "step": 1348 + }, + { + "epoch": 0.0568, + "grad_norm": 0.490234375, + "learning_rate": 0.0004982624011691752, + "loss": 3.8836, + "step": 1349 + }, + { + "epoch": 0.056842105263157895, + "grad_norm": 0.6640625, + "learning_rate": 0.000498258427321406, + "loss": 3.6656, + "step": 1350 + }, + { + "epoch": 0.05688421052631579, + "grad_norm": 0.4765625, + "learning_rate": 0.00049825444895066, + "loss": 3.819, + "step": 1351 + }, + { + "epoch": 0.056926315789473686, + "grad_norm": 0.65625, + "learning_rate": 0.0004982504660570096, + "loss": 3.7427, + "step": 1352 + }, + { + "epoch": 0.05696842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.0004982464786405274, + "loss": 4.0407, + "step": 1353 + }, + { + "epoch": 0.05701052631578948, + "grad_norm": 0.61328125, + "learning_rate": 0.0004982424867012861, + "loss": 3.506, + "step": 1354 + }, + { + "epoch": 0.05705263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.0004982384902393582, + "loss": 3.854, + "step": 1355 + }, + { + "epoch": 0.05709473684210526, + "grad_norm": 0.470703125, + "learning_rate": 0.0004982344892548168, + "loss": 3.7895, + "step": 1356 + }, + { + "epoch": 0.05713684210526316, + "grad_norm": 0.5234375, + "learning_rate": 0.0004982304837477348, + "loss": 3.8546, + "step": 1357 + }, + { + "epoch": 0.05717894736842105, + "grad_norm": 0.498046875, + "learning_rate": 0.0004982264737181848, + "loss": 3.6283, + "step": 1358 + }, + { + "epoch": 0.05722105263157895, + "grad_norm": 0.47265625, + "learning_rate": 0.0004982224591662403, + "loss": 3.6073, + "step": 1359 + }, + { + "epoch": 0.05726315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.0004982184400919742, + "loss": 3.9446, + "step": 1360 + }, + { + "epoch": 0.057305263157894734, + "grad_norm": 0.466796875, + "learning_rate": 0.0004982144164954596, + "loss": 3.8313, + "step": 1361 + }, + { + "epoch": 0.05734736842105263, + "grad_norm": 0.4609375, + "learning_rate": 0.0004982103883767701, + "loss": 3.5819, + "step": 1362 + }, + { + "epoch": 0.057389473684210525, + "grad_norm": 0.44921875, + "learning_rate": 0.0004982063557359789, + "loss": 3.5704, + "step": 1363 + }, + { + "epoch": 0.057431578947368424, + "grad_norm": 0.51171875, + "learning_rate": 0.0004982023185731597, + "loss": 3.7707, + "step": 1364 + }, + { + "epoch": 0.057473684210526316, + "grad_norm": 0.53515625, + "learning_rate": 0.0004981982768883857, + "loss": 3.6571, + "step": 1365 + }, + { + "epoch": 0.05751578947368421, + "grad_norm": 0.5390625, + "learning_rate": 0.0004981942306817308, + "loss": 3.4531, + "step": 1366 + }, + { + "epoch": 0.057557894736842106, + "grad_norm": 0.453125, + "learning_rate": 0.0004981901799532686, + "loss": 4.0188, + "step": 1367 + }, + { + "epoch": 0.0576, + "grad_norm": 0.6171875, + "learning_rate": 0.0004981861247030729, + "loss": 3.5399, + "step": 1368 + }, + { + "epoch": 0.0576421052631579, + "grad_norm": 0.48828125, + "learning_rate": 0.0004981820649312177, + "loss": 3.6424, + "step": 1369 + }, + { + "epoch": 0.05768421052631579, + "grad_norm": 0.5, + "learning_rate": 0.0004981780006377769, + "loss": 3.857, + "step": 1370 + }, + { + "epoch": 0.05772631578947368, + "grad_norm": 0.451171875, + "learning_rate": 0.0004981739318228244, + "loss": 3.5255, + "step": 1371 + }, + { + "epoch": 0.05776842105263158, + "grad_norm": 0.56640625, + "learning_rate": 0.0004981698584864345, + "loss": 3.3467, + "step": 1372 + }, + { + "epoch": 0.05781052631578947, + "grad_norm": 0.61328125, + "learning_rate": 0.0004981657806286813, + "loss": 4.0037, + "step": 1373 + }, + { + "epoch": 0.05785263157894737, + "grad_norm": 0.45703125, + "learning_rate": 0.0004981616982496393, + "loss": 3.5145, + "step": 1374 + }, + { + "epoch": 0.05789473684210526, + "grad_norm": 0.4140625, + "learning_rate": 0.0004981576113493827, + "loss": 3.9105, + "step": 1375 + }, + { + "epoch": 0.057936842105263155, + "grad_norm": 0.4921875, + "learning_rate": 0.000498153519927986, + "loss": 3.6701, + "step": 1376 + }, + { + "epoch": 0.057978947368421054, + "grad_norm": 0.453125, + "learning_rate": 0.0004981494239855237, + "loss": 3.8234, + "step": 1377 + }, + { + "epoch": 0.058021052631578945, + "grad_norm": 0.447265625, + "learning_rate": 0.0004981453235220704, + "loss": 3.8154, + "step": 1378 + }, + { + "epoch": 0.058063157894736844, + "grad_norm": 0.5390625, + "learning_rate": 0.000498141218537701, + "loss": 3.2274, + "step": 1379 + }, + { + "epoch": 0.058105263157894736, + "grad_norm": 0.462890625, + "learning_rate": 0.00049813710903249, + "loss": 3.6631, + "step": 1380 + }, + { + "epoch": 0.05814736842105263, + "grad_norm": 0.50390625, + "learning_rate": 0.0004981329950065125, + "loss": 3.4974, + "step": 1381 + }, + { + "epoch": 0.05818947368421053, + "grad_norm": 0.447265625, + "learning_rate": 0.0004981288764598434, + "loss": 4.0409, + "step": 1382 + }, + { + "epoch": 0.05823157894736842, + "grad_norm": 0.478515625, + "learning_rate": 0.0004981247533925577, + "loss": 3.8766, + "step": 1383 + }, + { + "epoch": 0.05827368421052632, + "grad_norm": 0.474609375, + "learning_rate": 0.0004981206258047305, + "loss": 3.843, + "step": 1384 + }, + { + "epoch": 0.05831578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.0004981164936964371, + "loss": 4.0245, + "step": 1385 + }, + { + "epoch": 0.05835789473684211, + "grad_norm": 0.4375, + "learning_rate": 0.0004981123570677526, + "loss": 3.481, + "step": 1386 + }, + { + "epoch": 0.0584, + "grad_norm": 0.443359375, + "learning_rate": 0.0004981082159187525, + "loss": 3.7682, + "step": 1387 + }, + { + "epoch": 0.05844210526315789, + "grad_norm": 0.435546875, + "learning_rate": 0.0004981040702495123, + "loss": 3.8991, + "step": 1388 + }, + { + "epoch": 0.05848421052631579, + "grad_norm": 0.48046875, + "learning_rate": 0.0004980999200601074, + "loss": 3.7622, + "step": 1389 + }, + { + "epoch": 0.05852631578947368, + "grad_norm": 0.47265625, + "learning_rate": 0.0004980957653506134, + "loss": 3.8332, + "step": 1390 + }, + { + "epoch": 0.05856842105263158, + "grad_norm": 0.4609375, + "learning_rate": 0.0004980916061211062, + "loss": 3.6944, + "step": 1391 + }, + { + "epoch": 0.058610526315789474, + "grad_norm": 0.4765625, + "learning_rate": 0.0004980874423716614, + "loss": 3.4223, + "step": 1392 + }, + { + "epoch": 0.058652631578947366, + "grad_norm": 0.5078125, + "learning_rate": 0.0004980832741023547, + "loss": 3.6684, + "step": 1393 + }, + { + "epoch": 0.058694736842105265, + "grad_norm": 0.404296875, + "learning_rate": 0.0004980791013132624, + "loss": 4.2956, + "step": 1394 + }, + { + "epoch": 0.05873684210526316, + "grad_norm": 0.443359375, + "learning_rate": 0.0004980749240044603, + "loss": 4.061, + "step": 1395 + }, + { + "epoch": 0.058778947368421056, + "grad_norm": 0.5, + "learning_rate": 0.0004980707421760246, + "loss": 3.5141, + "step": 1396 + }, + { + "epoch": 0.05882105263157895, + "grad_norm": 0.51171875, + "learning_rate": 0.0004980665558280315, + "loss": 3.6213, + "step": 1397 + }, + { + "epoch": 0.05886315789473684, + "grad_norm": 0.49609375, + "learning_rate": 0.0004980623649605571, + "loss": 3.6913, + "step": 1398 + }, + { + "epoch": 0.05890526315789474, + "grad_norm": 0.62109375, + "learning_rate": 0.000498058169573678, + "loss": 3.4471, + "step": 1399 + }, + { + "epoch": 0.05894736842105263, + "grad_norm": 0.48828125, + "learning_rate": 0.0004980539696674704, + "loss": 3.7601, + "step": 1400 + }, + { + "epoch": 0.05898947368421053, + "grad_norm": 0.443359375, + "learning_rate": 0.000498049765242011, + "loss": 3.9941, + "step": 1401 + }, + { + "epoch": 0.05903157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.0004980455562973762, + "loss": 3.8599, + "step": 1402 + }, + { + "epoch": 0.05907368421052631, + "grad_norm": 0.474609375, + "learning_rate": 0.0004980413428336428, + "loss": 3.4384, + "step": 1403 + }, + { + "epoch": 0.05911578947368421, + "grad_norm": 0.5234375, + "learning_rate": 0.0004980371248508878, + "loss": 3.9244, + "step": 1404 + }, + { + "epoch": 0.059157894736842104, + "grad_norm": 0.474609375, + "learning_rate": 0.0004980329023491876, + "loss": 3.8382, + "step": 1405 + }, + { + "epoch": 0.0592, + "grad_norm": 0.435546875, + "learning_rate": 0.0004980286753286195, + "loss": 4.0686, + "step": 1406 + }, + { + "epoch": 0.059242105263157895, + "grad_norm": 0.474609375, + "learning_rate": 0.0004980244437892602, + "loss": 3.5042, + "step": 1407 + }, + { + "epoch": 0.059284210526315786, + "grad_norm": 0.4453125, + "learning_rate": 0.000498020207731187, + "loss": 3.6917, + "step": 1408 + }, + { + "epoch": 0.059326315789473685, + "grad_norm": 0.46875, + "learning_rate": 0.0004980159671544771, + "loss": 3.5638, + "step": 1409 + }, + { + "epoch": 0.05936842105263158, + "grad_norm": 0.44140625, + "learning_rate": 0.0004980117220592076, + "loss": 4.0654, + "step": 1410 + }, + { + "epoch": 0.059410526315789476, + "grad_norm": 0.44140625, + "learning_rate": 0.000498007472445456, + "loss": 3.9187, + "step": 1411 + }, + { + "epoch": 0.05945263157894737, + "grad_norm": 0.46875, + "learning_rate": 0.0004980032183132996, + "loss": 3.3682, + "step": 1412 + }, + { + "epoch": 0.05949473684210526, + "grad_norm": 0.52734375, + "learning_rate": 0.000497998959662816, + "loss": 3.9349, + "step": 1413 + }, + { + "epoch": 0.05953684210526316, + "grad_norm": 0.455078125, + "learning_rate": 0.0004979946964940827, + "loss": 4.036, + "step": 1414 + }, + { + "epoch": 0.05957894736842105, + "grad_norm": 0.5703125, + "learning_rate": 0.0004979904288071774, + "loss": 3.2511, + "step": 1415 + }, + { + "epoch": 0.05962105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.0004979861566021779, + "loss": 3.9683, + "step": 1416 + }, + { + "epoch": 0.05966315789473684, + "grad_norm": 0.447265625, + "learning_rate": 0.000497981879879162, + "loss": 3.6874, + "step": 1417 + }, + { + "epoch": 0.059705263157894733, + "grad_norm": 0.41796875, + "learning_rate": 0.0004979775986382076, + "loss": 3.8246, + "step": 1418 + }, + { + "epoch": 0.05974736842105263, + "grad_norm": 0.458984375, + "learning_rate": 0.0004979733128793927, + "loss": 3.8402, + "step": 1419 + }, + { + "epoch": 0.059789473684210524, + "grad_norm": 0.474609375, + "learning_rate": 0.0004979690226027954, + "loss": 3.6947, + "step": 1420 + }, + { + "epoch": 0.05983157894736842, + "grad_norm": 0.9921875, + "learning_rate": 0.0004979647278084939, + "loss": 3.9764, + "step": 1421 + }, + { + "epoch": 0.059873684210526315, + "grad_norm": 0.41796875, + "learning_rate": 0.0004979604284965664, + "loss": 4.0506, + "step": 1422 + }, + { + "epoch": 0.059915789473684214, + "grad_norm": 0.451171875, + "learning_rate": 0.0004979561246670911, + "loss": 4.1389, + "step": 1423 + }, + { + "epoch": 0.059957894736842106, + "grad_norm": 0.609375, + "learning_rate": 0.0004979518163201467, + "loss": 3.7321, + "step": 1424 + }, + { + "epoch": 0.06, + "grad_norm": 0.55859375, + "learning_rate": 0.0004979475034558115, + "loss": 3.6528, + "step": 1425 + }, + { + "epoch": 0.0600421052631579, + "grad_norm": 0.609375, + "learning_rate": 0.0004979431860741641, + "loss": 3.5599, + "step": 1426 + }, + { + "epoch": 0.06008421052631579, + "grad_norm": 0.56640625, + "learning_rate": 0.0004979388641752833, + "loss": 3.9305, + "step": 1427 + }, + { + "epoch": 0.06012631578947369, + "grad_norm": 0.65625, + "learning_rate": 0.0004979345377592475, + "loss": 4.0798, + "step": 1428 + }, + { + "epoch": 0.06016842105263158, + "grad_norm": 0.5546875, + "learning_rate": 0.0004979302068261358, + "loss": 3.3725, + "step": 1429 + }, + { + "epoch": 0.06021052631578947, + "grad_norm": 0.48046875, + "learning_rate": 0.0004979258713760271, + "loss": 3.6621, + "step": 1430 + }, + { + "epoch": 0.06025263157894737, + "grad_norm": 0.796875, + "learning_rate": 0.0004979215314090003, + "loss": 3.1578, + "step": 1431 + }, + { + "epoch": 0.06029473684210526, + "grad_norm": 0.5234375, + "learning_rate": 0.0004979171869251344, + "loss": 3.592, + "step": 1432 + }, + { + "epoch": 0.06033684210526316, + "grad_norm": 0.4609375, + "learning_rate": 0.0004979128379245088, + "loss": 3.9118, + "step": 1433 + }, + { + "epoch": 0.06037894736842105, + "grad_norm": 0.51953125, + "learning_rate": 0.0004979084844072025, + "loss": 3.6324, + "step": 1434 + }, + { + "epoch": 0.060421052631578945, + "grad_norm": 0.46875, + "learning_rate": 0.0004979041263732948, + "loss": 3.9731, + "step": 1435 + }, + { + "epoch": 0.060463157894736844, + "grad_norm": 0.48046875, + "learning_rate": 0.0004978997638228654, + "loss": 3.941, + "step": 1436 + }, + { + "epoch": 0.060505263157894736, + "grad_norm": 0.5, + "learning_rate": 0.0004978953967559933, + "loss": 3.764, + "step": 1437 + }, + { + "epoch": 0.060547368421052634, + "grad_norm": 0.4375, + "learning_rate": 0.0004978910251727586, + "loss": 3.9604, + "step": 1438 + }, + { + "epoch": 0.060589473684210526, + "grad_norm": 0.5234375, + "learning_rate": 0.0004978866490732406, + "loss": 3.8266, + "step": 1439 + }, + { + "epoch": 0.06063157894736842, + "grad_norm": 0.482421875, + "learning_rate": 0.0004978822684575191, + "loss": 3.8518, + "step": 1440 + }, + { + "epoch": 0.06067368421052632, + "grad_norm": 0.439453125, + "learning_rate": 0.0004978778833256739, + "loss": 3.5031, + "step": 1441 + }, + { + "epoch": 0.06071578947368421, + "grad_norm": 0.494140625, + "learning_rate": 0.0004978734936777849, + "loss": 3.803, + "step": 1442 + }, + { + "epoch": 0.06075789473684211, + "grad_norm": 0.494140625, + "learning_rate": 0.0004978690995139321, + "loss": 3.6023, + "step": 1443 + }, + { + "epoch": 0.0608, + "grad_norm": 0.53125, + "learning_rate": 0.0004978647008341956, + "loss": 3.9022, + "step": 1444 + }, + { + "epoch": 0.06084210526315789, + "grad_norm": 0.494140625, + "learning_rate": 0.0004978602976386554, + "loss": 3.6245, + "step": 1445 + }, + { + "epoch": 0.06088421052631579, + "grad_norm": 0.515625, + "learning_rate": 0.0004978558899273918, + "loss": 3.909, + "step": 1446 + }, + { + "epoch": 0.06092631578947368, + "grad_norm": 0.4296875, + "learning_rate": 0.0004978514777004851, + "loss": 4.1468, + "step": 1447 + }, + { + "epoch": 0.06096842105263158, + "grad_norm": 0.48046875, + "learning_rate": 0.0004978470609580157, + "loss": 3.6682, + "step": 1448 + }, + { + "epoch": 0.06101052631578947, + "grad_norm": 0.439453125, + "learning_rate": 0.0004978426397000642, + "loss": 4.0666, + "step": 1449 + }, + { + "epoch": 0.061052631578947365, + "grad_norm": 0.484375, + "learning_rate": 0.0004978382139267108, + "loss": 3.8659, + "step": 1450 + }, + { + "epoch": 0.061094736842105264, + "grad_norm": 0.55859375, + "learning_rate": 0.0004978337836380364, + "loss": 3.6098, + "step": 1451 + }, + { + "epoch": 0.061136842105263156, + "grad_norm": 0.47265625, + "learning_rate": 0.0004978293488341217, + "loss": 3.9778, + "step": 1452 + }, + { + "epoch": 0.061178947368421055, + "grad_norm": 0.462890625, + "learning_rate": 0.0004978249095150475, + "loss": 3.7539, + "step": 1453 + }, + { + "epoch": 0.06122105263157895, + "grad_norm": 0.546875, + "learning_rate": 0.0004978204656808944, + "loss": 3.7538, + "step": 1454 + }, + { + "epoch": 0.06126315789473684, + "grad_norm": 0.44921875, + "learning_rate": 0.0004978160173317438, + "loss": 3.9262, + "step": 1455 + }, + { + "epoch": 0.06130526315789474, + "grad_norm": 0.46484375, + "learning_rate": 0.0004978115644676765, + "loss": 3.6827, + "step": 1456 + }, + { + "epoch": 0.06134736842105263, + "grad_norm": 0.51953125, + "learning_rate": 0.0004978071070887737, + "loss": 3.4441, + "step": 1457 + }, + { + "epoch": 0.06138947368421053, + "grad_norm": 0.47265625, + "learning_rate": 0.0004978026451951165, + "loss": 3.6408, + "step": 1458 + }, + { + "epoch": 0.06143157894736842, + "grad_norm": 0.46484375, + "learning_rate": 0.0004977981787867862, + "loss": 4.0173, + "step": 1459 + }, + { + "epoch": 0.06147368421052631, + "grad_norm": 0.47265625, + "learning_rate": 0.0004977937078638644, + "loss": 3.8653, + "step": 1460 + }, + { + "epoch": 0.06151578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.0004977892324264323, + "loss": 4.0035, + "step": 1461 + }, + { + "epoch": 0.0615578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.0004977847524745715, + "loss": 3.7402, + "step": 1462 + }, + { + "epoch": 0.0616, + "grad_norm": 0.494140625, + "learning_rate": 0.0004977802680083637, + "loss": 3.412, + "step": 1463 + }, + { + "epoch": 0.061642105263157894, + "grad_norm": 0.44921875, + "learning_rate": 0.0004977757790278905, + "loss": 3.6565, + "step": 1464 + }, + { + "epoch": 0.06168421052631579, + "grad_norm": 0.5078125, + "learning_rate": 0.0004977712855332337, + "loss": 3.5489, + "step": 1465 + }, + { + "epoch": 0.061726315789473685, + "grad_norm": 0.4375, + "learning_rate": 0.0004977667875244753, + "loss": 3.56, + "step": 1466 + }, + { + "epoch": 0.06176842105263158, + "grad_norm": 0.4921875, + "learning_rate": 0.0004977622850016971, + "loss": 3.7848, + "step": 1467 + }, + { + "epoch": 0.061810526315789475, + "grad_norm": 0.470703125, + "learning_rate": 0.0004977577779649811, + "loss": 3.6628, + "step": 1468 + }, + { + "epoch": 0.06185263157894737, + "grad_norm": 0.55859375, + "learning_rate": 0.0004977532664144096, + "loss": 3.4917, + "step": 1469 + }, + { + "epoch": 0.061894736842105266, + "grad_norm": 0.470703125, + "learning_rate": 0.0004977487503500646, + "loss": 3.9277, + "step": 1470 + }, + { + "epoch": 0.06193684210526316, + "grad_norm": 0.482421875, + "learning_rate": 0.0004977442297720285, + "loss": 4.2681, + "step": 1471 + }, + { + "epoch": 0.06197894736842105, + "grad_norm": 0.515625, + "learning_rate": 0.0004977397046803837, + "loss": 3.7431, + "step": 1472 + }, + { + "epoch": 0.06202105263157895, + "grad_norm": 0.451171875, + "learning_rate": 0.0004977351750752125, + "loss": 3.5858, + "step": 1473 + }, + { + "epoch": 0.06206315789473684, + "grad_norm": 0.46875, + "learning_rate": 0.0004977306409565974, + "loss": 4.2416, + "step": 1474 + }, + { + "epoch": 0.06210526315789474, + "grad_norm": 0.466796875, + "learning_rate": 0.0004977261023246212, + "loss": 3.5031, + "step": 1475 + }, + { + "epoch": 0.06214736842105263, + "grad_norm": 0.484375, + "learning_rate": 0.0004977215591793665, + "loss": 3.6567, + "step": 1476 + }, + { + "epoch": 0.062189473684210524, + "grad_norm": 0.462890625, + "learning_rate": 0.000497717011520916, + "loss": 4.0074, + "step": 1477 + }, + { + "epoch": 0.06223157894736842, + "grad_norm": 0.4609375, + "learning_rate": 0.0004977124593493526, + "loss": 3.846, + "step": 1478 + }, + { + "epoch": 0.062273684210526314, + "grad_norm": 0.54296875, + "learning_rate": 0.0004977079026647591, + "loss": 3.6382, + "step": 1479 + }, + { + "epoch": 0.06231578947368421, + "grad_norm": 0.458984375, + "learning_rate": 0.0004977033414672189, + "loss": 3.947, + "step": 1480 + }, + { + "epoch": 0.062357894736842105, + "grad_norm": 0.47265625, + "learning_rate": 0.0004976987757568148, + "loss": 3.5414, + "step": 1481 + }, + { + "epoch": 0.0624, + "grad_norm": 0.4765625, + "learning_rate": 0.00049769420553363, + "loss": 3.843, + "step": 1482 + }, + { + "epoch": 0.062442105263157896, + "grad_norm": 0.439453125, + "learning_rate": 0.0004976896307977477, + "loss": 3.6364, + "step": 1483 + }, + { + "epoch": 0.06248421052631579, + "grad_norm": 0.4765625, + "learning_rate": 0.0004976850515492515, + "loss": 3.8939, + "step": 1484 + }, + { + "epoch": 0.06252631578947368, + "grad_norm": 0.5625, + "learning_rate": 0.0004976804677882246, + "loss": 3.3034, + "step": 1485 + }, + { + "epoch": 0.06256842105263158, + "grad_norm": 0.6875, + "learning_rate": 0.0004976758795147507, + "loss": 3.1922, + "step": 1486 + }, + { + "epoch": 0.06261052631578948, + "grad_norm": 0.498046875, + "learning_rate": 0.0004976712867289132, + "loss": 3.7994, + "step": 1487 + }, + { + "epoch": 0.06265263157894736, + "grad_norm": 0.44140625, + "learning_rate": 0.0004976666894307959, + "loss": 4.0463, + "step": 1488 + }, + { + "epoch": 0.06269473684210526, + "grad_norm": 0.515625, + "learning_rate": 0.0004976620876204824, + "loss": 3.6857, + "step": 1489 + }, + { + "epoch": 0.06273684210526316, + "grad_norm": 0.4453125, + "learning_rate": 0.0004976574812980567, + "loss": 3.6421, + "step": 1490 + }, + { + "epoch": 0.06277894736842106, + "grad_norm": 0.447265625, + "learning_rate": 0.0004976528704636028, + "loss": 3.7976, + "step": 1491 + }, + { + "epoch": 0.06282105263157894, + "grad_norm": 0.466796875, + "learning_rate": 0.0004976482551172044, + "loss": 4.2465, + "step": 1492 + }, + { + "epoch": 0.06286315789473684, + "grad_norm": 0.58203125, + "learning_rate": 0.000497643635258946, + "loss": 3.9085, + "step": 1493 + }, + { + "epoch": 0.06290526315789474, + "grad_norm": 0.470703125, + "learning_rate": 0.0004976390108889113, + "loss": 3.5754, + "step": 1494 + }, + { + "epoch": 0.06294736842105263, + "grad_norm": 0.484375, + "learning_rate": 0.0004976343820071849, + "loss": 3.7551, + "step": 1495 + }, + { + "epoch": 0.06298947368421053, + "grad_norm": 0.478515625, + "learning_rate": 0.000497629748613851, + "loss": 3.6691, + "step": 1496 + }, + { + "epoch": 0.06303157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.0004976251107089941, + "loss": 3.7454, + "step": 1497 + }, + { + "epoch": 0.06307368421052631, + "grad_norm": 0.478515625, + "learning_rate": 0.0004976204682926985, + "loss": 3.7499, + "step": 1498 + }, + { + "epoch": 0.06311578947368421, + "grad_norm": 0.443359375, + "learning_rate": 0.000497615821365049, + "loss": 4.1201, + "step": 1499 + }, + { + "epoch": 0.06315789473684211, + "grad_norm": 0.46484375, + "learning_rate": 0.00049761116992613, + "loss": 3.5651, + "step": 1500 + }, + { + "epoch": 0.0632, + "grad_norm": 0.4765625, + "learning_rate": 0.0004976065139760267, + "loss": 3.3507, + "step": 1501 + }, + { + "epoch": 0.06324210526315789, + "grad_norm": 0.427734375, + "learning_rate": 0.0004976018535148235, + "loss": 3.9816, + "step": 1502 + }, + { + "epoch": 0.06328421052631579, + "grad_norm": 0.56640625, + "learning_rate": 0.0004975971885426054, + "loss": 3.1666, + "step": 1503 + }, + { + "epoch": 0.06332631578947369, + "grad_norm": 0.498046875, + "learning_rate": 0.0004975925190594576, + "loss": 3.6817, + "step": 1504 + }, + { + "epoch": 0.06336842105263157, + "grad_norm": 0.4375, + "learning_rate": 0.0004975878450654648, + "loss": 3.6552, + "step": 1505 + }, + { + "epoch": 0.06341052631578947, + "grad_norm": 0.515625, + "learning_rate": 0.0004975831665607125, + "loss": 2.9724, + "step": 1506 + }, + { + "epoch": 0.06345263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.0004975784835452858, + "loss": 4.1568, + "step": 1507 + }, + { + "epoch": 0.06349473684210526, + "grad_norm": 0.4609375, + "learning_rate": 0.00049757379601927, + "loss": 3.7084, + "step": 1508 + }, + { + "epoch": 0.06353684210526316, + "grad_norm": 0.4296875, + "learning_rate": 0.0004975691039827505, + "loss": 3.9228, + "step": 1509 + }, + { + "epoch": 0.06357894736842105, + "grad_norm": 0.546875, + "learning_rate": 0.0004975644074358128, + "loss": 3.3696, + "step": 1510 + }, + { + "epoch": 0.06362105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.0004975597063785424, + "loss": 4.021, + "step": 1511 + }, + { + "epoch": 0.06366315789473684, + "grad_norm": 0.4765625, + "learning_rate": 0.0004975550008110252, + "loss": 3.7964, + "step": 1512 + }, + { + "epoch": 0.06370526315789474, + "grad_norm": 0.427734375, + "learning_rate": 0.0004975502907333467, + "loss": 3.7025, + "step": 1513 + }, + { + "epoch": 0.06374736842105264, + "grad_norm": 0.6875, + "learning_rate": 0.0004975455761455927, + "loss": 3.7162, + "step": 1514 + }, + { + "epoch": 0.06378947368421052, + "grad_norm": 0.47265625, + "learning_rate": 0.0004975408570478491, + "loss": 3.7441, + "step": 1515 + }, + { + "epoch": 0.06383157894736842, + "grad_norm": 0.5078125, + "learning_rate": 0.0004975361334402021, + "loss": 3.6904, + "step": 1516 + }, + { + "epoch": 0.06387368421052632, + "grad_norm": 0.45703125, + "learning_rate": 0.0004975314053227375, + "loss": 3.6474, + "step": 1517 + }, + { + "epoch": 0.0639157894736842, + "grad_norm": 0.486328125, + "learning_rate": 0.0004975266726955415, + "loss": 3.309, + "step": 1518 + }, + { + "epoch": 0.0639578947368421, + "grad_norm": 0.6171875, + "learning_rate": 0.0004975219355587004, + "loss": 3.7018, + "step": 1519 + }, + { + "epoch": 0.064, + "grad_norm": 0.49609375, + "learning_rate": 0.0004975171939123004, + "loss": 3.8566, + "step": 1520 + }, + { + "epoch": 0.0640421052631579, + "grad_norm": 0.53125, + "learning_rate": 0.0004975124477564282, + "loss": 3.2526, + "step": 1521 + }, + { + "epoch": 0.06408421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.0004975076970911697, + "loss": 3.9706, + "step": 1522 + }, + { + "epoch": 0.06412631578947368, + "grad_norm": 0.447265625, + "learning_rate": 0.0004975029419166119, + "loss": 4.1502, + "step": 1523 + }, + { + "epoch": 0.06416842105263158, + "grad_norm": 0.62890625, + "learning_rate": 0.0004974981822328413, + "loss": 3.4427, + "step": 1524 + }, + { + "epoch": 0.06421052631578947, + "grad_norm": 0.58203125, + "learning_rate": 0.0004974934180399447, + "loss": 3.2783, + "step": 1525 + }, + { + "epoch": 0.06425263157894737, + "grad_norm": 0.50390625, + "learning_rate": 0.0004974886493380086, + "loss": 3.4603, + "step": 1526 + }, + { + "epoch": 0.06429473684210527, + "grad_norm": 0.494140625, + "learning_rate": 0.0004974838761271202, + "loss": 3.3505, + "step": 1527 + }, + { + "epoch": 0.06433684210526316, + "grad_norm": 0.515625, + "learning_rate": 0.0004974790984073665, + "loss": 3.7598, + "step": 1528 + }, + { + "epoch": 0.06437894736842105, + "grad_norm": 0.51953125, + "learning_rate": 0.0004974743161788342, + "loss": 3.6861, + "step": 1529 + }, + { + "epoch": 0.06442105263157895, + "grad_norm": 0.44921875, + "learning_rate": 0.0004974695294416107, + "loss": 3.8653, + "step": 1530 + }, + { + "epoch": 0.06446315789473685, + "grad_norm": 0.447265625, + "learning_rate": 0.0004974647381957832, + "loss": 3.692, + "step": 1531 + }, + { + "epoch": 0.06450526315789473, + "grad_norm": 0.484375, + "learning_rate": 0.0004974599424414389, + "loss": 3.9575, + "step": 1532 + }, + { + "epoch": 0.06454736842105263, + "grad_norm": 0.5, + "learning_rate": 0.0004974551421786652, + "loss": 3.5997, + "step": 1533 + }, + { + "epoch": 0.06458947368421053, + "grad_norm": 0.44921875, + "learning_rate": 0.0004974503374075496, + "loss": 3.7357, + "step": 1534 + }, + { + "epoch": 0.06463157894736841, + "grad_norm": 0.6328125, + "learning_rate": 0.0004974455281281794, + "loss": 3.5105, + "step": 1535 + }, + { + "epoch": 0.06467368421052631, + "grad_norm": 0.431640625, + "learning_rate": 0.0004974407143406426, + "loss": 4.0164, + "step": 1536 + }, + { + "epoch": 0.06471578947368421, + "grad_norm": 0.466796875, + "learning_rate": 0.0004974358960450267, + "loss": 3.8167, + "step": 1537 + }, + { + "epoch": 0.06475789473684211, + "grad_norm": 0.466796875, + "learning_rate": 0.0004974310732414194, + "loss": 3.5038, + "step": 1538 + }, + { + "epoch": 0.0648, + "grad_norm": 0.47265625, + "learning_rate": 0.0004974262459299087, + "loss": 3.4413, + "step": 1539 + }, + { + "epoch": 0.0648421052631579, + "grad_norm": 0.49609375, + "learning_rate": 0.0004974214141105826, + "loss": 3.2818, + "step": 1540 + }, + { + "epoch": 0.0648842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.000497416577783529, + "loss": 4.1847, + "step": 1541 + }, + { + "epoch": 0.06492631578947368, + "grad_norm": 0.46484375, + "learning_rate": 0.0004974117369488361, + "loss": 3.5805, + "step": 1542 + }, + { + "epoch": 0.06496842105263158, + "grad_norm": 0.44921875, + "learning_rate": 0.0004974068916065919, + "loss": 3.6437, + "step": 1543 + }, + { + "epoch": 0.06501052631578948, + "grad_norm": 0.455078125, + "learning_rate": 0.0004974020417568849, + "loss": 3.5432, + "step": 1544 + }, + { + "epoch": 0.06505263157894736, + "grad_norm": 0.55078125, + "learning_rate": 0.0004973971873998035, + "loss": 3.7226, + "step": 1545 + }, + { + "epoch": 0.06509473684210526, + "grad_norm": 0.41015625, + "learning_rate": 0.0004973923285354358, + "loss": 4.0023, + "step": 1546 + }, + { + "epoch": 0.06513684210526316, + "grad_norm": 0.482421875, + "learning_rate": 0.0004973874651638707, + "loss": 3.3934, + "step": 1547 + }, + { + "epoch": 0.06517894736842106, + "grad_norm": 0.447265625, + "learning_rate": 0.0004973825972851967, + "loss": 3.8527, + "step": 1548 + }, + { + "epoch": 0.06522105263157894, + "grad_norm": 0.5390625, + "learning_rate": 0.0004973777248995023, + "loss": 3.6853, + "step": 1549 + }, + { + "epoch": 0.06526315789473684, + "grad_norm": 0.51953125, + "learning_rate": 0.0004973728480068765, + "loss": 3.5738, + "step": 1550 + }, + { + "epoch": 0.06530526315789474, + "grad_norm": 0.49609375, + "learning_rate": 0.000497367966607408, + "loss": 3.6356, + "step": 1551 + }, + { + "epoch": 0.06534736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.0004973630807011859, + "loss": 3.7679, + "step": 1552 + }, + { + "epoch": 0.06538947368421053, + "grad_norm": 0.408203125, + "learning_rate": 0.0004973581902882989, + "loss": 3.5949, + "step": 1553 + }, + { + "epoch": 0.06543157894736842, + "grad_norm": 0.455078125, + "learning_rate": 0.0004973532953688366, + "loss": 3.6411, + "step": 1554 + }, + { + "epoch": 0.06547368421052631, + "grad_norm": 0.42578125, + "learning_rate": 0.0004973483959428877, + "loss": 3.7853, + "step": 1555 + }, + { + "epoch": 0.06551578947368421, + "grad_norm": 0.53515625, + "learning_rate": 0.0004973434920105416, + "loss": 3.9905, + "step": 1556 + }, + { + "epoch": 0.0655578947368421, + "grad_norm": 0.50390625, + "learning_rate": 0.0004973385835718879, + "loss": 3.5615, + "step": 1557 + }, + { + "epoch": 0.0656, + "grad_norm": 0.5703125, + "learning_rate": 0.0004973336706270156, + "loss": 3.3959, + "step": 1558 + }, + { + "epoch": 0.06564210526315789, + "grad_norm": 0.51171875, + "learning_rate": 0.0004973287531760145, + "loss": 3.8301, + "step": 1559 + }, + { + "epoch": 0.06568421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.0004973238312189742, + "loss": 3.6743, + "step": 1560 + }, + { + "epoch": 0.06572631578947369, + "grad_norm": 0.4609375, + "learning_rate": 0.0004973189047559842, + "loss": 3.6623, + "step": 1561 + }, + { + "epoch": 0.06576842105263157, + "grad_norm": 0.443359375, + "learning_rate": 0.0004973139737871344, + "loss": 3.6671, + "step": 1562 + }, + { + "epoch": 0.06581052631578947, + "grad_norm": 0.546875, + "learning_rate": 0.0004973090383125145, + "loss": 3.7153, + "step": 1563 + }, + { + "epoch": 0.06585263157894737, + "grad_norm": 0.490234375, + "learning_rate": 0.0004973040983322146, + "loss": 4.0499, + "step": 1564 + }, + { + "epoch": 0.06589473684210527, + "grad_norm": 0.51171875, + "learning_rate": 0.0004972991538463246, + "loss": 3.4217, + "step": 1565 + }, + { + "epoch": 0.06593684210526315, + "grad_norm": 0.48046875, + "learning_rate": 0.0004972942048549345, + "loss": 3.7176, + "step": 1566 + }, + { + "epoch": 0.06597894736842105, + "grad_norm": 0.53125, + "learning_rate": 0.0004972892513581345, + "loss": 3.6522, + "step": 1567 + }, + { + "epoch": 0.06602105263157895, + "grad_norm": 0.52734375, + "learning_rate": 0.000497284293356015, + "loss": 3.7333, + "step": 1568 + }, + { + "epoch": 0.06606315789473684, + "grad_norm": 0.54296875, + "learning_rate": 0.0004972793308486662, + "loss": 3.7564, + "step": 1569 + }, + { + "epoch": 0.06610526315789474, + "grad_norm": 0.49609375, + "learning_rate": 0.0004972743638361785, + "loss": 3.4878, + "step": 1570 + }, + { + "epoch": 0.06614736842105264, + "grad_norm": 0.462890625, + "learning_rate": 0.0004972693923186425, + "loss": 3.5301, + "step": 1571 + }, + { + "epoch": 0.06618947368421052, + "grad_norm": 0.482421875, + "learning_rate": 0.0004972644162961486, + "loss": 4.0258, + "step": 1572 + }, + { + "epoch": 0.06623157894736842, + "grad_norm": 0.70703125, + "learning_rate": 0.0004972594357687876, + "loss": 3.8434, + "step": 1573 + }, + { + "epoch": 0.06627368421052632, + "grad_norm": 0.4921875, + "learning_rate": 0.0004972544507366502, + "loss": 3.5917, + "step": 1574 + }, + { + "epoch": 0.06631578947368422, + "grad_norm": 0.4921875, + "learning_rate": 0.0004972494611998272, + "loss": 3.9154, + "step": 1575 + }, + { + "epoch": 0.0663578947368421, + "grad_norm": 0.57421875, + "learning_rate": 0.0004972444671584095, + "loss": 3.4825, + "step": 1576 + }, + { + "epoch": 0.0664, + "grad_norm": 0.4765625, + "learning_rate": 0.0004972394686124882, + "loss": 3.868, + "step": 1577 + }, + { + "epoch": 0.0664421052631579, + "grad_norm": 0.54296875, + "learning_rate": 0.0004972344655621542, + "loss": 3.8297, + "step": 1578 + }, + { + "epoch": 0.06648421052631578, + "grad_norm": 0.44140625, + "learning_rate": 0.0004972294580074989, + "loss": 3.8074, + "step": 1579 + }, + { + "epoch": 0.06652631578947368, + "grad_norm": 0.5, + "learning_rate": 0.0004972244459486132, + "loss": 3.4433, + "step": 1580 + }, + { + "epoch": 0.06656842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.0004972194293855885, + "loss": 3.8557, + "step": 1581 + }, + { + "epoch": 0.06661052631578947, + "grad_norm": 0.455078125, + "learning_rate": 0.0004972144083185164, + "loss": 4.0276, + "step": 1582 + }, + { + "epoch": 0.06665263157894737, + "grad_norm": 0.4609375, + "learning_rate": 0.0004972093827474882, + "loss": 3.5499, + "step": 1583 + }, + { + "epoch": 0.06669473684210526, + "grad_norm": 0.451171875, + "learning_rate": 0.0004972043526725955, + "loss": 3.9461, + "step": 1584 + }, + { + "epoch": 0.06673684210526316, + "grad_norm": 0.4453125, + "learning_rate": 0.0004971993180939299, + "loss": 3.9613, + "step": 1585 + }, + { + "epoch": 0.06677894736842105, + "grad_norm": 0.71484375, + "learning_rate": 0.0004971942790115832, + "loss": 3.4843, + "step": 1586 + }, + { + "epoch": 0.06682105263157895, + "grad_norm": 0.466796875, + "learning_rate": 0.0004971892354256473, + "loss": 3.4749, + "step": 1587 + }, + { + "epoch": 0.06686315789473685, + "grad_norm": 0.451171875, + "learning_rate": 0.0004971841873362138, + "loss": 3.5181, + "step": 1588 + }, + { + "epoch": 0.06690526315789473, + "grad_norm": 0.6171875, + "learning_rate": 0.0004971791347433749, + "loss": 3.1767, + "step": 1589 + }, + { + "epoch": 0.06694736842105263, + "grad_norm": 0.455078125, + "learning_rate": 0.0004971740776472227, + "loss": 3.8569, + "step": 1590 + }, + { + "epoch": 0.06698947368421053, + "grad_norm": 0.484375, + "learning_rate": 0.000497169016047849, + "loss": 3.6002, + "step": 1591 + }, + { + "epoch": 0.06703157894736841, + "grad_norm": 0.423828125, + "learning_rate": 0.0004971639499453465, + "loss": 3.8235, + "step": 1592 + }, + { + "epoch": 0.06707368421052631, + "grad_norm": 0.484375, + "learning_rate": 0.000497158879339807, + "loss": 3.5475, + "step": 1593 + }, + { + "epoch": 0.06711578947368421, + "grad_norm": 0.455078125, + "learning_rate": 0.0004971538042313232, + "loss": 3.3942, + "step": 1594 + }, + { + "epoch": 0.06715789473684211, + "grad_norm": 0.443359375, + "learning_rate": 0.0004971487246199875, + "loss": 3.2053, + "step": 1595 + }, + { + "epoch": 0.0672, + "grad_norm": 0.455078125, + "learning_rate": 0.0004971436405058925, + "loss": 4.1573, + "step": 1596 + }, + { + "epoch": 0.0672421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.0004971385518891306, + "loss": 3.417, + "step": 1597 + }, + { + "epoch": 0.0672842105263158, + "grad_norm": 0.4609375, + "learning_rate": 0.0004971334587697947, + "loss": 3.1885, + "step": 1598 + }, + { + "epoch": 0.06732631578947368, + "grad_norm": 0.490234375, + "learning_rate": 0.0004971283611479776, + "loss": 3.7261, + "step": 1599 + }, + { + "epoch": 0.06736842105263158, + "grad_norm": 0.48046875, + "learning_rate": 0.0004971232590237721, + "loss": 3.7957, + "step": 1600 + }, + { + "epoch": 0.06741052631578948, + "grad_norm": 0.451171875, + "learning_rate": 0.0004971181523972712, + "loss": 3.4128, + "step": 1601 + }, + { + "epoch": 0.06745263157894738, + "grad_norm": 0.4375, + "learning_rate": 0.0004971130412685679, + "loss": 3.5927, + "step": 1602 + }, + { + "epoch": 0.06749473684210526, + "grad_norm": 0.470703125, + "learning_rate": 0.0004971079256377554, + "loss": 3.6683, + "step": 1603 + }, + { + "epoch": 0.06753684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.0004971028055049267, + "loss": 3.7017, + "step": 1604 + }, + { + "epoch": 0.06757894736842106, + "grad_norm": 0.44921875, + "learning_rate": 0.0004970976808701753, + "loss": 3.9397, + "step": 1605 + }, + { + "epoch": 0.06762105263157894, + "grad_norm": 0.458984375, + "learning_rate": 0.0004970925517335944, + "loss": 3.6519, + "step": 1606 + }, + { + "epoch": 0.06766315789473684, + "grad_norm": 0.455078125, + "learning_rate": 0.0004970874180952776, + "loss": 3.7639, + "step": 1607 + }, + { + "epoch": 0.06770526315789474, + "grad_norm": 0.462890625, + "learning_rate": 0.0004970822799553184, + "loss": 3.2546, + "step": 1608 + }, + { + "epoch": 0.06774736842105263, + "grad_norm": 0.486328125, + "learning_rate": 0.0004970771373138103, + "loss": 3.998, + "step": 1609 + }, + { + "epoch": 0.06778947368421052, + "grad_norm": 0.6953125, + "learning_rate": 0.0004970719901708471, + "loss": 3.7759, + "step": 1610 + }, + { + "epoch": 0.06783157894736842, + "grad_norm": 0.490234375, + "learning_rate": 0.0004970668385265225, + "loss": 3.8989, + "step": 1611 + }, + { + "epoch": 0.06787368421052632, + "grad_norm": 0.484375, + "learning_rate": 0.0004970616823809304, + "loss": 3.6788, + "step": 1612 + }, + { + "epoch": 0.06791578947368421, + "grad_norm": 0.5234375, + "learning_rate": 0.0004970565217341647, + "loss": 3.7188, + "step": 1613 + }, + { + "epoch": 0.0679578947368421, + "grad_norm": 0.46875, + "learning_rate": 0.0004970513565863196, + "loss": 3.9735, + "step": 1614 + }, + { + "epoch": 0.068, + "grad_norm": 0.466796875, + "learning_rate": 0.0004970461869374889, + "loss": 3.5269, + "step": 1615 + }, + { + "epoch": 0.06804210526315789, + "grad_norm": 0.484375, + "learning_rate": 0.000497041012787767, + "loss": 3.755, + "step": 1616 + }, + { + "epoch": 0.06808421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.0004970358341372482, + "loss": 4.1299, + "step": 1617 + }, + { + "epoch": 0.06812631578947369, + "grad_norm": 0.46875, + "learning_rate": 0.0004970306509860267, + "loss": 3.6043, + "step": 1618 + }, + { + "epoch": 0.06816842105263157, + "grad_norm": 0.427734375, + "learning_rate": 0.0004970254633341969, + "loss": 3.9046, + "step": 1619 + }, + { + "epoch": 0.06821052631578947, + "grad_norm": 0.462890625, + "learning_rate": 0.0004970202711818535, + "loss": 3.6095, + "step": 1620 + }, + { + "epoch": 0.06825263157894737, + "grad_norm": 0.44921875, + "learning_rate": 0.0004970150745290909, + "loss": 3.499, + "step": 1621 + }, + { + "epoch": 0.06829473684210527, + "grad_norm": 0.5703125, + "learning_rate": 0.0004970098733760039, + "loss": 3.5376, + "step": 1622 + }, + { + "epoch": 0.06833684210526315, + "grad_norm": 0.484375, + "learning_rate": 0.0004970046677226873, + "loss": 4.0292, + "step": 1623 + }, + { + "epoch": 0.06837894736842105, + "grad_norm": 0.46484375, + "learning_rate": 0.0004969994575692358, + "loss": 3.5207, + "step": 1624 + }, + { + "epoch": 0.06842105263157895, + "grad_norm": 0.46484375, + "learning_rate": 0.0004969942429157443, + "loss": 3.3669, + "step": 1625 + }, + { + "epoch": 0.06846315789473684, + "grad_norm": 0.53515625, + "learning_rate": 0.0004969890237623082, + "loss": 3.4828, + "step": 1626 + }, + { + "epoch": 0.06850526315789474, + "grad_norm": 0.484375, + "learning_rate": 0.000496983800109022, + "loss": 3.7579, + "step": 1627 + }, + { + "epoch": 0.06854736842105263, + "grad_norm": 0.470703125, + "learning_rate": 0.0004969785719559813, + "loss": 3.8547, + "step": 1628 + }, + { + "epoch": 0.06858947368421052, + "grad_norm": 0.51171875, + "learning_rate": 0.0004969733393032812, + "loss": 3.5979, + "step": 1629 + }, + { + "epoch": 0.06863157894736842, + "grad_norm": 1.3046875, + "learning_rate": 0.000496968102151017, + "loss": 3.35, + "step": 1630 + }, + { + "epoch": 0.06867368421052632, + "grad_norm": 0.51171875, + "learning_rate": 0.0004969628604992841, + "loss": 3.2912, + "step": 1631 + }, + { + "epoch": 0.06871578947368422, + "grad_norm": 0.54296875, + "learning_rate": 0.0004969576143481781, + "loss": 3.8525, + "step": 1632 + }, + { + "epoch": 0.0687578947368421, + "grad_norm": 0.478515625, + "learning_rate": 0.0004969523636977946, + "loss": 3.9714, + "step": 1633 + }, + { + "epoch": 0.0688, + "grad_norm": 0.482421875, + "learning_rate": 0.0004969471085482291, + "loss": 3.6215, + "step": 1634 + }, + { + "epoch": 0.0688421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.0004969418488995775, + "loss": 3.7858, + "step": 1635 + }, + { + "epoch": 0.06888421052631578, + "grad_norm": 0.51171875, + "learning_rate": 0.0004969365847519356, + "loss": 3.814, + "step": 1636 + }, + { + "epoch": 0.06892631578947368, + "grad_norm": 0.5234375, + "learning_rate": 0.0004969313161053991, + "loss": 3.4875, + "step": 1637 + }, + { + "epoch": 0.06896842105263158, + "grad_norm": 0.498046875, + "learning_rate": 0.0004969260429600643, + "loss": 3.6464, + "step": 1638 + }, + { + "epoch": 0.06901052631578948, + "grad_norm": 0.53515625, + "learning_rate": 0.000496920765316027, + "loss": 3.5257, + "step": 1639 + }, + { + "epoch": 0.06905263157894737, + "grad_norm": 0.44921875, + "learning_rate": 0.0004969154831733836, + "loss": 3.6062, + "step": 1640 + }, + { + "epoch": 0.06909473684210526, + "grad_norm": 0.435546875, + "learning_rate": 0.0004969101965322301, + "loss": 3.8344, + "step": 1641 + }, + { + "epoch": 0.06913684210526316, + "grad_norm": 0.7578125, + "learning_rate": 0.000496904905392663, + "loss": 3.4854, + "step": 1642 + }, + { + "epoch": 0.06917894736842105, + "grad_norm": 0.4921875, + "learning_rate": 0.0004968996097547787, + "loss": 3.5177, + "step": 1643 + }, + { + "epoch": 0.06922105263157895, + "grad_norm": 0.435546875, + "learning_rate": 0.0004968943096186735, + "loss": 3.7809, + "step": 1644 + }, + { + "epoch": 0.06926315789473685, + "grad_norm": 0.455078125, + "learning_rate": 0.000496889004984444, + "loss": 3.8504, + "step": 1645 + }, + { + "epoch": 0.06930526315789473, + "grad_norm": 0.458984375, + "learning_rate": 0.000496883695852187, + "loss": 3.9813, + "step": 1646 + }, + { + "epoch": 0.06934736842105263, + "grad_norm": 0.4453125, + "learning_rate": 0.0004968783822219991, + "loss": 3.2324, + "step": 1647 + }, + { + "epoch": 0.06938947368421053, + "grad_norm": 0.69140625, + "learning_rate": 0.0004968730640939773, + "loss": 4.0656, + "step": 1648 + }, + { + "epoch": 0.06943157894736843, + "grad_norm": 0.498046875, + "learning_rate": 0.0004968677414682181, + "loss": 3.6067, + "step": 1649 + }, + { + "epoch": 0.06947368421052631, + "grad_norm": 0.48046875, + "learning_rate": 0.0004968624143448188, + "loss": 3.816, + "step": 1650 + }, + { + "epoch": 0.06951578947368421, + "grad_norm": 0.51953125, + "learning_rate": 0.0004968570827238764, + "loss": 3.3084, + "step": 1651 + }, + { + "epoch": 0.06955789473684211, + "grad_norm": 0.50390625, + "learning_rate": 0.000496851746605488, + "loss": 3.9131, + "step": 1652 + }, + { + "epoch": 0.0696, + "grad_norm": 0.43359375, + "learning_rate": 0.0004968464059897508, + "loss": 3.8056, + "step": 1653 + }, + { + "epoch": 0.0696421052631579, + "grad_norm": 0.515625, + "learning_rate": 0.0004968410608767621, + "loss": 3.2827, + "step": 1654 + }, + { + "epoch": 0.06968421052631579, + "grad_norm": 0.48828125, + "learning_rate": 0.0004968357112666193, + "loss": 3.5338, + "step": 1655 + }, + { + "epoch": 0.06972631578947368, + "grad_norm": 0.439453125, + "learning_rate": 0.00049683035715942, + "loss": 3.9412, + "step": 1656 + }, + { + "epoch": 0.06976842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.0004968249985552615, + "loss": 3.8525, + "step": 1657 + }, + { + "epoch": 0.06981052631578948, + "grad_norm": 0.439453125, + "learning_rate": 0.0004968196354542413, + "loss": 3.5625, + "step": 1658 + }, + { + "epoch": 0.06985263157894737, + "grad_norm": 0.46484375, + "learning_rate": 0.0004968142678564577, + "loss": 4.066, + "step": 1659 + }, + { + "epoch": 0.06989473684210526, + "grad_norm": 0.494140625, + "learning_rate": 0.0004968088957620079, + "loss": 4.0047, + "step": 1660 + }, + { + "epoch": 0.06993684210526316, + "grad_norm": 0.494140625, + "learning_rate": 0.0004968035191709901, + "loss": 3.4038, + "step": 1661 + }, + { + "epoch": 0.06997894736842106, + "grad_norm": 0.42578125, + "learning_rate": 0.0004967981380835021, + "loss": 3.6796, + "step": 1662 + }, + { + "epoch": 0.07002105263157894, + "grad_norm": 0.439453125, + "learning_rate": 0.0004967927524996422, + "loss": 3.7461, + "step": 1663 + }, + { + "epoch": 0.07006315789473684, + "grad_norm": 0.4609375, + "learning_rate": 0.0004967873624195081, + "loss": 3.4546, + "step": 1664 + }, + { + "epoch": 0.07010526315789474, + "grad_norm": 0.474609375, + "learning_rate": 0.0004967819678431983, + "loss": 3.6146, + "step": 1665 + }, + { + "epoch": 0.07014736842105262, + "grad_norm": 0.49609375, + "learning_rate": 0.000496776568770811, + "loss": 3.7792, + "step": 1666 + }, + { + "epoch": 0.07018947368421052, + "grad_norm": 0.431640625, + "learning_rate": 0.0004967711652024446, + "loss": 3.7992, + "step": 1667 + }, + { + "epoch": 0.07023157894736842, + "grad_norm": 0.474609375, + "learning_rate": 0.0004967657571381975, + "loss": 3.6278, + "step": 1668 + }, + { + "epoch": 0.07027368421052632, + "grad_norm": 0.455078125, + "learning_rate": 0.0004967603445781683, + "loss": 3.7263, + "step": 1669 + }, + { + "epoch": 0.0703157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.0004967549275224553, + "loss": 3.5737, + "step": 1670 + }, + { + "epoch": 0.0703578947368421, + "grad_norm": 0.46875, + "learning_rate": 0.0004967495059711577, + "loss": 3.9559, + "step": 1671 + }, + { + "epoch": 0.0704, + "grad_norm": 0.42578125, + "learning_rate": 0.0004967440799243739, + "loss": 3.9898, + "step": 1672 + }, + { + "epoch": 0.07044210526315789, + "grad_norm": 0.458984375, + "learning_rate": 0.0004967386493822029, + "loss": 3.8508, + "step": 1673 + }, + { + "epoch": 0.07048421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.0004967332143447436, + "loss": 3.3655, + "step": 1674 + }, + { + "epoch": 0.07052631578947369, + "grad_norm": 0.73828125, + "learning_rate": 0.0004967277748120951, + "loss": 3.6784, + "step": 1675 + }, + { + "epoch": 0.07056842105263159, + "grad_norm": 0.4453125, + "learning_rate": 0.0004967223307843565, + "loss": 3.8198, + "step": 1676 + }, + { + "epoch": 0.07061052631578947, + "grad_norm": 0.4609375, + "learning_rate": 0.0004967168822616268, + "loss": 3.51, + "step": 1677 + }, + { + "epoch": 0.07065263157894737, + "grad_norm": 0.50390625, + "learning_rate": 0.0004967114292440054, + "loss": 3.3994, + "step": 1678 + }, + { + "epoch": 0.07069473684210527, + "grad_norm": 0.453125, + "learning_rate": 0.0004967059717315916, + "loss": 3.5857, + "step": 1679 + }, + { + "epoch": 0.07073684210526315, + "grad_norm": 0.431640625, + "learning_rate": 0.0004967005097244849, + "loss": 3.3842, + "step": 1680 + }, + { + "epoch": 0.07077894736842105, + "grad_norm": 0.48828125, + "learning_rate": 0.0004966950432227848, + "loss": 3.4506, + "step": 1681 + }, + { + "epoch": 0.07082105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.0004966895722265908, + "loss": 3.8047, + "step": 1682 + }, + { + "epoch": 0.07086315789473684, + "grad_norm": 0.515625, + "learning_rate": 0.0004966840967360027, + "loss": 3.6717, + "step": 1683 + }, + { + "epoch": 0.07090526315789474, + "grad_norm": 0.412109375, + "learning_rate": 0.0004966786167511203, + "loss": 3.8388, + "step": 1684 + }, + { + "epoch": 0.07094736842105263, + "grad_norm": 0.470703125, + "learning_rate": 0.0004966731322720431, + "loss": 3.5484, + "step": 1685 + }, + { + "epoch": 0.07098947368421053, + "grad_norm": 0.443359375, + "learning_rate": 0.0004966676432988714, + "loss": 3.2095, + "step": 1686 + }, + { + "epoch": 0.07103157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.0004966621498317051, + "loss": 3.9041, + "step": 1687 + }, + { + "epoch": 0.07107368421052632, + "grad_norm": 0.4609375, + "learning_rate": 0.0004966566518706442, + "loss": 3.6749, + "step": 1688 + }, + { + "epoch": 0.07111578947368422, + "grad_norm": 0.423828125, + "learning_rate": 0.0004966511494157888, + "loss": 4.2981, + "step": 1689 + }, + { + "epoch": 0.0711578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.0004966456424672394, + "loss": 3.5101, + "step": 1690 + }, + { + "epoch": 0.0712, + "grad_norm": 0.416015625, + "learning_rate": 0.0004966401310250962, + "loss": 3.6502, + "step": 1691 + }, + { + "epoch": 0.0712421052631579, + "grad_norm": 0.51171875, + "learning_rate": 0.0004966346150894595, + "loss": 3.2637, + "step": 1692 + }, + { + "epoch": 0.07128421052631578, + "grad_norm": 0.4453125, + "learning_rate": 0.00049662909466043, + "loss": 3.3594, + "step": 1693 + }, + { + "epoch": 0.07132631578947368, + "grad_norm": 0.4453125, + "learning_rate": 0.0004966235697381082, + "loss": 3.831, + "step": 1694 + }, + { + "epoch": 0.07136842105263158, + "grad_norm": 0.474609375, + "learning_rate": 0.0004966180403225946, + "loss": 3.3443, + "step": 1695 + }, + { + "epoch": 0.07141052631578948, + "grad_norm": 0.4453125, + "learning_rate": 0.0004966125064139902, + "loss": 3.6084, + "step": 1696 + }, + { + "epoch": 0.07145263157894736, + "grad_norm": 0.53125, + "learning_rate": 0.0004966069680123955, + "loss": 3.3885, + "step": 1697 + }, + { + "epoch": 0.07149473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.0004966014251179117, + "loss": 3.6821, + "step": 1698 + }, + { + "epoch": 0.07153684210526316, + "grad_norm": 0.44921875, + "learning_rate": 0.0004965958777306398, + "loss": 3.2782, + "step": 1699 + }, + { + "epoch": 0.07157894736842105, + "grad_norm": 0.447265625, + "learning_rate": 0.0004965903258506806, + "loss": 4.0316, + "step": 1700 + }, + { + "epoch": 0.07162105263157895, + "grad_norm": 0.58203125, + "learning_rate": 0.0004965847694781355, + "loss": 3.7619, + "step": 1701 + }, + { + "epoch": 0.07166315789473685, + "grad_norm": 0.482421875, + "learning_rate": 0.0004965792086131054, + "loss": 3.5016, + "step": 1702 + }, + { + "epoch": 0.07170526315789473, + "grad_norm": 0.49609375, + "learning_rate": 0.0004965736432556921, + "loss": 3.6455, + "step": 1703 + }, + { + "epoch": 0.07174736842105263, + "grad_norm": 0.5078125, + "learning_rate": 0.0004965680734059966, + "loss": 4.0863, + "step": 1704 + }, + { + "epoch": 0.07178947368421053, + "grad_norm": 0.408203125, + "learning_rate": 0.0004965624990641205, + "loss": 3.7722, + "step": 1705 + }, + { + "epoch": 0.07183157894736843, + "grad_norm": 0.4609375, + "learning_rate": 0.0004965569202301655, + "loss": 4.0669, + "step": 1706 + }, + { + "epoch": 0.07187368421052631, + "grad_norm": 0.47265625, + "learning_rate": 0.0004965513369042329, + "loss": 3.6657, + "step": 1707 + }, + { + "epoch": 0.07191578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.0004965457490864248, + "loss": 3.5376, + "step": 1708 + }, + { + "epoch": 0.07195789473684211, + "grad_norm": 0.58984375, + "learning_rate": 0.0004965401567768428, + "loss": 3.5939, + "step": 1709 + }, + { + "epoch": 0.072, + "grad_norm": 0.51953125, + "learning_rate": 0.0004965345599755888, + "loss": 3.7482, + "step": 1710 + }, + { + "epoch": 0.0720421052631579, + "grad_norm": 0.50390625, + "learning_rate": 0.0004965289586827647, + "loss": 3.5953, + "step": 1711 + }, + { + "epoch": 0.07208421052631579, + "grad_norm": 0.51171875, + "learning_rate": 0.0004965233528984727, + "loss": 3.8089, + "step": 1712 + }, + { + "epoch": 0.07212631578947368, + "grad_norm": 0.5546875, + "learning_rate": 0.0004965177426228149, + "loss": 3.478, + "step": 1713 + }, + { + "epoch": 0.07216842105263158, + "grad_norm": 0.455078125, + "learning_rate": 0.0004965121278558934, + "loss": 3.593, + "step": 1714 + }, + { + "epoch": 0.07221052631578948, + "grad_norm": 0.50390625, + "learning_rate": 0.0004965065085978106, + "loss": 3.3264, + "step": 1715 + }, + { + "epoch": 0.07225263157894737, + "grad_norm": 0.453125, + "learning_rate": 0.0004965008848486689, + "loss": 4.226, + "step": 1716 + }, + { + "epoch": 0.07229473684210526, + "grad_norm": 0.470703125, + "learning_rate": 0.0004964952566085707, + "loss": 3.3619, + "step": 1717 + }, + { + "epoch": 0.07233684210526316, + "grad_norm": 0.515625, + "learning_rate": 0.0004964896238776184, + "loss": 3.2105, + "step": 1718 + }, + { + "epoch": 0.07237894736842106, + "grad_norm": 0.462890625, + "learning_rate": 0.0004964839866559149, + "loss": 3.546, + "step": 1719 + }, + { + "epoch": 0.07242105263157894, + "grad_norm": 0.44921875, + "learning_rate": 0.0004964783449435627, + "loss": 3.5829, + "step": 1720 + }, + { + "epoch": 0.07246315789473684, + "grad_norm": 0.43359375, + "learning_rate": 0.0004964726987406648, + "loss": 3.9014, + "step": 1721 + }, + { + "epoch": 0.07250526315789474, + "grad_norm": 0.578125, + "learning_rate": 0.0004964670480473238, + "loss": 2.9922, + "step": 1722 + }, + { + "epoch": 0.07254736842105264, + "grad_norm": 0.423828125, + "learning_rate": 0.0004964613928636428, + "loss": 3.7412, + "step": 1723 + }, + { + "epoch": 0.07258947368421052, + "grad_norm": 0.453125, + "learning_rate": 0.0004964557331897248, + "loss": 4.0009, + "step": 1724 + }, + { + "epoch": 0.07263157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.0004964500690256728, + "loss": 3.964, + "step": 1725 + }, + { + "epoch": 0.07267368421052632, + "grad_norm": 0.4609375, + "learning_rate": 0.0004964444003715902, + "loss": 3.4966, + "step": 1726 + }, + { + "epoch": 0.0727157894736842, + "grad_norm": 0.5703125, + "learning_rate": 0.0004964387272275802, + "loss": 3.8888, + "step": 1727 + }, + { + "epoch": 0.0727578947368421, + "grad_norm": 0.5, + "learning_rate": 0.0004964330495937462, + "loss": 3.4576, + "step": 1728 + }, + { + "epoch": 0.0728, + "grad_norm": 0.462890625, + "learning_rate": 0.0004964273674701916, + "loss": 3.4533, + "step": 1729 + }, + { + "epoch": 0.07284210526315789, + "grad_norm": 0.484375, + "learning_rate": 0.0004964216808570198, + "loss": 3.2211, + "step": 1730 + }, + { + "epoch": 0.07288421052631579, + "grad_norm": 0.490234375, + "learning_rate": 0.0004964159897543345, + "loss": 4.1171, + "step": 1731 + }, + { + "epoch": 0.07292631578947369, + "grad_norm": 0.447265625, + "learning_rate": 0.0004964102941622395, + "loss": 3.5748, + "step": 1732 + }, + { + "epoch": 0.07296842105263159, + "grad_norm": 0.466796875, + "learning_rate": 0.0004964045940808383, + "loss": 3.3029, + "step": 1733 + }, + { + "epoch": 0.07301052631578947, + "grad_norm": 0.458984375, + "learning_rate": 0.000496398889510235, + "loss": 3.5237, + "step": 1734 + }, + { + "epoch": 0.07305263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.0004963931804505335, + "loss": 3.8931, + "step": 1735 + }, + { + "epoch": 0.07309473684210527, + "grad_norm": 0.5390625, + "learning_rate": 0.0004963874669018377, + "loss": 3.3809, + "step": 1736 + }, + { + "epoch": 0.07313684210526315, + "grad_norm": 0.451171875, + "learning_rate": 0.0004963817488642517, + "loss": 4.1551, + "step": 1737 + }, + { + "epoch": 0.07317894736842105, + "grad_norm": 0.578125, + "learning_rate": 0.0004963760263378798, + "loss": 3.2452, + "step": 1738 + }, + { + "epoch": 0.07322105263157895, + "grad_norm": 0.474609375, + "learning_rate": 0.0004963702993228262, + "loss": 3.7427, + "step": 1739 + }, + { + "epoch": 0.07326315789473684, + "grad_norm": 0.466796875, + "learning_rate": 0.0004963645678191951, + "loss": 3.4735, + "step": 1740 + }, + { + "epoch": 0.07330526315789473, + "grad_norm": 0.474609375, + "learning_rate": 0.0004963588318270912, + "loss": 3.3544, + "step": 1741 + }, + { + "epoch": 0.07334736842105263, + "grad_norm": 0.51953125, + "learning_rate": 0.0004963530913466187, + "loss": 3.595, + "step": 1742 + }, + { + "epoch": 0.07338947368421053, + "grad_norm": 0.47265625, + "learning_rate": 0.0004963473463778824, + "loss": 3.8544, + "step": 1743 + }, + { + "epoch": 0.07343157894736842, + "grad_norm": 0.47265625, + "learning_rate": 0.0004963415969209868, + "loss": 3.6384, + "step": 1744 + }, + { + "epoch": 0.07347368421052632, + "grad_norm": 0.52734375, + "learning_rate": 0.0004963358429760368, + "loss": 3.4096, + "step": 1745 + }, + { + "epoch": 0.07351578947368421, + "grad_norm": 0.482421875, + "learning_rate": 0.0004963300845431373, + "loss": 3.9575, + "step": 1746 + }, + { + "epoch": 0.0735578947368421, + "grad_norm": 0.40234375, + "learning_rate": 0.0004963243216223929, + "loss": 3.944, + "step": 1747 + }, + { + "epoch": 0.0736, + "grad_norm": 0.50390625, + "learning_rate": 0.0004963185542139089, + "loss": 3.8351, + "step": 1748 + }, + { + "epoch": 0.0736421052631579, + "grad_norm": 0.48828125, + "learning_rate": 0.0004963127823177902, + "loss": 3.7207, + "step": 1749 + }, + { + "epoch": 0.07368421052631578, + "grad_norm": 0.51171875, + "learning_rate": 0.000496307005934142, + "loss": 3.6938, + "step": 1750 + }, + { + "epoch": 0.07372631578947368, + "grad_norm": 0.68359375, + "learning_rate": 0.0004963012250630696, + "loss": 3.3901, + "step": 1751 + }, + { + "epoch": 0.07376842105263158, + "grad_norm": 0.482421875, + "learning_rate": 0.0004962954397046783, + "loss": 3.7485, + "step": 1752 + }, + { + "epoch": 0.07381052631578948, + "grad_norm": 0.458984375, + "learning_rate": 0.0004962896498590734, + "loss": 4.225, + "step": 1753 + }, + { + "epoch": 0.07385263157894736, + "grad_norm": 0.53125, + "learning_rate": 0.0004962838555263604, + "loss": 3.3185, + "step": 1754 + }, + { + "epoch": 0.07389473684210526, + "grad_norm": 0.515625, + "learning_rate": 0.0004962780567066451, + "loss": 3.4933, + "step": 1755 + }, + { + "epoch": 0.07393684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.0004962722534000328, + "loss": 3.4005, + "step": 1756 + }, + { + "epoch": 0.07397894736842105, + "grad_norm": 0.43359375, + "learning_rate": 0.0004962664456066295, + "loss": 4.0651, + "step": 1757 + }, + { + "epoch": 0.07402105263157895, + "grad_norm": 0.455078125, + "learning_rate": 0.0004962606333265408, + "loss": 4.0011, + "step": 1758 + }, + { + "epoch": 0.07406315789473684, + "grad_norm": 0.59375, + "learning_rate": 0.0004962548165598728, + "loss": 3.5673, + "step": 1759 + }, + { + "epoch": 0.07410526315789474, + "grad_norm": 0.45703125, + "learning_rate": 0.0004962489953067314, + "loss": 3.8407, + "step": 1760 + }, + { + "epoch": 0.07414736842105263, + "grad_norm": 0.47265625, + "learning_rate": 0.0004962431695672226, + "loss": 3.7317, + "step": 1761 + }, + { + "epoch": 0.07418947368421053, + "grad_norm": 0.458984375, + "learning_rate": 0.0004962373393414526, + "loss": 3.8278, + "step": 1762 + }, + { + "epoch": 0.07423157894736843, + "grad_norm": 0.5546875, + "learning_rate": 0.0004962315046295276, + "loss": 3.7913, + "step": 1763 + }, + { + "epoch": 0.07427368421052631, + "grad_norm": 0.447265625, + "learning_rate": 0.0004962256654315538, + "loss": 3.8082, + "step": 1764 + }, + { + "epoch": 0.07431578947368421, + "grad_norm": 0.4609375, + "learning_rate": 0.0004962198217476378, + "loss": 3.9171, + "step": 1765 + }, + { + "epoch": 0.07435789473684211, + "grad_norm": 0.87890625, + "learning_rate": 0.0004962139735778859, + "loss": 3.9598, + "step": 1766 + }, + { + "epoch": 0.0744, + "grad_norm": 0.515625, + "learning_rate": 0.0004962081209224046, + "loss": 3.6397, + "step": 1767 + }, + { + "epoch": 0.07444210526315789, + "grad_norm": 0.5234375, + "learning_rate": 0.0004962022637813007, + "loss": 3.1378, + "step": 1768 + }, + { + "epoch": 0.07448421052631579, + "grad_norm": 0.466796875, + "learning_rate": 0.0004961964021546808, + "loss": 4.1881, + "step": 1769 + }, + { + "epoch": 0.07452631578947369, + "grad_norm": 0.5234375, + "learning_rate": 0.0004961905360426519, + "loss": 3.4501, + "step": 1770 + }, + { + "epoch": 0.07456842105263158, + "grad_norm": 0.51171875, + "learning_rate": 0.0004961846654453205, + "loss": 3.9726, + "step": 1771 + }, + { + "epoch": 0.07461052631578947, + "grad_norm": 0.48046875, + "learning_rate": 0.0004961787903627938, + "loss": 3.8051, + "step": 1772 + }, + { + "epoch": 0.07465263157894737, + "grad_norm": 0.48046875, + "learning_rate": 0.0004961729107951788, + "loss": 3.8196, + "step": 1773 + }, + { + "epoch": 0.07469473684210526, + "grad_norm": 0.392578125, + "learning_rate": 0.0004961670267425826, + "loss": 3.5223, + "step": 1774 + }, + { + "epoch": 0.07473684210526316, + "grad_norm": 0.3984375, + "learning_rate": 0.0004961611382051123, + "loss": 3.9418, + "step": 1775 + }, + { + "epoch": 0.07477894736842106, + "grad_norm": 0.45703125, + "learning_rate": 0.0004961552451828755, + "loss": 3.6962, + "step": 1776 + }, + { + "epoch": 0.07482105263157894, + "grad_norm": 0.4375, + "learning_rate": 0.0004961493476759792, + "loss": 3.713, + "step": 1777 + }, + { + "epoch": 0.07486315789473684, + "grad_norm": 0.546875, + "learning_rate": 0.000496143445684531, + "loss": 3.0897, + "step": 1778 + }, + { + "epoch": 0.07490526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.0004961375392086386, + "loss": 3.9444, + "step": 1779 + }, + { + "epoch": 0.07494736842105264, + "grad_norm": 0.44921875, + "learning_rate": 0.0004961316282484093, + "loss": 3.5383, + "step": 1780 + }, + { + "epoch": 0.07498947368421052, + "grad_norm": 0.5234375, + "learning_rate": 0.0004961257128039509, + "loss": 3.8087, + "step": 1781 + }, + { + "epoch": 0.07503157894736842, + "grad_norm": 0.490234375, + "learning_rate": 0.0004961197928753712, + "loss": 3.4868, + "step": 1782 + }, + { + "epoch": 0.07507368421052632, + "grad_norm": 0.4609375, + "learning_rate": 0.0004961138684627781, + "loss": 3.7377, + "step": 1783 + }, + { + "epoch": 0.0751157894736842, + "grad_norm": 0.470703125, + "learning_rate": 0.0004961079395662795, + "loss": 3.5204, + "step": 1784 + }, + { + "epoch": 0.0751578947368421, + "grad_norm": 0.4453125, + "learning_rate": 0.0004961020061859834, + "loss": 3.6972, + "step": 1785 + }, + { + "epoch": 0.0752, + "grad_norm": 0.3984375, + "learning_rate": 0.0004960960683219978, + "loss": 3.7269, + "step": 1786 + }, + { + "epoch": 0.07524210526315789, + "grad_norm": 0.46875, + "learning_rate": 0.0004960901259744311, + "loss": 3.6722, + "step": 1787 + }, + { + "epoch": 0.07528421052631579, + "grad_norm": 0.404296875, + "learning_rate": 0.0004960841791433915, + "loss": 3.8057, + "step": 1788 + }, + { + "epoch": 0.07532631578947369, + "grad_norm": 0.455078125, + "learning_rate": 0.0004960782278289872, + "loss": 3.5651, + "step": 1789 + }, + { + "epoch": 0.07536842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.0004960722720313267, + "loss": 4.2334, + "step": 1790 + }, + { + "epoch": 0.07541052631578947, + "grad_norm": 0.50390625, + "learning_rate": 0.0004960663117505185, + "loss": 3.6041, + "step": 1791 + }, + { + "epoch": 0.07545263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.0004960603469866713, + "loss": 3.3041, + "step": 1792 + }, + { + "epoch": 0.07549473684210527, + "grad_norm": 0.44140625, + "learning_rate": 0.0004960543777398936, + "loss": 3.0062, + "step": 1793 + }, + { + "epoch": 0.07553684210526315, + "grad_norm": 0.423828125, + "learning_rate": 0.0004960484040102943, + "loss": 3.8303, + "step": 1794 + }, + { + "epoch": 0.07557894736842105, + "grad_norm": 0.46875, + "learning_rate": 0.0004960424257979822, + "loss": 3.6535, + "step": 1795 + }, + { + "epoch": 0.07562105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.0004960364431030662, + "loss": 3.8331, + "step": 1796 + }, + { + "epoch": 0.07566315789473685, + "grad_norm": 0.39453125, + "learning_rate": 0.0004960304559256553, + "loss": 3.7033, + "step": 1797 + }, + { + "epoch": 0.07570526315789473, + "grad_norm": 0.447265625, + "learning_rate": 0.0004960244642658585, + "loss": 3.6106, + "step": 1798 + }, + { + "epoch": 0.07574736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.0004960184681237849, + "loss": 3.596, + "step": 1799 + }, + { + "epoch": 0.07578947368421053, + "grad_norm": 0.5390625, + "learning_rate": 0.0004960124674995441, + "loss": 4.1051, + "step": 1800 + }, + { + "epoch": 0.07583157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.0004960064623932451, + "loss": 3.5837, + "step": 1801 + }, + { + "epoch": 0.07587368421052632, + "grad_norm": 0.44921875, + "learning_rate": 0.0004960004528049974, + "loss": 3.7843, + "step": 1802 + }, + { + "epoch": 0.07591578947368421, + "grad_norm": 0.482421875, + "learning_rate": 0.0004959944387349105, + "loss": 3.5628, + "step": 1803 + }, + { + "epoch": 0.0759578947368421, + "grad_norm": 0.4453125, + "learning_rate": 0.000495988420183094, + "loss": 4.0603, + "step": 1804 + }, + { + "epoch": 0.076, + "grad_norm": 0.490234375, + "learning_rate": 0.0004959823971496574, + "loss": 3.3416, + "step": 1805 + }, + { + "epoch": 0.0760421052631579, + "grad_norm": 0.48828125, + "learning_rate": 0.0004959763696347106, + "loss": 3.6872, + "step": 1806 + }, + { + "epoch": 0.0760842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.0004959703376383634, + "loss": 3.8652, + "step": 1807 + }, + { + "epoch": 0.07612631578947368, + "grad_norm": 0.46875, + "learning_rate": 0.0004959643011607256, + "loss": 3.5389, + "step": 1808 + }, + { + "epoch": 0.07616842105263158, + "grad_norm": 0.494140625, + "learning_rate": 0.0004959582602019072, + "loss": 3.517, + "step": 1809 + }, + { + "epoch": 0.07621052631578948, + "grad_norm": 0.443359375, + "learning_rate": 0.0004959522147620183, + "loss": 3.8383, + "step": 1810 + }, + { + "epoch": 0.07625263157894736, + "grad_norm": 0.423828125, + "learning_rate": 0.0004959461648411692, + "loss": 3.6336, + "step": 1811 + }, + { + "epoch": 0.07629473684210526, + "grad_norm": 0.59375, + "learning_rate": 0.0004959401104394697, + "loss": 3.5757, + "step": 1812 + }, + { + "epoch": 0.07633684210526316, + "grad_norm": 0.466796875, + "learning_rate": 0.0004959340515570305, + "loss": 4.0782, + "step": 1813 + }, + { + "epoch": 0.07637894736842105, + "grad_norm": 0.439453125, + "learning_rate": 0.0004959279881939618, + "loss": 3.6384, + "step": 1814 + }, + { + "epoch": 0.07642105263157895, + "grad_norm": 0.43359375, + "learning_rate": 0.0004959219203503741, + "loss": 3.8301, + "step": 1815 + }, + { + "epoch": 0.07646315789473684, + "grad_norm": 0.498046875, + "learning_rate": 0.000495915848026378, + "loss": 3.281, + "step": 1816 + }, + { + "epoch": 0.07650526315789474, + "grad_norm": 0.44140625, + "learning_rate": 0.0004959097712220841, + "loss": 3.7455, + "step": 1817 + }, + { + "epoch": 0.07654736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.000495903689937603, + "loss": 4.0063, + "step": 1818 + }, + { + "epoch": 0.07658947368421053, + "grad_norm": 0.451171875, + "learning_rate": 0.0004958976041730457, + "loss": 3.8636, + "step": 1819 + }, + { + "epoch": 0.07663157894736843, + "grad_norm": 0.439453125, + "learning_rate": 0.0004958915139285229, + "loss": 3.7563, + "step": 1820 + }, + { + "epoch": 0.07667368421052631, + "grad_norm": 0.46875, + "learning_rate": 0.0004958854192041456, + "loss": 4.0655, + "step": 1821 + }, + { + "epoch": 0.07671578947368421, + "grad_norm": 0.4921875, + "learning_rate": 0.000495879320000025, + "loss": 4.0406, + "step": 1822 + }, + { + "epoch": 0.07675789473684211, + "grad_norm": 0.421875, + "learning_rate": 0.000495873216316272, + "loss": 3.626, + "step": 1823 + }, + { + "epoch": 0.0768, + "grad_norm": 0.4296875, + "learning_rate": 0.0004958671081529979, + "loss": 3.9336, + "step": 1824 + }, + { + "epoch": 0.07684210526315789, + "grad_norm": 0.447265625, + "learning_rate": 0.0004958609955103139, + "loss": 3.8481, + "step": 1825 + }, + { + "epoch": 0.07688421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.0004958548783883316, + "loss": 3.5711, + "step": 1826 + }, + { + "epoch": 0.07692631578947369, + "grad_norm": 0.455078125, + "learning_rate": 0.0004958487567871621, + "loss": 3.6206, + "step": 1827 + }, + { + "epoch": 0.07696842105263157, + "grad_norm": 0.45703125, + "learning_rate": 0.0004958426307069173, + "loss": 3.7727, + "step": 1828 + }, + { + "epoch": 0.07701052631578947, + "grad_norm": 0.486328125, + "learning_rate": 0.0004958365001477084, + "loss": 3.5596, + "step": 1829 + }, + { + "epoch": 0.07705263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.0004958303651096475, + "loss": 3.4394, + "step": 1830 + }, + { + "epoch": 0.07709473684210526, + "grad_norm": 0.458984375, + "learning_rate": 0.0004958242255928461, + "loss": 3.5369, + "step": 1831 + }, + { + "epoch": 0.07713684210526316, + "grad_norm": 0.478515625, + "learning_rate": 0.0004958180815974162, + "loss": 3.4367, + "step": 1832 + }, + { + "epoch": 0.07717894736842106, + "grad_norm": 0.46875, + "learning_rate": 0.0004958119331234696, + "loss": 2.9811, + "step": 1833 + }, + { + "epoch": 0.07722105263157895, + "grad_norm": 0.486328125, + "learning_rate": 0.0004958057801711185, + "loss": 3.5743, + "step": 1834 + }, + { + "epoch": 0.07726315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.0004957996227404747, + "loss": 3.6136, + "step": 1835 + }, + { + "epoch": 0.07730526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.0004957934608316507, + "loss": 3.6534, + "step": 1836 + }, + { + "epoch": 0.07734736842105264, + "grad_norm": 0.54296875, + "learning_rate": 0.0004957872944447588, + "loss": 3.388, + "step": 1837 + }, + { + "epoch": 0.07738947368421052, + "grad_norm": 0.48046875, + "learning_rate": 0.000495781123579911, + "loss": 3.944, + "step": 1838 + }, + { + "epoch": 0.07743157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.0004957749482372199, + "loss": 3.9884, + "step": 1839 + }, + { + "epoch": 0.07747368421052632, + "grad_norm": 0.59375, + "learning_rate": 0.0004957687684167981, + "loss": 3.8765, + "step": 1840 + }, + { + "epoch": 0.0775157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.000495762584118758, + "loss": 3.8636, + "step": 1841 + }, + { + "epoch": 0.0775578947368421, + "grad_norm": 0.47265625, + "learning_rate": 0.0004957563953432125, + "loss": 3.6129, + "step": 1842 + }, + { + "epoch": 0.0776, + "grad_norm": 0.435546875, + "learning_rate": 0.0004957502020902741, + "loss": 3.6204, + "step": 1843 + }, + { + "epoch": 0.0776421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.0004957440043600558, + "loss": 4.1154, + "step": 1844 + }, + { + "epoch": 0.07768421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.0004957378021526705, + "loss": 4.3149, + "step": 1845 + }, + { + "epoch": 0.07772631578947369, + "grad_norm": 0.486328125, + "learning_rate": 0.0004957315954682312, + "loss": 3.4905, + "step": 1846 + }, + { + "epoch": 0.07776842105263158, + "grad_norm": 0.490234375, + "learning_rate": 0.0004957253843068509, + "loss": 3.4097, + "step": 1847 + }, + { + "epoch": 0.07781052631578947, + "grad_norm": 0.4375, + "learning_rate": 0.0004957191686686429, + "loss": 3.6746, + "step": 1848 + }, + { + "epoch": 0.07785263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0004957129485537203, + "loss": 3.4614, + "step": 1849 + }, + { + "epoch": 0.07789473684210527, + "grad_norm": 0.408203125, + "learning_rate": 0.0004957067239621965, + "loss": 3.552, + "step": 1850 + }, + { + "epoch": 0.07793684210526315, + "grad_norm": 0.515625, + "learning_rate": 0.0004957004948941848, + "loss": 3.7481, + "step": 1851 + }, + { + "epoch": 0.07797894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.0004956942613497989, + "loss": 3.4045, + "step": 1852 + }, + { + "epoch": 0.07802105263157895, + "grad_norm": 0.5390625, + "learning_rate": 0.0004956880233291521, + "loss": 3.5658, + "step": 1853 + }, + { + "epoch": 0.07806315789473685, + "grad_norm": 0.6796875, + "learning_rate": 0.0004956817808323583, + "loss": 3.382, + "step": 1854 + }, + { + "epoch": 0.07810526315789473, + "grad_norm": 0.4375, + "learning_rate": 0.000495675533859531, + "loss": 3.6634, + "step": 1855 + }, + { + "epoch": 0.07814736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.0004956692824107842, + "loss": 3.3574, + "step": 1856 + }, + { + "epoch": 0.07818947368421053, + "grad_norm": 0.404296875, + "learning_rate": 0.0004956630264862317, + "loss": 3.3688, + "step": 1857 + }, + { + "epoch": 0.07823157894736842, + "grad_norm": 0.7578125, + "learning_rate": 0.0004956567660859876, + "loss": 3.4702, + "step": 1858 + }, + { + "epoch": 0.07827368421052631, + "grad_norm": 0.4609375, + "learning_rate": 0.0004956505012101658, + "loss": 3.3042, + "step": 1859 + }, + { + "epoch": 0.07831578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.0004956442318588805, + "loss": 3.6902, + "step": 1860 + }, + { + "epoch": 0.0783578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.0004956379580322459, + "loss": 3.6241, + "step": 1861 + }, + { + "epoch": 0.0784, + "grad_norm": 0.439453125, + "learning_rate": 0.0004956316797303762, + "loss": 3.8502, + "step": 1862 + }, + { + "epoch": 0.0784421052631579, + "grad_norm": 0.451171875, + "learning_rate": 0.000495625396953386, + "loss": 3.7562, + "step": 1863 + }, + { + "epoch": 0.0784842105263158, + "grad_norm": 0.48046875, + "learning_rate": 0.0004956191097013898, + "loss": 3.8918, + "step": 1864 + }, + { + "epoch": 0.07852631578947368, + "grad_norm": 0.4296875, + "learning_rate": 0.0004956128179745018, + "loss": 3.6625, + "step": 1865 + }, + { + "epoch": 0.07856842105263158, + "grad_norm": 0.396484375, + "learning_rate": 0.000495606521772837, + "loss": 3.9097, + "step": 1866 + }, + { + "epoch": 0.07861052631578948, + "grad_norm": 0.40625, + "learning_rate": 0.0004956002210965098, + "loss": 3.82, + "step": 1867 + }, + { + "epoch": 0.07865263157894736, + "grad_norm": 0.453125, + "learning_rate": 0.0004955939159456353, + "loss": 3.3097, + "step": 1868 + }, + { + "epoch": 0.07869473684210526, + "grad_norm": 1.1796875, + "learning_rate": 0.0004955876063203281, + "loss": 2.8061, + "step": 1869 + }, + { + "epoch": 0.07873684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.0004955812922207033, + "loss": 3.5491, + "step": 1870 + }, + { + "epoch": 0.07877894736842106, + "grad_norm": 0.4140625, + "learning_rate": 0.0004955749736468758, + "loss": 3.8769, + "step": 1871 + }, + { + "epoch": 0.07882105263157894, + "grad_norm": 0.490234375, + "learning_rate": 0.000495568650598961, + "loss": 3.2703, + "step": 1872 + }, + { + "epoch": 0.07886315789473684, + "grad_norm": 0.498046875, + "learning_rate": 0.0004955623230770738, + "loss": 3.4897, + "step": 1873 + }, + { + "epoch": 0.07890526315789474, + "grad_norm": 0.546875, + "learning_rate": 0.0004955559910813297, + "loss": 3.4976, + "step": 1874 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 0.48828125, + "learning_rate": 0.0004955496546118439, + "loss": 3.6242, + "step": 1875 + }, + { + "epoch": 0.07898947368421053, + "grad_norm": 0.48046875, + "learning_rate": 0.0004955433136687318, + "loss": 3.8319, + "step": 1876 + }, + { + "epoch": 0.07903157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.0004955369682521091, + "loss": 3.6944, + "step": 1877 + }, + { + "epoch": 0.07907368421052631, + "grad_norm": 0.4375, + "learning_rate": 0.0004955306183620915, + "loss": 3.8731, + "step": 1878 + }, + { + "epoch": 0.07911578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.0004955242639987943, + "loss": 3.7115, + "step": 1879 + }, + { + "epoch": 0.07915789473684211, + "grad_norm": 0.46484375, + "learning_rate": 0.0004955179051623335, + "loss": 3.8529, + "step": 1880 + }, + { + "epoch": 0.0792, + "grad_norm": 0.447265625, + "learning_rate": 0.000495511541852825, + "loss": 3.3012, + "step": 1881 + }, + { + "epoch": 0.07924210526315789, + "grad_norm": 0.48046875, + "learning_rate": 0.0004955051740703848, + "loss": 3.3343, + "step": 1882 + }, + { + "epoch": 0.07928421052631579, + "grad_norm": 0.462890625, + "learning_rate": 0.0004954988018151285, + "loss": 3.323, + "step": 1883 + }, + { + "epoch": 0.07932631578947369, + "grad_norm": 0.484375, + "learning_rate": 0.0004954924250871728, + "loss": 4.0457, + "step": 1884 + }, + { + "epoch": 0.07936842105263157, + "grad_norm": 0.53125, + "learning_rate": 0.0004954860438866333, + "loss": 3.6411, + "step": 1885 + }, + { + "epoch": 0.07941052631578947, + "grad_norm": 0.44921875, + "learning_rate": 0.0004954796582136266, + "loss": 3.7893, + "step": 1886 + }, + { + "epoch": 0.07945263157894737, + "grad_norm": 0.451171875, + "learning_rate": 0.0004954732680682689, + "loss": 3.5343, + "step": 1887 + }, + { + "epoch": 0.07949473684210526, + "grad_norm": 0.44921875, + "learning_rate": 0.0004954668734506768, + "loss": 3.7463, + "step": 1888 + }, + { + "epoch": 0.07953684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.0004954604743609666, + "loss": 3.5578, + "step": 1889 + }, + { + "epoch": 0.07957894736842105, + "grad_norm": 0.451171875, + "learning_rate": 0.000495454070799255, + "loss": 3.3886, + "step": 1890 + }, + { + "epoch": 0.07962105263157895, + "grad_norm": 0.4921875, + "learning_rate": 0.0004954476627656586, + "loss": 3.4701, + "step": 1891 + }, + { + "epoch": 0.07966315789473684, + "grad_norm": 0.5234375, + "learning_rate": 0.0004954412502602942, + "loss": 3.3631, + "step": 1892 + }, + { + "epoch": 0.07970526315789474, + "grad_norm": 0.435546875, + "learning_rate": 0.0004954348332832785, + "loss": 3.7645, + "step": 1893 + }, + { + "epoch": 0.07974736842105264, + "grad_norm": 0.3828125, + "learning_rate": 0.0004954284118347287, + "loss": 3.7881, + "step": 1894 + }, + { + "epoch": 0.07978947368421052, + "grad_norm": 0.4921875, + "learning_rate": 0.0004954219859147614, + "loss": 3.7712, + "step": 1895 + }, + { + "epoch": 0.07983157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.0004954155555234941, + "loss": 3.6624, + "step": 1896 + }, + { + "epoch": 0.07987368421052632, + "grad_norm": 0.52734375, + "learning_rate": 0.0004954091206610435, + "loss": 3.4891, + "step": 1897 + }, + { + "epoch": 0.0799157894736842, + "grad_norm": 0.466796875, + "learning_rate": 0.0004954026813275272, + "loss": 3.681, + "step": 1898 + }, + { + "epoch": 0.0799578947368421, + "grad_norm": 0.458984375, + "learning_rate": 0.0004953962375230623, + "loss": 3.2556, + "step": 1899 + }, + { + "epoch": 0.08, + "grad_norm": 0.46875, + "learning_rate": 0.0004953897892477664, + "loss": 3.9049, + "step": 1900 + }, + { + "epoch": 0.0800421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.0004953833365017568, + "loss": 3.9772, + "step": 1901 + }, + { + "epoch": 0.08008421052631579, + "grad_norm": 0.6328125, + "learning_rate": 0.000495376879285151, + "loss": 3.7265, + "step": 1902 + }, + { + "epoch": 0.08012631578947368, + "grad_norm": 0.46484375, + "learning_rate": 0.0004953704175980669, + "loss": 3.6198, + "step": 1903 + }, + { + "epoch": 0.08016842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.0004953639514406221, + "loss": 3.9809, + "step": 1904 + }, + { + "epoch": 0.08021052631578947, + "grad_norm": 0.76953125, + "learning_rate": 0.0004953574808129344, + "loss": 3.5986, + "step": 1905 + }, + { + "epoch": 0.08025263157894737, + "grad_norm": 0.44921875, + "learning_rate": 0.0004953510057151216, + "loss": 3.7484, + "step": 1906 + }, + { + "epoch": 0.08029473684210527, + "grad_norm": 0.44140625, + "learning_rate": 0.0004953445261473018, + "loss": 3.5561, + "step": 1907 + }, + { + "epoch": 0.08033684210526316, + "grad_norm": 0.5, + "learning_rate": 0.000495338042109593, + "loss": 3.354, + "step": 1908 + }, + { + "epoch": 0.08037894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.0004953315536021134, + "loss": 3.554, + "step": 1909 + }, + { + "epoch": 0.08042105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.000495325060624981, + "loss": 3.6499, + "step": 1910 + }, + { + "epoch": 0.08046315789473685, + "grad_norm": 0.53125, + "learning_rate": 0.0004953185631783143, + "loss": 3.4099, + "step": 1911 + }, + { + "epoch": 0.08050526315789473, + "grad_norm": 0.453125, + "learning_rate": 0.0004953120612622316, + "loss": 3.3944, + "step": 1912 + }, + { + "epoch": 0.08054736842105263, + "grad_norm": 0.4609375, + "learning_rate": 0.0004953055548768514, + "loss": 3.4921, + "step": 1913 + }, + { + "epoch": 0.08058947368421053, + "grad_norm": 0.48046875, + "learning_rate": 0.0004952990440222923, + "loss": 3.6982, + "step": 1914 + }, + { + "epoch": 0.08063157894736842, + "grad_norm": 0.470703125, + "learning_rate": 0.0004952925286986727, + "loss": 3.4956, + "step": 1915 + }, + { + "epoch": 0.08067368421052631, + "grad_norm": 0.44921875, + "learning_rate": 0.0004952860089061116, + "loss": 3.9061, + "step": 1916 + }, + { + "epoch": 0.08071578947368421, + "grad_norm": 0.466796875, + "learning_rate": 0.0004952794846447274, + "loss": 3.4516, + "step": 1917 + }, + { + "epoch": 0.08075789473684211, + "grad_norm": 0.4296875, + "learning_rate": 0.0004952729559146393, + "loss": 3.528, + "step": 1918 + }, + { + "epoch": 0.0808, + "grad_norm": 0.4453125, + "learning_rate": 0.0004952664227159661, + "loss": 3.8554, + "step": 1919 + }, + { + "epoch": 0.0808421052631579, + "grad_norm": 0.50390625, + "learning_rate": 0.0004952598850488268, + "loss": 3.1512, + "step": 1920 + }, + { + "epoch": 0.0808842105263158, + "grad_norm": 0.412109375, + "learning_rate": 0.0004952533429133407, + "loss": 3.45, + "step": 1921 + }, + { + "epoch": 0.08092631578947368, + "grad_norm": 0.578125, + "learning_rate": 0.0004952467963096268, + "loss": 3.3787, + "step": 1922 + }, + { + "epoch": 0.08096842105263158, + "grad_norm": 0.490234375, + "learning_rate": 0.0004952402452378043, + "loss": 3.9287, + "step": 1923 + }, + { + "epoch": 0.08101052631578948, + "grad_norm": 0.44140625, + "learning_rate": 0.0004952336896979929, + "loss": 3.8087, + "step": 1924 + }, + { + "epoch": 0.08105263157894736, + "grad_norm": 0.431640625, + "learning_rate": 0.0004952271296903117, + "loss": 3.6194, + "step": 1925 + }, + { + "epoch": 0.08109473684210526, + "grad_norm": 0.470703125, + "learning_rate": 0.0004952205652148802, + "loss": 3.6588, + "step": 1926 + }, + { + "epoch": 0.08113684210526316, + "grad_norm": 0.48046875, + "learning_rate": 0.0004952139962718183, + "loss": 3.461, + "step": 1927 + }, + { + "epoch": 0.08117894736842106, + "grad_norm": 0.5625, + "learning_rate": 0.0004952074228612455, + "loss": 3.3798, + "step": 1928 + }, + { + "epoch": 0.08122105263157894, + "grad_norm": 0.443359375, + "learning_rate": 0.0004952008449832815, + "loss": 4.0183, + "step": 1929 + }, + { + "epoch": 0.08126315789473684, + "grad_norm": 0.51953125, + "learning_rate": 0.0004951942626380462, + "loss": 3.591, + "step": 1930 + }, + { + "epoch": 0.08130526315789474, + "grad_norm": 0.44921875, + "learning_rate": 0.0004951876758256594, + "loss": 3.7295, + "step": 1931 + }, + { + "epoch": 0.08134736842105263, + "grad_norm": 0.44921875, + "learning_rate": 0.0004951810845462415, + "loss": 4.018, + "step": 1932 + }, + { + "epoch": 0.08138947368421053, + "grad_norm": 0.41015625, + "learning_rate": 0.0004951744887999121, + "loss": 3.831, + "step": 1933 + }, + { + "epoch": 0.08143157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.0004951678885867917, + "loss": 3.7, + "step": 1934 + }, + { + "epoch": 0.08147368421052631, + "grad_norm": 0.44921875, + "learning_rate": 0.0004951612839070005, + "loss": 3.4071, + "step": 1935 + }, + { + "epoch": 0.08151578947368421, + "grad_norm": 0.447265625, + "learning_rate": 0.0004951546747606585, + "loss": 3.9388, + "step": 1936 + }, + { + "epoch": 0.08155789473684211, + "grad_norm": 0.462890625, + "learning_rate": 0.0004951480611478866, + "loss": 3.5639, + "step": 1937 + }, + { + "epoch": 0.0816, + "grad_norm": 0.404296875, + "learning_rate": 0.000495141443068805, + "loss": 3.745, + "step": 1938 + }, + { + "epoch": 0.08164210526315789, + "grad_norm": 0.44140625, + "learning_rate": 0.0004951348205235344, + "loss": 3.8362, + "step": 1939 + }, + { + "epoch": 0.08168421052631579, + "grad_norm": 0.4765625, + "learning_rate": 0.0004951281935121953, + "loss": 3.6108, + "step": 1940 + }, + { + "epoch": 0.08172631578947369, + "grad_norm": 0.7265625, + "learning_rate": 0.0004951215620349086, + "loss": 3.4748, + "step": 1941 + }, + { + "epoch": 0.08176842105263157, + "grad_norm": 0.455078125, + "learning_rate": 0.0004951149260917949, + "loss": 3.7909, + "step": 1942 + }, + { + "epoch": 0.08181052631578947, + "grad_norm": 0.515625, + "learning_rate": 0.0004951082856829753, + "loss": 3.6138, + "step": 1943 + }, + { + "epoch": 0.08185263157894737, + "grad_norm": 0.490234375, + "learning_rate": 0.0004951016408085708, + "loss": 3.6875, + "step": 1944 + }, + { + "epoch": 0.08189473684210526, + "grad_norm": 0.51171875, + "learning_rate": 0.0004950949914687023, + "loss": 2.9941, + "step": 1945 + }, + { + "epoch": 0.08193684210526316, + "grad_norm": 0.5546875, + "learning_rate": 0.0004950883376634911, + "loss": 3.4746, + "step": 1946 + }, + { + "epoch": 0.08197894736842105, + "grad_norm": 0.455078125, + "learning_rate": 0.0004950816793930584, + "loss": 3.4375, + "step": 1947 + }, + { + "epoch": 0.08202105263157895, + "grad_norm": 0.4453125, + "learning_rate": 0.0004950750166575254, + "loss": 3.7499, + "step": 1948 + }, + { + "epoch": 0.08206315789473684, + "grad_norm": 0.45703125, + "learning_rate": 0.0004950683494570135, + "loss": 3.7055, + "step": 1949 + }, + { + "epoch": 0.08210526315789474, + "grad_norm": 0.44921875, + "learning_rate": 0.0004950616777916442, + "loss": 3.4872, + "step": 1950 + }, + { + "epoch": 0.08214736842105264, + "grad_norm": 0.59765625, + "learning_rate": 0.0004950550016615392, + "loss": 3.6569, + "step": 1951 + }, + { + "epoch": 0.08218947368421052, + "grad_norm": 0.43359375, + "learning_rate": 0.0004950483210668199, + "loss": 3.7398, + "step": 1952 + }, + { + "epoch": 0.08223157894736842, + "grad_norm": 0.494140625, + "learning_rate": 0.0004950416360076081, + "loss": 3.2988, + "step": 1953 + }, + { + "epoch": 0.08227368421052632, + "grad_norm": 0.48828125, + "learning_rate": 0.0004950349464840257, + "loss": 3.714, + "step": 1954 + }, + { + "epoch": 0.08231578947368422, + "grad_norm": 0.43359375, + "learning_rate": 0.0004950282524961944, + "loss": 3.4952, + "step": 1955 + }, + { + "epoch": 0.0823578947368421, + "grad_norm": 0.4453125, + "learning_rate": 0.0004950215540442362, + "loss": 3.7531, + "step": 1956 + }, + { + "epoch": 0.0824, + "grad_norm": 0.4609375, + "learning_rate": 0.0004950148511282732, + "loss": 3.6568, + "step": 1957 + }, + { + "epoch": 0.0824421052631579, + "grad_norm": 0.59765625, + "learning_rate": 0.0004950081437484277, + "loss": 3.3777, + "step": 1958 + }, + { + "epoch": 0.08248421052631578, + "grad_norm": 0.470703125, + "learning_rate": 0.0004950014319048214, + "loss": 3.5235, + "step": 1959 + }, + { + "epoch": 0.08252631578947368, + "grad_norm": 0.455078125, + "learning_rate": 0.000494994715597577, + "loss": 3.3017, + "step": 1960 + }, + { + "epoch": 0.08256842105263158, + "grad_norm": 0.46875, + "learning_rate": 0.0004949879948268167, + "loss": 3.6792, + "step": 1961 + }, + { + "epoch": 0.08261052631578947, + "grad_norm": 0.41796875, + "learning_rate": 0.000494981269592663, + "loss": 3.7127, + "step": 1962 + }, + { + "epoch": 0.08265263157894737, + "grad_norm": 0.4765625, + "learning_rate": 0.0004949745398952385, + "loss": 3.4819, + "step": 1963 + }, + { + "epoch": 0.08269473684210527, + "grad_norm": 0.5859375, + "learning_rate": 0.0004949678057346656, + "loss": 3.3384, + "step": 1964 + }, + { + "epoch": 0.08273684210526316, + "grad_norm": 0.466796875, + "learning_rate": 0.0004949610671110672, + "loss": 3.8225, + "step": 1965 + }, + { + "epoch": 0.08277894736842105, + "grad_norm": 0.6484375, + "learning_rate": 0.0004949543240245659, + "loss": 3.2495, + "step": 1966 + }, + { + "epoch": 0.08282105263157895, + "grad_norm": 0.45703125, + "learning_rate": 0.0004949475764752845, + "loss": 3.3999, + "step": 1967 + }, + { + "epoch": 0.08286315789473685, + "grad_norm": 0.4453125, + "learning_rate": 0.0004949408244633464, + "loss": 3.5412, + "step": 1968 + }, + { + "epoch": 0.08290526315789473, + "grad_norm": 0.451171875, + "learning_rate": 0.000494934067988874, + "loss": 3.2931, + "step": 1969 + }, + { + "epoch": 0.08294736842105263, + "grad_norm": 0.466796875, + "learning_rate": 0.0004949273070519906, + "loss": 3.4803, + "step": 1970 + }, + { + "epoch": 0.08298947368421053, + "grad_norm": 0.427734375, + "learning_rate": 0.0004949205416528196, + "loss": 3.8478, + "step": 1971 + }, + { + "epoch": 0.08303157894736841, + "grad_norm": 0.5, + "learning_rate": 0.0004949137717914842, + "loss": 3.6972, + "step": 1972 + }, + { + "epoch": 0.08307368421052631, + "grad_norm": 0.416015625, + "learning_rate": 0.0004949069974681075, + "loss": 3.6583, + "step": 1973 + }, + { + "epoch": 0.08311578947368421, + "grad_norm": 0.447265625, + "learning_rate": 0.0004949002186828131, + "loss": 3.9406, + "step": 1974 + }, + { + "epoch": 0.08315789473684211, + "grad_norm": 0.439453125, + "learning_rate": 0.0004948934354357244, + "loss": 3.8938, + "step": 1975 + }, + { + "epoch": 0.0832, + "grad_norm": 0.458984375, + "learning_rate": 0.0004948866477269651, + "loss": 3.6138, + "step": 1976 + }, + { + "epoch": 0.0832421052631579, + "grad_norm": 0.470703125, + "learning_rate": 0.0004948798555566589, + "loss": 3.387, + "step": 1977 + }, + { + "epoch": 0.0832842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.0004948730589249292, + "loss": 3.7547, + "step": 1978 + }, + { + "epoch": 0.08332631578947368, + "grad_norm": 0.4609375, + "learning_rate": 0.0004948662578319003, + "loss": 3.7165, + "step": 1979 + }, + { + "epoch": 0.08336842105263158, + "grad_norm": 0.546875, + "learning_rate": 0.0004948594522776958, + "loss": 3.4931, + "step": 1980 + }, + { + "epoch": 0.08341052631578948, + "grad_norm": 0.41796875, + "learning_rate": 0.0004948526422624398, + "loss": 3.7863, + "step": 1981 + }, + { + "epoch": 0.08345263157894736, + "grad_norm": 0.416015625, + "learning_rate": 0.0004948458277862563, + "loss": 3.8176, + "step": 1982 + }, + { + "epoch": 0.08349473684210526, + "grad_norm": 0.4140625, + "learning_rate": 0.0004948390088492696, + "loss": 3.5962, + "step": 1983 + }, + { + "epoch": 0.08353684210526316, + "grad_norm": 0.412109375, + "learning_rate": 0.0004948321854516038, + "loss": 3.9091, + "step": 1984 + }, + { + "epoch": 0.08357894736842106, + "grad_norm": 0.59765625, + "learning_rate": 0.0004948253575933832, + "loss": 3.9116, + "step": 1985 + }, + { + "epoch": 0.08362105263157894, + "grad_norm": 0.41015625, + "learning_rate": 0.0004948185252747323, + "loss": 4.0613, + "step": 1986 + }, + { + "epoch": 0.08366315789473684, + "grad_norm": 0.44140625, + "learning_rate": 0.0004948116884957754, + "loss": 3.6634, + "step": 1987 + }, + { + "epoch": 0.08370526315789474, + "grad_norm": 0.484375, + "learning_rate": 0.0004948048472566373, + "loss": 3.7137, + "step": 1988 + }, + { + "epoch": 0.08374736842105263, + "grad_norm": 0.5234375, + "learning_rate": 0.0004947980015574425, + "loss": 3.6376, + "step": 1989 + }, + { + "epoch": 0.08378947368421052, + "grad_norm": 0.470703125, + "learning_rate": 0.0004947911513983156, + "loss": 3.6237, + "step": 1990 + }, + { + "epoch": 0.08383157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.0004947842967793817, + "loss": 3.8604, + "step": 1991 + }, + { + "epoch": 0.08387368421052632, + "grad_norm": 0.4296875, + "learning_rate": 0.0004947774377007656, + "loss": 3.8249, + "step": 1992 + }, + { + "epoch": 0.08391578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.000494770574162592, + "loss": 3.6279, + "step": 1993 + }, + { + "epoch": 0.0839578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0004947637061649861, + "loss": 3.6857, + "step": 1994 + }, + { + "epoch": 0.084, + "grad_norm": 0.486328125, + "learning_rate": 0.0004947568337080732, + "loss": 3.928, + "step": 1995 + }, + { + "epoch": 0.08404210526315789, + "grad_norm": 0.42578125, + "learning_rate": 0.0004947499567919784, + "loss": 3.6396, + "step": 1996 + }, + { + "epoch": 0.08408421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.0004947430754168268, + "loss": 3.6607, + "step": 1997 + }, + { + "epoch": 0.08412631578947369, + "grad_norm": 0.64453125, + "learning_rate": 0.000494736189582744, + "loss": 3.5467, + "step": 1998 + }, + { + "epoch": 0.08416842105263157, + "grad_norm": 0.44921875, + "learning_rate": 0.0004947292992898554, + "loss": 3.7738, + "step": 1999 + }, + { + "epoch": 0.08421052631578947, + "grad_norm": 0.4375, + "learning_rate": 0.0004947224045382865, + "loss": 4.0989, + "step": 2000 + }, + { + "epoch": 0.08425263157894737, + "grad_norm": 0.45703125, + "learning_rate": 0.000494715505328163, + "loss": 3.777, + "step": 2001 + }, + { + "epoch": 0.08429473684210527, + "grad_norm": 0.41796875, + "learning_rate": 0.0004947086016596104, + "loss": 3.3926, + "step": 2002 + }, + { + "epoch": 0.08433684210526315, + "grad_norm": 0.423828125, + "learning_rate": 0.0004947016935327546, + "loss": 3.0078, + "step": 2003 + }, + { + "epoch": 0.08437894736842105, + "grad_norm": 0.625, + "learning_rate": 0.0004946947809477216, + "loss": 3.3413, + "step": 2004 + }, + { + "epoch": 0.08442105263157895, + "grad_norm": 0.52734375, + "learning_rate": 0.000494687863904637, + "loss": 3.5306, + "step": 2005 + }, + { + "epoch": 0.08446315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.0004946809424036272, + "loss": 3.9017, + "step": 2006 + }, + { + "epoch": 0.08450526315789474, + "grad_norm": 0.50390625, + "learning_rate": 0.000494674016444818, + "loss": 3.4367, + "step": 2007 + }, + { + "epoch": 0.08454736842105263, + "grad_norm": 0.5703125, + "learning_rate": 0.0004946670860283358, + "loss": 4.0367, + "step": 2008 + }, + { + "epoch": 0.08458947368421052, + "grad_norm": 0.482421875, + "learning_rate": 0.0004946601511543066, + "loss": 3.4446, + "step": 2009 + }, + { + "epoch": 0.08463157894736842, + "grad_norm": 0.470703125, + "learning_rate": 0.0004946532118228571, + "loss": 3.7409, + "step": 2010 + }, + { + "epoch": 0.08467368421052632, + "grad_norm": 0.4609375, + "learning_rate": 0.0004946462680341134, + "loss": 4.0729, + "step": 2011 + }, + { + "epoch": 0.08471578947368422, + "grad_norm": 0.439453125, + "learning_rate": 0.0004946393197882022, + "loss": 3.8701, + "step": 2012 + }, + { + "epoch": 0.0847578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.00049463236708525, + "loss": 3.8745, + "step": 2013 + }, + { + "epoch": 0.0848, + "grad_norm": 0.41796875, + "learning_rate": 0.0004946254099253835, + "loss": 3.9885, + "step": 2014 + }, + { + "epoch": 0.0848421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.0004946184483087295, + "loss": 3.3153, + "step": 2015 + }, + { + "epoch": 0.08488421052631578, + "grad_norm": 0.466796875, + "learning_rate": 0.0004946114822354147, + "loss": 3.5487, + "step": 2016 + }, + { + "epoch": 0.08492631578947368, + "grad_norm": 0.447265625, + "learning_rate": 0.0004946045117055662, + "loss": 3.6788, + "step": 2017 + }, + { + "epoch": 0.08496842105263158, + "grad_norm": 0.45703125, + "learning_rate": 0.0004945975367193109, + "loss": 3.8843, + "step": 2018 + }, + { + "epoch": 0.08501052631578947, + "grad_norm": 0.462890625, + "learning_rate": 0.0004945905572767758, + "loss": 2.9441, + "step": 2019 + }, + { + "epoch": 0.08505263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.0004945835733780881, + "loss": 3.5928, + "step": 2020 + }, + { + "epoch": 0.08509473684210526, + "grad_norm": 0.546875, + "learning_rate": 0.0004945765850233752, + "loss": 3.7109, + "step": 2021 + }, + { + "epoch": 0.08513684210526316, + "grad_norm": 0.4765625, + "learning_rate": 0.0004945695922127642, + "loss": 3.631, + "step": 2022 + }, + { + "epoch": 0.08517894736842105, + "grad_norm": 0.515625, + "learning_rate": 0.0004945625949463826, + "loss": 3.331, + "step": 2023 + }, + { + "epoch": 0.08522105263157895, + "grad_norm": 0.4453125, + "learning_rate": 0.0004945555932243578, + "loss": 3.6429, + "step": 2024 + }, + { + "epoch": 0.08526315789473685, + "grad_norm": 0.4453125, + "learning_rate": 0.0004945485870468176, + "loss": 3.4923, + "step": 2025 + }, + { + "epoch": 0.08530526315789473, + "grad_norm": 0.486328125, + "learning_rate": 0.0004945415764138893, + "loss": 3.7182, + "step": 2026 + }, + { + "epoch": 0.08534736842105263, + "grad_norm": 0.4375, + "learning_rate": 0.000494534561325701, + "loss": 3.9691, + "step": 2027 + }, + { + "epoch": 0.08538947368421053, + "grad_norm": 0.451171875, + "learning_rate": 0.0004945275417823802, + "loss": 3.2748, + "step": 2028 + }, + { + "epoch": 0.08543157894736843, + "grad_norm": 0.53515625, + "learning_rate": 0.0004945205177840548, + "loss": 3.4739, + "step": 2029 + }, + { + "epoch": 0.08547368421052631, + "grad_norm": 0.447265625, + "learning_rate": 0.000494513489330853, + "loss": 3.5763, + "step": 2030 + }, + { + "epoch": 0.08551578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.0004945064564229026, + "loss": 3.5381, + "step": 2031 + }, + { + "epoch": 0.08555789473684211, + "grad_norm": 0.4296875, + "learning_rate": 0.0004944994190603319, + "loss": 3.1597, + "step": 2032 + }, + { + "epoch": 0.0856, + "grad_norm": 0.43359375, + "learning_rate": 0.0004944923772432691, + "loss": 3.36, + "step": 2033 + }, + { + "epoch": 0.0856421052631579, + "grad_norm": 0.4609375, + "learning_rate": 0.0004944853309718424, + "loss": 3.6844, + "step": 2034 + }, + { + "epoch": 0.0856842105263158, + "grad_norm": 0.47265625, + "learning_rate": 0.0004944782802461803, + "loss": 3.7089, + "step": 2035 + }, + { + "epoch": 0.08572631578947368, + "grad_norm": 0.412109375, + "learning_rate": 0.0004944712250664112, + "loss": 3.9458, + "step": 2036 + }, + { + "epoch": 0.08576842105263158, + "grad_norm": 0.443359375, + "learning_rate": 0.0004944641654326635, + "loss": 4.1766, + "step": 2037 + }, + { + "epoch": 0.08581052631578948, + "grad_norm": 0.48046875, + "learning_rate": 0.000494457101345066, + "loss": 3.8978, + "step": 2038 + }, + { + "epoch": 0.08585263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.0004944500328037474, + "loss": 3.4751, + "step": 2039 + }, + { + "epoch": 0.08589473684210526, + "grad_norm": 0.50390625, + "learning_rate": 0.0004944429598088364, + "loss": 3.9218, + "step": 2040 + }, + { + "epoch": 0.08593684210526316, + "grad_norm": 0.490234375, + "learning_rate": 0.0004944358823604617, + "loss": 3.6082, + "step": 2041 + }, + { + "epoch": 0.08597894736842106, + "grad_norm": 0.43359375, + "learning_rate": 0.0004944288004587527, + "loss": 3.7253, + "step": 2042 + }, + { + "epoch": 0.08602105263157894, + "grad_norm": 0.478515625, + "learning_rate": 0.0004944217141038379, + "loss": 3.6628, + "step": 2043 + }, + { + "epoch": 0.08606315789473684, + "grad_norm": 0.4453125, + "learning_rate": 0.0004944146232958468, + "loss": 3.5219, + "step": 2044 + }, + { + "epoch": 0.08610526315789474, + "grad_norm": 0.439453125, + "learning_rate": 0.0004944075280349084, + "loss": 3.3887, + "step": 2045 + }, + { + "epoch": 0.08614736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.000494400428321152, + "loss": 3.8472, + "step": 2046 + }, + { + "epoch": 0.08618947368421052, + "grad_norm": 0.453125, + "learning_rate": 0.000494393324154707, + "loss": 3.5136, + "step": 2047 + }, + { + "epoch": 0.08623157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.0004943862155357027, + "loss": 3.5147, + "step": 2048 + }, + { + "epoch": 0.08627368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.0004943791024642687, + "loss": 3.7999, + "step": 2049 + }, + { + "epoch": 0.0863157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.0004943719849405347, + "loss": 3.4143, + "step": 2050 + }, + { + "epoch": 0.0863578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.0004943648629646302, + "loss": 3.9442, + "step": 2051 + }, + { + "epoch": 0.0864, + "grad_norm": 0.41796875, + "learning_rate": 0.0004943577365366849, + "loss": 3.8247, + "step": 2052 + }, + { + "epoch": 0.08644210526315789, + "grad_norm": 0.4140625, + "learning_rate": 0.0004943506056568289, + "loss": 3.3835, + "step": 2053 + }, + { + "epoch": 0.08648421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.0004943434703251919, + "loss": 3.6517, + "step": 2054 + }, + { + "epoch": 0.08652631578947369, + "grad_norm": 0.421875, + "learning_rate": 0.000494336330541904, + "loss": 3.9773, + "step": 2055 + }, + { + "epoch": 0.08656842105263157, + "grad_norm": 0.498046875, + "learning_rate": 0.0004943291863070952, + "loss": 3.5868, + "step": 2056 + }, + { + "epoch": 0.08661052631578947, + "grad_norm": 0.46484375, + "learning_rate": 0.0004943220376208957, + "loss": 3.5879, + "step": 2057 + }, + { + "epoch": 0.08665263157894737, + "grad_norm": 0.462890625, + "learning_rate": 0.0004943148844834359, + "loss": 3.6377, + "step": 2058 + }, + { + "epoch": 0.08669473684210527, + "grad_norm": 0.443359375, + "learning_rate": 0.0004943077268948457, + "loss": 3.6065, + "step": 2059 + }, + { + "epoch": 0.08673684210526315, + "grad_norm": 0.53515625, + "learning_rate": 0.0004943005648552559, + "loss": 3.7017, + "step": 2060 + }, + { + "epoch": 0.08677894736842105, + "grad_norm": 0.43359375, + "learning_rate": 0.0004942933983647969, + "loss": 3.7522, + "step": 2061 + }, + { + "epoch": 0.08682105263157895, + "grad_norm": 0.49609375, + "learning_rate": 0.0004942862274235991, + "loss": 3.1223, + "step": 2062 + }, + { + "epoch": 0.08686315789473684, + "grad_norm": 0.625, + "learning_rate": 0.0004942790520317934, + "loss": 3.437, + "step": 2063 + }, + { + "epoch": 0.08690526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.0004942718721895102, + "loss": 3.9889, + "step": 2064 + }, + { + "epoch": 0.08694736842105263, + "grad_norm": 0.400390625, + "learning_rate": 0.0004942646878968806, + "loss": 3.6701, + "step": 2065 + }, + { + "epoch": 0.08698947368421053, + "grad_norm": 0.421875, + "learning_rate": 0.0004942574991540354, + "loss": 3.6973, + "step": 2066 + }, + { + "epoch": 0.08703157894736842, + "grad_norm": 0.3984375, + "learning_rate": 0.0004942503059611055, + "loss": 3.7488, + "step": 2067 + }, + { + "epoch": 0.08707368421052632, + "grad_norm": 0.443359375, + "learning_rate": 0.0004942431083182221, + "loss": 3.9706, + "step": 2068 + }, + { + "epoch": 0.08711578947368422, + "grad_norm": 0.458984375, + "learning_rate": 0.0004942359062255162, + "loss": 3.8694, + "step": 2069 + }, + { + "epoch": 0.0871578947368421, + "grad_norm": 0.50390625, + "learning_rate": 0.000494228699683119, + "loss": 3.487, + "step": 2070 + }, + { + "epoch": 0.0872, + "grad_norm": 0.60546875, + "learning_rate": 0.000494221488691162, + "loss": 4.178, + "step": 2071 + }, + { + "epoch": 0.0872421052631579, + "grad_norm": 11.4375, + "learning_rate": 0.0004942142732497762, + "loss": 3.3172, + "step": 2072 + }, + { + "epoch": 0.08728421052631578, + "grad_norm": 0.4921875, + "learning_rate": 0.0004942070533590935, + "loss": 3.5073, + "step": 2073 + }, + { + "epoch": 0.08732631578947368, + "grad_norm": 0.46875, + "learning_rate": 0.0004941998290192451, + "loss": 3.7137, + "step": 2074 + }, + { + "epoch": 0.08736842105263158, + "grad_norm": 0.46484375, + "learning_rate": 0.0004941926002303628, + "loss": 3.9667, + "step": 2075 + }, + { + "epoch": 0.08741052631578948, + "grad_norm": 0.4453125, + "learning_rate": 0.0004941853669925781, + "loss": 3.8093, + "step": 2076 + }, + { + "epoch": 0.08745263157894737, + "grad_norm": 0.482421875, + "learning_rate": 0.000494178129306023, + "loss": 3.6918, + "step": 2077 + }, + { + "epoch": 0.08749473684210526, + "grad_norm": 0.447265625, + "learning_rate": 0.0004941708871708294, + "loss": 3.5568, + "step": 2078 + }, + { + "epoch": 0.08753684210526316, + "grad_norm": 0.53125, + "learning_rate": 0.000494163640587129, + "loss": 3.285, + "step": 2079 + }, + { + "epoch": 0.08757894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.000494156389555054, + "loss": 3.6124, + "step": 2080 + }, + { + "epoch": 0.08762105263157895, + "grad_norm": 0.5, + "learning_rate": 0.0004941491340747364, + "loss": 3.7445, + "step": 2081 + }, + { + "epoch": 0.08766315789473685, + "grad_norm": 0.451171875, + "learning_rate": 0.0004941418741463084, + "loss": 3.584, + "step": 2082 + }, + { + "epoch": 0.08770526315789473, + "grad_norm": 0.482421875, + "learning_rate": 0.0004941346097699023, + "loss": 3.3727, + "step": 2083 + }, + { + "epoch": 0.08774736842105263, + "grad_norm": 0.5078125, + "learning_rate": 0.0004941273409456506, + "loss": 3.5898, + "step": 2084 + }, + { + "epoch": 0.08778947368421053, + "grad_norm": 0.439453125, + "learning_rate": 0.0004941200676736855, + "loss": 3.6588, + "step": 2085 + }, + { + "epoch": 0.08783157894736843, + "grad_norm": 0.42578125, + "learning_rate": 0.0004941127899541396, + "loss": 3.5977, + "step": 2086 + }, + { + "epoch": 0.08787368421052631, + "grad_norm": 0.462890625, + "learning_rate": 0.0004941055077871454, + "loss": 3.3557, + "step": 2087 + }, + { + "epoch": 0.08791578947368421, + "grad_norm": 0.51171875, + "learning_rate": 0.0004940982211728359, + "loss": 3.4079, + "step": 2088 + }, + { + "epoch": 0.08795789473684211, + "grad_norm": 0.94921875, + "learning_rate": 0.0004940909301113434, + "loss": 3.249, + "step": 2089 + }, + { + "epoch": 0.088, + "grad_norm": 0.447265625, + "learning_rate": 0.0004940836346028011, + "loss": 2.947, + "step": 2090 + }, + { + "epoch": 0.0880421052631579, + "grad_norm": 0.478515625, + "learning_rate": 0.0004940763346473416, + "loss": 3.5199, + "step": 2091 + }, + { + "epoch": 0.08808421052631579, + "grad_norm": 0.498046875, + "learning_rate": 0.0004940690302450982, + "loss": 4.0699, + "step": 2092 + }, + { + "epoch": 0.08812631578947368, + "grad_norm": 0.55078125, + "learning_rate": 0.0004940617213962038, + "loss": 3.3557, + "step": 2093 + }, + { + "epoch": 0.08816842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.0004940544081007915, + "loss": 3.7499, + "step": 2094 + }, + { + "epoch": 0.08821052631578948, + "grad_norm": 0.5390625, + "learning_rate": 0.0004940470903589948, + "loss": 3.6581, + "step": 2095 + }, + { + "epoch": 0.08825263157894737, + "grad_norm": 0.484375, + "learning_rate": 0.0004940397681709467, + "loss": 3.5073, + "step": 2096 + }, + { + "epoch": 0.08829473684210526, + "grad_norm": 0.4453125, + "learning_rate": 0.0004940324415367808, + "loss": 3.5221, + "step": 2097 + }, + { + "epoch": 0.08833684210526316, + "grad_norm": 0.5078125, + "learning_rate": 0.0004940251104566306, + "loss": 3.4831, + "step": 2098 + }, + { + "epoch": 0.08837894736842106, + "grad_norm": 0.5859375, + "learning_rate": 0.0004940177749306296, + "loss": 3.818, + "step": 2099 + }, + { + "epoch": 0.08842105263157894, + "grad_norm": 0.419921875, + "learning_rate": 0.0004940104349589114, + "loss": 3.8387, + "step": 2100 + }, + { + "epoch": 0.08846315789473684, + "grad_norm": 0.53515625, + "learning_rate": 0.0004940030905416098, + "loss": 3.7556, + "step": 2101 + }, + { + "epoch": 0.08850526315789474, + "grad_norm": 0.435546875, + "learning_rate": 0.0004939957416788586, + "loss": 3.4967, + "step": 2102 + }, + { + "epoch": 0.08854736842105264, + "grad_norm": 0.453125, + "learning_rate": 0.0004939883883707917, + "loss": 3.8585, + "step": 2103 + }, + { + "epoch": 0.08858947368421052, + "grad_norm": 0.453125, + "learning_rate": 0.000493981030617543, + "loss": 3.7497, + "step": 2104 + }, + { + "epoch": 0.08863157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.0004939736684192466, + "loss": 3.755, + "step": 2105 + }, + { + "epoch": 0.08867368421052632, + "grad_norm": 0.44140625, + "learning_rate": 0.0004939663017760365, + "loss": 3.5746, + "step": 2106 + }, + { + "epoch": 0.0887157894736842, + "grad_norm": 0.54296875, + "learning_rate": 0.0004939589306880472, + "loss": 3.689, + "step": 2107 + }, + { + "epoch": 0.0887578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.0004939515551554127, + "loss": 3.5345, + "step": 2108 + }, + { + "epoch": 0.0888, + "grad_norm": 0.419921875, + "learning_rate": 0.0004939441751782676, + "loss": 3.5887, + "step": 2109 + }, + { + "epoch": 0.08884210526315789, + "grad_norm": 0.5546875, + "learning_rate": 0.0004939367907567462, + "loss": 3.1686, + "step": 2110 + }, + { + "epoch": 0.08888421052631579, + "grad_norm": 0.44921875, + "learning_rate": 0.0004939294018909831, + "loss": 3.5004, + "step": 2111 + }, + { + "epoch": 0.08892631578947369, + "grad_norm": 0.451171875, + "learning_rate": 0.0004939220085811129, + "loss": 3.7309, + "step": 2112 + }, + { + "epoch": 0.08896842105263159, + "grad_norm": 0.447265625, + "learning_rate": 0.0004939146108272703, + "loss": 3.5626, + "step": 2113 + }, + { + "epoch": 0.08901052631578947, + "grad_norm": 0.50390625, + "learning_rate": 0.0004939072086295901, + "loss": 3.3308, + "step": 2114 + }, + { + "epoch": 0.08905263157894737, + "grad_norm": 0.458984375, + "learning_rate": 0.0004938998019882071, + "loss": 3.8362, + "step": 2115 + }, + { + "epoch": 0.08909473684210527, + "grad_norm": 0.478515625, + "learning_rate": 0.0004938923909032562, + "loss": 3.6301, + "step": 2116 + }, + { + "epoch": 0.08913684210526315, + "grad_norm": 0.466796875, + "learning_rate": 0.0004938849753748727, + "loss": 3.7757, + "step": 2117 + }, + { + "epoch": 0.08917894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.0004938775554031913, + "loss": 3.7551, + "step": 2118 + }, + { + "epoch": 0.08922105263157895, + "grad_norm": 0.474609375, + "learning_rate": 0.0004938701309883474, + "loss": 3.7182, + "step": 2119 + }, + { + "epoch": 0.08926315789473684, + "grad_norm": 0.44921875, + "learning_rate": 0.0004938627021304764, + "loss": 4.178, + "step": 2120 + }, + { + "epoch": 0.08930526315789473, + "grad_norm": 0.404296875, + "learning_rate": 0.0004938552688297134, + "loss": 3.5459, + "step": 2121 + }, + { + "epoch": 0.08934736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.000493847831086194, + "loss": 3.6382, + "step": 2122 + }, + { + "epoch": 0.08938947368421053, + "grad_norm": 0.421875, + "learning_rate": 0.0004938403889000535, + "loss": 3.8423, + "step": 2123 + }, + { + "epoch": 0.08943157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.0004938329422714277, + "loss": 3.4575, + "step": 2124 + }, + { + "epoch": 0.08947368421052632, + "grad_norm": 0.404296875, + "learning_rate": 0.0004938254912004522, + "loss": 3.7989, + "step": 2125 + }, + { + "epoch": 0.08951578947368422, + "grad_norm": 0.41796875, + "learning_rate": 0.0004938180356872626, + "loss": 3.4769, + "step": 2126 + }, + { + "epoch": 0.0895578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.000493810575731995, + "loss": 3.3955, + "step": 2127 + }, + { + "epoch": 0.0896, + "grad_norm": 0.42578125, + "learning_rate": 0.0004938031113347852, + "loss": 3.6091, + "step": 2128 + }, + { + "epoch": 0.0896421052631579, + "grad_norm": 0.3984375, + "learning_rate": 0.000493795642495769, + "loss": 3.6599, + "step": 2129 + }, + { + "epoch": 0.08968421052631578, + "grad_norm": 0.400390625, + "learning_rate": 0.0004937881692150827, + "loss": 3.7955, + "step": 2130 + }, + { + "epoch": 0.08972631578947368, + "grad_norm": 0.47265625, + "learning_rate": 0.0004937806914928625, + "loss": 3.1032, + "step": 2131 + }, + { + "epoch": 0.08976842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.0004937732093292446, + "loss": 3.592, + "step": 2132 + }, + { + "epoch": 0.08981052631578948, + "grad_norm": 0.455078125, + "learning_rate": 0.000493765722724365, + "loss": 4.0776, + "step": 2133 + }, + { + "epoch": 0.08985263157894736, + "grad_norm": 0.421875, + "learning_rate": 0.0004937582316783604, + "loss": 3.6436, + "step": 2134 + }, + { + "epoch": 0.08989473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.0004937507361913674, + "loss": 3.7856, + "step": 2135 + }, + { + "epoch": 0.08993684210526316, + "grad_norm": 0.435546875, + "learning_rate": 0.0004937432362635222, + "loss": 3.6941, + "step": 2136 + }, + { + "epoch": 0.08997894736842105, + "grad_norm": 0.431640625, + "learning_rate": 0.0004937357318949617, + "loss": 3.5192, + "step": 2137 + }, + { + "epoch": 0.09002105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.0004937282230858225, + "loss": 3.4524, + "step": 2138 + }, + { + "epoch": 0.09006315789473684, + "grad_norm": 0.443359375, + "learning_rate": 0.0004937207098362416, + "loss": 4.1238, + "step": 2139 + }, + { + "epoch": 0.09010526315789473, + "grad_norm": 0.51171875, + "learning_rate": 0.0004937131921463555, + "loss": 3.8706, + "step": 2140 + }, + { + "epoch": 0.09014736842105263, + "grad_norm": 0.4453125, + "learning_rate": 0.0004937056700163015, + "loss": 3.6403, + "step": 2141 + }, + { + "epoch": 0.09018947368421053, + "grad_norm": 0.5, + "learning_rate": 0.0004936981434462166, + "loss": 3.5454, + "step": 2142 + }, + { + "epoch": 0.09023157894736843, + "grad_norm": 0.5546875, + "learning_rate": 0.0004936906124362378, + "loss": 2.9797, + "step": 2143 + }, + { + "epoch": 0.09027368421052631, + "grad_norm": 0.5390625, + "learning_rate": 0.0004936830769865023, + "loss": 3.6078, + "step": 2144 + }, + { + "epoch": 0.09031578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.0004936755370971475, + "loss": 3.3638, + "step": 2145 + }, + { + "epoch": 0.09035789473684211, + "grad_norm": 0.5, + "learning_rate": 0.0004936679927683107, + "loss": 3.2906, + "step": 2146 + }, + { + "epoch": 0.0904, + "grad_norm": 0.443359375, + "learning_rate": 0.0004936604440001295, + "loss": 3.4049, + "step": 2147 + }, + { + "epoch": 0.0904421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.0004936528907927411, + "loss": 3.6378, + "step": 2148 + }, + { + "epoch": 0.09048421052631579, + "grad_norm": 0.52734375, + "learning_rate": 0.0004936453331462835, + "loss": 3.2355, + "step": 2149 + }, + { + "epoch": 0.09052631578947369, + "grad_norm": 0.462890625, + "learning_rate": 0.0004936377710608941, + "loss": 3.1031, + "step": 2150 + }, + { + "epoch": 0.09056842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.0004936302045367107, + "loss": 3.7966, + "step": 2151 + }, + { + "epoch": 0.09061052631578947, + "grad_norm": 0.4453125, + "learning_rate": 0.0004936226335738714, + "loss": 3.812, + "step": 2152 + }, + { + "epoch": 0.09065263157894737, + "grad_norm": 0.44921875, + "learning_rate": 0.0004936150581725139, + "loss": 3.615, + "step": 2153 + }, + { + "epoch": 0.09069473684210526, + "grad_norm": 0.408203125, + "learning_rate": 0.0004936074783327762, + "loss": 3.5712, + "step": 2154 + }, + { + "epoch": 0.09073684210526316, + "grad_norm": 0.462890625, + "learning_rate": 0.0004935998940547965, + "loss": 3.1613, + "step": 2155 + }, + { + "epoch": 0.09077894736842106, + "grad_norm": 0.474609375, + "learning_rate": 0.0004935923053387129, + "loss": 3.7646, + "step": 2156 + }, + { + "epoch": 0.09082105263157894, + "grad_norm": 0.439453125, + "learning_rate": 0.0004935847121846638, + "loss": 3.3472, + "step": 2157 + }, + { + "epoch": 0.09086315789473684, + "grad_norm": 0.451171875, + "learning_rate": 0.0004935771145927874, + "loss": 3.17, + "step": 2158 + }, + { + "epoch": 0.09090526315789474, + "grad_norm": 0.451171875, + "learning_rate": 0.0004935695125632222, + "loss": 3.614, + "step": 2159 + }, + { + "epoch": 0.09094736842105264, + "grad_norm": 0.416015625, + "learning_rate": 0.0004935619060961067, + "loss": 4.2274, + "step": 2160 + }, + { + "epoch": 0.09098947368421052, + "grad_norm": 0.4296875, + "learning_rate": 0.0004935542951915794, + "loss": 3.4428, + "step": 2161 + }, + { + "epoch": 0.09103157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.000493546679849779, + "loss": 3.5008, + "step": 2162 + }, + { + "epoch": 0.09107368421052632, + "grad_norm": 0.404296875, + "learning_rate": 0.0004935390600708442, + "loss": 3.7241, + "step": 2163 + }, + { + "epoch": 0.0911157894736842, + "grad_norm": 0.451171875, + "learning_rate": 0.0004935314358549139, + "loss": 3.148, + "step": 2164 + }, + { + "epoch": 0.0911578947368421, + "grad_norm": 0.51171875, + "learning_rate": 0.000493523807202127, + "loss": 3.4836, + "step": 2165 + }, + { + "epoch": 0.0912, + "grad_norm": 0.43359375, + "learning_rate": 0.0004935161741126225, + "loss": 3.4649, + "step": 2166 + }, + { + "epoch": 0.09124210526315789, + "grad_norm": 0.4296875, + "learning_rate": 0.0004935085365865393, + "loss": 3.8163, + "step": 2167 + }, + { + "epoch": 0.09128421052631579, + "grad_norm": 0.474609375, + "learning_rate": 0.0004935008946240168, + "loss": 3.69, + "step": 2168 + }, + { + "epoch": 0.09132631578947369, + "grad_norm": 0.515625, + "learning_rate": 0.000493493248225194, + "loss": 3.6096, + "step": 2169 + }, + { + "epoch": 0.09136842105263158, + "grad_norm": 0.490234375, + "learning_rate": 0.0004934855973902105, + "loss": 3.4264, + "step": 2170 + }, + { + "epoch": 0.09141052631578947, + "grad_norm": 0.435546875, + "learning_rate": 0.0004934779421192053, + "loss": 3.6218, + "step": 2171 + }, + { + "epoch": 0.09145263157894737, + "grad_norm": 0.6484375, + "learning_rate": 0.0004934702824123182, + "loss": 3.8792, + "step": 2172 + }, + { + "epoch": 0.09149473684210527, + "grad_norm": 0.427734375, + "learning_rate": 0.0004934626182696886, + "loss": 3.2383, + "step": 2173 + }, + { + "epoch": 0.09153684210526315, + "grad_norm": 0.427734375, + "learning_rate": 0.000493454949691456, + "loss": 3.7411, + "step": 2174 + }, + { + "epoch": 0.09157894736842105, + "grad_norm": 0.38671875, + "learning_rate": 0.0004934472766777604, + "loss": 4.013, + "step": 2175 + }, + { + "epoch": 0.09162105263157895, + "grad_norm": 0.46875, + "learning_rate": 0.0004934395992287414, + "loss": 3.7162, + "step": 2176 + }, + { + "epoch": 0.09166315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.000493431917344539, + "loss": 3.3383, + "step": 2177 + }, + { + "epoch": 0.09170526315789473, + "grad_norm": 0.44140625, + "learning_rate": 0.0004934242310252931, + "loss": 3.5431, + "step": 2178 + }, + { + "epoch": 0.09174736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.0004934165402711436, + "loss": 3.6037, + "step": 2179 + }, + { + "epoch": 0.09178947368421053, + "grad_norm": 0.400390625, + "learning_rate": 0.0004934088450822308, + "loss": 3.4008, + "step": 2180 + }, + { + "epoch": 0.09183157894736842, + "grad_norm": 0.396484375, + "learning_rate": 0.0004934011454586949, + "loss": 3.8943, + "step": 2181 + }, + { + "epoch": 0.09187368421052632, + "grad_norm": 0.45703125, + "learning_rate": 0.000493393441400676, + "loss": 4.0653, + "step": 2182 + }, + { + "epoch": 0.09191578947368421, + "grad_norm": 0.69921875, + "learning_rate": 0.0004933857329083147, + "loss": 3.7465, + "step": 2183 + }, + { + "epoch": 0.0919578947368421, + "grad_norm": 0.45703125, + "learning_rate": 0.0004933780199817512, + "loss": 3.7839, + "step": 2184 + }, + { + "epoch": 0.092, + "grad_norm": 0.494140625, + "learning_rate": 0.0004933703026211262, + "loss": 3.198, + "step": 2185 + }, + { + "epoch": 0.0920421052631579, + "grad_norm": 0.48828125, + "learning_rate": 0.0004933625808265802, + "loss": 3.4851, + "step": 2186 + }, + { + "epoch": 0.0920842105263158, + "grad_norm": 0.458984375, + "learning_rate": 0.0004933548545982539, + "loss": 3.4378, + "step": 2187 + }, + { + "epoch": 0.09212631578947368, + "grad_norm": 0.455078125, + "learning_rate": 0.0004933471239362881, + "loss": 3.6771, + "step": 2188 + }, + { + "epoch": 0.09216842105263158, + "grad_norm": 0.44140625, + "learning_rate": 0.0004933393888408237, + "loss": 3.4539, + "step": 2189 + }, + { + "epoch": 0.09221052631578948, + "grad_norm": 0.470703125, + "learning_rate": 0.0004933316493120015, + "loss": 3.6512, + "step": 2190 + }, + { + "epoch": 0.09225263157894736, + "grad_norm": 0.435546875, + "learning_rate": 0.0004933239053499625, + "loss": 3.5342, + "step": 2191 + }, + { + "epoch": 0.09229473684210526, + "grad_norm": 0.51953125, + "learning_rate": 0.0004933161569548479, + "loss": 3.9198, + "step": 2192 + }, + { + "epoch": 0.09233684210526316, + "grad_norm": 0.443359375, + "learning_rate": 0.0004933084041267988, + "loss": 3.8718, + "step": 2193 + }, + { + "epoch": 0.09237894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.0004933006468659564, + "loss": 3.8724, + "step": 2194 + }, + { + "epoch": 0.09242105263157895, + "grad_norm": 0.44921875, + "learning_rate": 0.0004932928851724621, + "loss": 3.4539, + "step": 2195 + }, + { + "epoch": 0.09246315789473684, + "grad_norm": 0.458984375, + "learning_rate": 0.0004932851190464573, + "loss": 3.4672, + "step": 2196 + }, + { + "epoch": 0.09250526315789474, + "grad_norm": 0.44140625, + "learning_rate": 0.0004932773484880835, + "loss": 3.6806, + "step": 2197 + }, + { + "epoch": 0.09254736842105263, + "grad_norm": 0.4375, + "learning_rate": 0.0004932695734974822, + "loss": 3.746, + "step": 2198 + }, + { + "epoch": 0.09258947368421053, + "grad_norm": 0.78515625, + "learning_rate": 0.0004932617940747951, + "loss": 3.318, + "step": 2199 + }, + { + "epoch": 0.09263157894736843, + "grad_norm": 0.404296875, + "learning_rate": 0.0004932540102201641, + "loss": 3.4834, + "step": 2200 + }, + { + "epoch": 0.09267368421052631, + "grad_norm": 0.482421875, + "learning_rate": 0.0004932462219337305, + "loss": 3.6298, + "step": 2201 + }, + { + "epoch": 0.09271578947368421, + "grad_norm": 0.578125, + "learning_rate": 0.0004932384292156369, + "loss": 3.8754, + "step": 2202 + }, + { + "epoch": 0.09275789473684211, + "grad_norm": 0.55078125, + "learning_rate": 0.0004932306320660247, + "loss": 3.4735, + "step": 2203 + }, + { + "epoch": 0.0928, + "grad_norm": 0.46875, + "learning_rate": 0.0004932228304850363, + "loss": 3.9172, + "step": 2204 + }, + { + "epoch": 0.09284210526315789, + "grad_norm": 0.451171875, + "learning_rate": 0.0004932150244728136, + "loss": 3.7408, + "step": 2205 + }, + { + "epoch": 0.09288421052631579, + "grad_norm": 0.54296875, + "learning_rate": 0.0004932072140294989, + "loss": 3.5298, + "step": 2206 + }, + { + "epoch": 0.09292631578947369, + "grad_norm": 0.447265625, + "learning_rate": 0.0004931993991552346, + "loss": 3.4128, + "step": 2207 + }, + { + "epoch": 0.09296842105263158, + "grad_norm": 0.482421875, + "learning_rate": 0.0004931915798501629, + "loss": 3.6295, + "step": 2208 + }, + { + "epoch": 0.09301052631578947, + "grad_norm": 0.453125, + "learning_rate": 0.0004931837561144264, + "loss": 4.0245, + "step": 2209 + }, + { + "epoch": 0.09305263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.0004931759279481676, + "loss": 3.4963, + "step": 2210 + }, + { + "epoch": 0.09309473684210526, + "grad_norm": 0.455078125, + "learning_rate": 0.0004931680953515292, + "loss": 3.6967, + "step": 2211 + }, + { + "epoch": 0.09313684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.0004931602583246537, + "loss": 3.6086, + "step": 2212 + }, + { + "epoch": 0.09317894736842106, + "grad_norm": 0.54296875, + "learning_rate": 0.0004931524168676841, + "loss": 3.4076, + "step": 2213 + }, + { + "epoch": 0.09322105263157894, + "grad_norm": 0.47265625, + "learning_rate": 0.000493144570980763, + "loss": 3.6261, + "step": 2214 + }, + { + "epoch": 0.09326315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.0004931367206640337, + "loss": 3.3957, + "step": 2215 + }, + { + "epoch": 0.09330526315789474, + "grad_norm": 0.60546875, + "learning_rate": 0.0004931288659176388, + "loss": 3.3958, + "step": 2216 + }, + { + "epoch": 0.09334736842105264, + "grad_norm": 0.45703125, + "learning_rate": 0.0004931210067417218, + "loss": 3.6099, + "step": 2217 + }, + { + "epoch": 0.09338947368421052, + "grad_norm": 0.419921875, + "learning_rate": 0.0004931131431364257, + "loss": 3.6716, + "step": 2218 + }, + { + "epoch": 0.09343157894736842, + "grad_norm": 0.466796875, + "learning_rate": 0.0004931052751018937, + "loss": 3.8295, + "step": 2219 + }, + { + "epoch": 0.09347368421052632, + "grad_norm": 0.5, + "learning_rate": 0.0004930974026382693, + "loss": 3.9262, + "step": 2220 + }, + { + "epoch": 0.0935157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.0004930895257456957, + "loss": 3.3495, + "step": 2221 + }, + { + "epoch": 0.0935578947368421, + "grad_norm": 0.54296875, + "learning_rate": 0.0004930816444243167, + "loss": 3.7346, + "step": 2222 + }, + { + "epoch": 0.0936, + "grad_norm": 0.458984375, + "learning_rate": 0.0004930737586742758, + "loss": 3.5685, + "step": 2223 + }, + { + "epoch": 0.0936421052631579, + "grad_norm": 0.5703125, + "learning_rate": 0.0004930658684957164, + "loss": 3.3972, + "step": 2224 + }, + { + "epoch": 0.09368421052631579, + "grad_norm": 0.53125, + "learning_rate": 0.0004930579738887826, + "loss": 3.2524, + "step": 2225 + }, + { + "epoch": 0.09372631578947369, + "grad_norm": 0.474609375, + "learning_rate": 0.0004930500748536182, + "loss": 3.6159, + "step": 2226 + }, + { + "epoch": 0.09376842105263158, + "grad_norm": 0.455078125, + "learning_rate": 0.0004930421713903668, + "loss": 3.5875, + "step": 2227 + }, + { + "epoch": 0.09381052631578947, + "grad_norm": 0.59375, + "learning_rate": 0.0004930342634991727, + "loss": 3.1611, + "step": 2228 + }, + { + "epoch": 0.09385263157894737, + "grad_norm": 0.4765625, + "learning_rate": 0.0004930263511801799, + "loss": 3.639, + "step": 2229 + }, + { + "epoch": 0.09389473684210527, + "grad_norm": 0.52734375, + "learning_rate": 0.0004930184344335326, + "loss": 3.1428, + "step": 2230 + }, + { + "epoch": 0.09393684210526315, + "grad_norm": 0.44140625, + "learning_rate": 0.0004930105132593748, + "loss": 3.7563, + "step": 2231 + }, + { + "epoch": 0.09397894736842105, + "grad_norm": 0.486328125, + "learning_rate": 0.000493002587657851, + "loss": 3.37, + "step": 2232 + }, + { + "epoch": 0.09402105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.0004929946576291056, + "loss": 3.644, + "step": 2233 + }, + { + "epoch": 0.09406315789473685, + "grad_norm": 0.44140625, + "learning_rate": 0.000492986723173283, + "loss": 3.8953, + "step": 2234 + }, + { + "epoch": 0.09410526315789473, + "grad_norm": 0.478515625, + "learning_rate": 0.0004929787842905278, + "loss": 3.7189, + "step": 2235 + }, + { + "epoch": 0.09414736842105263, + "grad_norm": 0.416015625, + "learning_rate": 0.0004929708409809847, + "loss": 3.628, + "step": 2236 + }, + { + "epoch": 0.09418947368421053, + "grad_norm": 0.68359375, + "learning_rate": 0.0004929628932447984, + "loss": 3.9377, + "step": 2237 + }, + { + "epoch": 0.09423157894736842, + "grad_norm": 0.453125, + "learning_rate": 0.0004929549410821136, + "loss": 3.6712, + "step": 2238 + }, + { + "epoch": 0.09427368421052632, + "grad_norm": 0.56640625, + "learning_rate": 0.0004929469844930752, + "loss": 3.3246, + "step": 2239 + }, + { + "epoch": 0.09431578947368421, + "grad_norm": 0.5078125, + "learning_rate": 0.0004929390234778283, + "loss": 3.7811, + "step": 2240 + }, + { + "epoch": 0.0943578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.0004929310580365178, + "loss": 3.3557, + "step": 2241 + }, + { + "epoch": 0.0944, + "grad_norm": 0.416015625, + "learning_rate": 0.0004929230881692887, + "loss": 3.7594, + "step": 2242 + }, + { + "epoch": 0.0944421052631579, + "grad_norm": 0.55078125, + "learning_rate": 0.0004929151138762866, + "loss": 3.636, + "step": 2243 + }, + { + "epoch": 0.0944842105263158, + "grad_norm": 0.47265625, + "learning_rate": 0.0004929071351576564, + "loss": 3.7785, + "step": 2244 + }, + { + "epoch": 0.09452631578947368, + "grad_norm": 0.4296875, + "learning_rate": 0.0004928991520135436, + "loss": 3.8262, + "step": 2245 + }, + { + "epoch": 0.09456842105263158, + "grad_norm": 0.46484375, + "learning_rate": 0.0004928911644440936, + "loss": 3.4451, + "step": 2246 + }, + { + "epoch": 0.09461052631578948, + "grad_norm": 0.46484375, + "learning_rate": 0.000492883172449452, + "loss": 3.7754, + "step": 2247 + }, + { + "epoch": 0.09465263157894736, + "grad_norm": 0.5078125, + "learning_rate": 0.0004928751760297645, + "loss": 3.327, + "step": 2248 + }, + { + "epoch": 0.09469473684210526, + "grad_norm": 0.4296875, + "learning_rate": 0.0004928671751851765, + "loss": 3.4821, + "step": 2249 + }, + { + "epoch": 0.09473684210526316, + "grad_norm": 0.46875, + "learning_rate": 0.000492859169915834, + "loss": 3.619, + "step": 2250 + }, + { + "epoch": 0.09477894736842105, + "grad_norm": 0.486328125, + "learning_rate": 0.0004928511602218827, + "loss": 3.7875, + "step": 2251 + }, + { + "epoch": 0.09482105263157894, + "grad_norm": 0.458984375, + "learning_rate": 0.0004928431461034687, + "loss": 3.6277, + "step": 2252 + }, + { + "epoch": 0.09486315789473684, + "grad_norm": 0.455078125, + "learning_rate": 0.0004928351275607379, + "loss": 3.2966, + "step": 2253 + }, + { + "epoch": 0.09490526315789474, + "grad_norm": 0.462890625, + "learning_rate": 0.0004928271045938363, + "loss": 3.3082, + "step": 2254 + }, + { + "epoch": 0.09494736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.0004928190772029102, + "loss": 3.9054, + "step": 2255 + }, + { + "epoch": 0.09498947368421053, + "grad_norm": 0.474609375, + "learning_rate": 0.0004928110453881059, + "loss": 3.4825, + "step": 2256 + }, + { + "epoch": 0.09503157894736843, + "grad_norm": 0.423828125, + "learning_rate": 0.0004928030091495695, + "loss": 3.6731, + "step": 2257 + }, + { + "epoch": 0.09507368421052631, + "grad_norm": 0.4296875, + "learning_rate": 0.0004927949684874477, + "loss": 3.6112, + "step": 2258 + }, + { + "epoch": 0.09511578947368421, + "grad_norm": 0.4921875, + "learning_rate": 0.0004927869234018868, + "loss": 3.0327, + "step": 2259 + }, + { + "epoch": 0.09515789473684211, + "grad_norm": 0.443359375, + "learning_rate": 0.0004927788738930334, + "loss": 3.6778, + "step": 2260 + }, + { + "epoch": 0.0952, + "grad_norm": 0.447265625, + "learning_rate": 0.0004927708199610341, + "loss": 3.5951, + "step": 2261 + }, + { + "epoch": 0.09524210526315789, + "grad_norm": 0.47265625, + "learning_rate": 0.0004927627616060358, + "loss": 3.2553, + "step": 2262 + }, + { + "epoch": 0.09528421052631579, + "grad_norm": 0.48828125, + "learning_rate": 0.0004927546988281853, + "loss": 3.3389, + "step": 2263 + }, + { + "epoch": 0.09532631578947369, + "grad_norm": 0.44921875, + "learning_rate": 0.0004927466316276292, + "loss": 3.8305, + "step": 2264 + }, + { + "epoch": 0.09536842105263157, + "grad_norm": 0.455078125, + "learning_rate": 0.0004927385600045148, + "loss": 3.824, + "step": 2265 + }, + { + "epoch": 0.09541052631578947, + "grad_norm": 0.427734375, + "learning_rate": 0.000492730483958989, + "loss": 3.4389, + "step": 2266 + }, + { + "epoch": 0.09545263157894737, + "grad_norm": 0.47265625, + "learning_rate": 0.000492722403491199, + "loss": 3.6056, + "step": 2267 + }, + { + "epoch": 0.09549473684210526, + "grad_norm": 0.4296875, + "learning_rate": 0.0004927143186012922, + "loss": 3.8661, + "step": 2268 + }, + { + "epoch": 0.09553684210526316, + "grad_norm": 0.44921875, + "learning_rate": 0.0004927062292894153, + "loss": 3.6917, + "step": 2269 + }, + { + "epoch": 0.09557894736842105, + "grad_norm": 0.416015625, + "learning_rate": 0.0004926981355557164, + "loss": 3.6152, + "step": 2270 + }, + { + "epoch": 0.09562105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.0004926900374003424, + "loss": 3.4671, + "step": 2271 + }, + { + "epoch": 0.09566315789473684, + "grad_norm": 0.47265625, + "learning_rate": 0.0004926819348234412, + "loss": 3.482, + "step": 2272 + }, + { + "epoch": 0.09570526315789474, + "grad_norm": 0.40625, + "learning_rate": 0.0004926738278251603, + "loss": 3.4053, + "step": 2273 + }, + { + "epoch": 0.09574736842105264, + "grad_norm": 0.41015625, + "learning_rate": 0.0004926657164056474, + "loss": 3.6836, + "step": 2274 + }, + { + "epoch": 0.09578947368421052, + "grad_norm": 0.43359375, + "learning_rate": 0.0004926576005650501, + "loss": 3.472, + "step": 2275 + }, + { + "epoch": 0.09583157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.0004926494803035166, + "loss": 3.4592, + "step": 2276 + }, + { + "epoch": 0.09587368421052632, + "grad_norm": 0.44921875, + "learning_rate": 0.0004926413556211946, + "loss": 3.8507, + "step": 2277 + }, + { + "epoch": 0.0959157894736842, + "grad_norm": 0.404296875, + "learning_rate": 0.0004926332265182321, + "loss": 3.8539, + "step": 2278 + }, + { + "epoch": 0.0959578947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.0004926250929947776, + "loss": 3.2598, + "step": 2279 + }, + { + "epoch": 0.096, + "grad_norm": 0.43359375, + "learning_rate": 0.0004926169550509787, + "loss": 3.3632, + "step": 2280 + }, + { + "epoch": 0.0960421052631579, + "grad_norm": 0.73828125, + "learning_rate": 0.000492608812686984, + "loss": 3.3726, + "step": 2281 + }, + { + "epoch": 0.09608421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.0004926006659029419, + "loss": 3.678, + "step": 2282 + }, + { + "epoch": 0.09612631578947368, + "grad_norm": 0.46875, + "learning_rate": 0.0004925925146990005, + "loss": 3.8254, + "step": 2283 + }, + { + "epoch": 0.09616842105263158, + "grad_norm": 0.455078125, + "learning_rate": 0.0004925843590753087, + "loss": 4.0319, + "step": 2284 + }, + { + "epoch": 0.09621052631578947, + "grad_norm": 0.458984375, + "learning_rate": 0.0004925761990320148, + "loss": 3.8538, + "step": 2285 + }, + { + "epoch": 0.09625263157894737, + "grad_norm": 0.46875, + "learning_rate": 0.0004925680345692676, + "loss": 3.7418, + "step": 2286 + }, + { + "epoch": 0.09629473684210527, + "grad_norm": 0.41015625, + "learning_rate": 0.0004925598656872158, + "loss": 3.5188, + "step": 2287 + }, + { + "epoch": 0.09633684210526315, + "grad_norm": 0.55859375, + "learning_rate": 0.0004925516923860083, + "loss": 3.0186, + "step": 2288 + }, + { + "epoch": 0.09637894736842105, + "grad_norm": 0.49609375, + "learning_rate": 0.0004925435146657939, + "loss": 3.8819, + "step": 2289 + }, + { + "epoch": 0.09642105263157895, + "grad_norm": 0.478515625, + "learning_rate": 0.0004925353325267217, + "loss": 3.1003, + "step": 2290 + }, + { + "epoch": 0.09646315789473685, + "grad_norm": 0.53515625, + "learning_rate": 0.0004925271459689406, + "loss": 3.5093, + "step": 2291 + }, + { + "epoch": 0.09650526315789473, + "grad_norm": 0.4296875, + "learning_rate": 0.0004925189549925999, + "loss": 3.5151, + "step": 2292 + }, + { + "epoch": 0.09654736842105263, + "grad_norm": 0.62890625, + "learning_rate": 0.0004925107595978489, + "loss": 3.4106, + "step": 2293 + }, + { + "epoch": 0.09658947368421053, + "grad_norm": 0.47265625, + "learning_rate": 0.0004925025597848367, + "loss": 3.8088, + "step": 2294 + }, + { + "epoch": 0.09663157894736842, + "grad_norm": 0.466796875, + "learning_rate": 0.0004924943555537128, + "loss": 4.0872, + "step": 2295 + }, + { + "epoch": 0.09667368421052631, + "grad_norm": 0.451171875, + "learning_rate": 0.0004924861469046266, + "loss": 3.5939, + "step": 2296 + }, + { + "epoch": 0.09671578947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.0004924779338377278, + "loss": 3.5667, + "step": 2297 + }, + { + "epoch": 0.09675789473684211, + "grad_norm": 0.5, + "learning_rate": 0.0004924697163531658, + "loss": 3.5407, + "step": 2298 + }, + { + "epoch": 0.0968, + "grad_norm": 0.431640625, + "learning_rate": 0.0004924614944510907, + "loss": 3.7918, + "step": 2299 + }, + { + "epoch": 0.0968421052631579, + "grad_norm": 0.44921875, + "learning_rate": 0.0004924532681316519, + "loss": 3.1883, + "step": 2300 + }, + { + "epoch": 0.0968842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.0004924450373949994, + "loss": 3.9175, + "step": 2301 + }, + { + "epoch": 0.09692631578947368, + "grad_norm": 0.41015625, + "learning_rate": 0.0004924368022412832, + "loss": 3.8284, + "step": 2302 + }, + { + "epoch": 0.09696842105263158, + "grad_norm": 0.453125, + "learning_rate": 0.0004924285626706534, + "loss": 3.4134, + "step": 2303 + }, + { + "epoch": 0.09701052631578948, + "grad_norm": 0.427734375, + "learning_rate": 0.0004924203186832599, + "loss": 3.6822, + "step": 2304 + }, + { + "epoch": 0.09705263157894736, + "grad_norm": 0.41796875, + "learning_rate": 0.0004924120702792531, + "loss": 3.8275, + "step": 2305 + }, + { + "epoch": 0.09709473684210526, + "grad_norm": 0.46875, + "learning_rate": 0.0004924038174587833, + "loss": 3.7106, + "step": 2306 + }, + { + "epoch": 0.09713684210526316, + "grad_norm": 0.45703125, + "learning_rate": 0.0004923955602220007, + "loss": 3.8975, + "step": 2307 + }, + { + "epoch": 0.09717894736842106, + "grad_norm": 0.412109375, + "learning_rate": 0.0004923872985690557, + "loss": 3.6689, + "step": 2308 + }, + { + "epoch": 0.09722105263157894, + "grad_norm": 0.423828125, + "learning_rate": 0.000492379032500099, + "loss": 3.1689, + "step": 2309 + }, + { + "epoch": 0.09726315789473684, + "grad_norm": 0.5234375, + "learning_rate": 0.0004923707620152811, + "loss": 3.555, + "step": 2310 + }, + { + "epoch": 0.09730526315789474, + "grad_norm": 0.470703125, + "learning_rate": 0.0004923624871147526, + "loss": 3.2337, + "step": 2311 + }, + { + "epoch": 0.09734736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.0004923542077986644, + "loss": 3.4281, + "step": 2312 + }, + { + "epoch": 0.09738947368421053, + "grad_norm": 0.4140625, + "learning_rate": 0.0004923459240671673, + "loss": 3.3522, + "step": 2313 + }, + { + "epoch": 0.09743157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.0004923376359204123, + "loss": 3.7305, + "step": 2314 + }, + { + "epoch": 0.09747368421052631, + "grad_norm": 0.48046875, + "learning_rate": 0.0004923293433585503, + "loss": 3.1243, + "step": 2315 + }, + { + "epoch": 0.09751578947368421, + "grad_norm": 0.40234375, + "learning_rate": 0.0004923210463817322, + "loss": 3.749, + "step": 2316 + }, + { + "epoch": 0.09755789473684211, + "grad_norm": 0.400390625, + "learning_rate": 0.0004923127449901095, + "loss": 3.7855, + "step": 2317 + }, + { + "epoch": 0.0976, + "grad_norm": 0.416015625, + "learning_rate": 0.0004923044391838332, + "loss": 3.8114, + "step": 2318 + }, + { + "epoch": 0.09764210526315789, + "grad_norm": 0.44140625, + "learning_rate": 0.0004922961289630549, + "loss": 3.3494, + "step": 2319 + }, + { + "epoch": 0.09768421052631579, + "grad_norm": 0.45703125, + "learning_rate": 0.0004922878143279256, + "loss": 3.6752, + "step": 2320 + }, + { + "epoch": 0.09772631578947369, + "grad_norm": 0.404296875, + "learning_rate": 0.0004922794952785972, + "loss": 3.9347, + "step": 2321 + }, + { + "epoch": 0.09776842105263157, + "grad_norm": 0.41015625, + "learning_rate": 0.0004922711718152209, + "loss": 3.7123, + "step": 2322 + }, + { + "epoch": 0.09781052631578947, + "grad_norm": 0.390625, + "learning_rate": 0.0004922628439379485, + "loss": 3.6127, + "step": 2323 + }, + { + "epoch": 0.09785263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.0004922545116469318, + "loss": 3.7444, + "step": 2324 + }, + { + "epoch": 0.09789473684210526, + "grad_norm": 0.421875, + "learning_rate": 0.0004922461749423226, + "loss": 3.5492, + "step": 2325 + }, + { + "epoch": 0.09793684210526316, + "grad_norm": 0.51171875, + "learning_rate": 0.0004922378338242725, + "loss": 3.0203, + "step": 2326 + }, + { + "epoch": 0.09797894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.0004922294882929339, + "loss": 3.1889, + "step": 2327 + }, + { + "epoch": 0.09802105263157895, + "grad_norm": 0.46484375, + "learning_rate": 0.0004922211383484586, + "loss": 3.7574, + "step": 2328 + }, + { + "epoch": 0.09806315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.0004922127839909986, + "loss": 3.5968, + "step": 2329 + }, + { + "epoch": 0.09810526315789474, + "grad_norm": 0.427734375, + "learning_rate": 0.0004922044252207065, + "loss": 3.6444, + "step": 2330 + }, + { + "epoch": 0.09814736842105264, + "grad_norm": 0.41796875, + "learning_rate": 0.0004921960620377343, + "loss": 3.5392, + "step": 2331 + }, + { + "epoch": 0.09818947368421052, + "grad_norm": 0.439453125, + "learning_rate": 0.0004921876944422342, + "loss": 2.9158, + "step": 2332 + }, + { + "epoch": 0.09823157894736842, + "grad_norm": 0.48828125, + "learning_rate": 0.0004921793224343591, + "loss": 3.4225, + "step": 2333 + }, + { + "epoch": 0.09827368421052632, + "grad_norm": 0.453125, + "learning_rate": 0.0004921709460142612, + "loss": 3.5111, + "step": 2334 + }, + { + "epoch": 0.0983157894736842, + "grad_norm": 0.5234375, + "learning_rate": 0.0004921625651820933, + "loss": 3.7589, + "step": 2335 + }, + { + "epoch": 0.0983578947368421, + "grad_norm": 0.470703125, + "learning_rate": 0.0004921541799380078, + "loss": 3.2662, + "step": 2336 + }, + { + "epoch": 0.0984, + "grad_norm": 0.41015625, + "learning_rate": 0.0004921457902821578, + "loss": 3.5344, + "step": 2337 + }, + { + "epoch": 0.0984421052631579, + "grad_norm": 0.50390625, + "learning_rate": 0.000492137396214696, + "loss": 3.3708, + "step": 2338 + }, + { + "epoch": 0.09848421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.0004921289977357753, + "loss": 3.5139, + "step": 2339 + }, + { + "epoch": 0.09852631578947368, + "grad_norm": 0.40234375, + "learning_rate": 0.0004921205948455488, + "loss": 3.6257, + "step": 2340 + }, + { + "epoch": 0.09856842105263158, + "grad_norm": 0.5859375, + "learning_rate": 0.0004921121875441695, + "loss": 3.9827, + "step": 2341 + }, + { + "epoch": 0.09861052631578947, + "grad_norm": 0.51171875, + "learning_rate": 0.0004921037758317905, + "loss": 3.6639, + "step": 2342 + }, + { + "epoch": 0.09865263157894737, + "grad_norm": 0.451171875, + "learning_rate": 0.0004920953597085654, + "loss": 3.5262, + "step": 2343 + }, + { + "epoch": 0.09869473684210527, + "grad_norm": 0.48828125, + "learning_rate": 0.0004920869391746472, + "loss": 4.0008, + "step": 2344 + }, + { + "epoch": 0.09873684210526316, + "grad_norm": 0.46875, + "learning_rate": 0.0004920785142301893, + "loss": 3.1332, + "step": 2345 + }, + { + "epoch": 0.09877894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.0004920700848753454, + "loss": 3.6987, + "step": 2346 + }, + { + "epoch": 0.09882105263157895, + "grad_norm": 0.44140625, + "learning_rate": 0.000492061651110269, + "loss": 3.0578, + "step": 2347 + }, + { + "epoch": 0.09886315789473685, + "grad_norm": 0.43359375, + "learning_rate": 0.0004920532129351138, + "loss": 3.6252, + "step": 2348 + }, + { + "epoch": 0.09890526315789473, + "grad_norm": 0.578125, + "learning_rate": 0.0004920447703500333, + "loss": 3.3714, + "step": 2349 + }, + { + "epoch": 0.09894736842105263, + "grad_norm": 0.470703125, + "learning_rate": 0.0004920363233551816, + "loss": 3.0817, + "step": 2350 + }, + { + "epoch": 0.09898947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.0004920278719507123, + "loss": 3.938, + "step": 2351 + }, + { + "epoch": 0.09903157894736841, + "grad_norm": 0.4765625, + "learning_rate": 0.0004920194161367797, + "loss": 3.577, + "step": 2352 + }, + { + "epoch": 0.09907368421052631, + "grad_norm": 0.42578125, + "learning_rate": 0.0004920109559135376, + "loss": 3.6999, + "step": 2353 + }, + { + "epoch": 0.09911578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.0004920024912811403, + "loss": 4.1408, + "step": 2354 + }, + { + "epoch": 0.09915789473684211, + "grad_norm": 0.6640625, + "learning_rate": 0.0004919940222397419, + "loss": 3.2699, + "step": 2355 + }, + { + "epoch": 0.0992, + "grad_norm": 0.484375, + "learning_rate": 0.0004919855487894968, + "loss": 3.665, + "step": 2356 + }, + { + "epoch": 0.0992421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.0004919770709305593, + "loss": 3.7943, + "step": 2357 + }, + { + "epoch": 0.0992842105263158, + "grad_norm": 0.796875, + "learning_rate": 0.000491968588663084, + "loss": 3.3793, + "step": 2358 + }, + { + "epoch": 0.09932631578947368, + "grad_norm": 0.427734375, + "learning_rate": 0.000491960101987225, + "loss": 3.5438, + "step": 2359 + }, + { + "epoch": 0.09936842105263158, + "grad_norm": 0.474609375, + "learning_rate": 0.0004919516109031375, + "loss": 4.0149, + "step": 2360 + }, + { + "epoch": 0.09941052631578948, + "grad_norm": 0.484375, + "learning_rate": 0.0004919431154109758, + "loss": 3.5727, + "step": 2361 + }, + { + "epoch": 0.09945263157894736, + "grad_norm": 0.447265625, + "learning_rate": 0.0004919346155108949, + "loss": 3.3026, + "step": 2362 + }, + { + "epoch": 0.09949473684210526, + "grad_norm": 0.455078125, + "learning_rate": 0.0004919261112030495, + "loss": 3.7856, + "step": 2363 + }, + { + "epoch": 0.09953684210526316, + "grad_norm": 0.46484375, + "learning_rate": 0.0004919176024875947, + "loss": 3.9578, + "step": 2364 + }, + { + "epoch": 0.09957894736842106, + "grad_norm": 0.447265625, + "learning_rate": 0.0004919090893646854, + "loss": 3.8835, + "step": 2365 + }, + { + "epoch": 0.09962105263157894, + "grad_norm": 0.4453125, + "learning_rate": 0.0004919005718344767, + "loss": 3.2881, + "step": 2366 + }, + { + "epoch": 0.09966315789473684, + "grad_norm": 0.52734375, + "learning_rate": 0.0004918920498971238, + "loss": 3.4665, + "step": 2367 + }, + { + "epoch": 0.09970526315789474, + "grad_norm": 0.416015625, + "learning_rate": 0.0004918835235527819, + "loss": 3.7954, + "step": 2368 + }, + { + "epoch": 0.09974736842105263, + "grad_norm": 0.462890625, + "learning_rate": 0.0004918749928016064, + "loss": 3.0782, + "step": 2369 + }, + { + "epoch": 0.09978947368421053, + "grad_norm": 0.546875, + "learning_rate": 0.0004918664576437528, + "loss": 3.4417, + "step": 2370 + }, + { + "epoch": 0.09983157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.0004918579180793765, + "loss": 3.1913, + "step": 2371 + }, + { + "epoch": 0.09987368421052631, + "grad_norm": 0.419921875, + "learning_rate": 0.0004918493741086331, + "loss": 3.7737, + "step": 2372 + }, + { + "epoch": 0.09991578947368421, + "grad_norm": 0.458984375, + "learning_rate": 0.0004918408257316784, + "loss": 3.4303, + "step": 2373 + }, + { + "epoch": 0.0999578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.0004918322729486677, + "loss": 3.4441, + "step": 2374 + }, + { + "epoch": 0.1, + "grad_norm": 0.470703125, + "learning_rate": 0.0004918237157597574, + "loss": 3.574, + "step": 2375 + }, + { + "epoch": 0.10004210526315789, + "grad_norm": 0.390625, + "learning_rate": 0.000491815154165103, + "loss": 3.3758, + "step": 2376 + }, + { + "epoch": 0.10008421052631579, + "grad_norm": 0.4609375, + "learning_rate": 0.0004918065881648607, + "loss": 3.5957, + "step": 2377 + }, + { + "epoch": 0.10012631578947369, + "grad_norm": 0.443359375, + "learning_rate": 0.0004917980177591864, + "loss": 3.654, + "step": 2378 + }, + { + "epoch": 0.10016842105263157, + "grad_norm": 0.44140625, + "learning_rate": 0.0004917894429482363, + "loss": 3.5593, + "step": 2379 + }, + { + "epoch": 0.10021052631578947, + "grad_norm": 0.4453125, + "learning_rate": 0.0004917808637321667, + "loss": 3.145, + "step": 2380 + }, + { + "epoch": 0.10025263157894737, + "grad_norm": 0.458984375, + "learning_rate": 0.0004917722801111337, + "loss": 3.5548, + "step": 2381 + }, + { + "epoch": 0.10029473684210527, + "grad_norm": 0.435546875, + "learning_rate": 0.000491763692085294, + "loss": 2.8117, + "step": 2382 + }, + { + "epoch": 0.10033684210526315, + "grad_norm": 0.421875, + "learning_rate": 0.0004917550996548039, + "loss": 4.078, + "step": 2383 + }, + { + "epoch": 0.10037894736842105, + "grad_norm": 0.4140625, + "learning_rate": 0.0004917465028198199, + "loss": 3.5865, + "step": 2384 + }, + { + "epoch": 0.10042105263157895, + "grad_norm": 0.44921875, + "learning_rate": 0.0004917379015804985, + "loss": 3.3883, + "step": 2385 + }, + { + "epoch": 0.10046315789473684, + "grad_norm": 0.45703125, + "learning_rate": 0.0004917292959369968, + "loss": 3.7073, + "step": 2386 + }, + { + "epoch": 0.10050526315789474, + "grad_norm": 0.431640625, + "learning_rate": 0.0004917206858894713, + "loss": 3.8791, + "step": 2387 + }, + { + "epoch": 0.10054736842105264, + "grad_norm": 0.4140625, + "learning_rate": 0.0004917120714380788, + "loss": 3.6739, + "step": 2388 + }, + { + "epoch": 0.10058947368421052, + "grad_norm": 0.423828125, + "learning_rate": 0.0004917034525829764, + "loss": 3.7219, + "step": 2389 + }, + { + "epoch": 0.10063157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.0004916948293243211, + "loss": 3.509, + "step": 2390 + }, + { + "epoch": 0.10067368421052632, + "grad_norm": 0.44921875, + "learning_rate": 0.00049168620166227, + "loss": 3.5365, + "step": 2391 + }, + { + "epoch": 0.10071578947368422, + "grad_norm": 0.431640625, + "learning_rate": 0.0004916775695969803, + "loss": 3.7747, + "step": 2392 + }, + { + "epoch": 0.1007578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.0004916689331286091, + "loss": 4.096, + "step": 2393 + }, + { + "epoch": 0.1008, + "grad_norm": 0.478515625, + "learning_rate": 0.000491660292257314, + "loss": 3.8376, + "step": 2394 + }, + { + "epoch": 0.1008421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.0004916516469832524, + "loss": 3.5274, + "step": 2395 + }, + { + "epoch": 0.10088421052631578, + "grad_norm": 0.419921875, + "learning_rate": 0.0004916429973065815, + "loss": 3.6971, + "step": 2396 + }, + { + "epoch": 0.10092631578947368, + "grad_norm": 0.408203125, + "learning_rate": 0.0004916343432274594, + "loss": 3.5289, + "step": 2397 + }, + { + "epoch": 0.10096842105263158, + "grad_norm": 0.45703125, + "learning_rate": 0.0004916256847460433, + "loss": 3.685, + "step": 2398 + }, + { + "epoch": 0.10101052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.0004916170218624911, + "loss": 3.7434, + "step": 2399 + }, + { + "epoch": 0.10105263157894737, + "grad_norm": 0.400390625, + "learning_rate": 0.0004916083545769607, + "loss": 3.5422, + "step": 2400 + }, + { + "epoch": 0.10109473684210527, + "grad_norm": 0.44921875, + "learning_rate": 0.0004915996828896099, + "loss": 3.3303, + "step": 2401 + }, + { + "epoch": 0.10113684210526316, + "grad_norm": 0.49609375, + "learning_rate": 0.0004915910068005968, + "loss": 3.482, + "step": 2402 + }, + { + "epoch": 0.10117894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.0004915823263100794, + "loss": 3.6612, + "step": 2403 + }, + { + "epoch": 0.10122105263157895, + "grad_norm": 0.416015625, + "learning_rate": 0.0004915736414182159, + "loss": 3.6481, + "step": 2404 + }, + { + "epoch": 0.10126315789473685, + "grad_norm": 0.451171875, + "learning_rate": 0.0004915649521251644, + "loss": 3.3203, + "step": 2405 + }, + { + "epoch": 0.10130526315789473, + "grad_norm": 0.5234375, + "learning_rate": 0.0004915562584310834, + "loss": 3.3243, + "step": 2406 + }, + { + "epoch": 0.10134736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.0004915475603361312, + "loss": 3.9871, + "step": 2407 + }, + { + "epoch": 0.10138947368421053, + "grad_norm": 0.50390625, + "learning_rate": 0.0004915388578404661, + "loss": 3.5335, + "step": 2408 + }, + { + "epoch": 0.10143157894736841, + "grad_norm": 0.4140625, + "learning_rate": 0.0004915301509442469, + "loss": 3.9484, + "step": 2409 + }, + { + "epoch": 0.10147368421052631, + "grad_norm": 0.390625, + "learning_rate": 0.0004915214396476322, + "loss": 3.4999, + "step": 2410 + }, + { + "epoch": 0.10151578947368421, + "grad_norm": 0.40234375, + "learning_rate": 0.0004915127239507806, + "loss": 3.4293, + "step": 2411 + }, + { + "epoch": 0.10155789473684211, + "grad_norm": 0.427734375, + "learning_rate": 0.0004915040038538509, + "loss": 3.7979, + "step": 2412 + }, + { + "epoch": 0.1016, + "grad_norm": 0.50390625, + "learning_rate": 0.000491495279357002, + "loss": 3.7688, + "step": 2413 + }, + { + "epoch": 0.1016421052631579, + "grad_norm": 0.52734375, + "learning_rate": 0.0004914865504603929, + "loss": 4.1445, + "step": 2414 + }, + { + "epoch": 0.1016842105263158, + "grad_norm": 0.4921875, + "learning_rate": 0.0004914778171641826, + "loss": 3.4013, + "step": 2415 + }, + { + "epoch": 0.10172631578947368, + "grad_norm": 0.47265625, + "learning_rate": 0.0004914690794685302, + "loss": 3.2828, + "step": 2416 + }, + { + "epoch": 0.10176842105263158, + "grad_norm": 0.404296875, + "learning_rate": 0.0004914603373735948, + "loss": 3.3925, + "step": 2417 + }, + { + "epoch": 0.10181052631578948, + "grad_norm": 0.455078125, + "learning_rate": 0.0004914515908795358, + "loss": 3.3367, + "step": 2418 + }, + { + "epoch": 0.10185263157894738, + "grad_norm": 0.51953125, + "learning_rate": 0.0004914428399865126, + "loss": 3.6096, + "step": 2419 + }, + { + "epoch": 0.10189473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.0004914340846946843, + "loss": 3.6033, + "step": 2420 + }, + { + "epoch": 0.10193684210526316, + "grad_norm": 0.51953125, + "learning_rate": 0.0004914253250042109, + "loss": 3.0198, + "step": 2421 + }, + { + "epoch": 0.10197894736842106, + "grad_norm": 0.5703125, + "learning_rate": 0.0004914165609152516, + "loss": 3.4526, + "step": 2422 + }, + { + "epoch": 0.10202105263157894, + "grad_norm": 0.4765625, + "learning_rate": 0.0004914077924279663, + "loss": 3.216, + "step": 2423 + }, + { + "epoch": 0.10206315789473684, + "grad_norm": 0.408203125, + "learning_rate": 0.0004913990195425146, + "loss": 3.4327, + "step": 2424 + }, + { + "epoch": 0.10210526315789474, + "grad_norm": 0.4609375, + "learning_rate": 0.0004913902422590564, + "loss": 3.4962, + "step": 2425 + }, + { + "epoch": 0.10214736842105263, + "grad_norm": 0.46484375, + "learning_rate": 0.0004913814605777517, + "loss": 3.1794, + "step": 2426 + }, + { + "epoch": 0.10218947368421052, + "grad_norm": 0.453125, + "learning_rate": 0.0004913726744987602, + "loss": 3.7942, + "step": 2427 + }, + { + "epoch": 0.10223157894736842, + "grad_norm": 0.3984375, + "learning_rate": 0.0004913638840222424, + "loss": 3.6939, + "step": 2428 + }, + { + "epoch": 0.10227368421052632, + "grad_norm": 0.40234375, + "learning_rate": 0.0004913550891483581, + "loss": 3.6534, + "step": 2429 + }, + { + "epoch": 0.10231578947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.0004913462898772677, + "loss": 3.2149, + "step": 2430 + }, + { + "epoch": 0.1023578947368421, + "grad_norm": 0.50390625, + "learning_rate": 0.0004913374862091314, + "loss": 3.4913, + "step": 2431 + }, + { + "epoch": 0.1024, + "grad_norm": 0.43359375, + "learning_rate": 0.0004913286781441097, + "loss": 3.7831, + "step": 2432 + }, + { + "epoch": 0.10244210526315789, + "grad_norm": 0.423828125, + "learning_rate": 0.0004913198656823631, + "loss": 3.4988, + "step": 2433 + }, + { + "epoch": 0.10248421052631579, + "grad_norm": 0.447265625, + "learning_rate": 0.000491311048824052, + "loss": 3.6084, + "step": 2434 + }, + { + "epoch": 0.10252631578947369, + "grad_norm": 0.419921875, + "learning_rate": 0.0004913022275693372, + "loss": 3.3943, + "step": 2435 + }, + { + "epoch": 0.10256842105263157, + "grad_norm": 0.439453125, + "learning_rate": 0.0004912934019183793, + "loss": 3.4985, + "step": 2436 + }, + { + "epoch": 0.10261052631578947, + "grad_norm": 0.51953125, + "learning_rate": 0.0004912845718713391, + "loss": 3.4263, + "step": 2437 + }, + { + "epoch": 0.10265263157894737, + "grad_norm": 0.59375, + "learning_rate": 0.0004912757374283775, + "loss": 3.5648, + "step": 2438 + }, + { + "epoch": 0.10269473684210527, + "grad_norm": 0.462890625, + "learning_rate": 0.0004912668985896555, + "loss": 3.6311, + "step": 2439 + }, + { + "epoch": 0.10273684210526315, + "grad_norm": 0.474609375, + "learning_rate": 0.000491258055355334, + "loss": 3.0916, + "step": 2440 + }, + { + "epoch": 0.10277894736842105, + "grad_norm": 0.439453125, + "learning_rate": 0.0004912492077255743, + "loss": 3.679, + "step": 2441 + }, + { + "epoch": 0.10282105263157895, + "grad_norm": 0.4609375, + "learning_rate": 0.0004912403557005374, + "loss": 3.87, + "step": 2442 + }, + { + "epoch": 0.10286315789473684, + "grad_norm": 0.380859375, + "learning_rate": 0.0004912314992803847, + "loss": 3.3916, + "step": 2443 + }, + { + "epoch": 0.10290526315789474, + "grad_norm": 0.4375, + "learning_rate": 0.0004912226384652775, + "loss": 3.4416, + "step": 2444 + }, + { + "epoch": 0.10294736842105263, + "grad_norm": 0.44140625, + "learning_rate": 0.0004912137732553772, + "loss": 3.6113, + "step": 2445 + }, + { + "epoch": 0.10298947368421052, + "grad_norm": 0.490234375, + "learning_rate": 0.0004912049036508454, + "loss": 3.6274, + "step": 2446 + }, + { + "epoch": 0.10303157894736842, + "grad_norm": 0.51953125, + "learning_rate": 0.0004911960296518438, + "loss": 3.4642, + "step": 2447 + }, + { + "epoch": 0.10307368421052632, + "grad_norm": 0.41796875, + "learning_rate": 0.0004911871512585337, + "loss": 3.9321, + "step": 2448 + }, + { + "epoch": 0.10311578947368422, + "grad_norm": 0.5625, + "learning_rate": 0.0004911782684710773, + "loss": 3.8722, + "step": 2449 + }, + { + "epoch": 0.1031578947368421, + "grad_norm": 0.474609375, + "learning_rate": 0.0004911693812896361, + "loss": 3.4834, + "step": 2450 + }, + { + "epoch": 0.1032, + "grad_norm": 0.44921875, + "learning_rate": 0.0004911604897143722, + "loss": 3.4988, + "step": 2451 + }, + { + "epoch": 0.1032421052631579, + "grad_norm": 0.455078125, + "learning_rate": 0.0004911515937454474, + "loss": 3.6018, + "step": 2452 + }, + { + "epoch": 0.10328421052631578, + "grad_norm": 0.455078125, + "learning_rate": 0.000491142693383024, + "loss": 3.6775, + "step": 2453 + }, + { + "epoch": 0.10332631578947368, + "grad_norm": 0.44921875, + "learning_rate": 0.0004911337886272641, + "loss": 3.3132, + "step": 2454 + }, + { + "epoch": 0.10336842105263158, + "grad_norm": 0.451171875, + "learning_rate": 0.0004911248794783299, + "loss": 3.5588, + "step": 2455 + }, + { + "epoch": 0.10341052631578948, + "grad_norm": 0.451171875, + "learning_rate": 0.0004911159659363836, + "loss": 3.8202, + "step": 2456 + }, + { + "epoch": 0.10345263157894737, + "grad_norm": 0.458984375, + "learning_rate": 0.0004911070480015878, + "loss": 3.5973, + "step": 2457 + }, + { + "epoch": 0.10349473684210526, + "grad_norm": 0.45703125, + "learning_rate": 0.0004910981256741048, + "loss": 3.3257, + "step": 2458 + }, + { + "epoch": 0.10353684210526316, + "grad_norm": 0.451171875, + "learning_rate": 0.0004910891989540973, + "loss": 3.6352, + "step": 2459 + }, + { + "epoch": 0.10357894736842105, + "grad_norm": 0.490234375, + "learning_rate": 0.0004910802678417279, + "loss": 3.6312, + "step": 2460 + }, + { + "epoch": 0.10362105263157895, + "grad_norm": 0.41015625, + "learning_rate": 0.0004910713323371591, + "loss": 3.8383, + "step": 2461 + }, + { + "epoch": 0.10366315789473685, + "grad_norm": 0.421875, + "learning_rate": 0.000491062392440554, + "loss": 4.0315, + "step": 2462 + }, + { + "epoch": 0.10370526315789473, + "grad_norm": 0.4296875, + "learning_rate": 0.0004910534481520753, + "loss": 3.4518, + "step": 2463 + }, + { + "epoch": 0.10374736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.0004910444994718861, + "loss": 3.489, + "step": 2464 + }, + { + "epoch": 0.10378947368421053, + "grad_norm": 0.462890625, + "learning_rate": 0.0004910355464001492, + "loss": 3.5052, + "step": 2465 + }, + { + "epoch": 0.10383157894736843, + "grad_norm": 0.44140625, + "learning_rate": 0.0004910265889370279, + "loss": 3.3843, + "step": 2466 + }, + { + "epoch": 0.10387368421052631, + "grad_norm": 0.5234375, + "learning_rate": 0.0004910176270826854, + "loss": 3.4297, + "step": 2467 + }, + { + "epoch": 0.10391578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.0004910086608372849, + "loss": 3.7052, + "step": 2468 + }, + { + "epoch": 0.10395789473684211, + "grad_norm": 0.41796875, + "learning_rate": 0.0004909996902009897, + "loss": 3.5446, + "step": 2469 + }, + { + "epoch": 0.104, + "grad_norm": 0.474609375, + "learning_rate": 0.0004909907151739633, + "loss": 3.178, + "step": 2470 + }, + { + "epoch": 0.1040421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.0004909817357563693, + "loss": 3.6128, + "step": 2471 + }, + { + "epoch": 0.10408421052631579, + "grad_norm": 0.52734375, + "learning_rate": 0.0004909727519483712, + "loss": 3.6213, + "step": 2472 + }, + { + "epoch": 0.10412631578947368, + "grad_norm": 0.4375, + "learning_rate": 0.0004909637637501327, + "loss": 3.6342, + "step": 2473 + }, + { + "epoch": 0.10416842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.0004909547711618175, + "loss": 3.5634, + "step": 2474 + }, + { + "epoch": 0.10421052631578948, + "grad_norm": 0.68359375, + "learning_rate": 0.0004909457741835895, + "loss": 3.7878, + "step": 2475 + }, + { + "epoch": 0.10425263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.0004909367728156126, + "loss": 3.2269, + "step": 2476 + }, + { + "epoch": 0.10429473684210526, + "grad_norm": 0.439453125, + "learning_rate": 0.0004909277670580508, + "loss": 3.3652, + "step": 2477 + }, + { + "epoch": 0.10433684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.0004909187569110681, + "loss": 3.4177, + "step": 2478 + }, + { + "epoch": 0.10437894736842106, + "grad_norm": 0.59375, + "learning_rate": 0.0004909097423748288, + "loss": 3.4736, + "step": 2479 + }, + { + "epoch": 0.10442105263157894, + "grad_norm": 0.4609375, + "learning_rate": 0.000490900723449497, + "loss": 3.6626, + "step": 2480 + }, + { + "epoch": 0.10446315789473684, + "grad_norm": 0.40234375, + "learning_rate": 0.0004908917001352371, + "loss": 3.4584, + "step": 2481 + }, + { + "epoch": 0.10450526315789474, + "grad_norm": 0.546875, + "learning_rate": 0.0004908826724322134, + "loss": 4.0237, + "step": 2482 + }, + { + "epoch": 0.10454736842105262, + "grad_norm": 0.46484375, + "learning_rate": 0.0004908736403405905, + "loss": 3.6316, + "step": 2483 + }, + { + "epoch": 0.10458947368421052, + "grad_norm": 0.466796875, + "learning_rate": 0.0004908646038605329, + "loss": 3.5141, + "step": 2484 + }, + { + "epoch": 0.10463157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.0004908555629922052, + "loss": 3.7593, + "step": 2485 + }, + { + "epoch": 0.10467368421052632, + "grad_norm": 0.4453125, + "learning_rate": 0.0004908465177357721, + "loss": 3.6262, + "step": 2486 + }, + { + "epoch": 0.1047157894736842, + "grad_norm": 0.482421875, + "learning_rate": 0.0004908374680913984, + "loss": 3.2696, + "step": 2487 + }, + { + "epoch": 0.1047578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.000490828414059249, + "loss": 3.5004, + "step": 2488 + }, + { + "epoch": 0.1048, + "grad_norm": 0.431640625, + "learning_rate": 0.000490819355639489, + "loss": 3.8349, + "step": 2489 + }, + { + "epoch": 0.10484210526315789, + "grad_norm": 0.53125, + "learning_rate": 0.0004908102928322832, + "loss": 3.6042, + "step": 2490 + }, + { + "epoch": 0.10488421052631579, + "grad_norm": 0.48828125, + "learning_rate": 0.0004908012256377969, + "loss": 3.3381, + "step": 2491 + }, + { + "epoch": 0.10492631578947369, + "grad_norm": 0.451171875, + "learning_rate": 0.0004907921540561951, + "loss": 3.089, + "step": 2492 + }, + { + "epoch": 0.10496842105263159, + "grad_norm": 0.435546875, + "learning_rate": 0.0004907830780876431, + "loss": 3.8093, + "step": 2493 + }, + { + "epoch": 0.10501052631578947, + "grad_norm": 0.482421875, + "learning_rate": 0.0004907739977323064, + "loss": 3.778, + "step": 2494 + }, + { + "epoch": 0.10505263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0004907649129903504, + "loss": 4.023, + "step": 2495 + }, + { + "epoch": 0.10509473684210527, + "grad_norm": 0.484375, + "learning_rate": 0.0004907558238619405, + "loss": 3.0695, + "step": 2496 + }, + { + "epoch": 0.10513684210526315, + "grad_norm": 0.4921875, + "learning_rate": 0.0004907467303472423, + "loss": 3.5901, + "step": 2497 + }, + { + "epoch": 0.10517894736842105, + "grad_norm": 0.478515625, + "learning_rate": 0.0004907376324464218, + "loss": 3.4782, + "step": 2498 + }, + { + "epoch": 0.10522105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.0004907285301596442, + "loss": 3.2867, + "step": 2499 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.0004907194234870758, + "loss": 3.462, + "step": 2500 + }, + { + "epoch": 0.10530526315789474, + "grad_norm": 0.55078125, + "learning_rate": 0.0004907103124288823, + "loss": 3.423, + "step": 2501 + }, + { + "epoch": 0.10534736842105263, + "grad_norm": 0.4921875, + "learning_rate": 0.0004907011969852297, + "loss": 3.3231, + "step": 2502 + }, + { + "epoch": 0.10538947368421053, + "grad_norm": 0.431640625, + "learning_rate": 0.0004906920771562842, + "loss": 3.5672, + "step": 2503 + }, + { + "epoch": 0.10543157894736842, + "grad_norm": 0.40625, + "learning_rate": 0.0004906829529422118, + "loss": 3.4666, + "step": 2504 + }, + { + "epoch": 0.10547368421052632, + "grad_norm": 0.412109375, + "learning_rate": 0.0004906738243431787, + "loss": 3.8139, + "step": 2505 + }, + { + "epoch": 0.10551578947368422, + "grad_norm": 0.42578125, + "learning_rate": 0.0004906646913593514, + "loss": 3.0866, + "step": 2506 + }, + { + "epoch": 0.1055578947368421, + "grad_norm": 0.4609375, + "learning_rate": 0.0004906555539908962, + "loss": 3.0625, + "step": 2507 + }, + { + "epoch": 0.1056, + "grad_norm": 0.4375, + "learning_rate": 0.0004906464122379796, + "loss": 3.3318, + "step": 2508 + }, + { + "epoch": 0.1056421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.000490637266100768, + "loss": 3.6122, + "step": 2509 + }, + { + "epoch": 0.10568421052631578, + "grad_norm": 0.51171875, + "learning_rate": 0.0004906281155794282, + "loss": 2.7703, + "step": 2510 + }, + { + "epoch": 0.10572631578947368, + "grad_norm": 0.4765625, + "learning_rate": 0.0004906189606741268, + "loss": 3.3437, + "step": 2511 + }, + { + "epoch": 0.10576842105263158, + "grad_norm": 0.5078125, + "learning_rate": 0.0004906098013850307, + "loss": 3.3754, + "step": 2512 + }, + { + "epoch": 0.10581052631578948, + "grad_norm": 0.427734375, + "learning_rate": 0.0004906006377123068, + "loss": 3.4719, + "step": 2513 + }, + { + "epoch": 0.10585263157894736, + "grad_norm": 0.47265625, + "learning_rate": 0.0004905914696561219, + "loss": 3.7294, + "step": 2514 + }, + { + "epoch": 0.10589473684210526, + "grad_norm": 0.451171875, + "learning_rate": 0.0004905822972166432, + "loss": 3.3914, + "step": 2515 + }, + { + "epoch": 0.10593684210526316, + "grad_norm": 0.56640625, + "learning_rate": 0.0004905731203940375, + "loss": 3.5486, + "step": 2516 + }, + { + "epoch": 0.10597894736842105, + "grad_norm": 0.45703125, + "learning_rate": 0.0004905639391884724, + "loss": 3.5795, + "step": 2517 + }, + { + "epoch": 0.10602105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.000490554753600115, + "loss": 3.7017, + "step": 2518 + }, + { + "epoch": 0.10606315789473685, + "grad_norm": 0.51171875, + "learning_rate": 0.0004905455636291324, + "loss": 4.0968, + "step": 2519 + }, + { + "epoch": 0.10610526315789473, + "grad_norm": 0.4375, + "learning_rate": 0.0004905363692756925, + "loss": 3.7371, + "step": 2520 + }, + { + "epoch": 0.10614736842105263, + "grad_norm": 0.431640625, + "learning_rate": 0.0004905271705399624, + "loss": 2.97, + "step": 2521 + }, + { + "epoch": 0.10618947368421053, + "grad_norm": 0.478515625, + "learning_rate": 0.00049051796742211, + "loss": 3.3626, + "step": 2522 + }, + { + "epoch": 0.10623157894736843, + "grad_norm": 0.5546875, + "learning_rate": 0.0004905087599223027, + "loss": 3.4704, + "step": 2523 + }, + { + "epoch": 0.10627368421052631, + "grad_norm": 0.4296875, + "learning_rate": 0.0004904995480407086, + "loss": 3.9547, + "step": 2524 + }, + { + "epoch": 0.10631578947368421, + "grad_norm": 0.47265625, + "learning_rate": 0.000490490331777495, + "loss": 3.9125, + "step": 2525 + }, + { + "epoch": 0.10635789473684211, + "grad_norm": 0.5390625, + "learning_rate": 0.0004904811111328303, + "loss": 3.0285, + "step": 2526 + }, + { + "epoch": 0.1064, + "grad_norm": 0.43359375, + "learning_rate": 0.0004904718861068823, + "loss": 3.566, + "step": 2527 + }, + { + "epoch": 0.1064421052631579, + "grad_norm": 0.44921875, + "learning_rate": 0.0004904626566998191, + "loss": 3.4938, + "step": 2528 + }, + { + "epoch": 0.10648421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.0004904534229118087, + "loss": 3.5643, + "step": 2529 + }, + { + "epoch": 0.10652631578947368, + "grad_norm": 0.421875, + "learning_rate": 0.0004904441847430196, + "loss": 3.4715, + "step": 2530 + }, + { + "epoch": 0.10656842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.00049043494219362, + "loss": 3.9355, + "step": 2531 + }, + { + "epoch": 0.10661052631578948, + "grad_norm": 0.43359375, + "learning_rate": 0.0004904256952637781, + "loss": 3.3853, + "step": 2532 + }, + { + "epoch": 0.10665263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.0004904164439536626, + "loss": 3.1788, + "step": 2533 + }, + { + "epoch": 0.10669473684210526, + "grad_norm": 0.4609375, + "learning_rate": 0.0004904071882634419, + "loss": 3.379, + "step": 2534 + }, + { + "epoch": 0.10673684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.0004903979281932849, + "loss": 3.947, + "step": 2535 + }, + { + "epoch": 0.10677894736842106, + "grad_norm": 0.4140625, + "learning_rate": 0.0004903886637433599, + "loss": 3.6698, + "step": 2536 + }, + { + "epoch": 0.10682105263157894, + "grad_norm": 0.423828125, + "learning_rate": 0.0004903793949138359, + "loss": 3.8876, + "step": 2537 + }, + { + "epoch": 0.10686315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.0004903701217048818, + "loss": 3.754, + "step": 2538 + }, + { + "epoch": 0.10690526315789474, + "grad_norm": 0.443359375, + "learning_rate": 0.0004903608441166666, + "loss": 3.6785, + "step": 2539 + }, + { + "epoch": 0.10694736842105264, + "grad_norm": 0.41015625, + "learning_rate": 0.000490351562149359, + "loss": 4.0393, + "step": 2540 + }, + { + "epoch": 0.10698947368421052, + "grad_norm": 0.404296875, + "learning_rate": 0.0004903422758031285, + "loss": 3.7675, + "step": 2541 + }, + { + "epoch": 0.10703157894736842, + "grad_norm": 0.5859375, + "learning_rate": 0.0004903329850781442, + "loss": 3.4964, + "step": 2542 + }, + { + "epoch": 0.10707368421052632, + "grad_norm": 0.43359375, + "learning_rate": 0.0004903236899745752, + "loss": 3.729, + "step": 2543 + }, + { + "epoch": 0.1071157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.000490314390492591, + "loss": 3.4319, + "step": 2544 + }, + { + "epoch": 0.1071578947368421, + "grad_norm": 0.392578125, + "learning_rate": 0.0004903050866323608, + "loss": 3.559, + "step": 2545 + }, + { + "epoch": 0.1072, + "grad_norm": 0.447265625, + "learning_rate": 0.0004902957783940544, + "loss": 3.3122, + "step": 2546 + }, + { + "epoch": 0.10724210526315789, + "grad_norm": 0.46875, + "learning_rate": 0.0004902864657778411, + "loss": 3.2039, + "step": 2547 + }, + { + "epoch": 0.10728421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.0004902771487838909, + "loss": 3.7317, + "step": 2548 + }, + { + "epoch": 0.10732631578947369, + "grad_norm": 0.515625, + "learning_rate": 0.0004902678274123734, + "loss": 3.4333, + "step": 2549 + }, + { + "epoch": 0.10736842105263159, + "grad_norm": 0.42578125, + "learning_rate": 0.0004902585016634583, + "loss": 3.4448, + "step": 2550 + }, + { + "epoch": 0.10741052631578947, + "grad_norm": 0.4375, + "learning_rate": 0.0004902491715373155, + "loss": 3.1972, + "step": 2551 + }, + { + "epoch": 0.10745263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.0004902398370341153, + "loss": 3.6141, + "step": 2552 + }, + { + "epoch": 0.10749473684210527, + "grad_norm": 0.421875, + "learning_rate": 0.0004902304981540273, + "loss": 3.4586, + "step": 2553 + }, + { + "epoch": 0.10753684210526315, + "grad_norm": 0.421875, + "learning_rate": 0.000490221154897222, + "loss": 3.4054, + "step": 2554 + }, + { + "epoch": 0.10757894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.0004902118072638695, + "loss": 3.5277, + "step": 2555 + }, + { + "epoch": 0.10762105263157895, + "grad_norm": 0.96875, + "learning_rate": 0.0004902024552541401, + "loss": 3.6185, + "step": 2556 + }, + { + "epoch": 0.10766315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.0004901930988682042, + "loss": 3.6268, + "step": 2557 + }, + { + "epoch": 0.10770526315789473, + "grad_norm": 0.546875, + "learning_rate": 0.0004901837381062322, + "loss": 3.7803, + "step": 2558 + }, + { + "epoch": 0.10774736842105263, + "grad_norm": 0.44921875, + "learning_rate": 0.0004901743729683947, + "loss": 3.164, + "step": 2559 + }, + { + "epoch": 0.10778947368421053, + "grad_norm": 0.416015625, + "learning_rate": 0.0004901650034548625, + "loss": 3.5852, + "step": 2560 + }, + { + "epoch": 0.10783157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.0004901556295658059, + "loss": 3.9368, + "step": 2561 + }, + { + "epoch": 0.10787368421052632, + "grad_norm": 0.4140625, + "learning_rate": 0.000490146251301396, + "loss": 3.9976, + "step": 2562 + }, + { + "epoch": 0.10791578947368421, + "grad_norm": 0.392578125, + "learning_rate": 0.0004901368686618035, + "loss": 3.8391, + "step": 2563 + }, + { + "epoch": 0.1079578947368421, + "grad_norm": 0.39453125, + "learning_rate": 0.0004901274816471995, + "loss": 3.6153, + "step": 2564 + }, + { + "epoch": 0.108, + "grad_norm": 0.4453125, + "learning_rate": 0.0004901180902577548, + "loss": 3.579, + "step": 2565 + }, + { + "epoch": 0.1080421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.0004901086944936407, + "loss": 3.8054, + "step": 2566 + }, + { + "epoch": 0.10808421052631578, + "grad_norm": 0.49609375, + "learning_rate": 0.0004900992943550283, + "loss": 3.3685, + "step": 2567 + }, + { + "epoch": 0.10812631578947368, + "grad_norm": 0.400390625, + "learning_rate": 0.0004900898898420889, + "loss": 3.5342, + "step": 2568 + }, + { + "epoch": 0.10816842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.0004900804809549937, + "loss": 3.3373, + "step": 2569 + }, + { + "epoch": 0.10821052631578948, + "grad_norm": 0.431640625, + "learning_rate": 0.0004900710676939143, + "loss": 3.4235, + "step": 2570 + }, + { + "epoch": 0.10825263157894736, + "grad_norm": 0.421875, + "learning_rate": 0.000490061650059022, + "loss": 3.7912, + "step": 2571 + }, + { + "epoch": 0.10829473684210526, + "grad_norm": 0.4296875, + "learning_rate": 0.0004900522280504887, + "loss": 3.8327, + "step": 2572 + }, + { + "epoch": 0.10833684210526316, + "grad_norm": 0.4921875, + "learning_rate": 0.0004900428016684857, + "loss": 3.5385, + "step": 2573 + }, + { + "epoch": 0.10837894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.0004900333709131849, + "loss": 3.7324, + "step": 2574 + }, + { + "epoch": 0.10842105263157895, + "grad_norm": 0.396484375, + "learning_rate": 0.0004900239357847582, + "loss": 3.1748, + "step": 2575 + }, + { + "epoch": 0.10846315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.0004900144962833773, + "loss": 3.3892, + "step": 2576 + }, + { + "epoch": 0.10850526315789474, + "grad_norm": 0.48046875, + "learning_rate": 0.0004900050524092143, + "loss": 3.7106, + "step": 2577 + }, + { + "epoch": 0.10854736842105263, + "grad_norm": 0.404296875, + "learning_rate": 0.0004899956041624413, + "loss": 3.1601, + "step": 2578 + }, + { + "epoch": 0.10858947368421053, + "grad_norm": 0.47265625, + "learning_rate": 0.0004899861515432303, + "loss": 3.5534, + "step": 2579 + }, + { + "epoch": 0.10863157894736843, + "grad_norm": 0.45703125, + "learning_rate": 0.0004899766945517537, + "loss": 3.7678, + "step": 2580 + }, + { + "epoch": 0.10867368421052631, + "grad_norm": 0.4375, + "learning_rate": 0.0004899672331881836, + "loss": 3.7374, + "step": 2581 + }, + { + "epoch": 0.10871578947368421, + "grad_norm": 0.455078125, + "learning_rate": 0.0004899577674526925, + "loss": 3.9345, + "step": 2582 + }, + { + "epoch": 0.10875789473684211, + "grad_norm": 0.416015625, + "learning_rate": 0.0004899482973454529, + "loss": 3.3318, + "step": 2583 + }, + { + "epoch": 0.1088, + "grad_norm": 0.50390625, + "learning_rate": 0.000489938822866637, + "loss": 3.7552, + "step": 2584 + }, + { + "epoch": 0.10884210526315789, + "grad_norm": 0.419921875, + "learning_rate": 0.0004899293440164178, + "loss": 3.2896, + "step": 2585 + }, + { + "epoch": 0.10888421052631579, + "grad_norm": 0.462890625, + "learning_rate": 0.000489919860794968, + "loss": 3.5565, + "step": 2586 + }, + { + "epoch": 0.10892631578947369, + "grad_norm": 0.478515625, + "learning_rate": 0.0004899103732024601, + "loss": 3.2471, + "step": 2587 + }, + { + "epoch": 0.10896842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.0004899008812390671, + "loss": 3.795, + "step": 2588 + }, + { + "epoch": 0.10901052631578947, + "grad_norm": 0.423828125, + "learning_rate": 0.0004898913849049619, + "loss": 3.6525, + "step": 2589 + }, + { + "epoch": 0.10905263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.0004898818842003175, + "loss": 3.3793, + "step": 2590 + }, + { + "epoch": 0.10909473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.000489872379125307, + "loss": 3.6333, + "step": 2591 + }, + { + "epoch": 0.10913684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.0004898628696801037, + "loss": 3.4143, + "step": 2592 + }, + { + "epoch": 0.10917894736842106, + "grad_norm": 0.375, + "learning_rate": 0.0004898533558648807, + "loss": 3.3189, + "step": 2593 + }, + { + "epoch": 0.10922105263157894, + "grad_norm": 0.41015625, + "learning_rate": 0.0004898438376798114, + "loss": 3.565, + "step": 2594 + }, + { + "epoch": 0.10926315789473684, + "grad_norm": 0.41015625, + "learning_rate": 0.000489834315125069, + "loss": 3.4209, + "step": 2595 + }, + { + "epoch": 0.10930526315789474, + "grad_norm": 0.458984375, + "learning_rate": 0.0004898247882008273, + "loss": 3.5324, + "step": 2596 + }, + { + "epoch": 0.10934736842105264, + "grad_norm": 0.365234375, + "learning_rate": 0.0004898152569072598, + "loss": 3.5261, + "step": 2597 + }, + { + "epoch": 0.10938947368421052, + "grad_norm": 0.3984375, + "learning_rate": 0.0004898057212445401, + "loss": 3.5859, + "step": 2598 + }, + { + "epoch": 0.10943157894736842, + "grad_norm": 0.451171875, + "learning_rate": 0.0004897961812128418, + "loss": 3.4298, + "step": 2599 + }, + { + "epoch": 0.10947368421052632, + "grad_norm": 0.396484375, + "learning_rate": 0.000489786636812339, + "loss": 3.317, + "step": 2600 + }, + { + "epoch": 0.1095157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.0004897770880432052, + "loss": 3.5587, + "step": 2601 + }, + { + "epoch": 0.1095578947368421, + "grad_norm": 0.4765625, + "learning_rate": 0.0004897675349056147, + "loss": 3.6695, + "step": 2602 + }, + { + "epoch": 0.1096, + "grad_norm": 0.5, + "learning_rate": 0.0004897579773997414, + "loss": 3.3769, + "step": 2603 + }, + { + "epoch": 0.10964210526315789, + "grad_norm": 0.412109375, + "learning_rate": 0.0004897484155257594, + "loss": 3.6357, + "step": 2604 + }, + { + "epoch": 0.10968421052631579, + "grad_norm": 0.39453125, + "learning_rate": 0.0004897388492838432, + "loss": 3.4751, + "step": 2605 + }, + { + "epoch": 0.10972631578947369, + "grad_norm": 0.41796875, + "learning_rate": 0.0004897292786741666, + "loss": 3.2206, + "step": 2606 + }, + { + "epoch": 0.10976842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.0004897197036969044, + "loss": 3.3609, + "step": 2607 + }, + { + "epoch": 0.10981052631578947, + "grad_norm": 0.431640625, + "learning_rate": 0.0004897101243522306, + "loss": 3.4097, + "step": 2608 + }, + { + "epoch": 0.10985263157894737, + "grad_norm": 0.447265625, + "learning_rate": 0.0004897005406403202, + "loss": 3.6122, + "step": 2609 + }, + { + "epoch": 0.10989473684210527, + "grad_norm": 0.427734375, + "learning_rate": 0.0004896909525613475, + "loss": 3.6775, + "step": 2610 + }, + { + "epoch": 0.10993684210526315, + "grad_norm": 0.41015625, + "learning_rate": 0.0004896813601154873, + "loss": 3.4825, + "step": 2611 + }, + { + "epoch": 0.10997894736842105, + "grad_norm": 0.412109375, + "learning_rate": 0.0004896717633029143, + "loss": 3.9252, + "step": 2612 + }, + { + "epoch": 0.11002105263157895, + "grad_norm": 0.482421875, + "learning_rate": 0.0004896621621238034, + "loss": 3.3762, + "step": 2613 + }, + { + "epoch": 0.11006315789473685, + "grad_norm": 0.419921875, + "learning_rate": 0.0004896525565783296, + "loss": 3.6748, + "step": 2614 + }, + { + "epoch": 0.11010526315789473, + "grad_norm": 0.4609375, + "learning_rate": 0.0004896429466666676, + "loss": 3.1753, + "step": 2615 + }, + { + "epoch": 0.11014736842105263, + "grad_norm": 0.458984375, + "learning_rate": 0.0004896333323889928, + "loss": 3.45, + "step": 2616 + }, + { + "epoch": 0.11018947368421053, + "grad_norm": 0.447265625, + "learning_rate": 0.0004896237137454802, + "loss": 3.7695, + "step": 2617 + }, + { + "epoch": 0.11023157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.0004896140907363051, + "loss": 3.924, + "step": 2618 + }, + { + "epoch": 0.11027368421052632, + "grad_norm": 0.404296875, + "learning_rate": 0.0004896044633616428, + "loss": 3.8208, + "step": 2619 + }, + { + "epoch": 0.11031578947368421, + "grad_norm": 0.474609375, + "learning_rate": 0.0004895948316216687, + "loss": 3.3617, + "step": 2620 + }, + { + "epoch": 0.1103578947368421, + "grad_norm": 0.45703125, + "learning_rate": 0.0004895851955165582, + "loss": 3.7592, + "step": 2621 + }, + { + "epoch": 0.1104, + "grad_norm": 0.375, + "learning_rate": 0.0004895755550464871, + "loss": 3.3189, + "step": 2622 + }, + { + "epoch": 0.1104421052631579, + "grad_norm": 0.470703125, + "learning_rate": 0.0004895659102116308, + "loss": 3.3376, + "step": 2623 + }, + { + "epoch": 0.1104842105263158, + "grad_norm": 0.62109375, + "learning_rate": 0.0004895562610121652, + "loss": 3.3798, + "step": 2624 + }, + { + "epoch": 0.11052631578947368, + "grad_norm": 0.466796875, + "learning_rate": 0.0004895466074482657, + "loss": 3.7928, + "step": 2625 + }, + { + "epoch": 0.11056842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.0004895369495201086, + "loss": 3.0828, + "step": 2626 + }, + { + "epoch": 0.11061052631578948, + "grad_norm": 0.412109375, + "learning_rate": 0.0004895272872278699, + "loss": 3.6787, + "step": 2627 + }, + { + "epoch": 0.11065263157894736, + "grad_norm": 0.423828125, + "learning_rate": 0.0004895176205717254, + "loss": 3.791, + "step": 2628 + }, + { + "epoch": 0.11069473684210526, + "grad_norm": 0.4296875, + "learning_rate": 0.000489507949551851, + "loss": 3.423, + "step": 2629 + }, + { + "epoch": 0.11073684210526316, + "grad_norm": 0.458984375, + "learning_rate": 0.0004894982741684235, + "loss": 3.4624, + "step": 2630 + }, + { + "epoch": 0.11077894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.0004894885944216186, + "loss": 3.4961, + "step": 2631 + }, + { + "epoch": 0.11082105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.0004894789103116131, + "loss": 3.6301, + "step": 2632 + }, + { + "epoch": 0.11086315789473684, + "grad_norm": 0.40625, + "learning_rate": 0.0004894692218385832, + "loss": 3.8849, + "step": 2633 + }, + { + "epoch": 0.11090526315789474, + "grad_norm": 0.6171875, + "learning_rate": 0.0004894595290027054, + "loss": 3.2536, + "step": 2634 + }, + { + "epoch": 0.11094736842105263, + "grad_norm": 0.466796875, + "learning_rate": 0.0004894498318041562, + "loss": 3.9343, + "step": 2635 + }, + { + "epoch": 0.11098947368421053, + "grad_norm": 0.474609375, + "learning_rate": 0.0004894401302431126, + "loss": 3.5315, + "step": 2636 + }, + { + "epoch": 0.11103157894736843, + "grad_norm": 0.4453125, + "learning_rate": 0.0004894304243197511, + "loss": 3.3575, + "step": 2637 + }, + { + "epoch": 0.11107368421052631, + "grad_norm": 0.390625, + "learning_rate": 0.0004894207140342486, + "loss": 3.2135, + "step": 2638 + }, + { + "epoch": 0.11111578947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.000489410999386782, + "loss": 3.5283, + "step": 2639 + }, + { + "epoch": 0.11115789473684211, + "grad_norm": 0.46875, + "learning_rate": 0.0004894012803775283, + "loss": 3.7674, + "step": 2640 + }, + { + "epoch": 0.1112, + "grad_norm": 0.43359375, + "learning_rate": 0.0004893915570066645, + "loss": 3.4459, + "step": 2641 + }, + { + "epoch": 0.11124210526315789, + "grad_norm": 0.435546875, + "learning_rate": 0.0004893818292743678, + "loss": 3.5808, + "step": 2642 + }, + { + "epoch": 0.11128421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.0004893720971808154, + "loss": 3.3435, + "step": 2643 + }, + { + "epoch": 0.11132631578947369, + "grad_norm": 0.4609375, + "learning_rate": 0.0004893623607261847, + "loss": 3.5229, + "step": 2644 + }, + { + "epoch": 0.11136842105263157, + "grad_norm": 0.388671875, + "learning_rate": 0.0004893526199106531, + "loss": 3.2473, + "step": 2645 + }, + { + "epoch": 0.11141052631578947, + "grad_norm": 0.392578125, + "learning_rate": 0.0004893428747343979, + "loss": 3.7486, + "step": 2646 + }, + { + "epoch": 0.11145263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.0004893331251975968, + "loss": 3.56, + "step": 2647 + }, + { + "epoch": 0.11149473684210526, + "grad_norm": 0.48828125, + "learning_rate": 0.0004893233713004272, + "loss": 3.3711, + "step": 2648 + }, + { + "epoch": 0.11153684210526316, + "grad_norm": 0.470703125, + "learning_rate": 0.0004893136130430671, + "loss": 3.1145, + "step": 2649 + }, + { + "epoch": 0.11157894736842106, + "grad_norm": 0.4453125, + "learning_rate": 0.0004893038504256941, + "loss": 3.5733, + "step": 2650 + }, + { + "epoch": 0.11162105263157895, + "grad_norm": 0.62109375, + "learning_rate": 0.0004892940834484862, + "loss": 2.9337, + "step": 2651 + }, + { + "epoch": 0.11166315789473684, + "grad_norm": 0.431640625, + "learning_rate": 0.0004892843121116213, + "loss": 3.6927, + "step": 2652 + }, + { + "epoch": 0.11170526315789474, + "grad_norm": 0.419921875, + "learning_rate": 0.0004892745364152773, + "loss": 3.8813, + "step": 2653 + }, + { + "epoch": 0.11174736842105264, + "grad_norm": 0.40625, + "learning_rate": 0.0004892647563596323, + "loss": 3.6019, + "step": 2654 + }, + { + "epoch": 0.11178947368421052, + "grad_norm": 0.43359375, + "learning_rate": 0.0004892549719448647, + "loss": 3.5056, + "step": 2655 + }, + { + "epoch": 0.11183157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.0004892451831711525, + "loss": 3.6645, + "step": 2656 + }, + { + "epoch": 0.11187368421052632, + "grad_norm": 0.5390625, + "learning_rate": 0.0004892353900386742, + "loss": 3.4746, + "step": 2657 + }, + { + "epoch": 0.1119157894736842, + "grad_norm": 0.45703125, + "learning_rate": 0.0004892255925476082, + "loss": 3.7596, + "step": 2658 + }, + { + "epoch": 0.1119578947368421, + "grad_norm": 0.466796875, + "learning_rate": 0.000489215790698133, + "loss": 3.4502, + "step": 2659 + }, + { + "epoch": 0.112, + "grad_norm": 0.439453125, + "learning_rate": 0.0004892059844904272, + "loss": 3.1957, + "step": 2660 + }, + { + "epoch": 0.1120421052631579, + "grad_norm": 0.451171875, + "learning_rate": 0.0004891961739246694, + "loss": 3.5748, + "step": 2661 + }, + { + "epoch": 0.11208421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.0004891863590010383, + "loss": 3.6574, + "step": 2662 + }, + { + "epoch": 0.11212631578947369, + "grad_norm": 0.453125, + "learning_rate": 0.0004891765397197128, + "loss": 3.4892, + "step": 2663 + }, + { + "epoch": 0.11216842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.0004891667160808719, + "loss": 3.8335, + "step": 2664 + }, + { + "epoch": 0.11221052631578947, + "grad_norm": 0.54296875, + "learning_rate": 0.0004891568880846941, + "loss": 3.1086, + "step": 2665 + }, + { + "epoch": 0.11225263157894737, + "grad_norm": 0.458984375, + "learning_rate": 0.0004891470557313591, + "loss": 3.1166, + "step": 2666 + }, + { + "epoch": 0.11229473684210527, + "grad_norm": 0.52734375, + "learning_rate": 0.0004891372190210456, + "loss": 3.6589, + "step": 2667 + }, + { + "epoch": 0.11233684210526315, + "grad_norm": 0.431640625, + "learning_rate": 0.000489127377953933, + "loss": 3.8796, + "step": 2668 + }, + { + "epoch": 0.11237894736842105, + "grad_norm": 0.40234375, + "learning_rate": 0.0004891175325302006, + "loss": 3.6773, + "step": 2669 + }, + { + "epoch": 0.11242105263157895, + "grad_norm": 0.388671875, + "learning_rate": 0.0004891076827500276, + "loss": 3.8707, + "step": 2670 + }, + { + "epoch": 0.11246315789473685, + "grad_norm": 0.4296875, + "learning_rate": 0.0004890978286135935, + "loss": 3.5764, + "step": 2671 + }, + { + "epoch": 0.11250526315789473, + "grad_norm": 0.384765625, + "learning_rate": 0.000489087970121078, + "loss": 3.4786, + "step": 2672 + }, + { + "epoch": 0.11254736842105263, + "grad_norm": 0.48828125, + "learning_rate": 0.0004890781072726606, + "loss": 3.2337, + "step": 2673 + }, + { + "epoch": 0.11258947368421053, + "grad_norm": 0.400390625, + "learning_rate": 0.000489068240068521, + "loss": 3.5644, + "step": 2674 + }, + { + "epoch": 0.11263157894736842, + "grad_norm": 0.4921875, + "learning_rate": 0.0004890583685088389, + "loss": 3.9321, + "step": 2675 + }, + { + "epoch": 0.11267368421052631, + "grad_norm": 0.4453125, + "learning_rate": 0.0004890484925937943, + "loss": 3.7626, + "step": 2676 + }, + { + "epoch": 0.11271578947368421, + "grad_norm": 0.404296875, + "learning_rate": 0.000489038612323567, + "loss": 3.5162, + "step": 2677 + }, + { + "epoch": 0.1127578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.000489028727698337, + "loss": 3.3456, + "step": 2678 + }, + { + "epoch": 0.1128, + "grad_norm": 0.43359375, + "learning_rate": 0.0004890188387182843, + "loss": 3.7814, + "step": 2679 + }, + { + "epoch": 0.1128421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.0004890089453835894, + "loss": 3.2508, + "step": 2680 + }, + { + "epoch": 0.1128842105263158, + "grad_norm": 0.486328125, + "learning_rate": 0.0004889990476944322, + "loss": 2.7378, + "step": 2681 + }, + { + "epoch": 0.11292631578947368, + "grad_norm": 0.408203125, + "learning_rate": 0.0004889891456509933, + "loss": 3.6601, + "step": 2682 + }, + { + "epoch": 0.11296842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.0004889792392534529, + "loss": 3.8327, + "step": 2683 + }, + { + "epoch": 0.11301052631578948, + "grad_norm": 0.396484375, + "learning_rate": 0.0004889693285019916, + "loss": 3.8476, + "step": 2684 + }, + { + "epoch": 0.11305263157894736, + "grad_norm": 0.458984375, + "learning_rate": 0.0004889594133967899, + "loss": 3.2264, + "step": 2685 + }, + { + "epoch": 0.11309473684210526, + "grad_norm": 0.4140625, + "learning_rate": 0.0004889494939380284, + "loss": 3.5233, + "step": 2686 + }, + { + "epoch": 0.11313684210526316, + "grad_norm": 0.416015625, + "learning_rate": 0.0004889395701258879, + "loss": 3.7289, + "step": 2687 + }, + { + "epoch": 0.11317894736842106, + "grad_norm": 0.4609375, + "learning_rate": 0.0004889296419605492, + "loss": 3.5828, + "step": 2688 + }, + { + "epoch": 0.11322105263157894, + "grad_norm": 0.421875, + "learning_rate": 0.0004889197094421931, + "loss": 3.5343, + "step": 2689 + }, + { + "epoch": 0.11326315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.0004889097725710007, + "loss": 3.3791, + "step": 2690 + }, + { + "epoch": 0.11330526315789474, + "grad_norm": 0.431640625, + "learning_rate": 0.0004888998313471528, + "loss": 3.8507, + "step": 2691 + }, + { + "epoch": 0.11334736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.0004888898857708309, + "loss": 3.5045, + "step": 2692 + }, + { + "epoch": 0.11338947368421053, + "grad_norm": 0.396484375, + "learning_rate": 0.0004888799358422157, + "loss": 3.6004, + "step": 2693 + }, + { + "epoch": 0.11343157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.0004888699815614888, + "loss": 3.4196, + "step": 2694 + }, + { + "epoch": 0.11347368421052631, + "grad_norm": 0.41015625, + "learning_rate": 0.0004888600229288316, + "loss": 3.849, + "step": 2695 + }, + { + "epoch": 0.11351578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.0004888500599444255, + "loss": 3.2949, + "step": 2696 + }, + { + "epoch": 0.11355789473684211, + "grad_norm": 0.408203125, + "learning_rate": 0.0004888400926084518, + "loss": 3.658, + "step": 2697 + }, + { + "epoch": 0.1136, + "grad_norm": 0.404296875, + "learning_rate": 0.0004888301209210923, + "loss": 3.9084, + "step": 2698 + }, + { + "epoch": 0.11364210526315789, + "grad_norm": 0.453125, + "learning_rate": 0.0004888201448825286, + "loss": 3.1516, + "step": 2699 + }, + { + "epoch": 0.11368421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.0004888101644929424, + "loss": 3.7905, + "step": 2700 + }, + { + "epoch": 0.11372631578947369, + "grad_norm": 0.421875, + "learning_rate": 0.0004888001797525157, + "loss": 3.3805, + "step": 2701 + }, + { + "epoch": 0.11376842105263157, + "grad_norm": 0.408203125, + "learning_rate": 0.0004887901906614301, + "loss": 3.376, + "step": 2702 + }, + { + "epoch": 0.11381052631578947, + "grad_norm": 0.48828125, + "learning_rate": 0.000488780197219868, + "loss": 3.6209, + "step": 2703 + }, + { + "epoch": 0.11385263157894737, + "grad_norm": 0.447265625, + "learning_rate": 0.0004887701994280112, + "loss": 3.6253, + "step": 2704 + }, + { + "epoch": 0.11389473684210526, + "grad_norm": 0.44140625, + "learning_rate": 0.000488760197286042, + "loss": 3.7679, + "step": 2705 + }, + { + "epoch": 0.11393684210526316, + "grad_norm": 0.51171875, + "learning_rate": 0.0004887501907941425, + "loss": 3.6703, + "step": 2706 + }, + { + "epoch": 0.11397894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.0004887401799524948, + "loss": 3.5127, + "step": 2707 + }, + { + "epoch": 0.11402105263157895, + "grad_norm": 0.462890625, + "learning_rate": 0.0004887301647612818, + "loss": 3.535, + "step": 2708 + }, + { + "epoch": 0.11406315789473684, + "grad_norm": 0.474609375, + "learning_rate": 0.0004887201452206856, + "loss": 3.7991, + "step": 2709 + }, + { + "epoch": 0.11410526315789474, + "grad_norm": 0.431640625, + "learning_rate": 0.0004887101213308889, + "loss": 3.4245, + "step": 2710 + }, + { + "epoch": 0.11414736842105264, + "grad_norm": 0.42578125, + "learning_rate": 0.0004887000930920742, + "loss": 3.8205, + "step": 2711 + }, + { + "epoch": 0.11418947368421052, + "grad_norm": 0.400390625, + "learning_rate": 0.0004886900605044242, + "loss": 3.3188, + "step": 2712 + }, + { + "epoch": 0.11423157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.0004886800235681219, + "loss": 3.961, + "step": 2713 + }, + { + "epoch": 0.11427368421052632, + "grad_norm": 0.44921875, + "learning_rate": 0.0004886699822833499, + "loss": 3.601, + "step": 2714 + }, + { + "epoch": 0.1143157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.0004886599366502913, + "loss": 3.4259, + "step": 2715 + }, + { + "epoch": 0.1143578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.000488649886669129, + "loss": 3.6399, + "step": 2716 + }, + { + "epoch": 0.1144, + "grad_norm": 0.41796875, + "learning_rate": 0.0004886398323400463, + "loss": 3.694, + "step": 2717 + }, + { + "epoch": 0.1144421052631579, + "grad_norm": 0.455078125, + "learning_rate": 0.0004886297736632261, + "loss": 3.5827, + "step": 2718 + }, + { + "epoch": 0.11448421052631579, + "grad_norm": 0.390625, + "learning_rate": 0.0004886197106388519, + "loss": 3.6914, + "step": 2719 + }, + { + "epoch": 0.11452631578947368, + "grad_norm": 0.474609375, + "learning_rate": 0.0004886096432671069, + "loss": 4.0794, + "step": 2720 + }, + { + "epoch": 0.11456842105263158, + "grad_norm": 0.478515625, + "learning_rate": 0.0004885995715481746, + "loss": 3.3042, + "step": 2721 + }, + { + "epoch": 0.11461052631578947, + "grad_norm": 0.51953125, + "learning_rate": 0.0004885894954822385, + "loss": 3.8311, + "step": 2722 + }, + { + "epoch": 0.11465263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0004885794150694819, + "loss": 3.9166, + "step": 2723 + }, + { + "epoch": 0.11469473684210527, + "grad_norm": 0.45703125, + "learning_rate": 0.0004885693303100889, + "loss": 3.1083, + "step": 2724 + }, + { + "epoch": 0.11473684210526315, + "grad_norm": 0.41015625, + "learning_rate": 0.0004885592412042428, + "loss": 3.5434, + "step": 2725 + }, + { + "epoch": 0.11477894736842105, + "grad_norm": 0.453125, + "learning_rate": 0.0004885491477521278, + "loss": 3.2582, + "step": 2726 + }, + { + "epoch": 0.11482105263157895, + "grad_norm": 0.48828125, + "learning_rate": 0.0004885390499539276, + "loss": 3.36, + "step": 2727 + }, + { + "epoch": 0.11486315789473685, + "grad_norm": 0.44140625, + "learning_rate": 0.0004885289478098261, + "loss": 3.3487, + "step": 2728 + }, + { + "epoch": 0.11490526315789473, + "grad_norm": 0.396484375, + "learning_rate": 0.0004885188413200074, + "loss": 3.6309, + "step": 2729 + }, + { + "epoch": 0.11494736842105263, + "grad_norm": 0.4375, + "learning_rate": 0.0004885087304846557, + "loss": 3.7837, + "step": 2730 + }, + { + "epoch": 0.11498947368421053, + "grad_norm": 0.439453125, + "learning_rate": 0.0004884986153039553, + "loss": 3.6787, + "step": 2731 + }, + { + "epoch": 0.11503157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.0004884884957780902, + "loss": 3.9876, + "step": 2732 + }, + { + "epoch": 0.11507368421052631, + "grad_norm": 0.40625, + "learning_rate": 0.000488478371907245, + "loss": 4.0725, + "step": 2733 + }, + { + "epoch": 0.11511578947368421, + "grad_norm": 0.470703125, + "learning_rate": 0.000488468243691604, + "loss": 3.281, + "step": 2734 + }, + { + "epoch": 0.11515789473684211, + "grad_norm": 0.451171875, + "learning_rate": 0.0004884581111313518, + "loss": 3.6996, + "step": 2735 + }, + { + "epoch": 0.1152, + "grad_norm": 0.474609375, + "learning_rate": 0.000488447974226673, + "loss": 3.57, + "step": 2736 + }, + { + "epoch": 0.1152421052631579, + "grad_norm": 0.451171875, + "learning_rate": 0.0004884378329777523, + "loss": 3.2319, + "step": 2737 + }, + { + "epoch": 0.1152842105263158, + "grad_norm": 0.494140625, + "learning_rate": 0.0004884276873847745, + "loss": 3.3286, + "step": 2738 + }, + { + "epoch": 0.11532631578947368, + "grad_norm": 0.447265625, + "learning_rate": 0.0004884175374479243, + "loss": 3.768, + "step": 2739 + }, + { + "epoch": 0.11536842105263158, + "grad_norm": 0.458984375, + "learning_rate": 0.0004884073831673867, + "loss": 3.3615, + "step": 2740 + }, + { + "epoch": 0.11541052631578948, + "grad_norm": 0.44140625, + "learning_rate": 0.0004883972245433466, + "loss": 3.321, + "step": 2741 + }, + { + "epoch": 0.11545263157894736, + "grad_norm": 0.44140625, + "learning_rate": 0.0004883870615759893, + "loss": 3.9683, + "step": 2742 + }, + { + "epoch": 0.11549473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.0004883768942654998, + "loss": 3.2698, + "step": 2743 + }, + { + "epoch": 0.11553684210526316, + "grad_norm": 0.5234375, + "learning_rate": 0.0004883667226120633, + "loss": 3.3373, + "step": 2744 + }, + { + "epoch": 0.11557894736842106, + "grad_norm": 0.45703125, + "learning_rate": 0.0004883565466158652, + "loss": 3.784, + "step": 2745 + }, + { + "epoch": 0.11562105263157894, + "grad_norm": 0.408203125, + "learning_rate": 0.000488346366277091, + "loss": 3.6288, + "step": 2746 + }, + { + "epoch": 0.11566315789473684, + "grad_norm": 0.50390625, + "learning_rate": 0.0004883361815959259, + "loss": 3.5148, + "step": 2747 + }, + { + "epoch": 0.11570526315789474, + "grad_norm": 0.427734375, + "learning_rate": 0.0004883259925725557, + "loss": 3.5405, + "step": 2748 + }, + { + "epoch": 0.11574736842105263, + "grad_norm": 0.400390625, + "learning_rate": 0.000488315799207166, + "loss": 3.5724, + "step": 2749 + }, + { + "epoch": 0.11578947368421053, + "grad_norm": 0.42578125, + "learning_rate": 0.0004883056014999423, + "loss": 3.1807, + "step": 2750 + }, + { + "epoch": 0.11583157894736842, + "grad_norm": 0.466796875, + "learning_rate": 0.0004882953994510706, + "loss": 3.8557, + "step": 2751 + }, + { + "epoch": 0.11587368421052631, + "grad_norm": 0.427734375, + "learning_rate": 0.0004882851930607368, + "loss": 3.8998, + "step": 2752 + }, + { + "epoch": 0.11591578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0004882749823291266, + "loss": 3.2544, + "step": 2753 + }, + { + "epoch": 0.11595789473684211, + "grad_norm": 0.41796875, + "learning_rate": 0.0004882647672564263, + "loss": 3.7189, + "step": 2754 + }, + { + "epoch": 0.116, + "grad_norm": 0.421875, + "learning_rate": 0.0004882545478428218, + "loss": 3.4328, + "step": 2755 + }, + { + "epoch": 0.11604210526315789, + "grad_norm": 0.404296875, + "learning_rate": 0.0004882443240884994, + "loss": 3.3119, + "step": 2756 + }, + { + "epoch": 0.11608421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.00048823409599364537, + "loss": 3.2452, + "step": 2757 + }, + { + "epoch": 0.11612631578947369, + "grad_norm": 0.48046875, + "learning_rate": 0.0004882238635584461, + "loss": 3.264, + "step": 2758 + }, + { + "epoch": 0.11616842105263157, + "grad_norm": 0.404296875, + "learning_rate": 0.00048821362678308776, + "loss": 3.5905, + "step": 2759 + }, + { + "epoch": 0.11621052631578947, + "grad_norm": 0.431640625, + "learning_rate": 0.0004882033856677571, + "loss": 3.1328, + "step": 2760 + }, + { + "epoch": 0.11625263157894737, + "grad_norm": 0.40234375, + "learning_rate": 0.00048819314021264064, + "loss": 3.5047, + "step": 2761 + }, + { + "epoch": 0.11629473684210526, + "grad_norm": 0.41015625, + "learning_rate": 0.0004881828904179251, + "loss": 3.4413, + "step": 2762 + }, + { + "epoch": 0.11633684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.00048817263628379707, + "loss": 3.2281, + "step": 2763 + }, + { + "epoch": 0.11637894736842105, + "grad_norm": 0.408203125, + "learning_rate": 0.00048816237781044346, + "loss": 3.7667, + "step": 2764 + }, + { + "epoch": 0.11642105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.0004881521149980511, + "loss": 3.384, + "step": 2765 + }, + { + "epoch": 0.11646315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.0004881418478468071, + "loss": 3.4954, + "step": 2766 + }, + { + "epoch": 0.11650526315789474, + "grad_norm": 0.41015625, + "learning_rate": 0.00048813157635689845, + "loss": 3.7482, + "step": 2767 + }, + { + "epoch": 0.11654736842105264, + "grad_norm": 0.435546875, + "learning_rate": 0.0004881213005285123, + "loss": 3.6393, + "step": 2768 + }, + { + "epoch": 0.11658947368421052, + "grad_norm": 0.4140625, + "learning_rate": 0.00048811102036183576, + "loss": 3.4446, + "step": 2769 + }, + { + "epoch": 0.11663157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.00048810073585705616, + "loss": 3.6504, + "step": 2770 + }, + { + "epoch": 0.11667368421052632, + "grad_norm": 0.53515625, + "learning_rate": 0.000488090447014361, + "loss": 3.8029, + "step": 2771 + }, + { + "epoch": 0.11671578947368422, + "grad_norm": 0.5234375, + "learning_rate": 0.0004880801538339376, + "loss": 3.0719, + "step": 2772 + }, + { + "epoch": 0.1167578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0004880698563159736, + "loss": 3.4642, + "step": 2773 + }, + { + "epoch": 0.1168, + "grad_norm": 0.427734375, + "learning_rate": 0.0004880595544606564, + "loss": 3.6255, + "step": 2774 + }, + { + "epoch": 0.1168421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.0004880492482681739, + "loss": 3.2833, + "step": 2775 + }, + { + "epoch": 0.11688421052631578, + "grad_norm": 0.4609375, + "learning_rate": 0.0004880389377387138, + "loss": 3.3541, + "step": 2776 + }, + { + "epoch": 0.11692631578947368, + "grad_norm": 0.439453125, + "learning_rate": 0.00048802862287246394, + "loss": 3.6164, + "step": 2777 + }, + { + "epoch": 0.11696842105263158, + "grad_norm": 0.40234375, + "learning_rate": 0.00048801830366961224, + "loss": 3.9764, + "step": 2778 + }, + { + "epoch": 0.11701052631578947, + "grad_norm": 0.447265625, + "learning_rate": 0.0004880079801303468, + "loss": 3.7477, + "step": 2779 + }, + { + "epoch": 0.11705263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.00048799765225485543, + "loss": 4.0114, + "step": 2780 + }, + { + "epoch": 0.11709473684210527, + "grad_norm": 0.40625, + "learning_rate": 0.0004879873200433266, + "loss": 3.0378, + "step": 2781 + }, + { + "epoch": 0.11713684210526316, + "grad_norm": 0.58203125, + "learning_rate": 0.0004879769834959483, + "loss": 3.3359, + "step": 2782 + }, + { + "epoch": 0.11717894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.0004879666426129091, + "loss": 2.9993, + "step": 2783 + }, + { + "epoch": 0.11722105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.0004879562973943972, + "loss": 4.038, + "step": 2784 + }, + { + "epoch": 0.11726315789473685, + "grad_norm": 0.65625, + "learning_rate": 0.0004879459478406012, + "loss": 3.5074, + "step": 2785 + }, + { + "epoch": 0.11730526315789473, + "grad_norm": 0.427734375, + "learning_rate": 0.00048793559395170953, + "loss": 3.7285, + "step": 2786 + }, + { + "epoch": 0.11734736842105263, + "grad_norm": 0.57421875, + "learning_rate": 0.000487925235727911, + "loss": 3.5123, + "step": 2787 + }, + { + "epoch": 0.11738947368421053, + "grad_norm": 0.48046875, + "learning_rate": 0.00048791487316939415, + "loss": 3.5816, + "step": 2788 + }, + { + "epoch": 0.11743157894736841, + "grad_norm": 0.408203125, + "learning_rate": 0.0004879045062763479, + "loss": 3.5543, + "step": 2789 + }, + { + "epoch": 0.11747368421052631, + "grad_norm": 0.455078125, + "learning_rate": 0.000487894135048961, + "loss": 3.526, + "step": 2790 + }, + { + "epoch": 0.11751578947368421, + "grad_norm": 0.400390625, + "learning_rate": 0.00048788375948742254, + "loss": 3.49, + "step": 2791 + }, + { + "epoch": 0.11755789473684211, + "grad_norm": 0.4296875, + "learning_rate": 0.0004878733795919215, + "loss": 3.5965, + "step": 2792 + }, + { + "epoch": 0.1176, + "grad_norm": 0.392578125, + "learning_rate": 0.0004878629953626469, + "loss": 3.8373, + "step": 2793 + }, + { + "epoch": 0.1176421052631579, + "grad_norm": 0.46875, + "learning_rate": 0.00048785260679978803, + "loss": 3.3511, + "step": 2794 + }, + { + "epoch": 0.1176842105263158, + "grad_norm": 0.455078125, + "learning_rate": 0.0004878422139035341, + "loss": 3.3516, + "step": 2795 + }, + { + "epoch": 0.11772631578947368, + "grad_norm": 0.39453125, + "learning_rate": 0.0004878318166740745, + "loss": 3.2327, + "step": 2796 + }, + { + "epoch": 0.11776842105263158, + "grad_norm": 0.490234375, + "learning_rate": 0.00048782141511159873, + "loss": 3.0295, + "step": 2797 + }, + { + "epoch": 0.11781052631578948, + "grad_norm": 0.42578125, + "learning_rate": 0.0004878110092162961, + "loss": 3.4337, + "step": 2798 + }, + { + "epoch": 0.11785263157894736, + "grad_norm": 0.498046875, + "learning_rate": 0.0004878005989883564, + "loss": 3.44, + "step": 2799 + }, + { + "epoch": 0.11789473684210526, + "grad_norm": 0.44921875, + "learning_rate": 0.0004877901844279691, + "loss": 3.4228, + "step": 2800 + }, + { + "epoch": 0.11793684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.00048777976553532397, + "loss": 3.546, + "step": 2801 + }, + { + "epoch": 0.11797894736842106, + "grad_norm": 0.396484375, + "learning_rate": 0.00048776934231061105, + "loss": 3.1686, + "step": 2802 + }, + { + "epoch": 0.11802105263157894, + "grad_norm": 0.4140625, + "learning_rate": 0.00048775891475402, + "loss": 3.8679, + "step": 2803 + }, + { + "epoch": 0.11806315789473684, + "grad_norm": 0.431640625, + "learning_rate": 0.0004877484828657409, + "loss": 3.788, + "step": 2804 + }, + { + "epoch": 0.11810526315789474, + "grad_norm": 0.408203125, + "learning_rate": 0.00048773804664596377, + "loss": 3.1713, + "step": 2805 + }, + { + "epoch": 0.11814736842105263, + "grad_norm": 0.5078125, + "learning_rate": 0.00048772760609487874, + "loss": 3.7858, + "step": 2806 + }, + { + "epoch": 0.11818947368421052, + "grad_norm": 0.41015625, + "learning_rate": 0.00048771716121267605, + "loss": 3.552, + "step": 2807 + }, + { + "epoch": 0.11823157894736842, + "grad_norm": 0.447265625, + "learning_rate": 0.0004877067119995461, + "loss": 4.2121, + "step": 2808 + }, + { + "epoch": 0.11827368421052632, + "grad_norm": 0.439453125, + "learning_rate": 0.000487696258455679, + "loss": 3.6594, + "step": 2809 + }, + { + "epoch": 0.11831578947368421, + "grad_norm": 0.3984375, + "learning_rate": 0.0004876858005812654, + "loss": 2.9684, + "step": 2810 + }, + { + "epoch": 0.1183578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.0004876753383764958, + "loss": 3.3871, + "step": 2811 + }, + { + "epoch": 0.1184, + "grad_norm": 0.427734375, + "learning_rate": 0.0004876648718415608, + "loss": 3.8453, + "step": 2812 + }, + { + "epoch": 0.11844210526315789, + "grad_norm": 0.42578125, + "learning_rate": 0.00048765440097665104, + "loss": 3.3664, + "step": 2813 + }, + { + "epoch": 0.11848421052631579, + "grad_norm": 0.404296875, + "learning_rate": 0.0004876439257819574, + "loss": 3.5623, + "step": 2814 + }, + { + "epoch": 0.11852631578947369, + "grad_norm": 0.453125, + "learning_rate": 0.0004876334462576706, + "loss": 3.5241, + "step": 2815 + }, + { + "epoch": 0.11856842105263157, + "grad_norm": 0.447265625, + "learning_rate": 0.00048762296240398165, + "loss": 3.3805, + "step": 2816 + }, + { + "epoch": 0.11861052631578947, + "grad_norm": 0.439453125, + "learning_rate": 0.00048761247422108146, + "loss": 3.6599, + "step": 2817 + }, + { + "epoch": 0.11865263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.0004876019817091612, + "loss": 3.5569, + "step": 2818 + }, + { + "epoch": 0.11869473684210527, + "grad_norm": 0.375, + "learning_rate": 0.00048759148486841197, + "loss": 3.6372, + "step": 2819 + }, + { + "epoch": 0.11873684210526315, + "grad_norm": 0.439453125, + "learning_rate": 0.00048758098369902517, + "loss": 3.1221, + "step": 2820 + }, + { + "epoch": 0.11877894736842105, + "grad_norm": 0.39453125, + "learning_rate": 0.0004875704782011918, + "loss": 3.2028, + "step": 2821 + }, + { + "epoch": 0.11882105263157895, + "grad_norm": 0.4453125, + "learning_rate": 0.0004875599683751036, + "loss": 3.3473, + "step": 2822 + }, + { + "epoch": 0.11886315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.00048754945422095186, + "loss": 3.5971, + "step": 2823 + }, + { + "epoch": 0.11890526315789474, + "grad_norm": 0.451171875, + "learning_rate": 0.00048753893573892816, + "loss": 3.4471, + "step": 2824 + }, + { + "epoch": 0.11894736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.0004875284129292242, + "loss": 3.1798, + "step": 2825 + }, + { + "epoch": 0.11898947368421052, + "grad_norm": 0.453125, + "learning_rate": 0.0004875178857920316, + "loss": 3.0938, + "step": 2826 + }, + { + "epoch": 0.11903157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.00048750735432754217, + "loss": 3.5277, + "step": 2827 + }, + { + "epoch": 0.11907368421052632, + "grad_norm": 0.4140625, + "learning_rate": 0.00048749681853594787, + "loss": 3.5053, + "step": 2828 + }, + { + "epoch": 0.11911578947368422, + "grad_norm": 0.447265625, + "learning_rate": 0.00048748627841744055, + "loss": 3.189, + "step": 2829 + }, + { + "epoch": 0.1191578947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.00048747573397221224, + "loss": 3.4325, + "step": 2830 + }, + { + "epoch": 0.1192, + "grad_norm": 0.384765625, + "learning_rate": 0.0004874651852004551, + "loss": 3.7074, + "step": 2831 + }, + { + "epoch": 0.1192421052631579, + "grad_norm": 0.470703125, + "learning_rate": 0.0004874546321023613, + "loss": 3.5767, + "step": 2832 + }, + { + "epoch": 0.11928421052631578, + "grad_norm": 0.423828125, + "learning_rate": 0.0004874440746781231, + "loss": 3.9523, + "step": 2833 + }, + { + "epoch": 0.11932631578947368, + "grad_norm": 0.40625, + "learning_rate": 0.00048743351292793283, + "loss": 3.741, + "step": 2834 + }, + { + "epoch": 0.11936842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.0004874229468519829, + "loss": 3.6629, + "step": 2835 + }, + { + "epoch": 0.11941052631578947, + "grad_norm": 0.455078125, + "learning_rate": 0.00048741237645046585, + "loss": 3.4801, + "step": 2836 + }, + { + "epoch": 0.11945263157894737, + "grad_norm": 0.455078125, + "learning_rate": 0.0004874018017235743, + "loss": 3.4843, + "step": 2837 + }, + { + "epoch": 0.11949473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.0004873912226715008, + "loss": 3.7619, + "step": 2838 + }, + { + "epoch": 0.11953684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.00048738063929443823, + "loss": 3.7195, + "step": 2839 + }, + { + "epoch": 0.11957894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.0004873700515925793, + "loss": 3.4025, + "step": 2840 + }, + { + "epoch": 0.11962105263157895, + "grad_norm": 0.53515625, + "learning_rate": 0.00048735945956611686, + "loss": 3.4672, + "step": 2841 + }, + { + "epoch": 0.11966315789473685, + "grad_norm": 0.4453125, + "learning_rate": 0.00048734886321524405, + "loss": 3.3723, + "step": 2842 + }, + { + "epoch": 0.11970526315789473, + "grad_norm": 0.40234375, + "learning_rate": 0.0004873382625401537, + "loss": 3.6559, + "step": 2843 + }, + { + "epoch": 0.11974736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.00048732765754103916, + "loss": 3.6285, + "step": 2844 + }, + { + "epoch": 0.11978947368421053, + "grad_norm": 0.4140625, + "learning_rate": 0.00048731704821809355, + "loss": 3.5868, + "step": 2845 + }, + { + "epoch": 0.11983157894736843, + "grad_norm": 0.44921875, + "learning_rate": 0.00048730643457151014, + "loss": 3.3669, + "step": 2846 + }, + { + "epoch": 0.11987368421052631, + "grad_norm": 0.439453125, + "learning_rate": 0.00048729581660148224, + "loss": 3.4807, + "step": 2847 + }, + { + "epoch": 0.11991578947368421, + "grad_norm": 0.400390625, + "learning_rate": 0.0004872851943082035, + "loss": 3.4516, + "step": 2848 + }, + { + "epoch": 0.11995789473684211, + "grad_norm": 0.6796875, + "learning_rate": 0.0004872745676918673, + "loss": 3.4762, + "step": 2849 + }, + { + "epoch": 0.12, + "grad_norm": 0.494140625, + "learning_rate": 0.0004872639367526672, + "loss": 3.5862, + "step": 2850 + }, + { + "epoch": 0.1200421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00048725330149079695, + "loss": 3.5795, + "step": 2851 + }, + { + "epoch": 0.1200842105263158, + "grad_norm": 0.38671875, + "learning_rate": 0.0004872426619064503, + "loss": 3.6206, + "step": 2852 + }, + { + "epoch": 0.12012631578947368, + "grad_norm": 0.3828125, + "learning_rate": 0.00048723201799982113, + "loss": 3.2186, + "step": 2853 + }, + { + "epoch": 0.12016842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.0004872213697711033, + "loss": 3.8518, + "step": 2854 + }, + { + "epoch": 0.12021052631578948, + "grad_norm": 0.50390625, + "learning_rate": 0.0004872107172204909, + "loss": 3.7301, + "step": 2855 + }, + { + "epoch": 0.12025263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.00048720006034817787, + "loss": 3.5111, + "step": 2856 + }, + { + "epoch": 0.12029473684210526, + "grad_norm": 0.65625, + "learning_rate": 0.0004871893991543585, + "loss": 3.4307, + "step": 2857 + }, + { + "epoch": 0.12033684210526316, + "grad_norm": 0.453125, + "learning_rate": 0.00048717873363922695, + "loss": 2.8608, + "step": 2858 + }, + { + "epoch": 0.12037894736842106, + "grad_norm": 0.466796875, + "learning_rate": 0.0004871680638029775, + "loss": 3.2442, + "step": 2859 + }, + { + "epoch": 0.12042105263157894, + "grad_norm": 0.443359375, + "learning_rate": 0.00048715738964580466, + "loss": 3.6237, + "step": 2860 + }, + { + "epoch": 0.12046315789473684, + "grad_norm": 0.46484375, + "learning_rate": 0.00048714671116790277, + "loss": 3.688, + "step": 2861 + }, + { + "epoch": 0.12050526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.0004871360283694665, + "loss": 3.6212, + "step": 2862 + }, + { + "epoch": 0.12054736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.0004871253412506903, + "loss": 3.434, + "step": 2863 + }, + { + "epoch": 0.12058947368421052, + "grad_norm": 0.427734375, + "learning_rate": 0.00048711464981176914, + "loss": 3.51, + "step": 2864 + }, + { + "epoch": 0.12063157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.0004871039540528975, + "loss": 3.8943, + "step": 2865 + }, + { + "epoch": 0.12067368421052632, + "grad_norm": 0.59375, + "learning_rate": 0.0004870932539742705, + "loss": 3.6839, + "step": 2866 + }, + { + "epoch": 0.1207157894736842, + "grad_norm": 0.4765625, + "learning_rate": 0.000487082549576083, + "loss": 3.286, + "step": 2867 + }, + { + "epoch": 0.1207578947368421, + "grad_norm": 0.4609375, + "learning_rate": 0.00048707184085853, + "loss": 3.9478, + "step": 2868 + }, + { + "epoch": 0.1208, + "grad_norm": 0.427734375, + "learning_rate": 0.0004870611278218066, + "loss": 3.3878, + "step": 2869 + }, + { + "epoch": 0.12084210526315789, + "grad_norm": 0.458984375, + "learning_rate": 0.000487050410466108, + "loss": 3.9745, + "step": 2870 + }, + { + "epoch": 0.12088421052631579, + "grad_norm": 0.48046875, + "learning_rate": 0.0004870396887916294, + "loss": 3.292, + "step": 2871 + }, + { + "epoch": 0.12092631578947369, + "grad_norm": 0.5078125, + "learning_rate": 0.0004870289627985662, + "loss": 3.6294, + "step": 2872 + }, + { + "epoch": 0.12096842105263157, + "grad_norm": 0.54296875, + "learning_rate": 0.0004870182324871138, + "loss": 3.6152, + "step": 2873 + }, + { + "epoch": 0.12101052631578947, + "grad_norm": 0.62109375, + "learning_rate": 0.0004870074978574677, + "loss": 3.1454, + "step": 2874 + }, + { + "epoch": 0.12105263157894737, + "grad_norm": 0.466796875, + "learning_rate": 0.00048699675890982345, + "loss": 3.2372, + "step": 2875 + }, + { + "epoch": 0.12109473684210527, + "grad_norm": 0.431640625, + "learning_rate": 0.0004869860156443767, + "loss": 3.4052, + "step": 2876 + }, + { + "epoch": 0.12113684210526315, + "grad_norm": 0.458984375, + "learning_rate": 0.0004869752680613232, + "loss": 3.4905, + "step": 2877 + }, + { + "epoch": 0.12117894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.00048696451616085873, + "loss": 3.5956, + "step": 2878 + }, + { + "epoch": 0.12122105263157895, + "grad_norm": 0.44140625, + "learning_rate": 0.0004869537599431793, + "loss": 3.4402, + "step": 2879 + }, + { + "epoch": 0.12126315789473684, + "grad_norm": 0.5625, + "learning_rate": 0.00048694299940848064, + "loss": 3.4183, + "step": 2880 + }, + { + "epoch": 0.12130526315789474, + "grad_norm": 0.44140625, + "learning_rate": 0.00048693223455695896, + "loss": 3.8453, + "step": 2881 + }, + { + "epoch": 0.12134736842105263, + "grad_norm": 0.40234375, + "learning_rate": 0.00048692146538881036, + "loss": 3.2207, + "step": 2882 + }, + { + "epoch": 0.12138947368421053, + "grad_norm": 0.439453125, + "learning_rate": 0.000486910691904231, + "loss": 3.7992, + "step": 2883 + }, + { + "epoch": 0.12143157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.00048689991410341726, + "loss": 3.466, + "step": 2884 + }, + { + "epoch": 0.12147368421052632, + "grad_norm": 0.423828125, + "learning_rate": 0.0004868891319865654, + "loss": 3.4106, + "step": 2885 + }, + { + "epoch": 0.12151578947368422, + "grad_norm": 0.447265625, + "learning_rate": 0.0004868783455538719, + "loss": 3.4247, + "step": 2886 + }, + { + "epoch": 0.1215578947368421, + "grad_norm": 0.45703125, + "learning_rate": 0.00048686755480553326, + "loss": 3.6108, + "step": 2887 + }, + { + "epoch": 0.1216, + "grad_norm": 0.427734375, + "learning_rate": 0.0004868567597417461, + "loss": 3.4444, + "step": 2888 + }, + { + "epoch": 0.1216421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.000486845960362707, + "loss": 3.5251, + "step": 2889 + }, + { + "epoch": 0.12168421052631578, + "grad_norm": 0.5078125, + "learning_rate": 0.0004868351566686129, + "loss": 3.5891, + "step": 2890 + }, + { + "epoch": 0.12172631578947368, + "grad_norm": 0.4453125, + "learning_rate": 0.0004868243486596604, + "loss": 3.5605, + "step": 2891 + }, + { + "epoch": 0.12176842105263158, + "grad_norm": 0.462890625, + "learning_rate": 0.00048681353633604665, + "loss": 3.6923, + "step": 2892 + }, + { + "epoch": 0.12181052631578948, + "grad_norm": 0.42578125, + "learning_rate": 0.0004868027196979684, + "loss": 3.2688, + "step": 2893 + }, + { + "epoch": 0.12185263157894737, + "grad_norm": 0.443359375, + "learning_rate": 0.00048679189874562283, + "loss": 3.2256, + "step": 2894 + }, + { + "epoch": 0.12189473684210526, + "grad_norm": 0.4140625, + "learning_rate": 0.0004867810734792072, + "loss": 3.4213, + "step": 2895 + }, + { + "epoch": 0.12193684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.0004867702438989185, + "loss": 3.5332, + "step": 2896 + }, + { + "epoch": 0.12197894736842105, + "grad_norm": 0.4765625, + "learning_rate": 0.00048675941000495416, + "loss": 3.1365, + "step": 2897 + }, + { + "epoch": 0.12202105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.0004867485717975116, + "loss": 3.3783, + "step": 2898 + }, + { + "epoch": 0.12206315789473685, + "grad_norm": 0.4375, + "learning_rate": 0.00048673772927678815, + "loss": 3.6153, + "step": 2899 + }, + { + "epoch": 0.12210526315789473, + "grad_norm": 0.447265625, + "learning_rate": 0.0004867268824429815, + "loss": 3.749, + "step": 2900 + }, + { + "epoch": 0.12214736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.00048671603129628923, + "loss": 3.7498, + "step": 2901 + }, + { + "epoch": 0.12218947368421053, + "grad_norm": 0.47265625, + "learning_rate": 0.0004867051758369089, + "loss": 3.1852, + "step": 2902 + }, + { + "epoch": 0.12223157894736843, + "grad_norm": 0.392578125, + "learning_rate": 0.0004866943160650384, + "loss": 3.7043, + "step": 2903 + }, + { + "epoch": 0.12227368421052631, + "grad_norm": 0.39453125, + "learning_rate": 0.00048668345198087565, + "loss": 3.7886, + "step": 2904 + }, + { + "epoch": 0.12231578947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.00048667258358461846, + "loss": 3.3222, + "step": 2905 + }, + { + "epoch": 0.12235789473684211, + "grad_norm": 0.44921875, + "learning_rate": 0.00048666171087646484, + "loss": 3.5955, + "step": 2906 + }, + { + "epoch": 0.1224, + "grad_norm": 0.427734375, + "learning_rate": 0.000486650833856613, + "loss": 3.2265, + "step": 2907 + }, + { + "epoch": 0.1224421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00048663995252526095, + "loss": 3.2892, + "step": 2908 + }, + { + "epoch": 0.12248421052631579, + "grad_norm": 0.447265625, + "learning_rate": 0.000486629066882607, + "loss": 3.7501, + "step": 2909 + }, + { + "epoch": 0.12252631578947368, + "grad_norm": 0.455078125, + "learning_rate": 0.00048661817692884947, + "loss": 3.7994, + "step": 2910 + }, + { + "epoch": 0.12256842105263158, + "grad_norm": 0.3984375, + "learning_rate": 0.00048660728266418683, + "loss": 4.1502, + "step": 2911 + }, + { + "epoch": 0.12261052631578948, + "grad_norm": 0.4765625, + "learning_rate": 0.00048659638408881745, + "loss": 2.9858, + "step": 2912 + }, + { + "epoch": 0.12265263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00048658548120293995, + "loss": 3.5554, + "step": 2913 + }, + { + "epoch": 0.12269473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.00048657457400675297, + "loss": 3.1423, + "step": 2914 + }, + { + "epoch": 0.12273684210526316, + "grad_norm": 0.455078125, + "learning_rate": 0.0004865636625004551, + "loss": 2.8558, + "step": 2915 + }, + { + "epoch": 0.12277894736842106, + "grad_norm": 0.474609375, + "learning_rate": 0.0004865527466842454, + "loss": 3.8267, + "step": 2916 + }, + { + "epoch": 0.12282105263157894, + "grad_norm": 0.439453125, + "learning_rate": 0.00048654182655832255, + "loss": 3.6986, + "step": 2917 + }, + { + "epoch": 0.12286315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.0004865309021228856, + "loss": 3.452, + "step": 2918 + }, + { + "epoch": 0.12290526315789474, + "grad_norm": 0.431640625, + "learning_rate": 0.00048651997337813345, + "loss": 3.5154, + "step": 2919 + }, + { + "epoch": 0.12294736842105262, + "grad_norm": 0.486328125, + "learning_rate": 0.00048650904032426534, + "loss": 3.613, + "step": 2920 + }, + { + "epoch": 0.12298947368421052, + "grad_norm": 0.458984375, + "learning_rate": 0.00048649810296148035, + "loss": 3.2096, + "step": 2921 + }, + { + "epoch": 0.12303157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.0004864871612899778, + "loss": 3.2185, + "step": 2922 + }, + { + "epoch": 0.12307368421052632, + "grad_norm": 0.427734375, + "learning_rate": 0.00048647621530995705, + "loss": 3.4172, + "step": 2923 + }, + { + "epoch": 0.1231157894736842, + "grad_norm": 0.5078125, + "learning_rate": 0.00048646526502161755, + "loss": 3.9241, + "step": 2924 + }, + { + "epoch": 0.1231578947368421, + "grad_norm": 0.396484375, + "learning_rate": 0.00048645431042515866, + "loss": 3.54, + "step": 2925 + }, + { + "epoch": 0.1232, + "grad_norm": 0.42578125, + "learning_rate": 0.0004864433515207801, + "loss": 3.7357, + "step": 2926 + }, + { + "epoch": 0.12324210526315789, + "grad_norm": 0.546875, + "learning_rate": 0.0004864323883086815, + "loss": 3.7652, + "step": 2927 + }, + { + "epoch": 0.12328421052631579, + "grad_norm": 0.458984375, + "learning_rate": 0.0004864214207890626, + "loss": 3.8698, + "step": 2928 + }, + { + "epoch": 0.12332631578947369, + "grad_norm": 0.67578125, + "learning_rate": 0.0004864104489621232, + "loss": 3.6557, + "step": 2929 + }, + { + "epoch": 0.12336842105263159, + "grad_norm": 0.484375, + "learning_rate": 0.00048639947282806315, + "loss": 3.3412, + "step": 2930 + }, + { + "epoch": 0.12341052631578947, + "grad_norm": 0.451171875, + "learning_rate": 0.0004863884923870825, + "loss": 3.2352, + "step": 2931 + }, + { + "epoch": 0.12345263157894737, + "grad_norm": 0.5, + "learning_rate": 0.0004863775076393812, + "loss": 3.5197, + "step": 2932 + }, + { + "epoch": 0.12349473684210527, + "grad_norm": 0.42578125, + "learning_rate": 0.00048636651858515947, + "loss": 3.3437, + "step": 2933 + }, + { + "epoch": 0.12353684210526315, + "grad_norm": 0.447265625, + "learning_rate": 0.0004863555252246175, + "loss": 3.5777, + "step": 2934 + }, + { + "epoch": 0.12357894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.00048634452755795554, + "loss": 3.2911, + "step": 2935 + }, + { + "epoch": 0.12362105263157895, + "grad_norm": 0.45703125, + "learning_rate": 0.00048633352558537403, + "loss": 3.523, + "step": 2936 + }, + { + "epoch": 0.12366315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.0004863225193070734, + "loss": 3.7026, + "step": 2937 + }, + { + "epoch": 0.12370526315789473, + "grad_norm": 0.451171875, + "learning_rate": 0.000486311508723254, + "loss": 3.6308, + "step": 2938 + }, + { + "epoch": 0.12374736842105263, + "grad_norm": 0.53125, + "learning_rate": 0.00048630049383411667, + "loss": 2.9412, + "step": 2939 + }, + { + "epoch": 0.12378947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.00048628947463986197, + "loss": 3.5675, + "step": 2940 + }, + { + "epoch": 0.12383157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.0004862784511406907, + "loss": 3.6594, + "step": 2941 + }, + { + "epoch": 0.12387368421052632, + "grad_norm": 0.494140625, + "learning_rate": 0.0004862674233368036, + "loss": 3.6711, + "step": 2942 + }, + { + "epoch": 0.12391578947368422, + "grad_norm": 0.416015625, + "learning_rate": 0.0004862563912284017, + "loss": 3.1911, + "step": 2943 + }, + { + "epoch": 0.1239578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.0004862453548156859, + "loss": 4.2802, + "step": 2944 + }, + { + "epoch": 0.124, + "grad_norm": 0.6640625, + "learning_rate": 0.0004862343140988573, + "loss": 3.4156, + "step": 2945 + }, + { + "epoch": 0.1240421052631579, + "grad_norm": 0.400390625, + "learning_rate": 0.0004862232690781171, + "loss": 3.6335, + "step": 2946 + }, + { + "epoch": 0.12408421052631578, + "grad_norm": 0.59375, + "learning_rate": 0.00048621221975366646, + "loss": 3.6205, + "step": 2947 + }, + { + "epoch": 0.12412631578947368, + "grad_norm": 0.50390625, + "learning_rate": 0.00048620116612570674, + "loss": 3.3457, + "step": 2948 + }, + { + "epoch": 0.12416842105263158, + "grad_norm": 0.412109375, + "learning_rate": 0.0004861901081944393, + "loss": 3.4073, + "step": 2949 + }, + { + "epoch": 0.12421052631578948, + "grad_norm": 0.462890625, + "learning_rate": 0.0004861790459600656, + "loss": 3.4518, + "step": 2950 + }, + { + "epoch": 0.12425263157894736, + "grad_norm": 0.46875, + "learning_rate": 0.0004861679794227872, + "loss": 3.3027, + "step": 2951 + }, + { + "epoch": 0.12429473684210526, + "grad_norm": 0.447265625, + "learning_rate": 0.0004861569085828057, + "loss": 3.0642, + "step": 2952 + }, + { + "epoch": 0.12433684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.0004861458334403227, + "loss": 3.5175, + "step": 2953 + }, + { + "epoch": 0.12437894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.0004861347539955402, + "loss": 3.9531, + "step": 2954 + }, + { + "epoch": 0.12442105263157895, + "grad_norm": 0.474609375, + "learning_rate": 0.0004861236702486599, + "loss": 3.1812, + "step": 2955 + }, + { + "epoch": 0.12446315789473684, + "grad_norm": 0.5546875, + "learning_rate": 0.0004861125821998837, + "loss": 3.2253, + "step": 2956 + }, + { + "epoch": 0.12450526315789473, + "grad_norm": 0.74609375, + "learning_rate": 0.0004861014898494137, + "loss": 3.2094, + "step": 2957 + }, + { + "epoch": 0.12454736842105263, + "grad_norm": 0.400390625, + "learning_rate": 0.000486090393197452, + "loss": 3.2554, + "step": 2958 + }, + { + "epoch": 0.12458947368421053, + "grad_norm": 0.421875, + "learning_rate": 0.0004860792922442007, + "loss": 3.3683, + "step": 2959 + }, + { + "epoch": 0.12463157894736843, + "grad_norm": 0.49609375, + "learning_rate": 0.0004860681869898621, + "loss": 2.9148, + "step": 2960 + }, + { + "epoch": 0.12467368421052631, + "grad_norm": 0.451171875, + "learning_rate": 0.0004860570774346384, + "loss": 3.3476, + "step": 2961 + }, + { + "epoch": 0.12471578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.00048604596357873227, + "loss": 3.367, + "step": 2962 + }, + { + "epoch": 0.12475789473684211, + "grad_norm": 0.51953125, + "learning_rate": 0.0004860348454223459, + "loss": 3.3253, + "step": 2963 + }, + { + "epoch": 0.1248, + "grad_norm": 0.64453125, + "learning_rate": 0.00048602372296568204, + "loss": 3.1474, + "step": 2964 + }, + { + "epoch": 0.1248421052631579, + "grad_norm": 0.45703125, + "learning_rate": 0.00048601259620894324, + "loss": 3.5001, + "step": 2965 + }, + { + "epoch": 0.12488421052631579, + "grad_norm": 0.466796875, + "learning_rate": 0.00048600146515233224, + "loss": 3.3656, + "step": 2966 + }, + { + "epoch": 0.12492631578947369, + "grad_norm": 0.482421875, + "learning_rate": 0.00048599032979605185, + "loss": 3.1679, + "step": 2967 + }, + { + "epoch": 0.12496842105263158, + "grad_norm": 0.4453125, + "learning_rate": 0.0004859791901403049, + "loss": 3.3416, + "step": 2968 + }, + { + "epoch": 0.12501052631578946, + "grad_norm": 0.447265625, + "learning_rate": 0.00048596804618529445, + "loss": 3.5502, + "step": 2969 + }, + { + "epoch": 0.12505263157894736, + "grad_norm": 0.486328125, + "learning_rate": 0.0004859568979312233, + "loss": 3.7684, + "step": 2970 + }, + { + "epoch": 0.12509473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.00048594574537829484, + "loss": 3.3721, + "step": 2971 + }, + { + "epoch": 0.12513684210526316, + "grad_norm": 0.49609375, + "learning_rate": 0.0004859345885267121, + "loss": 3.3118, + "step": 2972 + }, + { + "epoch": 0.12517894736842106, + "grad_norm": 0.466796875, + "learning_rate": 0.00048592342737667835, + "loss": 3.5161, + "step": 2973 + }, + { + "epoch": 0.12522105263157896, + "grad_norm": 0.423828125, + "learning_rate": 0.00048591226192839696, + "loss": 3.6436, + "step": 2974 + }, + { + "epoch": 0.12526315789473685, + "grad_norm": 0.59375, + "learning_rate": 0.00048590109218207134, + "loss": 3.4589, + "step": 2975 + }, + { + "epoch": 0.12530526315789473, + "grad_norm": 0.5390625, + "learning_rate": 0.00048588991813790493, + "loss": 3.3623, + "step": 2976 + }, + { + "epoch": 0.12534736842105262, + "grad_norm": 0.466796875, + "learning_rate": 0.0004858787397961014, + "loss": 3.6315, + "step": 2977 + }, + { + "epoch": 0.12538947368421052, + "grad_norm": 0.423828125, + "learning_rate": 0.0004858675571568645, + "loss": 3.6184, + "step": 2978 + }, + { + "epoch": 0.12543157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.00048585637022039775, + "loss": 3.2735, + "step": 2979 + }, + { + "epoch": 0.12547368421052632, + "grad_norm": 0.43359375, + "learning_rate": 0.000485845178986905, + "loss": 3.8127, + "step": 2980 + }, + { + "epoch": 0.12551578947368422, + "grad_norm": 0.46484375, + "learning_rate": 0.0004858339834565902, + "loss": 3.6341, + "step": 2981 + }, + { + "epoch": 0.12555789473684212, + "grad_norm": 0.412109375, + "learning_rate": 0.00048582278362965743, + "loss": 3.5892, + "step": 2982 + }, + { + "epoch": 0.1256, + "grad_norm": 0.44140625, + "learning_rate": 0.00048581157950631056, + "loss": 3.3656, + "step": 2983 + }, + { + "epoch": 0.1256421052631579, + "grad_norm": 0.46484375, + "learning_rate": 0.00048580037108675376, + "loss": 3.484, + "step": 2984 + }, + { + "epoch": 0.1256842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.00048578915837119125, + "loss": 3.3778, + "step": 2985 + }, + { + "epoch": 0.12572631578947369, + "grad_norm": 0.41796875, + "learning_rate": 0.00048577794135982734, + "loss": 3.1639, + "step": 2986 + }, + { + "epoch": 0.12576842105263158, + "grad_norm": 0.44921875, + "learning_rate": 0.0004857667200528664, + "loss": 3.4191, + "step": 2987 + }, + { + "epoch": 0.12581052631578948, + "grad_norm": 0.421875, + "learning_rate": 0.00048575549445051275, + "loss": 3.4421, + "step": 2988 + }, + { + "epoch": 0.12585263157894735, + "grad_norm": 0.404296875, + "learning_rate": 0.00048574426455297103, + "loss": 3.8453, + "step": 2989 + }, + { + "epoch": 0.12589473684210525, + "grad_norm": 0.4140625, + "learning_rate": 0.0004857330303604458, + "loss": 3.3646, + "step": 2990 + }, + { + "epoch": 0.12593684210526315, + "grad_norm": 0.396484375, + "learning_rate": 0.00048572179187314176, + "loss": 2.9505, + "step": 2991 + }, + { + "epoch": 0.12597894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.00048571054909126354, + "loss": 3.3229, + "step": 2992 + }, + { + "epoch": 0.12602105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.0004856993020150162, + "loss": 3.2146, + "step": 2993 + }, + { + "epoch": 0.12606315789473685, + "grad_norm": 0.404296875, + "learning_rate": 0.0004856880506446044, + "loss": 3.8323, + "step": 2994 + }, + { + "epoch": 0.12610526315789475, + "grad_norm": 0.4140625, + "learning_rate": 0.0004856767949802333, + "loss": 3.2727, + "step": 2995 + }, + { + "epoch": 0.12614736842105262, + "grad_norm": 0.416015625, + "learning_rate": 0.00048566553502210787, + "loss": 3.3491, + "step": 2996 + }, + { + "epoch": 0.12618947368421052, + "grad_norm": 0.419921875, + "learning_rate": 0.00048565427077043333, + "loss": 3.8218, + "step": 2997 + }, + { + "epoch": 0.12623157894736842, + "grad_norm": 0.400390625, + "learning_rate": 0.0004856430022254148, + "loss": 3.9196, + "step": 2998 + }, + { + "epoch": 0.12627368421052632, + "grad_norm": 0.4296875, + "learning_rate": 0.00048563172938725777, + "loss": 3.5469, + "step": 2999 + }, + { + "epoch": 0.12631578947368421, + "grad_norm": 0.4765625, + "learning_rate": 0.00048562045225616733, + "loss": 3.2952, + "step": 3000 + }, + { + "epoch": 0.12631578947368421, + "eval_loss": 3.4825093746185303, + "eval_runtime": 345.0058, + "eval_samples_per_second": 43.478, + "eval_steps_per_second": 5.435, + "step": 3000 + }, + { + "epoch": 0.1263578947368421, + "grad_norm": 0.46484375, + "learning_rate": 0.0004856091708323491, + "loss": 3.5591, + "step": 3001 + }, + { + "epoch": 0.1264, + "grad_norm": 0.46875, + "learning_rate": 0.00048559788511600875, + "loss": 3.7928, + "step": 3002 + }, + { + "epoch": 0.12644210526315788, + "grad_norm": 0.39453125, + "learning_rate": 0.0004855865951073516, + "loss": 3.5254, + "step": 3003 + }, + { + "epoch": 0.12648421052631578, + "grad_norm": 0.4140625, + "learning_rate": 0.00048557530080658355, + "loss": 3.3097, + "step": 3004 + }, + { + "epoch": 0.12652631578947368, + "grad_norm": 0.404296875, + "learning_rate": 0.0004855640022139103, + "loss": 3.6043, + "step": 3005 + }, + { + "epoch": 0.12656842105263158, + "grad_norm": 0.404296875, + "learning_rate": 0.00048555269932953774, + "loss": 3.6455, + "step": 3006 + }, + { + "epoch": 0.12661052631578948, + "grad_norm": 0.412109375, + "learning_rate": 0.00048554139215367176, + "loss": 3.27, + "step": 3007 + }, + { + "epoch": 0.12665263157894738, + "grad_norm": 0.451171875, + "learning_rate": 0.00048553008068651825, + "loss": 3.0601, + "step": 3008 + }, + { + "epoch": 0.12669473684210528, + "grad_norm": 0.40234375, + "learning_rate": 0.0004855187649282836, + "loss": 3.8757, + "step": 3009 + }, + { + "epoch": 0.12673684210526315, + "grad_norm": 0.4140625, + "learning_rate": 0.00048550744487917363, + "loss": 3.4292, + "step": 3010 + }, + { + "epoch": 0.12677894736842105, + "grad_norm": 0.396484375, + "learning_rate": 0.00048549612053939474, + "loss": 3.6556, + "step": 3011 + }, + { + "epoch": 0.12682105263157895, + "grad_norm": 0.39453125, + "learning_rate": 0.00048548479190915327, + "loss": 3.515, + "step": 3012 + }, + { + "epoch": 0.12686315789473684, + "grad_norm": 0.455078125, + "learning_rate": 0.00048547345898865556, + "loss": 3.6684, + "step": 3013 + }, + { + "epoch": 0.12690526315789474, + "grad_norm": 0.44921875, + "learning_rate": 0.0004854621217781081, + "loss": 3.6245, + "step": 3014 + }, + { + "epoch": 0.12694736842105264, + "grad_norm": 0.419921875, + "learning_rate": 0.00048545078027771745, + "loss": 3.3477, + "step": 3015 + }, + { + "epoch": 0.1269894736842105, + "grad_norm": 0.4453125, + "learning_rate": 0.00048543943448769024, + "loss": 3.7547, + "step": 3016 + }, + { + "epoch": 0.1270315789473684, + "grad_norm": 0.451171875, + "learning_rate": 0.0004854280844082331, + "loss": 3.2349, + "step": 3017 + }, + { + "epoch": 0.1270736842105263, + "grad_norm": 0.494140625, + "learning_rate": 0.0004854167300395529, + "loss": 3.1902, + "step": 3018 + }, + { + "epoch": 0.1271157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.0004854053713818565, + "loss": 3.125, + "step": 3019 + }, + { + "epoch": 0.1271578947368421, + "grad_norm": 0.498046875, + "learning_rate": 0.0004853940084353509, + "loss": 3.5249, + "step": 3020 + }, + { + "epoch": 0.1272, + "grad_norm": 0.41796875, + "learning_rate": 0.0004853826412002429, + "loss": 3.5504, + "step": 3021 + }, + { + "epoch": 0.1272421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00048537126967673977, + "loss": 3.4727, + "step": 3022 + }, + { + "epoch": 0.12728421052631578, + "grad_norm": 0.40625, + "learning_rate": 0.0004853598938650487, + "loss": 3.4245, + "step": 3023 + }, + { + "epoch": 0.12732631578947368, + "grad_norm": 0.404296875, + "learning_rate": 0.0004853485137653769, + "loss": 3.3179, + "step": 3024 + }, + { + "epoch": 0.12736842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.0004853371293779317, + "loss": 3.5749, + "step": 3025 + }, + { + "epoch": 0.12741052631578947, + "grad_norm": 0.47265625, + "learning_rate": 0.0004853257407029205, + "loss": 3.7831, + "step": 3026 + }, + { + "epoch": 0.12745263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00048531434774055085, + "loss": 3.2463, + "step": 3027 + }, + { + "epoch": 0.12749473684210527, + "grad_norm": 0.423828125, + "learning_rate": 0.0004853029504910302, + "loss": 3.6175, + "step": 3028 + }, + { + "epoch": 0.12753684210526317, + "grad_norm": 0.4296875, + "learning_rate": 0.00048529154895456627, + "loss": 3.7884, + "step": 3029 + }, + { + "epoch": 0.12757894736842104, + "grad_norm": 0.439453125, + "learning_rate": 0.0004852801431313668, + "loss": 3.0851, + "step": 3030 + }, + { + "epoch": 0.12762105263157894, + "grad_norm": 0.435546875, + "learning_rate": 0.00048526873302163955, + "loss": 3.7011, + "step": 3031 + }, + { + "epoch": 0.12766315789473684, + "grad_norm": 0.443359375, + "learning_rate": 0.0004852573186255924, + "loss": 3.18, + "step": 3032 + }, + { + "epoch": 0.12770526315789474, + "grad_norm": 0.4375, + "learning_rate": 0.00048524589994343334, + "loss": 3.4369, + "step": 3033 + }, + { + "epoch": 0.12774736842105264, + "grad_norm": 0.41796875, + "learning_rate": 0.00048523447697537036, + "loss": 3.3593, + "step": 3034 + }, + { + "epoch": 0.12778947368421054, + "grad_norm": 0.4296875, + "learning_rate": 0.0004852230497216116, + "loss": 3.5531, + "step": 3035 + }, + { + "epoch": 0.1278315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.0004852116181823653, + "loss": 3.7948, + "step": 3036 + }, + { + "epoch": 0.1278736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.0004852001823578397, + "loss": 3.771, + "step": 3037 + }, + { + "epoch": 0.1279157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00048518874224824307, + "loss": 3.5598, + "step": 3038 + }, + { + "epoch": 0.1279578947368421, + "grad_norm": 0.55859375, + "learning_rate": 0.00048517729785378384, + "loss": 3.2631, + "step": 3039 + }, + { + "epoch": 0.128, + "grad_norm": 0.423828125, + "learning_rate": 0.00048516584917467065, + "loss": 3.7522, + "step": 3040 + }, + { + "epoch": 0.1280421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.00048515439621111194, + "loss": 3.7963, + "step": 3041 + }, + { + "epoch": 0.1280842105263158, + "grad_norm": 0.455078125, + "learning_rate": 0.00048514293896331653, + "loss": 3.4834, + "step": 3042 + }, + { + "epoch": 0.12812631578947367, + "grad_norm": 0.451171875, + "learning_rate": 0.00048513147743149286, + "loss": 3.4996, + "step": 3043 + }, + { + "epoch": 0.12816842105263157, + "grad_norm": 0.43359375, + "learning_rate": 0.0004851200116158501, + "loss": 3.6002, + "step": 3044 + }, + { + "epoch": 0.12821052631578947, + "grad_norm": 0.4609375, + "learning_rate": 0.000485108541516597, + "loss": 3.4594, + "step": 3045 + }, + { + "epoch": 0.12825263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.00048509706713394243, + "loss": 3.5628, + "step": 3046 + }, + { + "epoch": 0.12829473684210527, + "grad_norm": 0.40234375, + "learning_rate": 0.0004850855884680956, + "loss": 3.5081, + "step": 3047 + }, + { + "epoch": 0.12833684210526317, + "grad_norm": 0.443359375, + "learning_rate": 0.00048507410551926547, + "loss": 3.8763, + "step": 3048 + }, + { + "epoch": 0.12837894736842106, + "grad_norm": 0.443359375, + "learning_rate": 0.0004850626182876614, + "loss": 3.5405, + "step": 3049 + }, + { + "epoch": 0.12842105263157894, + "grad_norm": 0.5625, + "learning_rate": 0.00048505112677349254, + "loss": 3.4144, + "step": 3050 + }, + { + "epoch": 0.12846315789473683, + "grad_norm": 0.4296875, + "learning_rate": 0.0004850396309769684, + "loss": 3.5738, + "step": 3051 + }, + { + "epoch": 0.12850526315789473, + "grad_norm": 0.416015625, + "learning_rate": 0.0004850281308982983, + "loss": 3.5143, + "step": 3052 + }, + { + "epoch": 0.12854736842105263, + "grad_norm": 0.404296875, + "learning_rate": 0.00048501662653769185, + "loss": 3.697, + "step": 3053 + }, + { + "epoch": 0.12858947368421053, + "grad_norm": 0.392578125, + "learning_rate": 0.00048500511789535856, + "loss": 3.5978, + "step": 3054 + }, + { + "epoch": 0.12863157894736843, + "grad_norm": 0.427734375, + "learning_rate": 0.00048499360497150813, + "loss": 3.4588, + "step": 3055 + }, + { + "epoch": 0.12867368421052633, + "grad_norm": 0.42578125, + "learning_rate": 0.0004849820877663504, + "loss": 3.4352, + "step": 3056 + }, + { + "epoch": 0.1287157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.00048497056628009507, + "loss": 3.4422, + "step": 3057 + }, + { + "epoch": 0.1287578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.00048495904051295207, + "loss": 3.2563, + "step": 3058 + }, + { + "epoch": 0.1288, + "grad_norm": 0.447265625, + "learning_rate": 0.00048494751046513143, + "loss": 3.4515, + "step": 3059 + }, + { + "epoch": 0.1288421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00048493597613684326, + "loss": 3.4291, + "step": 3060 + }, + { + "epoch": 0.1288842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.0004849244375282976, + "loss": 3.293, + "step": 3061 + }, + { + "epoch": 0.1289263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.00048491289463970465, + "loss": 3.4308, + "step": 3062 + }, + { + "epoch": 0.12896842105263157, + "grad_norm": 0.4453125, + "learning_rate": 0.0004849013474712749, + "loss": 3.5792, + "step": 3063 + }, + { + "epoch": 0.12901052631578946, + "grad_norm": 0.416015625, + "learning_rate": 0.0004848897960232185, + "loss": 3.3259, + "step": 3064 + }, + { + "epoch": 0.12905263157894736, + "grad_norm": 0.43359375, + "learning_rate": 0.0004848782402957461, + "loss": 3.366, + "step": 3065 + }, + { + "epoch": 0.12909473684210526, + "grad_norm": 0.515625, + "learning_rate": 0.000484866680289068, + "loss": 3.274, + "step": 3066 + }, + { + "epoch": 0.12913684210526316, + "grad_norm": 0.4609375, + "learning_rate": 0.00048485511600339506, + "loss": 3.1575, + "step": 3067 + }, + { + "epoch": 0.12917894736842106, + "grad_norm": 0.41796875, + "learning_rate": 0.00048484354743893776, + "loss": 3.4722, + "step": 3068 + }, + { + "epoch": 0.12922105263157896, + "grad_norm": 0.41015625, + "learning_rate": 0.000484831974595907, + "loss": 3.0451, + "step": 3069 + }, + { + "epoch": 0.12926315789473683, + "grad_norm": 0.466796875, + "learning_rate": 0.00048482039747451354, + "loss": 3.2021, + "step": 3070 + }, + { + "epoch": 0.12930526315789473, + "grad_norm": 0.41796875, + "learning_rate": 0.0004848088160749684, + "loss": 3.6545, + "step": 3071 + }, + { + "epoch": 0.12934736842105263, + "grad_norm": 0.515625, + "learning_rate": 0.0004847972303974825, + "loss": 3.2949, + "step": 3072 + }, + { + "epoch": 0.12938947368421053, + "grad_norm": 0.451171875, + "learning_rate": 0.00048478564044226693, + "loss": 3.1198, + "step": 3073 + }, + { + "epoch": 0.12943157894736843, + "grad_norm": 0.412109375, + "learning_rate": 0.0004847740462095329, + "loss": 3.5714, + "step": 3074 + }, + { + "epoch": 0.12947368421052632, + "grad_norm": 0.43359375, + "learning_rate": 0.00048476244769949154, + "loss": 3.0656, + "step": 3075 + }, + { + "epoch": 0.12951578947368422, + "grad_norm": 0.41796875, + "learning_rate": 0.0004847508449123542, + "loss": 3.8121, + "step": 3076 + }, + { + "epoch": 0.1295578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0004847392378483323, + "loss": 3.7964, + "step": 3077 + }, + { + "epoch": 0.1296, + "grad_norm": 0.4609375, + "learning_rate": 0.0004847276265076373, + "loss": 3.32, + "step": 3078 + }, + { + "epoch": 0.1296421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.0004847160108904808, + "loss": 3.8884, + "step": 3079 + }, + { + "epoch": 0.1296842105263158, + "grad_norm": 0.439453125, + "learning_rate": 0.00048470439099707426, + "loss": 3.9961, + "step": 3080 + }, + { + "epoch": 0.1297263157894737, + "grad_norm": 0.39453125, + "learning_rate": 0.00048469276682762955, + "loss": 3.2673, + "step": 3081 + }, + { + "epoch": 0.1297684210526316, + "grad_norm": 0.443359375, + "learning_rate": 0.0004846811383823584, + "loss": 3.7614, + "step": 3082 + }, + { + "epoch": 0.1298105263157895, + "grad_norm": 0.431640625, + "learning_rate": 0.00048466950566147256, + "loss": 3.625, + "step": 3083 + }, + { + "epoch": 0.12985263157894736, + "grad_norm": 0.45703125, + "learning_rate": 0.00048465786866518415, + "loss": 3.2057, + "step": 3084 + }, + { + "epoch": 0.12989473684210526, + "grad_norm": 0.412109375, + "learning_rate": 0.00048464622739370503, + "loss": 3.5994, + "step": 3085 + }, + { + "epoch": 0.12993684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.0004846345818472473, + "loss": 3.3678, + "step": 3086 + }, + { + "epoch": 0.12997894736842106, + "grad_norm": 0.44140625, + "learning_rate": 0.00048462293202602324, + "loss": 3.6239, + "step": 3087 + }, + { + "epoch": 0.13002105263157895, + "grad_norm": 0.4140625, + "learning_rate": 0.00048461127793024503, + "loss": 3.7816, + "step": 3088 + }, + { + "epoch": 0.13006315789473685, + "grad_norm": 0.4375, + "learning_rate": 0.00048459961956012494, + "loss": 3.4049, + "step": 3089 + }, + { + "epoch": 0.13010526315789472, + "grad_norm": 0.41796875, + "learning_rate": 0.00048458795691587545, + "loss": 3.3318, + "step": 3090 + }, + { + "epoch": 0.13014736842105262, + "grad_norm": 0.4140625, + "learning_rate": 0.000484576289997709, + "loss": 3.1791, + "step": 3091 + }, + { + "epoch": 0.13018947368421052, + "grad_norm": 0.404296875, + "learning_rate": 0.00048456461880583827, + "loss": 3.7217, + "step": 3092 + }, + { + "epoch": 0.13023157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.0004845529433404757, + "loss": 3.5749, + "step": 3093 + }, + { + "epoch": 0.13027368421052632, + "grad_norm": 0.83203125, + "learning_rate": 0.00048454126360183405, + "loss": 3.7437, + "step": 3094 + }, + { + "epoch": 0.13031578947368422, + "grad_norm": 0.408203125, + "learning_rate": 0.0004845295795901262, + "loss": 3.2497, + "step": 3095 + }, + { + "epoch": 0.13035789473684212, + "grad_norm": 0.388671875, + "learning_rate": 0.000484517891305565, + "loss": 3.5373, + "step": 3096 + }, + { + "epoch": 0.1304, + "grad_norm": 0.4140625, + "learning_rate": 0.0004845061987483633, + "loss": 3.5362, + "step": 3097 + }, + { + "epoch": 0.1304421052631579, + "grad_norm": 0.466796875, + "learning_rate": 0.00048449450191873425, + "loss": 3.3323, + "step": 3098 + }, + { + "epoch": 0.13048421052631579, + "grad_norm": 0.5703125, + "learning_rate": 0.0004844828008168909, + "loss": 3.4124, + "step": 3099 + }, + { + "epoch": 0.13052631578947368, + "grad_norm": 0.416015625, + "learning_rate": 0.00048447109544304636, + "loss": 3.486, + "step": 3100 + }, + { + "epoch": 0.13056842105263158, + "grad_norm": 0.466796875, + "learning_rate": 0.000484459385797414, + "loss": 3.8117, + "step": 3101 + }, + { + "epoch": 0.13061052631578948, + "grad_norm": 0.4609375, + "learning_rate": 0.0004844476718802071, + "loss": 3.6356, + "step": 3102 + }, + { + "epoch": 0.13065263157894738, + "grad_norm": 0.443359375, + "learning_rate": 0.0004844359536916391, + "loss": 3.1973, + "step": 3103 + }, + { + "epoch": 0.13069473684210525, + "grad_norm": 0.498046875, + "learning_rate": 0.00048442423123192343, + "loss": 3.3644, + "step": 3104 + }, + { + "epoch": 0.13073684210526315, + "grad_norm": 0.453125, + "learning_rate": 0.00048441250450127375, + "loss": 3.1139, + "step": 3105 + }, + { + "epoch": 0.13077894736842105, + "grad_norm": 0.453125, + "learning_rate": 0.0004844007734999036, + "loss": 3.6364, + "step": 3106 + }, + { + "epoch": 0.13082105263157895, + "grad_norm": 0.486328125, + "learning_rate": 0.0004843890382280268, + "loss": 3.7265, + "step": 3107 + }, + { + "epoch": 0.13086315789473685, + "grad_norm": 0.44140625, + "learning_rate": 0.0004843772986858571, + "loss": 3.3619, + "step": 3108 + }, + { + "epoch": 0.13090526315789475, + "grad_norm": 0.4140625, + "learning_rate": 0.00048436555487360843, + "loss": 3.5016, + "step": 3109 + }, + { + "epoch": 0.13094736842105262, + "grad_norm": 0.412109375, + "learning_rate": 0.00048435380679149475, + "loss": 3.5004, + "step": 3110 + }, + { + "epoch": 0.13098947368421052, + "grad_norm": 0.419921875, + "learning_rate": 0.0004843420544397301, + "loss": 3.8717, + "step": 3111 + }, + { + "epoch": 0.13103157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.00048433029781852846, + "loss": 3.8099, + "step": 3112 + }, + { + "epoch": 0.13107368421052631, + "grad_norm": 0.3984375, + "learning_rate": 0.0004843185369281041, + "loss": 3.6289, + "step": 3113 + }, + { + "epoch": 0.1311157894736842, + "grad_norm": 0.462890625, + "learning_rate": 0.00048430677176867137, + "loss": 3.7775, + "step": 3114 + }, + { + "epoch": 0.1311578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00048429500234044455, + "loss": 3.4422, + "step": 3115 + }, + { + "epoch": 0.1312, + "grad_norm": 0.484375, + "learning_rate": 0.00048428322864363805, + "loss": 3.0621, + "step": 3116 + }, + { + "epoch": 0.13124210526315788, + "grad_norm": 0.478515625, + "learning_rate": 0.00048427145067846643, + "loss": 3.2286, + "step": 3117 + }, + { + "epoch": 0.13128421052631578, + "grad_norm": 0.4453125, + "learning_rate": 0.00048425966844514425, + "loss": 3.6919, + "step": 3118 + }, + { + "epoch": 0.13132631578947368, + "grad_norm": 0.55859375, + "learning_rate": 0.0004842478819438861, + "loss": 3.1122, + "step": 3119 + }, + { + "epoch": 0.13136842105263158, + "grad_norm": 0.66796875, + "learning_rate": 0.00048423609117490685, + "loss": 3.1712, + "step": 3120 + }, + { + "epoch": 0.13141052631578948, + "grad_norm": 0.44140625, + "learning_rate": 0.00048422429613842114, + "loss": 3.373, + "step": 3121 + }, + { + "epoch": 0.13145263157894738, + "grad_norm": 0.416015625, + "learning_rate": 0.00048421249683464404, + "loss": 3.466, + "step": 3122 + }, + { + "epoch": 0.13149473684210528, + "grad_norm": 0.431640625, + "learning_rate": 0.00048420069326379035, + "loss": 3.3927, + "step": 3123 + }, + { + "epoch": 0.13153684210526315, + "grad_norm": 0.447265625, + "learning_rate": 0.0004841888854260753, + "loss": 3.65, + "step": 3124 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 0.498046875, + "learning_rate": 0.00048417707332171385, + "loss": 3.5691, + "step": 3125 + }, + { + "epoch": 0.13162105263157894, + "grad_norm": 0.455078125, + "learning_rate": 0.00048416525695092126, + "loss": 3.3102, + "step": 3126 + }, + { + "epoch": 0.13166315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.00048415343631391287, + "loss": 3.4244, + "step": 3127 + }, + { + "epoch": 0.13170526315789474, + "grad_norm": 0.5546875, + "learning_rate": 0.00048414161141090395, + "loss": 3.105, + "step": 3128 + }, + { + "epoch": 0.13174736842105264, + "grad_norm": 0.4140625, + "learning_rate": 0.00048412978224211003, + "loss": 3.538, + "step": 3129 + }, + { + "epoch": 0.13178947368421054, + "grad_norm": 0.494140625, + "learning_rate": 0.0004841179488077465, + "loss": 3.6223, + "step": 3130 + }, + { + "epoch": 0.1318315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.00048410611110802907, + "loss": 3.293, + "step": 3131 + }, + { + "epoch": 0.1318736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.0004840942691431733, + "loss": 3.8934, + "step": 3132 + }, + { + "epoch": 0.1319157894736842, + "grad_norm": 0.490234375, + "learning_rate": 0.0004840824229133951, + "loss": 3.3952, + "step": 3133 + }, + { + "epoch": 0.1319578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.00048407057241891003, + "loss": 3.6741, + "step": 3134 + }, + { + "epoch": 0.132, + "grad_norm": 0.4296875, + "learning_rate": 0.0004840587176599343, + "loss": 3.6919, + "step": 3135 + }, + { + "epoch": 0.1320421052631579, + "grad_norm": 0.515625, + "learning_rate": 0.00048404685863668375, + "loss": 3.3624, + "step": 3136 + }, + { + "epoch": 0.13208421052631578, + "grad_norm": 0.4453125, + "learning_rate": 0.0004840349953493743, + "loss": 3.5521, + "step": 3137 + }, + { + "epoch": 0.13212631578947368, + "grad_norm": 0.455078125, + "learning_rate": 0.00048402312779822235, + "loss": 3.434, + "step": 3138 + }, + { + "epoch": 0.13216842105263157, + "grad_norm": 0.4296875, + "learning_rate": 0.00048401125598344387, + "loss": 3.7669, + "step": 3139 + }, + { + "epoch": 0.13221052631578947, + "grad_norm": 0.419921875, + "learning_rate": 0.00048399937990525535, + "loss": 3.3957, + "step": 3140 + }, + { + "epoch": 0.13225263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.000483987499563873, + "loss": 3.6902, + "step": 3141 + }, + { + "epoch": 0.13229473684210527, + "grad_norm": 0.44921875, + "learning_rate": 0.0004839756149595133, + "loss": 3.4689, + "step": 3142 + }, + { + "epoch": 0.13233684210526317, + "grad_norm": 0.431640625, + "learning_rate": 0.0004839637260923929, + "loss": 3.8819, + "step": 3143 + }, + { + "epoch": 0.13237894736842104, + "grad_norm": 0.421875, + "learning_rate": 0.00048395183296272824, + "loss": 3.9212, + "step": 3144 + }, + { + "epoch": 0.13242105263157894, + "grad_norm": 0.44921875, + "learning_rate": 0.0004839399355707361, + "loss": 3.294, + "step": 3145 + }, + { + "epoch": 0.13246315789473684, + "grad_norm": 0.45703125, + "learning_rate": 0.0004839280339166331, + "loss": 3.4012, + "step": 3146 + }, + { + "epoch": 0.13250526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00048391612800063633, + "loss": 3.4675, + "step": 3147 + }, + { + "epoch": 0.13254736842105264, + "grad_norm": 0.4375, + "learning_rate": 0.0004839042178229625, + "loss": 3.4778, + "step": 3148 + }, + { + "epoch": 0.13258947368421053, + "grad_norm": 0.486328125, + "learning_rate": 0.0004838923033838286, + "loss": 3.4909, + "step": 3149 + }, + { + "epoch": 0.13263157894736843, + "grad_norm": 0.5703125, + "learning_rate": 0.00048388038468345177, + "loss": 3.248, + "step": 3150 + }, + { + "epoch": 0.1326736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.00048386846172204914, + "loss": 3.7677, + "step": 3151 + }, + { + "epoch": 0.1327157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.0004838565344998379, + "loss": 3.4439, + "step": 3152 + }, + { + "epoch": 0.1327578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0004838446030170354, + "loss": 3.6969, + "step": 3153 + }, + { + "epoch": 0.1328, + "grad_norm": 0.47265625, + "learning_rate": 0.000483832667273859, + "loss": 2.9953, + "step": 3154 + }, + { + "epoch": 0.1328421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.00048382072727052607, + "loss": 3.5185, + "step": 3155 + }, + { + "epoch": 0.1328842105263158, + "grad_norm": 0.48046875, + "learning_rate": 0.00048380878300725426, + "loss": 2.7888, + "step": 3156 + }, + { + "epoch": 0.1329263157894737, + "grad_norm": 0.40625, + "learning_rate": 0.00048379683448426114, + "loss": 3.1143, + "step": 3157 + }, + { + "epoch": 0.13296842105263157, + "grad_norm": 0.44140625, + "learning_rate": 0.00048378488170176446, + "loss": 3.5351, + "step": 3158 + }, + { + "epoch": 0.13301052631578947, + "grad_norm": 0.41796875, + "learning_rate": 0.00048377292465998187, + "loss": 3.7393, + "step": 3159 + }, + { + "epoch": 0.13305263157894737, + "grad_norm": 0.453125, + "learning_rate": 0.0004837609633591312, + "loss": 3.1264, + "step": 3160 + }, + { + "epoch": 0.13309473684210527, + "grad_norm": 0.416015625, + "learning_rate": 0.00048374899779943047, + "loss": 3.4538, + "step": 3161 + }, + { + "epoch": 0.13313684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.0004837370279810977, + "loss": 3.3446, + "step": 3162 + }, + { + "epoch": 0.13317894736842106, + "grad_norm": 0.59375, + "learning_rate": 0.0004837250539043509, + "loss": 3.0745, + "step": 3163 + }, + { + "epoch": 0.13322105263157893, + "grad_norm": 0.474609375, + "learning_rate": 0.0004837130755694082, + "loss": 3.2596, + "step": 3164 + }, + { + "epoch": 0.13326315789473683, + "grad_norm": 0.431640625, + "learning_rate": 0.0004837010929764879, + "loss": 3.3567, + "step": 3165 + }, + { + "epoch": 0.13330526315789473, + "grad_norm": 0.43359375, + "learning_rate": 0.0004836891061258082, + "loss": 3.7764, + "step": 3166 + }, + { + "epoch": 0.13334736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.0004836771150175876, + "loss": 3.1345, + "step": 3167 + }, + { + "epoch": 0.13338947368421053, + "grad_norm": 1.125, + "learning_rate": 0.0004836651196520445, + "loss": 3.6898, + "step": 3168 + }, + { + "epoch": 0.13343157894736843, + "grad_norm": 0.470703125, + "learning_rate": 0.00048365312002939757, + "loss": 3.3996, + "step": 3169 + }, + { + "epoch": 0.13347368421052633, + "grad_norm": 0.51953125, + "learning_rate": 0.0004836411161498652, + "loss": 3.7285, + "step": 3170 + }, + { + "epoch": 0.1335157894736842, + "grad_norm": 0.44921875, + "learning_rate": 0.0004836291080136663, + "loss": 3.2869, + "step": 3171 + }, + { + "epoch": 0.1335578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.0004836170956210195, + "loss": 3.6923, + "step": 3172 + }, + { + "epoch": 0.1336, + "grad_norm": 0.48046875, + "learning_rate": 0.00048360507897214373, + "loss": 3.4182, + "step": 3173 + }, + { + "epoch": 0.1336421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.0004835930580672579, + "loss": 3.4935, + "step": 3174 + }, + { + "epoch": 0.1336842105263158, + "grad_norm": 0.484375, + "learning_rate": 0.00048358103290658096, + "loss": 3.4496, + "step": 3175 + }, + { + "epoch": 0.1337263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.0004835690034903321, + "loss": 3.7415, + "step": 3176 + }, + { + "epoch": 0.1337684210526316, + "grad_norm": 0.51171875, + "learning_rate": 0.0004835569698187304, + "loss": 3.9538, + "step": 3177 + }, + { + "epoch": 0.13381052631578946, + "grad_norm": 0.416015625, + "learning_rate": 0.0004835449318919952, + "loss": 3.4964, + "step": 3178 + }, + { + "epoch": 0.13385263157894736, + "grad_norm": 0.431640625, + "learning_rate": 0.00048353288971034566, + "loss": 3.6151, + "step": 3179 + }, + { + "epoch": 0.13389473684210526, + "grad_norm": 0.404296875, + "learning_rate": 0.0004835208432740013, + "loss": 4.0144, + "step": 3180 + }, + { + "epoch": 0.13393684210526316, + "grad_norm": 0.470703125, + "learning_rate": 0.0004835087925831815, + "loss": 3.6099, + "step": 3181 + }, + { + "epoch": 0.13397894736842106, + "grad_norm": 0.4140625, + "learning_rate": 0.0004834967376381059, + "loss": 3.0776, + "step": 3182 + }, + { + "epoch": 0.13402105263157896, + "grad_norm": 0.408203125, + "learning_rate": 0.0004834846784389941, + "loss": 3.4687, + "step": 3183 + }, + { + "epoch": 0.13406315789473683, + "grad_norm": 0.4921875, + "learning_rate": 0.00048347261498606574, + "loss": 3.3375, + "step": 3184 + }, + { + "epoch": 0.13410526315789473, + "grad_norm": 0.515625, + "learning_rate": 0.0004834605472795407, + "loss": 3.7815, + "step": 3185 + }, + { + "epoch": 0.13414736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00048344847531963883, + "loss": 3.3755, + "step": 3186 + }, + { + "epoch": 0.13418947368421053, + "grad_norm": 0.396484375, + "learning_rate": 0.00048343639910658, + "loss": 3.6461, + "step": 3187 + }, + { + "epoch": 0.13423157894736842, + "grad_norm": 0.470703125, + "learning_rate": 0.0004834243186405842, + "loss": 3.3575, + "step": 3188 + }, + { + "epoch": 0.13427368421052632, + "grad_norm": 0.4140625, + "learning_rate": 0.00048341223392187163, + "loss": 3.2938, + "step": 3189 + }, + { + "epoch": 0.13431578947368422, + "grad_norm": 0.400390625, + "learning_rate": 0.0004834001449506624, + "loss": 3.2255, + "step": 3190 + }, + { + "epoch": 0.1343578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.00048338805172717676, + "loss": 3.5113, + "step": 3191 + }, + { + "epoch": 0.1344, + "grad_norm": 0.443359375, + "learning_rate": 0.00048337595425163504, + "loss": 3.8051, + "step": 3192 + }, + { + "epoch": 0.1344421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.0004833638525242576, + "loss": 3.8248, + "step": 3193 + }, + { + "epoch": 0.1344842105263158, + "grad_norm": 0.40234375, + "learning_rate": 0.00048335174654526503, + "loss": 3.2356, + "step": 3194 + }, + { + "epoch": 0.1345263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.00048333963631487774, + "loss": 3.6372, + "step": 3195 + }, + { + "epoch": 0.1345684210526316, + "grad_norm": 0.46484375, + "learning_rate": 0.0004833275218333165, + "loss": 3.3769, + "step": 3196 + }, + { + "epoch": 0.13461052631578949, + "grad_norm": 0.48828125, + "learning_rate": 0.00048331540310080193, + "loss": 3.3041, + "step": 3197 + }, + { + "epoch": 0.13465263157894736, + "grad_norm": 0.416015625, + "learning_rate": 0.0004833032801175549, + "loss": 3.7184, + "step": 3198 + }, + { + "epoch": 0.13469473684210526, + "grad_norm": 0.45703125, + "learning_rate": 0.0004832911528837962, + "loss": 3.6978, + "step": 3199 + }, + { + "epoch": 0.13473684210526315, + "grad_norm": 0.44140625, + "learning_rate": 0.0004832790213997467, + "loss": 3.1034, + "step": 3200 + }, + { + "epoch": 0.13477894736842105, + "grad_norm": 0.4140625, + "learning_rate": 0.00048326688566562763, + "loss": 3.2976, + "step": 3201 + }, + { + "epoch": 0.13482105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.00048325474568165994, + "loss": 3.6054, + "step": 3202 + }, + { + "epoch": 0.13486315789473685, + "grad_norm": 0.396484375, + "learning_rate": 0.00048324260144806485, + "loss": 3.5848, + "step": 3203 + }, + { + "epoch": 0.13490526315789475, + "grad_norm": 0.4296875, + "learning_rate": 0.00048323045296506364, + "loss": 3.325, + "step": 3204 + }, + { + "epoch": 0.13494736842105262, + "grad_norm": 0.486328125, + "learning_rate": 0.0004832183002328776, + "loss": 3.447, + "step": 3205 + }, + { + "epoch": 0.13498947368421052, + "grad_norm": 0.4375, + "learning_rate": 0.00048320614325172816, + "loss": 3.6158, + "step": 3206 + }, + { + "epoch": 0.13503157894736842, + "grad_norm": 0.462890625, + "learning_rate": 0.00048319398202183677, + "loss": 3.7218, + "step": 3207 + }, + { + "epoch": 0.13507368421052632, + "grad_norm": 0.419921875, + "learning_rate": 0.000483181816543425, + "loss": 3.6022, + "step": 3208 + }, + { + "epoch": 0.13511578947368422, + "grad_norm": 0.4296875, + "learning_rate": 0.0004831696468167146, + "loss": 3.3156, + "step": 3209 + }, + { + "epoch": 0.13515789473684212, + "grad_norm": 0.427734375, + "learning_rate": 0.00048315747284192713, + "loss": 3.5495, + "step": 3210 + }, + { + "epoch": 0.1352, + "grad_norm": 0.42578125, + "learning_rate": 0.00048314529461928446, + "loss": 3.1416, + "step": 3211 + }, + { + "epoch": 0.13524210526315789, + "grad_norm": 0.3984375, + "learning_rate": 0.00048313311214900846, + "loss": 3.738, + "step": 3212 + }, + { + "epoch": 0.13528421052631578, + "grad_norm": 0.40234375, + "learning_rate": 0.000483120925431321, + "loss": 3.6982, + "step": 3213 + }, + { + "epoch": 0.13532631578947368, + "grad_norm": 0.419921875, + "learning_rate": 0.0004831087344664443, + "loss": 2.9038, + "step": 3214 + }, + { + "epoch": 0.13536842105263158, + "grad_norm": 0.40234375, + "learning_rate": 0.0004830965392546003, + "loss": 3.2407, + "step": 3215 + }, + { + "epoch": 0.13541052631578948, + "grad_norm": 0.421875, + "learning_rate": 0.0004830843397960113, + "loss": 3.7073, + "step": 3216 + }, + { + "epoch": 0.13545263157894738, + "grad_norm": 0.41796875, + "learning_rate": 0.00048307213609089937, + "loss": 3.3639, + "step": 3217 + }, + { + "epoch": 0.13549473684210525, + "grad_norm": 0.431640625, + "learning_rate": 0.00048305992813948707, + "loss": 3.1163, + "step": 3218 + }, + { + "epoch": 0.13553684210526315, + "grad_norm": 0.41796875, + "learning_rate": 0.00048304771594199667, + "loss": 3.4708, + "step": 3219 + }, + { + "epoch": 0.13557894736842105, + "grad_norm": 0.408203125, + "learning_rate": 0.00048303549949865065, + "loss": 3.644, + "step": 3220 + }, + { + "epoch": 0.13562105263157895, + "grad_norm": 0.42578125, + "learning_rate": 0.0004830232788096717, + "loss": 3.6264, + "step": 3221 + }, + { + "epoch": 0.13566315789473685, + "grad_norm": 0.423828125, + "learning_rate": 0.0004830110538752824, + "loss": 3.1388, + "step": 3222 + }, + { + "epoch": 0.13570526315789475, + "grad_norm": 0.416015625, + "learning_rate": 0.0004829988246957055, + "loss": 3.5765, + "step": 3223 + }, + { + "epoch": 0.13574736842105264, + "grad_norm": 0.455078125, + "learning_rate": 0.00048298659127116373, + "loss": 2.7384, + "step": 3224 + }, + { + "epoch": 0.13578947368421052, + "grad_norm": 0.4296875, + "learning_rate": 0.00048297435360188004, + "loss": 3.4258, + "step": 3225 + }, + { + "epoch": 0.13583157894736841, + "grad_norm": 0.490234375, + "learning_rate": 0.0004829621116880774, + "loss": 3.2441, + "step": 3226 + }, + { + "epoch": 0.1358736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.00048294986552997873, + "loss": 3.0285, + "step": 3227 + }, + { + "epoch": 0.1359157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.0004829376151278072, + "loss": 3.6065, + "step": 3228 + }, + { + "epoch": 0.1359578947368421, + "grad_norm": 0.39453125, + "learning_rate": 0.000482925360481786, + "loss": 3.0228, + "step": 3229 + }, + { + "epoch": 0.136, + "grad_norm": 0.4609375, + "learning_rate": 0.0004829131015921385, + "loss": 3.9504, + "step": 3230 + }, + { + "epoch": 0.13604210526315788, + "grad_norm": 0.453125, + "learning_rate": 0.00048290083845908794, + "loss": 3.0421, + "step": 3231 + }, + { + "epoch": 0.13608421052631578, + "grad_norm": 0.48828125, + "learning_rate": 0.0004828885710828577, + "loss": 2.6915, + "step": 3232 + }, + { + "epoch": 0.13612631578947368, + "grad_norm": 0.447265625, + "learning_rate": 0.0004828762994636714, + "loss": 3.3249, + "step": 3233 + }, + { + "epoch": 0.13616842105263158, + "grad_norm": 0.462890625, + "learning_rate": 0.00048286402360175245, + "loss": 3.8109, + "step": 3234 + }, + { + "epoch": 0.13621052631578948, + "grad_norm": 0.408203125, + "learning_rate": 0.00048285174349732465, + "loss": 3.4175, + "step": 3235 + }, + { + "epoch": 0.13625263157894738, + "grad_norm": 0.44140625, + "learning_rate": 0.00048283945915061167, + "loss": 3.5594, + "step": 3236 + }, + { + "epoch": 0.13629473684210527, + "grad_norm": 0.443359375, + "learning_rate": 0.00048282717056183735, + "loss": 3.5825, + "step": 3237 + }, + { + "epoch": 0.13633684210526315, + "grad_norm": 0.3984375, + "learning_rate": 0.00048281487773122546, + "loss": 3.6342, + "step": 3238 + }, + { + "epoch": 0.13637894736842104, + "grad_norm": 1.984375, + "learning_rate": 0.00048280258065900005, + "loss": 3.2347, + "step": 3239 + }, + { + "epoch": 0.13642105263157894, + "grad_norm": 0.53125, + "learning_rate": 0.00048279027934538523, + "loss": 3.7541, + "step": 3240 + }, + { + "epoch": 0.13646315789473684, + "grad_norm": 0.439453125, + "learning_rate": 0.000482777973790605, + "loss": 3.2116, + "step": 3241 + }, + { + "epoch": 0.13650526315789474, + "grad_norm": 0.625, + "learning_rate": 0.00048276566399488356, + "loss": 3.4717, + "step": 3242 + }, + { + "epoch": 0.13654736842105264, + "grad_norm": 0.3984375, + "learning_rate": 0.00048275334995844524, + "loss": 3.5744, + "step": 3243 + }, + { + "epoch": 0.13658947368421054, + "grad_norm": 0.427734375, + "learning_rate": 0.0004827410316815144, + "loss": 3.4858, + "step": 3244 + }, + { + "epoch": 0.1366315789473684, + "grad_norm": 0.48828125, + "learning_rate": 0.0004827287091643153, + "loss": 3.1273, + "step": 3245 + }, + { + "epoch": 0.1366736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.0004827163824070727, + "loss": 3.9362, + "step": 3246 + }, + { + "epoch": 0.1367157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.00048270405141001094, + "loss": 3.6746, + "step": 3247 + }, + { + "epoch": 0.1367578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0004826917161733548, + "loss": 3.5528, + "step": 3248 + }, + { + "epoch": 0.1368, + "grad_norm": 0.484375, + "learning_rate": 0.0004826793766973291, + "loss": 3.4151, + "step": 3249 + }, + { + "epoch": 0.1368421052631579, + "grad_norm": 0.58984375, + "learning_rate": 0.0004826670329821584, + "loss": 3.0746, + "step": 3250 + }, + { + "epoch": 0.1368842105263158, + "grad_norm": 0.458984375, + "learning_rate": 0.0004826546850280678, + "loss": 3.5138, + "step": 3251 + }, + { + "epoch": 0.13692631578947367, + "grad_norm": 0.47265625, + "learning_rate": 0.00048264233283528226, + "loss": 3.0448, + "step": 3252 + }, + { + "epoch": 0.13696842105263157, + "grad_norm": 0.458984375, + "learning_rate": 0.00048262997640402663, + "loss": 3.4632, + "step": 3253 + }, + { + "epoch": 0.13701052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.0004826176157345262, + "loss": 3.5807, + "step": 3254 + }, + { + "epoch": 0.13705263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.00048260525082700613, + "loss": 3.4383, + "step": 3255 + }, + { + "epoch": 0.13709473684210527, + "grad_norm": 0.4765625, + "learning_rate": 0.00048259288168169174, + "loss": 3.4302, + "step": 3256 + }, + { + "epoch": 0.13713684210526317, + "grad_norm": 0.4375, + "learning_rate": 0.0004825805082988083, + "loss": 3.1451, + "step": 3257 + }, + { + "epoch": 0.13717894736842104, + "grad_norm": 0.421875, + "learning_rate": 0.00048256813067858124, + "loss": 3.2636, + "step": 3258 + }, + { + "epoch": 0.13722105263157894, + "grad_norm": 0.39453125, + "learning_rate": 0.0004825557488212361, + "loss": 3.311, + "step": 3259 + }, + { + "epoch": 0.13726315789473684, + "grad_norm": 0.4765625, + "learning_rate": 0.0004825433627269985, + "loss": 3.2212, + "step": 3260 + }, + { + "epoch": 0.13730526315789474, + "grad_norm": 0.40234375, + "learning_rate": 0.000482530972396094, + "loss": 3.6675, + "step": 3261 + }, + { + "epoch": 0.13734736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.0004825185778287485, + "loss": 3.5987, + "step": 3262 + }, + { + "epoch": 0.13738947368421053, + "grad_norm": 0.431640625, + "learning_rate": 0.0004825061790251876, + "loss": 3.5565, + "step": 3263 + }, + { + "epoch": 0.13743157894736843, + "grad_norm": 0.451171875, + "learning_rate": 0.00048249377598563736, + "loss": 3.4649, + "step": 3264 + }, + { + "epoch": 0.1374736842105263, + "grad_norm": 0.40625, + "learning_rate": 0.00048248136871032366, + "loss": 3.4036, + "step": 3265 + }, + { + "epoch": 0.1375157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.0004824689571994726, + "loss": 3.6233, + "step": 3266 + }, + { + "epoch": 0.1375578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.0004824565414533102, + "loss": 4.049, + "step": 3267 + }, + { + "epoch": 0.1376, + "grad_norm": 0.3828125, + "learning_rate": 0.00048244412147206283, + "loss": 3.5397, + "step": 3268 + }, + { + "epoch": 0.1376421052631579, + "grad_norm": 0.5859375, + "learning_rate": 0.0004824316972559567, + "loss": 3.2536, + "step": 3269 + }, + { + "epoch": 0.1376842105263158, + "grad_norm": 0.39453125, + "learning_rate": 0.0004824192688052181, + "loss": 3.5472, + "step": 3270 + }, + { + "epoch": 0.1377263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.0004824068361200735, + "loss": 3.6646, + "step": 3271 + }, + { + "epoch": 0.13776842105263157, + "grad_norm": 0.431640625, + "learning_rate": 0.0004823943992007494, + "loss": 3.7678, + "step": 3272 + }, + { + "epoch": 0.13781052631578947, + "grad_norm": 0.40625, + "learning_rate": 0.0004823819580474724, + "loss": 3.3231, + "step": 3273 + }, + { + "epoch": 0.13785263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.0004823695126604692, + "loss": 2.7148, + "step": 3274 + }, + { + "epoch": 0.13789473684210526, + "grad_norm": 0.84375, + "learning_rate": 0.00048235706303996647, + "loss": 3.9458, + "step": 3275 + }, + { + "epoch": 0.13793684210526316, + "grad_norm": 0.50390625, + "learning_rate": 0.0004823446091861911, + "loss": 3.4602, + "step": 3276 + }, + { + "epoch": 0.13797894736842106, + "grad_norm": 0.52734375, + "learning_rate": 0.0004823321510993699, + "loss": 3.2659, + "step": 3277 + }, + { + "epoch": 0.13802105263157896, + "grad_norm": 0.4375, + "learning_rate": 0.00048231968877972985, + "loss": 3.678, + "step": 3278 + }, + { + "epoch": 0.13806315789473683, + "grad_norm": 0.53515625, + "learning_rate": 0.00048230722222749814, + "loss": 3.2327, + "step": 3279 + }, + { + "epoch": 0.13810526315789473, + "grad_norm": 0.4140625, + "learning_rate": 0.0004822947514429017, + "loss": 3.7413, + "step": 3280 + }, + { + "epoch": 0.13814736842105263, + "grad_norm": 0.58203125, + "learning_rate": 0.0004822822764261678, + "loss": 2.7387, + "step": 3281 + }, + { + "epoch": 0.13818947368421053, + "grad_norm": 0.453125, + "learning_rate": 0.0004822697971775238, + "loss": 4.0238, + "step": 3282 + }, + { + "epoch": 0.13823157894736843, + "grad_norm": 0.439453125, + "learning_rate": 0.00048225731369719706, + "loss": 3.5222, + "step": 3283 + }, + { + "epoch": 0.13827368421052633, + "grad_norm": 0.423828125, + "learning_rate": 0.0004822448259854149, + "loss": 3.8242, + "step": 3284 + }, + { + "epoch": 0.1383157894736842, + "grad_norm": 0.47265625, + "learning_rate": 0.0004822323340424049, + "loss": 3.5235, + "step": 3285 + }, + { + "epoch": 0.1383578947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.00048221983786839455, + "loss": 3.6023, + "step": 3286 + }, + { + "epoch": 0.1384, + "grad_norm": 0.435546875, + "learning_rate": 0.00048220733746361165, + "loss": 3.0167, + "step": 3287 + }, + { + "epoch": 0.1384421052631579, + "grad_norm": 0.462890625, + "learning_rate": 0.00048219483282828396, + "loss": 3.7563, + "step": 3288 + }, + { + "epoch": 0.1384842105263158, + "grad_norm": 0.50390625, + "learning_rate": 0.00048218232396263916, + "loss": 3.2722, + "step": 3289 + }, + { + "epoch": 0.1385263157894737, + "grad_norm": 0.400390625, + "learning_rate": 0.00048216981086690526, + "loss": 3.6911, + "step": 3290 + }, + { + "epoch": 0.1385684210526316, + "grad_norm": 0.546875, + "learning_rate": 0.0004821572935413102, + "loss": 3.5001, + "step": 3291 + }, + { + "epoch": 0.13861052631578946, + "grad_norm": 0.435546875, + "learning_rate": 0.000482144771986082, + "loss": 3.7663, + "step": 3292 + }, + { + "epoch": 0.13865263157894736, + "grad_norm": 0.416015625, + "learning_rate": 0.0004821322462014488, + "loss": 3.5049, + "step": 3293 + }, + { + "epoch": 0.13869473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.0004821197161876389, + "loss": 3.1934, + "step": 3294 + }, + { + "epoch": 0.13873684210526316, + "grad_norm": 0.439453125, + "learning_rate": 0.00048210718194488045, + "loss": 3.5232, + "step": 3295 + }, + { + "epoch": 0.13877894736842106, + "grad_norm": 0.435546875, + "learning_rate": 0.0004820946434734018, + "loss": 3.6145, + "step": 3296 + }, + { + "epoch": 0.13882105263157896, + "grad_norm": 0.4453125, + "learning_rate": 0.00048208210077343153, + "loss": 3.9118, + "step": 3297 + }, + { + "epoch": 0.13886315789473686, + "grad_norm": 0.490234375, + "learning_rate": 0.000482069553845198, + "loss": 3.6828, + "step": 3298 + }, + { + "epoch": 0.13890526315789473, + "grad_norm": 0.5390625, + "learning_rate": 0.00048205700268892995, + "loss": 3.4494, + "step": 3299 + }, + { + "epoch": 0.13894736842105262, + "grad_norm": 0.4296875, + "learning_rate": 0.00048204444730485586, + "loss": 3.5507, + "step": 3300 + }, + { + "epoch": 0.13898947368421052, + "grad_norm": 0.58203125, + "learning_rate": 0.00048203188769320465, + "loss": 3.5164, + "step": 3301 + }, + { + "epoch": 0.13903157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.0004820193238542051, + "loss": 3.5431, + "step": 3302 + }, + { + "epoch": 0.13907368421052632, + "grad_norm": 0.4140625, + "learning_rate": 0.0004820067557880861, + "loss": 3.6186, + "step": 3303 + }, + { + "epoch": 0.13911578947368422, + "grad_norm": 0.4609375, + "learning_rate": 0.0004819941834950765, + "loss": 3.3359, + "step": 3304 + }, + { + "epoch": 0.1391578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.0004819816069754055, + "loss": 3.5231, + "step": 3305 + }, + { + "epoch": 0.1392, + "grad_norm": 0.458984375, + "learning_rate": 0.00048196902622930216, + "loss": 2.6678, + "step": 3306 + }, + { + "epoch": 0.1392421052631579, + "grad_norm": 0.453125, + "learning_rate": 0.0004819564412569958, + "loss": 3.8929, + "step": 3307 + }, + { + "epoch": 0.1392842105263158, + "grad_norm": 0.4140625, + "learning_rate": 0.00048194385205871555, + "loss": 3.952, + "step": 3308 + }, + { + "epoch": 0.1393263157894737, + "grad_norm": 0.439453125, + "learning_rate": 0.00048193125863469087, + "loss": 3.4279, + "step": 3309 + }, + { + "epoch": 0.13936842105263159, + "grad_norm": 0.455078125, + "learning_rate": 0.0004819186609851511, + "loss": 3.6198, + "step": 3310 + }, + { + "epoch": 0.13941052631578948, + "grad_norm": 0.48046875, + "learning_rate": 0.00048190605911032594, + "loss": 3.4354, + "step": 3311 + }, + { + "epoch": 0.13945263157894736, + "grad_norm": 0.447265625, + "learning_rate": 0.00048189345301044475, + "loss": 3.2515, + "step": 3312 + }, + { + "epoch": 0.13949473684210525, + "grad_norm": 0.435546875, + "learning_rate": 0.0004818808426857374, + "loss": 3.341, + "step": 3313 + }, + { + "epoch": 0.13953684210526315, + "grad_norm": 0.455078125, + "learning_rate": 0.0004818682281364335, + "loss": 3.3847, + "step": 3314 + }, + { + "epoch": 0.13957894736842105, + "grad_norm": 0.44140625, + "learning_rate": 0.0004818556093627629, + "loss": 3.8515, + "step": 3315 + }, + { + "epoch": 0.13962105263157895, + "grad_norm": 0.44140625, + "learning_rate": 0.0004818429863649555, + "loss": 3.6405, + "step": 3316 + }, + { + "epoch": 0.13966315789473685, + "grad_norm": 0.42578125, + "learning_rate": 0.00048183035914324134, + "loss": 3.5672, + "step": 3317 + }, + { + "epoch": 0.13970526315789475, + "grad_norm": 0.4375, + "learning_rate": 0.00048181772769785044, + "loss": 3.0749, + "step": 3318 + }, + { + "epoch": 0.13974736842105262, + "grad_norm": 0.388671875, + "learning_rate": 0.0004818050920290129, + "loss": 3.253, + "step": 3319 + }, + { + "epoch": 0.13978947368421052, + "grad_norm": 0.423828125, + "learning_rate": 0.0004817924521369589, + "loss": 3.264, + "step": 3320 + }, + { + "epoch": 0.13983157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.0004817798080219188, + "loss": 3.7743, + "step": 3321 + }, + { + "epoch": 0.13987368421052632, + "grad_norm": 0.62890625, + "learning_rate": 0.00048176715968412294, + "loss": 3.5263, + "step": 3322 + }, + { + "epoch": 0.13991578947368422, + "grad_norm": 1.09375, + "learning_rate": 0.00048175450712380176, + "loss": 3.5551, + "step": 3323 + }, + { + "epoch": 0.13995789473684211, + "grad_norm": 0.419921875, + "learning_rate": 0.0004817418503411857, + "loss": 3.5418, + "step": 3324 + }, + { + "epoch": 0.14, + "grad_norm": 0.396484375, + "learning_rate": 0.0004817291893365054, + "loss": 3.8286, + "step": 3325 + }, + { + "epoch": 0.14004210526315788, + "grad_norm": 0.3984375, + "learning_rate": 0.00048171652410999166, + "loss": 3.2367, + "step": 3326 + }, + { + "epoch": 0.14008421052631578, + "grad_norm": 0.466796875, + "learning_rate": 0.00048170385466187506, + "loss": 3.7586, + "step": 3327 + }, + { + "epoch": 0.14012631578947368, + "grad_norm": 0.443359375, + "learning_rate": 0.0004816911809923864, + "loss": 3.8166, + "step": 3328 + }, + { + "epoch": 0.14016842105263158, + "grad_norm": 0.4921875, + "learning_rate": 0.00048167850310175663, + "loss": 3.5673, + "step": 3329 + }, + { + "epoch": 0.14021052631578948, + "grad_norm": 0.390625, + "learning_rate": 0.0004816658209902168, + "loss": 3.3051, + "step": 3330 + }, + { + "epoch": 0.14025263157894738, + "grad_norm": 0.3984375, + "learning_rate": 0.000481653134657998, + "loss": 3.8526, + "step": 3331 + }, + { + "epoch": 0.14029473684210525, + "grad_norm": 0.875, + "learning_rate": 0.00048164044410533115, + "loss": 3.2402, + "step": 3332 + }, + { + "epoch": 0.14033684210526315, + "grad_norm": 0.5234375, + "learning_rate": 0.0004816277493324476, + "loss": 3.9009, + "step": 3333 + }, + { + "epoch": 0.14037894736842105, + "grad_norm": 0.4453125, + "learning_rate": 0.00048161505033957866, + "loss": 3.3513, + "step": 3334 + }, + { + "epoch": 0.14042105263157895, + "grad_norm": 0.451171875, + "learning_rate": 0.00048160234712695557, + "loss": 3.0041, + "step": 3335 + }, + { + "epoch": 0.14046315789473685, + "grad_norm": 0.4453125, + "learning_rate": 0.00048158963969480986, + "loss": 3.3732, + "step": 3336 + }, + { + "epoch": 0.14050526315789474, + "grad_norm": 0.416015625, + "learning_rate": 0.000481576928043373, + "loss": 3.0949, + "step": 3337 + }, + { + "epoch": 0.14054736842105264, + "grad_norm": 0.40625, + "learning_rate": 0.0004815642121728766, + "loss": 3.7713, + "step": 3338 + }, + { + "epoch": 0.14058947368421051, + "grad_norm": 0.435546875, + "learning_rate": 0.0004815514920835524, + "loss": 3.4124, + "step": 3339 + }, + { + "epoch": 0.1406315789473684, + "grad_norm": 0.43359375, + "learning_rate": 0.00048153876777563197, + "loss": 3.7876, + "step": 3340 + }, + { + "epoch": 0.1406736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00048152603924934734, + "loss": 3.3635, + "step": 3341 + }, + { + "epoch": 0.1407157894736842, + "grad_norm": 0.48828125, + "learning_rate": 0.0004815133065049303, + "loss": 3.5514, + "step": 3342 + }, + { + "epoch": 0.1407578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.0004815005695426128, + "loss": 3.1351, + "step": 3343 + }, + { + "epoch": 0.1408, + "grad_norm": 0.41015625, + "learning_rate": 0.00048148782836262687, + "loss": 3.4575, + "step": 3344 + }, + { + "epoch": 0.1408421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.0004814750829652048, + "loss": 3.3083, + "step": 3345 + }, + { + "epoch": 0.14088421052631578, + "grad_norm": 0.40234375, + "learning_rate": 0.0004814623333505786, + "loss": 3.274, + "step": 3346 + }, + { + "epoch": 0.14092631578947368, + "grad_norm": 0.400390625, + "learning_rate": 0.00048144957951898066, + "loss": 3.2509, + "step": 3347 + }, + { + "epoch": 0.14096842105263158, + "grad_norm": 0.5703125, + "learning_rate": 0.00048143682147064337, + "loss": 3.0702, + "step": 3348 + }, + { + "epoch": 0.14101052631578948, + "grad_norm": 0.51171875, + "learning_rate": 0.00048142405920579905, + "loss": 3.2826, + "step": 3349 + }, + { + "epoch": 0.14105263157894737, + "grad_norm": 0.4453125, + "learning_rate": 0.0004814112927246804, + "loss": 3.1565, + "step": 3350 + }, + { + "epoch": 0.14109473684210527, + "grad_norm": 0.400390625, + "learning_rate": 0.00048139852202751975, + "loss": 3.6337, + "step": 3351 + }, + { + "epoch": 0.14113684210526317, + "grad_norm": 0.427734375, + "learning_rate": 0.00048138574711455, + "loss": 3.4784, + "step": 3352 + }, + { + "epoch": 0.14117894736842104, + "grad_norm": 0.44140625, + "learning_rate": 0.00048137296798600384, + "loss": 3.7547, + "step": 3353 + }, + { + "epoch": 0.14122105263157894, + "grad_norm": 0.43359375, + "learning_rate": 0.00048136018464211395, + "loss": 3.458, + "step": 3354 + }, + { + "epoch": 0.14126315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.0004813473970831134, + "loss": 3.8346, + "step": 3355 + }, + { + "epoch": 0.14130526315789474, + "grad_norm": 0.439453125, + "learning_rate": 0.00048133460530923506, + "loss": 3.2314, + "step": 3356 + }, + { + "epoch": 0.14134736842105264, + "grad_norm": 0.41015625, + "learning_rate": 0.000481321809320712, + "loss": 3.7825, + "step": 3357 + }, + { + "epoch": 0.14138947368421054, + "grad_norm": 0.412109375, + "learning_rate": 0.0004813090091177774, + "loss": 3.6057, + "step": 3358 + }, + { + "epoch": 0.1414315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.0004812962047006645, + "loss": 3.528, + "step": 3359 + }, + { + "epoch": 0.1414736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.00048128339606960647, + "loss": 3.3825, + "step": 3360 + }, + { + "epoch": 0.1415157894736842, + "grad_norm": 0.451171875, + "learning_rate": 0.00048127058322483665, + "loss": 3.257, + "step": 3361 + }, + { + "epoch": 0.1415578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.0004812577661665886, + "loss": 3.5798, + "step": 3362 + }, + { + "epoch": 0.1416, + "grad_norm": 0.421875, + "learning_rate": 0.00048124494489509574, + "loss": 3.4369, + "step": 3363 + }, + { + "epoch": 0.1416421052631579, + "grad_norm": 0.462890625, + "learning_rate": 0.00048123211941059166, + "loss": 3.6946, + "step": 3364 + }, + { + "epoch": 0.1416842105263158, + "grad_norm": 0.400390625, + "learning_rate": 0.0004812192897133101, + "loss": 3.1618, + "step": 3365 + }, + { + "epoch": 0.14172631578947367, + "grad_norm": 0.4375, + "learning_rate": 0.0004812064558034847, + "loss": 3.3881, + "step": 3366 + }, + { + "epoch": 0.14176842105263157, + "grad_norm": 0.419921875, + "learning_rate": 0.0004811936176813494, + "loss": 3.5791, + "step": 3367 + }, + { + "epoch": 0.14181052631578947, + "grad_norm": 0.404296875, + "learning_rate": 0.000481180775347138, + "loss": 3.5384, + "step": 3368 + }, + { + "epoch": 0.14185263157894737, + "grad_norm": 0.388671875, + "learning_rate": 0.00048116792880108445, + "loss": 3.3532, + "step": 3369 + }, + { + "epoch": 0.14189473684210527, + "grad_norm": 0.45703125, + "learning_rate": 0.0004811550780434229, + "loss": 3.7962, + "step": 3370 + }, + { + "epoch": 0.14193684210526317, + "grad_norm": 0.435546875, + "learning_rate": 0.00048114222307438735, + "loss": 4.0457, + "step": 3371 + }, + { + "epoch": 0.14197894736842107, + "grad_norm": 0.427734375, + "learning_rate": 0.0004811293638942122, + "loss": 3.8407, + "step": 3372 + }, + { + "epoch": 0.14202105263157894, + "grad_norm": 0.42578125, + "learning_rate": 0.0004811165005031315, + "loss": 3.4283, + "step": 3373 + }, + { + "epoch": 0.14206315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.00048110363290137983, + "loss": 3.4009, + "step": 3374 + }, + { + "epoch": 0.14210526315789473, + "grad_norm": 0.44921875, + "learning_rate": 0.0004810907610891914, + "loss": 3.3084, + "step": 3375 + }, + { + "epoch": 0.14214736842105263, + "grad_norm": 0.46484375, + "learning_rate": 0.00048107788506680083, + "loss": 3.3244, + "step": 3376 + }, + { + "epoch": 0.14218947368421053, + "grad_norm": 0.40234375, + "learning_rate": 0.0004810650048344427, + "loss": 3.7103, + "step": 3377 + }, + { + "epoch": 0.14223157894736843, + "grad_norm": 0.419921875, + "learning_rate": 0.00048105212039235173, + "loss": 3.5199, + "step": 3378 + }, + { + "epoch": 0.1422736842105263, + "grad_norm": 0.4453125, + "learning_rate": 0.0004810392317407626, + "loss": 3.365, + "step": 3379 + }, + { + "epoch": 0.1423157894736842, + "grad_norm": 0.72265625, + "learning_rate": 0.0004810263388799101, + "loss": 3.1618, + "step": 3380 + }, + { + "epoch": 0.1423578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.0004810134418100291, + "loss": 3.895, + "step": 3381 + }, + { + "epoch": 0.1424, + "grad_norm": 0.435546875, + "learning_rate": 0.0004810005405313547, + "loss": 3.4511, + "step": 3382 + }, + { + "epoch": 0.1424421052631579, + "grad_norm": 0.4765625, + "learning_rate": 0.0004809876350441218, + "loss": 2.8591, + "step": 3383 + }, + { + "epoch": 0.1424842105263158, + "grad_norm": 0.4453125, + "learning_rate": 0.0004809747253485656, + "loss": 3.61, + "step": 3384 + }, + { + "epoch": 0.1425263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.00048096181144492137, + "loss": 3.213, + "step": 3385 + }, + { + "epoch": 0.14256842105263157, + "grad_norm": 0.3984375, + "learning_rate": 0.0004809488933334242, + "loss": 3.751, + "step": 3386 + }, + { + "epoch": 0.14261052631578947, + "grad_norm": 0.392578125, + "learning_rate": 0.0004809359710143096, + "loss": 3.4314, + "step": 3387 + }, + { + "epoch": 0.14265263157894736, + "grad_norm": 0.455078125, + "learning_rate": 0.0004809230444878129, + "loss": 3.7577, + "step": 3388 + }, + { + "epoch": 0.14269473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.0004809101137541697, + "loss": 3.6778, + "step": 3389 + }, + { + "epoch": 0.14273684210526316, + "grad_norm": 0.4609375, + "learning_rate": 0.0004808971788136155, + "loss": 2.7928, + "step": 3390 + }, + { + "epoch": 0.14277894736842106, + "grad_norm": 0.439453125, + "learning_rate": 0.0004808842396663861, + "loss": 3.4381, + "step": 3391 + }, + { + "epoch": 0.14282105263157896, + "grad_norm": 0.39453125, + "learning_rate": 0.000480871296312717, + "loss": 3.5115, + "step": 3392 + }, + { + "epoch": 0.14286315789473683, + "grad_norm": 0.4140625, + "learning_rate": 0.00048085834875284417, + "loss": 3.7029, + "step": 3393 + }, + { + "epoch": 0.14290526315789473, + "grad_norm": 0.466796875, + "learning_rate": 0.0004808453969870035, + "loss": 3.1848, + "step": 3394 + }, + { + "epoch": 0.14294736842105263, + "grad_norm": 0.416015625, + "learning_rate": 0.00048083244101543093, + "loss": 3.6566, + "step": 3395 + }, + { + "epoch": 0.14298947368421053, + "grad_norm": 0.447265625, + "learning_rate": 0.0004808194808383625, + "loss": 3.3442, + "step": 3396 + }, + { + "epoch": 0.14303157894736843, + "grad_norm": 0.404296875, + "learning_rate": 0.00048080651645603437, + "loss": 3.4952, + "step": 3397 + }, + { + "epoch": 0.14307368421052633, + "grad_norm": 0.421875, + "learning_rate": 0.00048079354786868263, + "loss": 3.5902, + "step": 3398 + }, + { + "epoch": 0.14311578947368422, + "grad_norm": 0.447265625, + "learning_rate": 0.0004807805750765436, + "loss": 3.2667, + "step": 3399 + }, + { + "epoch": 0.1431578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.0004807675980798537, + "loss": 3.2706, + "step": 3400 + }, + { + "epoch": 0.1432, + "grad_norm": 0.431640625, + "learning_rate": 0.00048075461687884934, + "loss": 3.3424, + "step": 3401 + }, + { + "epoch": 0.1432421052631579, + "grad_norm": 0.404296875, + "learning_rate": 0.0004807416314737669, + "loss": 3.6431, + "step": 3402 + }, + { + "epoch": 0.1432842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.0004807286418648431, + "loss": 3.3001, + "step": 3403 + }, + { + "epoch": 0.1433263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.00048071564805231455, + "loss": 3.4262, + "step": 3404 + }, + { + "epoch": 0.1433684210526316, + "grad_norm": 0.53515625, + "learning_rate": 0.00048070265003641793, + "loss": 3.517, + "step": 3405 + }, + { + "epoch": 0.14341052631578946, + "grad_norm": 0.4296875, + "learning_rate": 0.0004806896478173901, + "loss": 3.4288, + "step": 3406 + }, + { + "epoch": 0.14345263157894736, + "grad_norm": 0.412109375, + "learning_rate": 0.00048067664139546796, + "loss": 3.4006, + "step": 3407 + }, + { + "epoch": 0.14349473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.0004806636307708884, + "loss": 3.5285, + "step": 3408 + }, + { + "epoch": 0.14353684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.0004806506159438886, + "loss": 3.0659, + "step": 3409 + }, + { + "epoch": 0.14357894736842106, + "grad_norm": 0.453125, + "learning_rate": 0.0004806375969147055, + "loss": 3.4186, + "step": 3410 + }, + { + "epoch": 0.14362105263157895, + "grad_norm": 0.396484375, + "learning_rate": 0.00048062457368357636, + "loss": 3.5544, + "step": 3411 + }, + { + "epoch": 0.14366315789473685, + "grad_norm": 0.5546875, + "learning_rate": 0.00048061154625073853, + "loss": 3.2955, + "step": 3412 + }, + { + "epoch": 0.14370526315789472, + "grad_norm": 0.45703125, + "learning_rate": 0.0004805985146164292, + "loss": 3.7367, + "step": 3413 + }, + { + "epoch": 0.14374736842105262, + "grad_norm": 0.431640625, + "learning_rate": 0.000480585478780886, + "loss": 3.4163, + "step": 3414 + }, + { + "epoch": 0.14378947368421052, + "grad_norm": 0.423828125, + "learning_rate": 0.0004805724387443462, + "loss": 3.2796, + "step": 3415 + }, + { + "epoch": 0.14383157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.0004805593945070475, + "loss": 3.2515, + "step": 3416 + }, + { + "epoch": 0.14387368421052632, + "grad_norm": 0.40625, + "learning_rate": 0.0004805463460692275, + "loss": 3.6293, + "step": 3417 + }, + { + "epoch": 0.14391578947368422, + "grad_norm": 0.435546875, + "learning_rate": 0.000480533293431124, + "loss": 3.5968, + "step": 3418 + }, + { + "epoch": 0.14395789473684212, + "grad_norm": 0.455078125, + "learning_rate": 0.0004805202365929747, + "loss": 3.0503, + "step": 3419 + }, + { + "epoch": 0.144, + "grad_norm": 0.44140625, + "learning_rate": 0.00048050717555501765, + "loss": 3.338, + "step": 3420 + }, + { + "epoch": 0.1440421052631579, + "grad_norm": 0.455078125, + "learning_rate": 0.00048049411031749064, + "loss": 3.5539, + "step": 3421 + }, + { + "epoch": 0.1440842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.0004804810408806317, + "loss": 3.2507, + "step": 3422 + }, + { + "epoch": 0.14412631578947369, + "grad_norm": 0.50390625, + "learning_rate": 0.0004804679672446791, + "loss": 3.4739, + "step": 3423 + }, + { + "epoch": 0.14416842105263158, + "grad_norm": 0.451171875, + "learning_rate": 0.00048045488940987084, + "loss": 3.807, + "step": 3424 + }, + { + "epoch": 0.14421052631578948, + "grad_norm": 0.423828125, + "learning_rate": 0.00048044180737644534, + "loss": 3.4709, + "step": 3425 + }, + { + "epoch": 0.14425263157894735, + "grad_norm": 0.46875, + "learning_rate": 0.00048042872114464083, + "loss": 3.5264, + "step": 3426 + }, + { + "epoch": 0.14429473684210525, + "grad_norm": 0.46484375, + "learning_rate": 0.00048041563071469574, + "loss": 3.4984, + "step": 3427 + }, + { + "epoch": 0.14433684210526315, + "grad_norm": 0.55859375, + "learning_rate": 0.00048040253608684863, + "loss": 3.1885, + "step": 3428 + }, + { + "epoch": 0.14437894736842105, + "grad_norm": 0.408203125, + "learning_rate": 0.000480389437261338, + "loss": 3.532, + "step": 3429 + }, + { + "epoch": 0.14442105263157895, + "grad_norm": 0.431640625, + "learning_rate": 0.00048037633423840255, + "loss": 3.5646, + "step": 3430 + }, + { + "epoch": 0.14446315789473685, + "grad_norm": 0.43359375, + "learning_rate": 0.00048036322701828095, + "loss": 3.235, + "step": 3431 + }, + { + "epoch": 0.14450526315789475, + "grad_norm": 0.466796875, + "learning_rate": 0.00048035011560121203, + "loss": 3.3639, + "step": 3432 + }, + { + "epoch": 0.14454736842105262, + "grad_norm": 0.396484375, + "learning_rate": 0.00048033699998743464, + "loss": 3.6266, + "step": 3433 + }, + { + "epoch": 0.14458947368421052, + "grad_norm": 0.515625, + "learning_rate": 0.0004803238801771877, + "loss": 3.1089, + "step": 3434 + }, + { + "epoch": 0.14463157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.0004803107561707103, + "loss": 3.3758, + "step": 3435 + }, + { + "epoch": 0.14467368421052632, + "grad_norm": 0.40234375, + "learning_rate": 0.0004802976279682416, + "loss": 3.4435, + "step": 3436 + }, + { + "epoch": 0.14471578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.0004802844955700206, + "loss": 3.4905, + "step": 3437 + }, + { + "epoch": 0.1447578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.0004802713589762868, + "loss": 3.4209, + "step": 3438 + }, + { + "epoch": 0.1448, + "grad_norm": 0.421875, + "learning_rate": 0.00048025821818727934, + "loss": 3.3824, + "step": 3439 + }, + { + "epoch": 0.14484210526315788, + "grad_norm": 0.427734375, + "learning_rate": 0.0004802450732032376, + "loss": 4.0705, + "step": 3440 + }, + { + "epoch": 0.14488421052631578, + "grad_norm": 0.43359375, + "learning_rate": 0.00048023192402440123, + "loss": 3.7318, + "step": 3441 + }, + { + "epoch": 0.14492631578947368, + "grad_norm": 0.41796875, + "learning_rate": 0.0004802187706510097, + "loss": 3.4073, + "step": 3442 + }, + { + "epoch": 0.14496842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.0004802056130833027, + "loss": 3.1488, + "step": 3443 + }, + { + "epoch": 0.14501052631578948, + "grad_norm": 0.4140625, + "learning_rate": 0.00048019245132151987, + "loss": 3.2328, + "step": 3444 + }, + { + "epoch": 0.14505263157894738, + "grad_norm": 0.69921875, + "learning_rate": 0.00048017928536590106, + "loss": 3.7851, + "step": 3445 + }, + { + "epoch": 0.14509473684210528, + "grad_norm": 0.419921875, + "learning_rate": 0.0004801661152166861, + "loss": 3.3842, + "step": 3446 + }, + { + "epoch": 0.14513684210526315, + "grad_norm": 0.443359375, + "learning_rate": 0.00048015294087411497, + "loss": 3.3484, + "step": 3447 + }, + { + "epoch": 0.14517894736842105, + "grad_norm": 0.46484375, + "learning_rate": 0.0004801397623384277, + "loss": 3.6755, + "step": 3448 + }, + { + "epoch": 0.14522105263157895, + "grad_norm": 0.46875, + "learning_rate": 0.0004801265796098643, + "loss": 3.3168, + "step": 3449 + }, + { + "epoch": 0.14526315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.00048011339268866506, + "loss": 3.2477, + "step": 3450 + }, + { + "epoch": 0.14530526315789474, + "grad_norm": 0.455078125, + "learning_rate": 0.0004801002015750702, + "loss": 2.9345, + "step": 3451 + }, + { + "epoch": 0.14534736842105264, + "grad_norm": 0.42578125, + "learning_rate": 0.00048008700626931994, + "loss": 3.5804, + "step": 3452 + }, + { + "epoch": 0.1453894736842105, + "grad_norm": 0.44140625, + "learning_rate": 0.0004800738067716548, + "loss": 3.8735, + "step": 3453 + }, + { + "epoch": 0.1454315789473684, + "grad_norm": 0.470703125, + "learning_rate": 0.00048006060308231524, + "loss": 3.658, + "step": 3454 + }, + { + "epoch": 0.1454736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.00048004739520154184, + "loss": 3.5298, + "step": 3455 + }, + { + "epoch": 0.1455157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.00048003418312957515, + "loss": 2.9864, + "step": 3456 + }, + { + "epoch": 0.1455578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00048002096686665597, + "loss": 3.5737, + "step": 3457 + }, + { + "epoch": 0.1456, + "grad_norm": 0.43359375, + "learning_rate": 0.00048000774641302493, + "loss": 3.3253, + "step": 3458 + }, + { + "epoch": 0.1456421052631579, + "grad_norm": 0.392578125, + "learning_rate": 0.0004799945217689231, + "loss": 3.3111, + "step": 3459 + }, + { + "epoch": 0.14568421052631578, + "grad_norm": 0.4609375, + "learning_rate": 0.00047998129293459123, + "loss": 3.4269, + "step": 3460 + }, + { + "epoch": 0.14572631578947368, + "grad_norm": 0.43359375, + "learning_rate": 0.0004799680599102705, + "loss": 3.7297, + "step": 3461 + }, + { + "epoch": 0.14576842105263157, + "grad_norm": 0.400390625, + "learning_rate": 0.0004799548226962019, + "loss": 3.2455, + "step": 3462 + }, + { + "epoch": 0.14581052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.0004799415812926266, + "loss": 3.1751, + "step": 3463 + }, + { + "epoch": 0.14585263157894737, + "grad_norm": 0.38671875, + "learning_rate": 0.0004799283356997859, + "loss": 3.5139, + "step": 3464 + }, + { + "epoch": 0.14589473684210527, + "grad_norm": 0.42578125, + "learning_rate": 0.000479915085917921, + "loss": 3.6596, + "step": 3465 + }, + { + "epoch": 0.14593684210526317, + "grad_norm": 0.41796875, + "learning_rate": 0.0004799018319472734, + "loss": 3.5211, + "step": 3466 + }, + { + "epoch": 0.14597894736842104, + "grad_norm": 0.4375, + "learning_rate": 0.0004798885737880846, + "loss": 4.0496, + "step": 3467 + }, + { + "epoch": 0.14602105263157894, + "grad_norm": 0.4375, + "learning_rate": 0.000479875311440596, + "loss": 3.496, + "step": 3468 + }, + { + "epoch": 0.14606315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.00047986204490504936, + "loss": 3.4902, + "step": 3469 + }, + { + "epoch": 0.14610526315789474, + "grad_norm": 0.49609375, + "learning_rate": 0.0004798487741816864, + "loss": 3.5114, + "step": 3470 + }, + { + "epoch": 0.14614736842105264, + "grad_norm": 0.453125, + "learning_rate": 0.0004798354992707488, + "loss": 3.246, + "step": 3471 + }, + { + "epoch": 0.14618947368421054, + "grad_norm": 0.40625, + "learning_rate": 0.0004798222201724785, + "loss": 3.6993, + "step": 3472 + }, + { + "epoch": 0.14623157894736843, + "grad_norm": 0.431640625, + "learning_rate": 0.00047980893688711723, + "loss": 3.5203, + "step": 3473 + }, + { + "epoch": 0.1462736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.00047979564941490725, + "loss": 3.2323, + "step": 3474 + }, + { + "epoch": 0.1463157894736842, + "grad_norm": 0.458984375, + "learning_rate": 0.0004797823577560905, + "loss": 3.3565, + "step": 3475 + }, + { + "epoch": 0.1463578947368421, + "grad_norm": 0.52734375, + "learning_rate": 0.0004797690619109092, + "loss": 3.4393, + "step": 3476 + }, + { + "epoch": 0.1464, + "grad_norm": 0.47265625, + "learning_rate": 0.0004797557618796056, + "loss": 3.1628, + "step": 3477 + }, + { + "epoch": 0.1464421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00047974245766242186, + "loss": 3.8626, + "step": 3478 + }, + { + "epoch": 0.1464842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.00047972914925960053, + "loss": 3.2123, + "step": 3479 + }, + { + "epoch": 0.14652631578947367, + "grad_norm": 0.40234375, + "learning_rate": 0.00047971583667138405, + "loss": 3.5529, + "step": 3480 + }, + { + "epoch": 0.14656842105263157, + "grad_norm": 0.439453125, + "learning_rate": 0.00047970251989801487, + "loss": 3.9338, + "step": 3481 + }, + { + "epoch": 0.14661052631578947, + "grad_norm": 0.46875, + "learning_rate": 0.0004796891989397357, + "loss": 3.3719, + "step": 3482 + }, + { + "epoch": 0.14665263157894737, + "grad_norm": 0.478515625, + "learning_rate": 0.00047967587379678926, + "loss": 3.3891, + "step": 3483 + }, + { + "epoch": 0.14669473684210527, + "grad_norm": 0.435546875, + "learning_rate": 0.0004796625444694182, + "loss": 3.6581, + "step": 3484 + }, + { + "epoch": 0.14673684210526317, + "grad_norm": 0.453125, + "learning_rate": 0.0004796492109578654, + "loss": 3.1821, + "step": 3485 + }, + { + "epoch": 0.14677894736842106, + "grad_norm": 0.462890625, + "learning_rate": 0.0004796358732623738, + "loss": 3.3633, + "step": 3486 + }, + { + "epoch": 0.14682105263157894, + "grad_norm": 0.40625, + "learning_rate": 0.00047962253138318645, + "loss": 3.3571, + "step": 3487 + }, + { + "epoch": 0.14686315789473683, + "grad_norm": 0.4765625, + "learning_rate": 0.0004796091853205463, + "loss": 3.294, + "step": 3488 + }, + { + "epoch": 0.14690526315789473, + "grad_norm": 0.423828125, + "learning_rate": 0.00047959583507469665, + "loss": 3.5613, + "step": 3489 + }, + { + "epoch": 0.14694736842105263, + "grad_norm": 0.416015625, + "learning_rate": 0.00047958248064588067, + "loss": 3.4197, + "step": 3490 + }, + { + "epoch": 0.14698947368421053, + "grad_norm": 0.40625, + "learning_rate": 0.00047956912203434156, + "loss": 3.3872, + "step": 3491 + }, + { + "epoch": 0.14703157894736843, + "grad_norm": 0.458984375, + "learning_rate": 0.0004795557592403228, + "loss": 3.6493, + "step": 3492 + }, + { + "epoch": 0.14707368421052633, + "grad_norm": 0.400390625, + "learning_rate": 0.00047954239226406786, + "loss": 3.1529, + "step": 3493 + }, + { + "epoch": 0.1471157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.0004795290211058202, + "loss": 3.3958, + "step": 3494 + }, + { + "epoch": 0.1471578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.00047951564576582354, + "loss": 3.5002, + "step": 3495 + }, + { + "epoch": 0.1472, + "grad_norm": 0.451171875, + "learning_rate": 0.0004795022662443214, + "loss": 3.2861, + "step": 3496 + }, + { + "epoch": 0.1472421052631579, + "grad_norm": 0.484375, + "learning_rate": 0.00047948888254155766, + "loss": 3.1008, + "step": 3497 + }, + { + "epoch": 0.1472842105263158, + "grad_norm": 0.578125, + "learning_rate": 0.00047947549465777604, + "loss": 3.5641, + "step": 3498 + }, + { + "epoch": 0.1473263157894737, + "grad_norm": 0.435546875, + "learning_rate": 0.0004794621025932206, + "loss": 3.5222, + "step": 3499 + }, + { + "epoch": 0.14736842105263157, + "grad_norm": 0.37890625, + "learning_rate": 0.00047944870634813525, + "loss": 3.2415, + "step": 3500 + }, + { + "epoch": 0.14741052631578946, + "grad_norm": 0.42578125, + "learning_rate": 0.00047943530592276407, + "loss": 3.731, + "step": 3501 + }, + { + "epoch": 0.14745263157894736, + "grad_norm": 0.41796875, + "learning_rate": 0.0004794219013173512, + "loss": 3.7334, + "step": 3502 + }, + { + "epoch": 0.14749473684210526, + "grad_norm": 0.4296875, + "learning_rate": 0.0004794084925321408, + "loss": 3.4148, + "step": 3503 + }, + { + "epoch": 0.14753684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.00047939507956737724, + "loss": 3.5669, + "step": 3504 + }, + { + "epoch": 0.14757894736842106, + "grad_norm": 0.458984375, + "learning_rate": 0.0004793816624233048, + "loss": 3.0733, + "step": 3505 + }, + { + "epoch": 0.14762105263157896, + "grad_norm": 0.41015625, + "learning_rate": 0.0004793682411001681, + "loss": 3.6561, + "step": 3506 + }, + { + "epoch": 0.14766315789473683, + "grad_norm": 0.43359375, + "learning_rate": 0.0004793548155982115, + "loss": 3.5954, + "step": 3507 + }, + { + "epoch": 0.14770526315789473, + "grad_norm": 0.416015625, + "learning_rate": 0.00047934138591767964, + "loss": 3.4994, + "step": 3508 + }, + { + "epoch": 0.14774736842105263, + "grad_norm": 0.4375, + "learning_rate": 0.0004793279520588171, + "loss": 3.0289, + "step": 3509 + }, + { + "epoch": 0.14778947368421053, + "grad_norm": 0.412109375, + "learning_rate": 0.0004793145140218688, + "loss": 3.6115, + "step": 3510 + }, + { + "epoch": 0.14783157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.0004793010718070795, + "loss": 3.8799, + "step": 3511 + }, + { + "epoch": 0.14787368421052632, + "grad_norm": 0.44921875, + "learning_rate": 0.00047928762541469405, + "loss": 3.542, + "step": 3512 + }, + { + "epoch": 0.14791578947368422, + "grad_norm": 0.421875, + "learning_rate": 0.0004792741748449575, + "loss": 3.0782, + "step": 3513 + }, + { + "epoch": 0.1479578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.0004792607200981149, + "loss": 3.4163, + "step": 3514 + }, + { + "epoch": 0.148, + "grad_norm": 0.40625, + "learning_rate": 0.0004792472611744113, + "loss": 3.6587, + "step": 3515 + }, + { + "epoch": 0.1480421052631579, + "grad_norm": 0.490234375, + "learning_rate": 0.00047923379807409197, + "loss": 3.3142, + "step": 3516 + }, + { + "epoch": 0.1480842105263158, + "grad_norm": 0.65625, + "learning_rate": 0.00047922033079740215, + "loss": 3.4841, + "step": 3517 + }, + { + "epoch": 0.1481263157894737, + "grad_norm": 0.443359375, + "learning_rate": 0.0004792068593445873, + "loss": 3.3279, + "step": 3518 + }, + { + "epoch": 0.1481684210526316, + "grad_norm": 0.470703125, + "learning_rate": 0.0004791933837158927, + "loss": 3.4343, + "step": 3519 + }, + { + "epoch": 0.1482105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.00047917990391156397, + "loss": 3.4753, + "step": 3520 + }, + { + "epoch": 0.14825263157894736, + "grad_norm": 0.65234375, + "learning_rate": 0.0004791664199318467, + "loss": 3.5413, + "step": 3521 + }, + { + "epoch": 0.14829473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.00047915293177698646, + "loss": 3.775, + "step": 3522 + }, + { + "epoch": 0.14833684210526316, + "grad_norm": 0.58203125, + "learning_rate": 0.00047913943944722905, + "loss": 3.553, + "step": 3523 + }, + { + "epoch": 0.14837894736842105, + "grad_norm": 0.65234375, + "learning_rate": 0.0004791259429428203, + "loss": 3.7543, + "step": 3524 + }, + { + "epoch": 0.14842105263157895, + "grad_norm": 0.5078125, + "learning_rate": 0.0004791124422640061, + "loss": 4.0371, + "step": 3525 + }, + { + "epoch": 0.14846315789473685, + "grad_norm": 0.40234375, + "learning_rate": 0.0004790989374110324, + "loss": 2.9852, + "step": 3526 + }, + { + "epoch": 0.14850526315789472, + "grad_norm": 0.49609375, + "learning_rate": 0.0004790854283841452, + "loss": 2.8469, + "step": 3527 + }, + { + "epoch": 0.14854736842105262, + "grad_norm": 0.44921875, + "learning_rate": 0.0004790719151835907, + "loss": 3.8494, + "step": 3528 + }, + { + "epoch": 0.14858947368421052, + "grad_norm": 0.44921875, + "learning_rate": 0.00047905839780961504, + "loss": 3.2385, + "step": 3529 + }, + { + "epoch": 0.14863157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.00047904487626246453, + "loss": 3.5745, + "step": 3530 + }, + { + "epoch": 0.14867368421052632, + "grad_norm": 0.451171875, + "learning_rate": 0.00047903135054238547, + "loss": 3.3895, + "step": 3531 + }, + { + "epoch": 0.14871578947368422, + "grad_norm": 0.419921875, + "learning_rate": 0.00047901782064962425, + "loss": 3.3836, + "step": 3532 + }, + { + "epoch": 0.14875789473684212, + "grad_norm": 0.421875, + "learning_rate": 0.00047900428658442746, + "loss": 3.6977, + "step": 3533 + }, + { + "epoch": 0.1488, + "grad_norm": 0.458984375, + "learning_rate": 0.00047899074834704166, + "loss": 3.0294, + "step": 3534 + }, + { + "epoch": 0.1488421052631579, + "grad_norm": 0.484375, + "learning_rate": 0.00047897720593771345, + "loss": 3.279, + "step": 3535 + }, + { + "epoch": 0.14888421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.00047896365935668964, + "loss": 3.8636, + "step": 3536 + }, + { + "epoch": 0.14892631578947368, + "grad_norm": 0.416015625, + "learning_rate": 0.0004789501086042169, + "loss": 3.4007, + "step": 3537 + }, + { + "epoch": 0.14896842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.00047893655368054223, + "loss": 3.2157, + "step": 3538 + }, + { + "epoch": 0.14901052631578948, + "grad_norm": 0.4609375, + "learning_rate": 0.0004789229945859125, + "loss": 3.2495, + "step": 3539 + }, + { + "epoch": 0.14905263157894738, + "grad_norm": 0.431640625, + "learning_rate": 0.00047890943132057487, + "loss": 3.777, + "step": 3540 + }, + { + "epoch": 0.14909473684210525, + "grad_norm": 0.443359375, + "learning_rate": 0.00047889586388477623, + "loss": 3.8531, + "step": 3541 + }, + { + "epoch": 0.14913684210526315, + "grad_norm": 0.42578125, + "learning_rate": 0.000478882292278764, + "loss": 3.6161, + "step": 3542 + }, + { + "epoch": 0.14917894736842105, + "grad_norm": 0.408203125, + "learning_rate": 0.0004788687165027853, + "loss": 3.3703, + "step": 3543 + }, + { + "epoch": 0.14922105263157895, + "grad_norm": 0.4765625, + "learning_rate": 0.0004788551365570875, + "loss": 3.7496, + "step": 3544 + }, + { + "epoch": 0.14926315789473685, + "grad_norm": 0.470703125, + "learning_rate": 0.00047884155244191804, + "loss": 3.7035, + "step": 3545 + }, + { + "epoch": 0.14930526315789475, + "grad_norm": 0.416015625, + "learning_rate": 0.0004788279641575243, + "loss": 3.6786, + "step": 3546 + }, + { + "epoch": 0.14934736842105265, + "grad_norm": 0.439453125, + "learning_rate": 0.00047881437170415393, + "loss": 3.7735, + "step": 3547 + }, + { + "epoch": 0.14938947368421052, + "grad_norm": 0.41796875, + "learning_rate": 0.00047880077508205457, + "loss": 3.521, + "step": 3548 + }, + { + "epoch": 0.14943157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.0004787871742914739, + "loss": 3.4225, + "step": 3549 + }, + { + "epoch": 0.14947368421052631, + "grad_norm": 0.431640625, + "learning_rate": 0.00047877356933265973, + "loss": 3.6627, + "step": 3550 + }, + { + "epoch": 0.1495157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.00047875996020586, + "loss": 3.2554, + "step": 3551 + }, + { + "epoch": 0.1495578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.0004787463469113225, + "loss": 3.639, + "step": 3552 + }, + { + "epoch": 0.1496, + "grad_norm": 0.412109375, + "learning_rate": 0.00047873272944929534, + "loss": 3.5686, + "step": 3553 + }, + { + "epoch": 0.14964210526315788, + "grad_norm": 0.419921875, + "learning_rate": 0.00047871910782002657, + "loss": 3.3285, + "step": 3554 + }, + { + "epoch": 0.14968421052631578, + "grad_norm": 0.443359375, + "learning_rate": 0.0004787054820237644, + "loss": 3.3723, + "step": 3555 + }, + { + "epoch": 0.14972631578947368, + "grad_norm": 0.400390625, + "learning_rate": 0.0004786918520607572, + "loss": 3.0776, + "step": 3556 + }, + { + "epoch": 0.14976842105263158, + "grad_norm": 0.455078125, + "learning_rate": 0.00047867821793125297, + "loss": 3.2772, + "step": 3557 + }, + { + "epoch": 0.14981052631578948, + "grad_norm": 0.435546875, + "learning_rate": 0.00047866457963550036, + "loss": 3.5471, + "step": 3558 + }, + { + "epoch": 0.14985263157894738, + "grad_norm": 0.408203125, + "learning_rate": 0.00047865093717374774, + "loss": 3.4563, + "step": 3559 + }, + { + "epoch": 0.14989473684210528, + "grad_norm": 0.400390625, + "learning_rate": 0.0004786372905462438, + "loss": 3.5108, + "step": 3560 + }, + { + "epoch": 0.14993684210526315, + "grad_norm": 0.421875, + "learning_rate": 0.00047862363975323695, + "loss": 3.7157, + "step": 3561 + }, + { + "epoch": 0.14997894736842105, + "grad_norm": 0.427734375, + "learning_rate": 0.0004786099847949761, + "loss": 3.3045, + "step": 3562 + }, + { + "epoch": 0.15002105263157894, + "grad_norm": 0.48828125, + "learning_rate": 0.0004785963256717099, + "loss": 3.3637, + "step": 3563 + }, + { + "epoch": 0.15006315789473684, + "grad_norm": 0.52734375, + "learning_rate": 0.00047858266238368725, + "loss": 2.8851, + "step": 3564 + }, + { + "epoch": 0.15010526315789474, + "grad_norm": 0.455078125, + "learning_rate": 0.000478568994931157, + "loss": 3.5491, + "step": 3565 + }, + { + "epoch": 0.15014736842105264, + "grad_norm": 0.45703125, + "learning_rate": 0.00047855532331436836, + "loss": 3.09, + "step": 3566 + }, + { + "epoch": 0.15018947368421054, + "grad_norm": 0.4453125, + "learning_rate": 0.0004785416475335701, + "loss": 3.1478, + "step": 3567 + }, + { + "epoch": 0.1502315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.00047852796758901165, + "loss": 3.9951, + "step": 3568 + }, + { + "epoch": 0.1502736842105263, + "grad_norm": 0.40625, + "learning_rate": 0.0004785142834809421, + "loss": 2.8634, + "step": 3569 + }, + { + "epoch": 0.1503157894736842, + "grad_norm": 0.39453125, + "learning_rate": 0.00047850059520961085, + "loss": 3.5533, + "step": 3570 + }, + { + "epoch": 0.1503578947368421, + "grad_norm": 0.396484375, + "learning_rate": 0.00047848690277526717, + "loss": 3.6855, + "step": 3571 + }, + { + "epoch": 0.1504, + "grad_norm": 0.4296875, + "learning_rate": 0.0004784732061781606, + "loss": 3.2511, + "step": 3572 + }, + { + "epoch": 0.1504421052631579, + "grad_norm": 0.474609375, + "learning_rate": 0.00047845950541854066, + "loss": 3.2379, + "step": 3573 + }, + { + "epoch": 0.15048421052631578, + "grad_norm": 0.43359375, + "learning_rate": 0.000478445800496657, + "loss": 3.2198, + "step": 3574 + }, + { + "epoch": 0.15052631578947367, + "grad_norm": 0.47265625, + "learning_rate": 0.00047843209141275924, + "loss": 3.1124, + "step": 3575 + }, + { + "epoch": 0.15056842105263157, + "grad_norm": 0.427734375, + "learning_rate": 0.0004784183781670972, + "loss": 3.7152, + "step": 3576 + }, + { + "epoch": 0.15061052631578947, + "grad_norm": 0.478515625, + "learning_rate": 0.0004784046607599206, + "loss": 3.2085, + "step": 3577 + }, + { + "epoch": 0.15065263157894737, + "grad_norm": 0.51171875, + "learning_rate": 0.0004783909391914795, + "loss": 3.5067, + "step": 3578 + }, + { + "epoch": 0.15069473684210527, + "grad_norm": 0.51953125, + "learning_rate": 0.0004783772134620238, + "loss": 3.93, + "step": 3579 + }, + { + "epoch": 0.15073684210526317, + "grad_norm": 0.392578125, + "learning_rate": 0.0004783634835718037, + "loss": 3.4735, + "step": 3580 + }, + { + "epoch": 0.15077894736842104, + "grad_norm": 0.41796875, + "learning_rate": 0.00047834974952106916, + "loss": 3.7857, + "step": 3581 + }, + { + "epoch": 0.15082105263157894, + "grad_norm": 0.412109375, + "learning_rate": 0.00047833601131007055, + "loss": 3.4935, + "step": 3582 + }, + { + "epoch": 0.15086315789473684, + "grad_norm": 0.58203125, + "learning_rate": 0.0004783222689390581, + "loss": 3.5528, + "step": 3583 + }, + { + "epoch": 0.15090526315789474, + "grad_norm": 0.458984375, + "learning_rate": 0.00047830852240828204, + "loss": 3.3636, + "step": 3584 + }, + { + "epoch": 0.15094736842105264, + "grad_norm": 0.408203125, + "learning_rate": 0.0004782947717179931, + "loss": 3.7314, + "step": 3585 + }, + { + "epoch": 0.15098947368421053, + "grad_norm": 0.51953125, + "learning_rate": 0.0004782810168684416, + "loss": 3.4479, + "step": 3586 + }, + { + "epoch": 0.15103157894736843, + "grad_norm": 0.41796875, + "learning_rate": 0.0004782672578598782, + "loss": 3.0838, + "step": 3587 + }, + { + "epoch": 0.1510736842105263, + "grad_norm": 0.451171875, + "learning_rate": 0.0004782534946925535, + "loss": 2.8466, + "step": 3588 + }, + { + "epoch": 0.1511157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.0004782397273667184, + "loss": 3.5797, + "step": 3589 + }, + { + "epoch": 0.1511578947368421, + "grad_norm": 0.40234375, + "learning_rate": 0.00047822595588262363, + "loss": 3.3296, + "step": 3590 + }, + { + "epoch": 0.1512, + "grad_norm": 0.48046875, + "learning_rate": 0.00047821218024052004, + "loss": 3.4251, + "step": 3591 + }, + { + "epoch": 0.1512421052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.00047819840044065876, + "loss": 3.3906, + "step": 3592 + }, + { + "epoch": 0.1512842105263158, + "grad_norm": 0.396484375, + "learning_rate": 0.00047818461648329073, + "loss": 3.8013, + "step": 3593 + }, + { + "epoch": 0.1513263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.000478170828368667, + "loss": 3.228, + "step": 3594 + }, + { + "epoch": 0.15136842105263157, + "grad_norm": 0.3984375, + "learning_rate": 0.00047815703609703893, + "loss": 3.4233, + "step": 3595 + }, + { + "epoch": 0.15141052631578947, + "grad_norm": 0.416015625, + "learning_rate": 0.0004781432396686577, + "loss": 3.0338, + "step": 3596 + }, + { + "epoch": 0.15145263157894737, + "grad_norm": 0.458984375, + "learning_rate": 0.0004781294390837747, + "loss": 3.4687, + "step": 3597 + }, + { + "epoch": 0.15149473684210527, + "grad_norm": 0.435546875, + "learning_rate": 0.00047811563434264144, + "loss": 3.3073, + "step": 3598 + }, + { + "epoch": 0.15153684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.0004781018254455093, + "loss": 3.5941, + "step": 3599 + }, + { + "epoch": 0.15157894736842106, + "grad_norm": 0.408203125, + "learning_rate": 0.0004780880123926299, + "loss": 3.4236, + "step": 3600 + }, + { + "epoch": 0.15162105263157893, + "grad_norm": 0.73828125, + "learning_rate": 0.00047807419518425487, + "loss": 3.2942, + "step": 3601 + }, + { + "epoch": 0.15166315789473683, + "grad_norm": 0.44140625, + "learning_rate": 0.00047806037382063605, + "loss": 3.4013, + "step": 3602 + }, + { + "epoch": 0.15170526315789473, + "grad_norm": 0.44921875, + "learning_rate": 0.00047804654830202513, + "loss": 3.45, + "step": 3603 + }, + { + "epoch": 0.15174736842105263, + "grad_norm": 0.49609375, + "learning_rate": 0.000478032718628674, + "loss": 3.314, + "step": 3604 + }, + { + "epoch": 0.15178947368421053, + "grad_norm": 0.40625, + "learning_rate": 0.0004780188848008348, + "loss": 3.3036, + "step": 3605 + }, + { + "epoch": 0.15183157894736843, + "grad_norm": 0.404296875, + "learning_rate": 0.00047800504681875936, + "loss": 3.2694, + "step": 3606 + }, + { + "epoch": 0.15187368421052633, + "grad_norm": 0.43359375, + "learning_rate": 0.00047799120468269986, + "loss": 3.479, + "step": 3607 + }, + { + "epoch": 0.1519157894736842, + "grad_norm": 0.44921875, + "learning_rate": 0.00047797735839290843, + "loss": 3.7281, + "step": 3608 + }, + { + "epoch": 0.1519578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.0004779635079496375, + "loss": 2.9445, + "step": 3609 + }, + { + "epoch": 0.152, + "grad_norm": 0.416015625, + "learning_rate": 0.00047794965335313925, + "loss": 3.4902, + "step": 3610 + }, + { + "epoch": 0.1520421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00047793579460366615, + "loss": 3.0122, + "step": 3611 + }, + { + "epoch": 0.1520842105263158, + "grad_norm": 0.453125, + "learning_rate": 0.0004779219317014707, + "loss": 3.3085, + "step": 3612 + }, + { + "epoch": 0.1521263157894737, + "grad_norm": 0.455078125, + "learning_rate": 0.0004779080646468054, + "loss": 3.3256, + "step": 3613 + }, + { + "epoch": 0.1521684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.000477894193439923, + "loss": 3.2456, + "step": 3614 + }, + { + "epoch": 0.15221052631578946, + "grad_norm": 0.51171875, + "learning_rate": 0.00047788031808107615, + "loss": 3.4174, + "step": 3615 + }, + { + "epoch": 0.15225263157894736, + "grad_norm": 0.490234375, + "learning_rate": 0.0004778664385705176, + "loss": 3.0583, + "step": 3616 + }, + { + "epoch": 0.15229473684210526, + "grad_norm": 0.439453125, + "learning_rate": 0.0004778525549085003, + "loss": 3.5168, + "step": 3617 + }, + { + "epoch": 0.15233684210526316, + "grad_norm": 0.447265625, + "learning_rate": 0.00047783866709527725, + "loss": 3.3866, + "step": 3618 + }, + { + "epoch": 0.15237894736842106, + "grad_norm": 0.4609375, + "learning_rate": 0.0004778247751311013, + "loss": 2.9958, + "step": 3619 + }, + { + "epoch": 0.15242105263157896, + "grad_norm": 0.46484375, + "learning_rate": 0.0004778108790162256, + "loss": 3.2339, + "step": 3620 + }, + { + "epoch": 0.15246315789473686, + "grad_norm": 0.423828125, + "learning_rate": 0.00047779697875090346, + "loss": 3.7221, + "step": 3621 + }, + { + "epoch": 0.15250526315789473, + "grad_norm": 0.490234375, + "learning_rate": 0.0004777830743353879, + "loss": 3.3945, + "step": 3622 + }, + { + "epoch": 0.15254736842105263, + "grad_norm": 0.458984375, + "learning_rate": 0.0004777691657699324, + "loss": 3.793, + "step": 3623 + }, + { + "epoch": 0.15258947368421052, + "grad_norm": 0.55078125, + "learning_rate": 0.00047775525305479035, + "loss": 3.2065, + "step": 3624 + }, + { + "epoch": 0.15263157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.00047774133619021516, + "loss": 3.7421, + "step": 3625 + }, + { + "epoch": 0.15267368421052632, + "grad_norm": 0.431640625, + "learning_rate": 0.0004777274151764604, + "loss": 3.5376, + "step": 3626 + }, + { + "epoch": 0.15271578947368422, + "grad_norm": 0.40625, + "learning_rate": 0.0004777134900137797, + "loss": 3.6357, + "step": 3627 + }, + { + "epoch": 0.1527578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.0004776995607024268, + "loss": 3.7376, + "step": 3628 + }, + { + "epoch": 0.1528, + "grad_norm": 0.69140625, + "learning_rate": 0.0004776856272426555, + "loss": 2.9966, + "step": 3629 + }, + { + "epoch": 0.1528421052631579, + "grad_norm": 0.5234375, + "learning_rate": 0.0004776716896347195, + "loss": 3.1757, + "step": 3630 + }, + { + "epoch": 0.1528842105263158, + "grad_norm": 0.451171875, + "learning_rate": 0.00047765774787887286, + "loss": 3.6047, + "step": 3631 + }, + { + "epoch": 0.1529263157894737, + "grad_norm": 0.474609375, + "learning_rate": 0.0004776438019753695, + "loss": 3.292, + "step": 3632 + }, + { + "epoch": 0.1529684210526316, + "grad_norm": 0.44140625, + "learning_rate": 0.0004776298519244635, + "loss": 3.5381, + "step": 3633 + }, + { + "epoch": 0.15301052631578949, + "grad_norm": 0.443359375, + "learning_rate": 0.00047761589772640913, + "loss": 3.1568, + "step": 3634 + }, + { + "epoch": 0.15305263157894736, + "grad_norm": 0.4296875, + "learning_rate": 0.0004776019393814605, + "loss": 3.3782, + "step": 3635 + }, + { + "epoch": 0.15309473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.00047758797688987197, + "loss": 3.7427, + "step": 3636 + }, + { + "epoch": 0.15313684210526315, + "grad_norm": 0.44140625, + "learning_rate": 0.00047757401025189794, + "loss": 3.1935, + "step": 3637 + }, + { + "epoch": 0.15317894736842105, + "grad_norm": 0.51171875, + "learning_rate": 0.00047756003946779283, + "loss": 3.6259, + "step": 3638 + }, + { + "epoch": 0.15322105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.00047754606453781116, + "loss": 3.1283, + "step": 3639 + }, + { + "epoch": 0.15326315789473685, + "grad_norm": 0.419921875, + "learning_rate": 0.0004775320854622075, + "loss": 3.7776, + "step": 3640 + }, + { + "epoch": 0.15330526315789475, + "grad_norm": 0.7265625, + "learning_rate": 0.0004775181022412366, + "loss": 3.5322, + "step": 3641 + }, + { + "epoch": 0.15334736842105262, + "grad_norm": 0.453125, + "learning_rate": 0.00047750411487515324, + "loss": 3.6808, + "step": 3642 + }, + { + "epoch": 0.15338947368421052, + "grad_norm": 0.40625, + "learning_rate": 0.00047749012336421214, + "loss": 3.767, + "step": 3643 + }, + { + "epoch": 0.15343157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.00047747612770866835, + "loss": 3.5188, + "step": 3644 + }, + { + "epoch": 0.15347368421052632, + "grad_norm": 0.47265625, + "learning_rate": 0.00047746212790877673, + "loss": 3.615, + "step": 3645 + }, + { + "epoch": 0.15351578947368422, + "grad_norm": 0.4453125, + "learning_rate": 0.0004774481239647925, + "loss": 3.8009, + "step": 3646 + }, + { + "epoch": 0.15355789473684212, + "grad_norm": 0.439453125, + "learning_rate": 0.0004774341158769706, + "loss": 3.3402, + "step": 3647 + }, + { + "epoch": 0.1536, + "grad_norm": 0.42578125, + "learning_rate": 0.00047742010364556635, + "loss": 3.5347, + "step": 3648 + }, + { + "epoch": 0.15364210526315789, + "grad_norm": 0.48828125, + "learning_rate": 0.000477406087270835, + "loss": 3.5282, + "step": 3649 + }, + { + "epoch": 0.15368421052631578, + "grad_norm": 0.494140625, + "learning_rate": 0.000477392066753032, + "loss": 3.3169, + "step": 3650 + }, + { + "epoch": 0.15372631578947368, + "grad_norm": 0.44140625, + "learning_rate": 0.00047737804209241265, + "loss": 3.406, + "step": 3651 + }, + { + "epoch": 0.15376842105263158, + "grad_norm": 0.453125, + "learning_rate": 0.00047736401328923265, + "loss": 3.3344, + "step": 3652 + }, + { + "epoch": 0.15381052631578948, + "grad_norm": 0.46484375, + "learning_rate": 0.00047734998034374733, + "loss": 2.9495, + "step": 3653 + }, + { + "epoch": 0.15385263157894738, + "grad_norm": 0.4453125, + "learning_rate": 0.00047733594325621257, + "loss": 3.7441, + "step": 3654 + }, + { + "epoch": 0.15389473684210525, + "grad_norm": 0.41015625, + "learning_rate": 0.00047732190202688397, + "loss": 3.4942, + "step": 3655 + }, + { + "epoch": 0.15393684210526315, + "grad_norm": 0.48046875, + "learning_rate": 0.00047730785665601745, + "loss": 3.6626, + "step": 3656 + }, + { + "epoch": 0.15397894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.00047729380714386883, + "loss": 3.3828, + "step": 3657 + }, + { + "epoch": 0.15402105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.00047727975349069415, + "loss": 3.2138, + "step": 3658 + }, + { + "epoch": 0.15406315789473685, + "grad_norm": 0.431640625, + "learning_rate": 0.0004772656956967494, + "loss": 3.0591, + "step": 3659 + }, + { + "epoch": 0.15410526315789475, + "grad_norm": 0.419921875, + "learning_rate": 0.00047725163376229063, + "loss": 3.4689, + "step": 3660 + }, + { + "epoch": 0.15414736842105264, + "grad_norm": 0.40234375, + "learning_rate": 0.0004772375676875741, + "loss": 3.5733, + "step": 3661 + }, + { + "epoch": 0.15418947368421052, + "grad_norm": 0.419921875, + "learning_rate": 0.0004772234974728562, + "loss": 3.3081, + "step": 3662 + }, + { + "epoch": 0.15423157894736841, + "grad_norm": 0.45703125, + "learning_rate": 0.0004772094231183931, + "loss": 3.3391, + "step": 3663 + }, + { + "epoch": 0.1542736842105263, + "grad_norm": 0.45703125, + "learning_rate": 0.00047719534462444126, + "loss": 3.4436, + "step": 3664 + }, + { + "epoch": 0.1543157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.0004771812619912571, + "loss": 3.4334, + "step": 3665 + }, + { + "epoch": 0.1543578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.00047716717521909734, + "loss": 3.4928, + "step": 3666 + }, + { + "epoch": 0.1544, + "grad_norm": 0.435546875, + "learning_rate": 0.0004771530843082186, + "loss": 3.8065, + "step": 3667 + }, + { + "epoch": 0.1544421052631579, + "grad_norm": 0.57421875, + "learning_rate": 0.00047713898925887756, + "loss": 3.7351, + "step": 3668 + }, + { + "epoch": 0.15448421052631578, + "grad_norm": 0.447265625, + "learning_rate": 0.000477124890071331, + "loss": 3.0596, + "step": 3669 + }, + { + "epoch": 0.15452631578947368, + "grad_norm": 0.45703125, + "learning_rate": 0.00047711078674583573, + "loss": 3.7457, + "step": 3670 + }, + { + "epoch": 0.15456842105263158, + "grad_norm": 0.515625, + "learning_rate": 0.0004770966792826489, + "loss": 3.5734, + "step": 3671 + }, + { + "epoch": 0.15461052631578948, + "grad_norm": 0.486328125, + "learning_rate": 0.0004770825676820273, + "loss": 3.1431, + "step": 3672 + }, + { + "epoch": 0.15465263157894737, + "grad_norm": 0.421875, + "learning_rate": 0.0004770684519442281, + "loss": 3.5353, + "step": 3673 + }, + { + "epoch": 0.15469473684210527, + "grad_norm": 0.435546875, + "learning_rate": 0.00047705433206950853, + "loss": 3.8456, + "step": 3674 + }, + { + "epoch": 0.15473684210526314, + "grad_norm": 0.462890625, + "learning_rate": 0.0004770402080581259, + "loss": 3.717, + "step": 3675 + }, + { + "epoch": 0.15477894736842104, + "grad_norm": 0.4296875, + "learning_rate": 0.0004770260799103374, + "loss": 3.7989, + "step": 3676 + }, + { + "epoch": 0.15482105263157894, + "grad_norm": 0.458984375, + "learning_rate": 0.00047701194762640037, + "loss": 3.2728, + "step": 3677 + }, + { + "epoch": 0.15486315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.0004769978112065725, + "loss": 3.4157, + "step": 3678 + }, + { + "epoch": 0.15490526315789474, + "grad_norm": 0.478515625, + "learning_rate": 0.00047698367065111114, + "loss": 3.5842, + "step": 3679 + }, + { + "epoch": 0.15494736842105264, + "grad_norm": 0.421875, + "learning_rate": 0.00047696952596027397, + "loss": 3.2744, + "step": 3680 + }, + { + "epoch": 0.15498947368421054, + "grad_norm": 0.400390625, + "learning_rate": 0.0004769553771343188, + "loss": 3.4852, + "step": 3681 + }, + { + "epoch": 0.1550315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.0004769412241735033, + "loss": 3.3286, + "step": 3682 + }, + { + "epoch": 0.1550736842105263, + "grad_norm": 0.4453125, + "learning_rate": 0.00047692706707808536, + "loss": 3.5173, + "step": 3683 + }, + { + "epoch": 0.1551157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.00047691290584832286, + "loss": 3.1805, + "step": 3684 + }, + { + "epoch": 0.1551578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.0004768987404844738, + "loss": 3.1129, + "step": 3685 + }, + { + "epoch": 0.1552, + "grad_norm": 0.42578125, + "learning_rate": 0.0004768845709867964, + "loss": 3.4441, + "step": 3686 + }, + { + "epoch": 0.1552421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.00047687039735554854, + "loss": 3.3472, + "step": 3687 + }, + { + "epoch": 0.1552842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.0004768562195909887, + "loss": 3.4257, + "step": 3688 + }, + { + "epoch": 0.15532631578947367, + "grad_norm": 0.4296875, + "learning_rate": 0.000476842037693375, + "loss": 3.6577, + "step": 3689 + }, + { + "epoch": 0.15536842105263157, + "grad_norm": 0.421875, + "learning_rate": 0.00047682785166296593, + "loss": 3.8201, + "step": 3690 + }, + { + "epoch": 0.15541052631578947, + "grad_norm": 0.421875, + "learning_rate": 0.00047681366150002, + "loss": 3.284, + "step": 3691 + }, + { + "epoch": 0.15545263157894737, + "grad_norm": 0.392578125, + "learning_rate": 0.0004767994672047956, + "loss": 3.3328, + "step": 3692 + }, + { + "epoch": 0.15549473684210527, + "grad_norm": 0.39453125, + "learning_rate": 0.00047678526877755135, + "loss": 3.1374, + "step": 3693 + }, + { + "epoch": 0.15553684210526317, + "grad_norm": 0.3984375, + "learning_rate": 0.00047677106621854597, + "loss": 3.5267, + "step": 3694 + }, + { + "epoch": 0.15557894736842104, + "grad_norm": 0.423828125, + "learning_rate": 0.00047675685952803826, + "loss": 3.0731, + "step": 3695 + }, + { + "epoch": 0.15562105263157894, + "grad_norm": 0.52734375, + "learning_rate": 0.000476742648706287, + "loss": 3.0489, + "step": 3696 + }, + { + "epoch": 0.15566315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.00047672843375355103, + "loss": 3.9336, + "step": 3697 + }, + { + "epoch": 0.15570526315789474, + "grad_norm": 0.478515625, + "learning_rate": 0.0004767142146700894, + "loss": 3.2461, + "step": 3698 + }, + { + "epoch": 0.15574736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00047669999145616117, + "loss": 3.6933, + "step": 3699 + }, + { + "epoch": 0.15578947368421053, + "grad_norm": 0.396484375, + "learning_rate": 0.00047668576411202546, + "loss": 3.2744, + "step": 3700 + }, + { + "epoch": 0.15583157894736843, + "grad_norm": 0.4453125, + "learning_rate": 0.0004766715326379415, + "loss": 3.7561, + "step": 3701 + }, + { + "epoch": 0.1558736842105263, + "grad_norm": 0.46875, + "learning_rate": 0.0004766572970341685, + "loss": 3.5275, + "step": 3702 + }, + { + "epoch": 0.1559157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.0004766430573009659, + "loss": 3.4197, + "step": 3703 + }, + { + "epoch": 0.1559578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0004766288134385931, + "loss": 3.5674, + "step": 3704 + }, + { + "epoch": 0.156, + "grad_norm": 0.4375, + "learning_rate": 0.0004766145654473095, + "loss": 3.2447, + "step": 3705 + }, + { + "epoch": 0.1560421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00047660031332737493, + "loss": 3.7222, + "step": 3706 + }, + { + "epoch": 0.1560842105263158, + "grad_norm": 0.46484375, + "learning_rate": 0.00047658605707904877, + "loss": 3.4923, + "step": 3707 + }, + { + "epoch": 0.1561263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.00047657179670259096, + "loss": 3.2968, + "step": 3708 + }, + { + "epoch": 0.15616842105263157, + "grad_norm": 0.46484375, + "learning_rate": 0.00047655753219826117, + "loss": 3.9744, + "step": 3709 + }, + { + "epoch": 0.15621052631578947, + "grad_norm": 0.404296875, + "learning_rate": 0.00047654326356631933, + "loss": 3.9199, + "step": 3710 + }, + { + "epoch": 0.15625263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.00047652899080702544, + "loss": 3.4477, + "step": 3711 + }, + { + "epoch": 0.15629473684210526, + "grad_norm": 0.458984375, + "learning_rate": 0.0004765147139206395, + "loss": 3.721, + "step": 3712 + }, + { + "epoch": 0.15633684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.00047650043290742164, + "loss": 3.2584, + "step": 3713 + }, + { + "epoch": 0.15637894736842106, + "grad_norm": 0.421875, + "learning_rate": 0.00047648614776763193, + "loss": 3.4564, + "step": 3714 + }, + { + "epoch": 0.15642105263157896, + "grad_norm": 0.42578125, + "learning_rate": 0.00047647185850153073, + "loss": 3.3904, + "step": 3715 + }, + { + "epoch": 0.15646315789473683, + "grad_norm": 0.44921875, + "learning_rate": 0.0004764575651093784, + "loss": 3.2006, + "step": 3716 + }, + { + "epoch": 0.15650526315789473, + "grad_norm": 0.44140625, + "learning_rate": 0.00047644326759143534, + "loss": 3.6703, + "step": 3717 + }, + { + "epoch": 0.15654736842105263, + "grad_norm": 0.408203125, + "learning_rate": 0.000476428965947962, + "loss": 3.3534, + "step": 3718 + }, + { + "epoch": 0.15658947368421053, + "grad_norm": 0.447265625, + "learning_rate": 0.0004764146601792189, + "loss": 3.8089, + "step": 3719 + }, + { + "epoch": 0.15663157894736843, + "grad_norm": 0.41796875, + "learning_rate": 0.00047640035028546674, + "loss": 3.2193, + "step": 3720 + }, + { + "epoch": 0.15667368421052633, + "grad_norm": 0.4453125, + "learning_rate": 0.0004763860362669662, + "loss": 3.7856, + "step": 3721 + }, + { + "epoch": 0.1567157894736842, + "grad_norm": 0.4609375, + "learning_rate": 0.00047637171812397804, + "loss": 3.4054, + "step": 3722 + }, + { + "epoch": 0.1567578947368421, + "grad_norm": 0.400390625, + "learning_rate": 0.00047635739585676323, + "loss": 3.1605, + "step": 3723 + }, + { + "epoch": 0.1568, + "grad_norm": 0.419921875, + "learning_rate": 0.0004763430694655826, + "loss": 3.3979, + "step": 3724 + }, + { + "epoch": 0.1568421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00047632873895069717, + "loss": 3.9123, + "step": 3725 + }, + { + "epoch": 0.1568842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.00047631440431236807, + "loss": 3.3996, + "step": 3726 + }, + { + "epoch": 0.1569263157894737, + "grad_norm": 0.39453125, + "learning_rate": 0.0004763000655508564, + "loss": 3.3632, + "step": 3727 + }, + { + "epoch": 0.1569684210526316, + "grad_norm": 0.46484375, + "learning_rate": 0.00047628572266642346, + "loss": 3.5738, + "step": 3728 + }, + { + "epoch": 0.15701052631578946, + "grad_norm": 0.44140625, + "learning_rate": 0.0004762713756593304, + "loss": 3.3733, + "step": 3729 + }, + { + "epoch": 0.15705263157894736, + "grad_norm": 0.3984375, + "learning_rate": 0.0004762570245298389, + "loss": 3.9693, + "step": 3730 + }, + { + "epoch": 0.15709473684210526, + "grad_norm": 0.478515625, + "learning_rate": 0.00047624266927821015, + "loss": 3.5044, + "step": 3731 + }, + { + "epoch": 0.15713684210526316, + "grad_norm": 0.451171875, + "learning_rate": 0.0004762283099047059, + "loss": 3.719, + "step": 3732 + }, + { + "epoch": 0.15717894736842106, + "grad_norm": 0.45703125, + "learning_rate": 0.00047621394640958754, + "loss": 3.3341, + "step": 3733 + }, + { + "epoch": 0.15722105263157896, + "grad_norm": 0.4140625, + "learning_rate": 0.0004761995787931169, + "loss": 3.5223, + "step": 3734 + }, + { + "epoch": 0.15726315789473685, + "grad_norm": 0.462890625, + "learning_rate": 0.0004761852070555557, + "loss": 3.2013, + "step": 3735 + }, + { + "epoch": 0.15730526315789473, + "grad_norm": 0.46875, + "learning_rate": 0.00047617083119716583, + "loss": 3.6286, + "step": 3736 + }, + { + "epoch": 0.15734736842105262, + "grad_norm": 0.447265625, + "learning_rate": 0.0004761564512182091, + "loss": 3.1929, + "step": 3737 + }, + { + "epoch": 0.15738947368421052, + "grad_norm": 0.408203125, + "learning_rate": 0.0004761420671189476, + "loss": 3.4403, + "step": 3738 + }, + { + "epoch": 0.15743157894736842, + "grad_norm": 0.451171875, + "learning_rate": 0.0004761276788996433, + "loss": 3.5543, + "step": 3739 + }, + { + "epoch": 0.15747368421052632, + "grad_norm": 0.474609375, + "learning_rate": 0.0004761132865605584, + "loss": 3.4269, + "step": 3740 + }, + { + "epoch": 0.15751578947368422, + "grad_norm": 0.423828125, + "learning_rate": 0.0004760988901019551, + "loss": 3.7293, + "step": 3741 + }, + { + "epoch": 0.15755789473684212, + "grad_norm": 0.482421875, + "learning_rate": 0.0004760844895240957, + "loss": 3.4919, + "step": 3742 + }, + { + "epoch": 0.1576, + "grad_norm": 0.431640625, + "learning_rate": 0.00047607008482724244, + "loss": 3.3613, + "step": 3743 + }, + { + "epoch": 0.1576421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.0004760556760116579, + "loss": 3.3115, + "step": 3744 + }, + { + "epoch": 0.1576842105263158, + "grad_norm": 0.453125, + "learning_rate": 0.0004760412630776046, + "loss": 3.8654, + "step": 3745 + }, + { + "epoch": 0.1577263157894737, + "grad_norm": 0.470703125, + "learning_rate": 0.00047602684602534505, + "loss": 3.4589, + "step": 3746 + }, + { + "epoch": 0.15776842105263159, + "grad_norm": 0.41015625, + "learning_rate": 0.0004760124248551419, + "loss": 3.1835, + "step": 3747 + }, + { + "epoch": 0.15781052631578948, + "grad_norm": 0.416015625, + "learning_rate": 0.000475997999567258, + "loss": 3.7373, + "step": 3748 + }, + { + "epoch": 0.15785263157894736, + "grad_norm": 0.4140625, + "learning_rate": 0.00047598357016195603, + "loss": 3.8226, + "step": 3749 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 0.400390625, + "learning_rate": 0.00047596913663949895, + "loss": 3.135, + "step": 3750 + }, + { + "epoch": 0.15793684210526315, + "grad_norm": 0.6796875, + "learning_rate": 0.00047595469900014965, + "loss": 3.3915, + "step": 3751 + }, + { + "epoch": 0.15797894736842105, + "grad_norm": 0.408203125, + "learning_rate": 0.0004759402572441712, + "loss": 3.1679, + "step": 3752 + }, + { + "epoch": 0.15802105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.0004759258113718269, + "loss": 3.5198, + "step": 3753 + }, + { + "epoch": 0.15806315789473685, + "grad_norm": 0.392578125, + "learning_rate": 0.00047591136138337963, + "loss": 3.5471, + "step": 3754 + }, + { + "epoch": 0.15810526315789475, + "grad_norm": 0.466796875, + "learning_rate": 0.00047589690727909285, + "loss": 3.4873, + "step": 3755 + }, + { + "epoch": 0.15814736842105262, + "grad_norm": 0.66015625, + "learning_rate": 0.00047588244905922975, + "loss": 3.2597, + "step": 3756 + }, + { + "epoch": 0.15818947368421052, + "grad_norm": 0.404296875, + "learning_rate": 0.0004758679867240539, + "loss": 3.579, + "step": 3757 + }, + { + "epoch": 0.15823157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.0004758535202738287, + "loss": 3.6976, + "step": 3758 + }, + { + "epoch": 0.15827368421052632, + "grad_norm": 0.46875, + "learning_rate": 0.0004758390497088178, + "loss": 2.9278, + "step": 3759 + }, + { + "epoch": 0.15831578947368422, + "grad_norm": 0.42578125, + "learning_rate": 0.0004758245750292847, + "loss": 3.2774, + "step": 3760 + }, + { + "epoch": 0.15835789473684211, + "grad_norm": 0.52734375, + "learning_rate": 0.0004758100962354931, + "loss": 3.2356, + "step": 3761 + }, + { + "epoch": 0.1584, + "grad_norm": 0.59375, + "learning_rate": 0.0004757956133277069, + "loss": 3.2109, + "step": 3762 + }, + { + "epoch": 0.15844210526315788, + "grad_norm": 0.470703125, + "learning_rate": 0.00047578112630618994, + "loss": 3.089, + "step": 3763 + }, + { + "epoch": 0.15848421052631578, + "grad_norm": 0.416015625, + "learning_rate": 0.00047576663517120624, + "loss": 3.2914, + "step": 3764 + }, + { + "epoch": 0.15852631578947368, + "grad_norm": 0.45703125, + "learning_rate": 0.0004757521399230196, + "loss": 3.529, + "step": 3765 + }, + { + "epoch": 0.15856842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.0004757376405618943, + "loss": 3.5075, + "step": 3766 + }, + { + "epoch": 0.15861052631578948, + "grad_norm": 0.470703125, + "learning_rate": 0.00047572313708809435, + "loss": 3.6595, + "step": 3767 + }, + { + "epoch": 0.15865263157894738, + "grad_norm": 0.41796875, + "learning_rate": 0.0004757086295018841, + "loss": 3.4359, + "step": 3768 + }, + { + "epoch": 0.15869473684210525, + "grad_norm": 0.431640625, + "learning_rate": 0.00047569411780352776, + "loss": 3.3874, + "step": 3769 + }, + { + "epoch": 0.15873684210526315, + "grad_norm": 0.435546875, + "learning_rate": 0.0004756796019932898, + "loss": 3.4118, + "step": 3770 + }, + { + "epoch": 0.15877894736842105, + "grad_norm": 0.408203125, + "learning_rate": 0.00047566508207143467, + "loss": 3.8719, + "step": 3771 + }, + { + "epoch": 0.15882105263157895, + "grad_norm": 0.431640625, + "learning_rate": 0.0004756505580382269, + "loss": 3.8064, + "step": 3772 + }, + { + "epoch": 0.15886315789473685, + "grad_norm": 0.4375, + "learning_rate": 0.00047563602989393106, + "loss": 3.4762, + "step": 3773 + }, + { + "epoch": 0.15890526315789474, + "grad_norm": 0.443359375, + "learning_rate": 0.00047562149763881185, + "loss": 3.2884, + "step": 3774 + }, + { + "epoch": 0.15894736842105264, + "grad_norm": 0.427734375, + "learning_rate": 0.00047560696127313406, + "loss": 3.764, + "step": 3775 + }, + { + "epoch": 0.1589894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.0004755924207971626, + "loss": 3.3026, + "step": 3776 + }, + { + "epoch": 0.1590315789473684, + "grad_norm": 0.40234375, + "learning_rate": 0.00047557787621116214, + "loss": 3.6832, + "step": 3777 + }, + { + "epoch": 0.1590736842105263, + "grad_norm": 0.478515625, + "learning_rate": 0.0004755633275153979, + "loss": 3.3365, + "step": 3778 + }, + { + "epoch": 0.1591157894736842, + "grad_norm": 0.45703125, + "learning_rate": 0.0004755487747101349, + "loss": 3.6425, + "step": 3779 + }, + { + "epoch": 0.1591578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.0004755342177956382, + "loss": 3.5729, + "step": 3780 + }, + { + "epoch": 0.1592, + "grad_norm": 0.41796875, + "learning_rate": 0.00047551965677217304, + "loss": 3.4463, + "step": 3781 + }, + { + "epoch": 0.1592421052631579, + "grad_norm": 0.404296875, + "learning_rate": 0.00047550509164000465, + "loss": 3.0716, + "step": 3782 + }, + { + "epoch": 0.15928421052631578, + "grad_norm": 0.427734375, + "learning_rate": 0.00047549052239939855, + "loss": 3.2612, + "step": 3783 + }, + { + "epoch": 0.15932631578947368, + "grad_norm": 0.416015625, + "learning_rate": 0.00047547594905062, + "loss": 3.5127, + "step": 3784 + }, + { + "epoch": 0.15936842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.00047546137159393465, + "loss": 3.3692, + "step": 3785 + }, + { + "epoch": 0.15941052631578947, + "grad_norm": 0.4296875, + "learning_rate": 0.00047544679002960795, + "loss": 3.2505, + "step": 3786 + }, + { + "epoch": 0.15945263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.0004754322043579056, + "loss": 3.6312, + "step": 3787 + }, + { + "epoch": 0.15949473684210527, + "grad_norm": 0.41796875, + "learning_rate": 0.00047541761457909346, + "loss": 3.9617, + "step": 3788 + }, + { + "epoch": 0.15953684210526317, + "grad_norm": 0.4375, + "learning_rate": 0.00047540302069343715, + "loss": 3.4276, + "step": 3789 + }, + { + "epoch": 0.15957894736842104, + "grad_norm": 0.40234375, + "learning_rate": 0.0004753884227012027, + "loss": 3.1666, + "step": 3790 + }, + { + "epoch": 0.15962105263157894, + "grad_norm": 0.38671875, + "learning_rate": 0.000475373820602656, + "loss": 3.4347, + "step": 3791 + }, + { + "epoch": 0.15966315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.00047535921439806306, + "loss": 3.2315, + "step": 3792 + }, + { + "epoch": 0.15970526315789474, + "grad_norm": 0.404296875, + "learning_rate": 0.00047534460408768997, + "loss": 3.122, + "step": 3793 + }, + { + "epoch": 0.15974736842105264, + "grad_norm": 0.408203125, + "learning_rate": 0.0004753299896718031, + "loss": 3.6026, + "step": 3794 + }, + { + "epoch": 0.15978947368421054, + "grad_norm": 0.416015625, + "learning_rate": 0.0004753153711506685, + "loss": 2.9973, + "step": 3795 + }, + { + "epoch": 0.1598315789473684, + "grad_norm": 0.443359375, + "learning_rate": 0.0004753007485245525, + "loss": 3.1226, + "step": 3796 + }, + { + "epoch": 0.1598736842105263, + "grad_norm": 0.404296875, + "learning_rate": 0.00047528612179372165, + "loss": 3.3758, + "step": 3797 + }, + { + "epoch": 0.1599157894736842, + "grad_norm": 0.396484375, + "learning_rate": 0.00047527149095844234, + "loss": 3.4076, + "step": 3798 + }, + { + "epoch": 0.1599578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.0004752568560189812, + "loss": 3.3045, + "step": 3799 + }, + { + "epoch": 0.16, + "grad_norm": 0.4375, + "learning_rate": 0.00047524221697560476, + "loss": 3.2325, + "step": 3800 + }, + { + "epoch": 0.1600421052631579, + "grad_norm": 0.37890625, + "learning_rate": 0.00047522757382857986, + "loss": 3.5248, + "step": 3801 + }, + { + "epoch": 0.1600842105263158, + "grad_norm": 0.458984375, + "learning_rate": 0.00047521292657817305, + "loss": 2.9584, + "step": 3802 + }, + { + "epoch": 0.16012631578947367, + "grad_norm": 0.439453125, + "learning_rate": 0.0004751982752246514, + "loss": 3.4668, + "step": 3803 + }, + { + "epoch": 0.16016842105263157, + "grad_norm": 0.443359375, + "learning_rate": 0.00047518361976828184, + "loss": 3.263, + "step": 3804 + }, + { + "epoch": 0.16021052631578947, + "grad_norm": 0.390625, + "learning_rate": 0.0004751689602093312, + "loss": 3.3976, + "step": 3805 + }, + { + "epoch": 0.16025263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.00047515429654806674, + "loss": 3.3254, + "step": 3806 + }, + { + "epoch": 0.16029473684210527, + "grad_norm": 0.421875, + "learning_rate": 0.00047513962878475556, + "loss": 3.0458, + "step": 3807 + }, + { + "epoch": 0.16033684210526317, + "grad_norm": 0.44140625, + "learning_rate": 0.0004751249569196648, + "loss": 3.6979, + "step": 3808 + }, + { + "epoch": 0.16037894736842107, + "grad_norm": 0.443359375, + "learning_rate": 0.00047511028095306186, + "loss": 3.3904, + "step": 3809 + }, + { + "epoch": 0.16042105263157894, + "grad_norm": 0.4453125, + "learning_rate": 0.0004750956008852141, + "loss": 4.0054, + "step": 3810 + }, + { + "epoch": 0.16046315789473684, + "grad_norm": 0.93359375, + "learning_rate": 0.00047508091671638895, + "loss": 3.0281, + "step": 3811 + }, + { + "epoch": 0.16050526315789473, + "grad_norm": 0.408203125, + "learning_rate": 0.000475066228446854, + "loss": 3.5083, + "step": 3812 + }, + { + "epoch": 0.16054736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.0004750515360768768, + "loss": 2.986, + "step": 3813 + }, + { + "epoch": 0.16058947368421053, + "grad_norm": 0.412109375, + "learning_rate": 0.00047503683960672504, + "loss": 3.5373, + "step": 3814 + }, + { + "epoch": 0.16063157894736843, + "grad_norm": 0.421875, + "learning_rate": 0.0004750221390366665, + "loss": 3.4918, + "step": 3815 + }, + { + "epoch": 0.16067368421052633, + "grad_norm": 0.3984375, + "learning_rate": 0.00047500743436696893, + "loss": 3.5649, + "step": 3816 + }, + { + "epoch": 0.1607157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.0004749927255979002, + "loss": 3.7729, + "step": 3817 + }, + { + "epoch": 0.1607578947368421, + "grad_norm": 0.498046875, + "learning_rate": 0.00047497801272972844, + "loss": 3.7871, + "step": 3818 + }, + { + "epoch": 0.1608, + "grad_norm": 0.4296875, + "learning_rate": 0.00047496329576272167, + "loss": 3.1167, + "step": 3819 + }, + { + "epoch": 0.1608421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00047494857469714794, + "loss": 3.0299, + "step": 3820 + }, + { + "epoch": 0.1608842105263158, + "grad_norm": 0.453125, + "learning_rate": 0.00047493384953327546, + "loss": 3.4207, + "step": 3821 + }, + { + "epoch": 0.1609263157894737, + "grad_norm": 0.404296875, + "learning_rate": 0.00047491912027137253, + "loss": 3.7361, + "step": 3822 + }, + { + "epoch": 0.16096842105263157, + "grad_norm": 0.4296875, + "learning_rate": 0.00047490438691170754, + "loss": 3.4605, + "step": 3823 + }, + { + "epoch": 0.16101052631578947, + "grad_norm": 0.404296875, + "learning_rate": 0.00047488964945454883, + "loss": 3.5144, + "step": 3824 + }, + { + "epoch": 0.16105263157894736, + "grad_norm": 0.390625, + "learning_rate": 0.00047487490790016495, + "loss": 3.9343, + "step": 3825 + }, + { + "epoch": 0.16109473684210526, + "grad_norm": 0.474609375, + "learning_rate": 0.0004748601622488244, + "loss": 3.5133, + "step": 3826 + }, + { + "epoch": 0.16113684210526316, + "grad_norm": 0.45703125, + "learning_rate": 0.00047484541250079605, + "loss": 3.2526, + "step": 3827 + }, + { + "epoch": 0.16117894736842106, + "grad_norm": 0.4140625, + "learning_rate": 0.00047483065865634835, + "loss": 3.6039, + "step": 3828 + }, + { + "epoch": 0.16122105263157896, + "grad_norm": 0.3984375, + "learning_rate": 0.00047481590071575026, + "loss": 3.2024, + "step": 3829 + }, + { + "epoch": 0.16126315789473683, + "grad_norm": 0.43359375, + "learning_rate": 0.0004748011386792706, + "loss": 3.0853, + "step": 3830 + }, + { + "epoch": 0.16130526315789473, + "grad_norm": 0.50390625, + "learning_rate": 0.0004747863725471783, + "loss": 3.3361, + "step": 3831 + }, + { + "epoch": 0.16134736842105263, + "grad_norm": 0.416015625, + "learning_rate": 0.00047477160231974245, + "loss": 3.5184, + "step": 3832 + }, + { + "epoch": 0.16138947368421053, + "grad_norm": 0.439453125, + "learning_rate": 0.0004747568279972321, + "loss": 3.8712, + "step": 3833 + }, + { + "epoch": 0.16143157894736843, + "grad_norm": 0.46484375, + "learning_rate": 0.00047474204957991635, + "loss": 3.1647, + "step": 3834 + }, + { + "epoch": 0.16147368421052632, + "grad_norm": 0.478515625, + "learning_rate": 0.0004747272670680646, + "loss": 3.6008, + "step": 3835 + }, + { + "epoch": 0.16151578947368422, + "grad_norm": 0.435546875, + "learning_rate": 0.000474712480461946, + "loss": 3.2742, + "step": 3836 + }, + { + "epoch": 0.1615578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00047469768976183006, + "loss": 3.6445, + "step": 3837 + }, + { + "epoch": 0.1616, + "grad_norm": 0.416015625, + "learning_rate": 0.00047468289496798626, + "loss": 3.7055, + "step": 3838 + }, + { + "epoch": 0.1616421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.0004746680960806841, + "loss": 3.2288, + "step": 3839 + }, + { + "epoch": 0.1616842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.0004746532931001932, + "loss": 3.508, + "step": 3840 + }, + { + "epoch": 0.1617263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.00047463848602678326, + "loss": 3.2557, + "step": 3841 + }, + { + "epoch": 0.1617684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.00047462367486072397, + "loss": 3.515, + "step": 3842 + }, + { + "epoch": 0.16181052631578946, + "grad_norm": 0.80078125, + "learning_rate": 0.0004746088596022854, + "loss": 3.4366, + "step": 3843 + }, + { + "epoch": 0.16185263157894736, + "grad_norm": 0.443359375, + "learning_rate": 0.00047459404025173717, + "loss": 3.4384, + "step": 3844 + }, + { + "epoch": 0.16189473684210526, + "grad_norm": 0.419921875, + "learning_rate": 0.0004745792168093495, + "loss": 3.5839, + "step": 3845 + }, + { + "epoch": 0.16193684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.0004745643892753923, + "loss": 3.6662, + "step": 3846 + }, + { + "epoch": 0.16197894736842106, + "grad_norm": 0.494140625, + "learning_rate": 0.00047454955765013587, + "loss": 3.2906, + "step": 3847 + }, + { + "epoch": 0.16202105263157895, + "grad_norm": 0.5234375, + "learning_rate": 0.00047453472193385025, + "loss": 3.3933, + "step": 3848 + }, + { + "epoch": 0.16206315789473685, + "grad_norm": 0.431640625, + "learning_rate": 0.00047451988212680586, + "loss": 3.6889, + "step": 3849 + }, + { + "epoch": 0.16210526315789472, + "grad_norm": 0.421875, + "learning_rate": 0.00047450503822927295, + "loss": 3.2869, + "step": 3850 + }, + { + "epoch": 0.16214736842105262, + "grad_norm": 0.5078125, + "learning_rate": 0.00047449019024152207, + "loss": 3.1754, + "step": 3851 + }, + { + "epoch": 0.16218947368421052, + "grad_norm": 0.3984375, + "learning_rate": 0.0004744753381638237, + "loss": 3.2455, + "step": 3852 + }, + { + "epoch": 0.16223157894736842, + "grad_norm": 0.462890625, + "learning_rate": 0.0004744604819964484, + "loss": 3.0194, + "step": 3853 + }, + { + "epoch": 0.16227368421052632, + "grad_norm": 0.453125, + "learning_rate": 0.0004744456217396669, + "loss": 3.1745, + "step": 3854 + }, + { + "epoch": 0.16231578947368422, + "grad_norm": 0.41796875, + "learning_rate": 0.00047443075739374985, + "loss": 3.6739, + "step": 3855 + }, + { + "epoch": 0.16235789473684212, + "grad_norm": 0.396484375, + "learning_rate": 0.000474415888958968, + "loss": 3.1853, + "step": 3856 + }, + { + "epoch": 0.1624, + "grad_norm": 0.392578125, + "learning_rate": 0.0004744010164355925, + "loss": 3.4155, + "step": 3857 + }, + { + "epoch": 0.1624421052631579, + "grad_norm": 0.44921875, + "learning_rate": 0.00047438613982389403, + "loss": 3.6003, + "step": 3858 + }, + { + "epoch": 0.1624842105263158, + "grad_norm": 0.396484375, + "learning_rate": 0.0004743712591241438, + "loss": 3.3342, + "step": 3859 + }, + { + "epoch": 0.16252631578947369, + "grad_norm": 0.455078125, + "learning_rate": 0.0004743563743366128, + "loss": 3.5112, + "step": 3860 + }, + { + "epoch": 0.16256842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.0004743414854615723, + "loss": 3.3818, + "step": 3861 + }, + { + "epoch": 0.16261052631578948, + "grad_norm": 0.447265625, + "learning_rate": 0.00047432659249929356, + "loss": 3.4501, + "step": 3862 + }, + { + "epoch": 0.16265263157894738, + "grad_norm": 0.4453125, + "learning_rate": 0.0004743116954500478, + "loss": 4.0091, + "step": 3863 + }, + { + "epoch": 0.16269473684210525, + "grad_norm": 0.46484375, + "learning_rate": 0.00047429679431410654, + "loss": 3.5009, + "step": 3864 + }, + { + "epoch": 0.16273684210526315, + "grad_norm": 0.46875, + "learning_rate": 0.0004742818890917412, + "loss": 3.1644, + "step": 3865 + }, + { + "epoch": 0.16277894736842105, + "grad_norm": 1.1640625, + "learning_rate": 0.0004742669797832234, + "loss": 3.2342, + "step": 3866 + }, + { + "epoch": 0.16282105263157895, + "grad_norm": 0.5078125, + "learning_rate": 0.00047425206638882465, + "loss": 3.1711, + "step": 3867 + }, + { + "epoch": 0.16286315789473685, + "grad_norm": 0.81640625, + "learning_rate": 0.0004742371489088169, + "loss": 3.1086, + "step": 3868 + }, + { + "epoch": 0.16290526315789475, + "grad_norm": 0.42578125, + "learning_rate": 0.00047422222734347163, + "loss": 3.2091, + "step": 3869 + }, + { + "epoch": 0.16294736842105262, + "grad_norm": 0.4453125, + "learning_rate": 0.0004742073016930608, + "loss": 3.4166, + "step": 3870 + }, + { + "epoch": 0.16298947368421052, + "grad_norm": 0.5078125, + "learning_rate": 0.00047419237195785646, + "loss": 3.3303, + "step": 3871 + }, + { + "epoch": 0.16303157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00047417743813813054, + "loss": 3.4595, + "step": 3872 + }, + { + "epoch": 0.16307368421052632, + "grad_norm": 0.5, + "learning_rate": 0.00047416250023415497, + "loss": 3.305, + "step": 3873 + }, + { + "epoch": 0.16311578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00047414755824620215, + "loss": 3.7613, + "step": 3874 + }, + { + "epoch": 0.1631578947368421, + "grad_norm": 0.60546875, + "learning_rate": 0.00047413261217454407, + "loss": 3.4918, + "step": 3875 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5546875, + "learning_rate": 0.00047411766201945323, + "loss": 3.0577, + "step": 3876 + }, + { + "epoch": 0.16324210526315788, + "grad_norm": 0.4609375, + "learning_rate": 0.00047410270778120193, + "loss": 3.3608, + "step": 3877 + }, + { + "epoch": 0.16328421052631578, + "grad_norm": 0.5078125, + "learning_rate": 0.0004740877494600625, + "loss": 3.3662, + "step": 3878 + }, + { + "epoch": 0.16332631578947368, + "grad_norm": 0.458984375, + "learning_rate": 0.00047407278705630763, + "loss": 3.5284, + "step": 3879 + }, + { + "epoch": 0.16336842105263158, + "grad_norm": 0.484375, + "learning_rate": 0.0004740578205702099, + "loss": 3.7682, + "step": 3880 + }, + { + "epoch": 0.16341052631578948, + "grad_norm": 0.41015625, + "learning_rate": 0.00047404285000204183, + "loss": 3.6055, + "step": 3881 + }, + { + "epoch": 0.16345263157894738, + "grad_norm": 0.46875, + "learning_rate": 0.0004740278753520764, + "loss": 3.4189, + "step": 3882 + }, + { + "epoch": 0.16349473684210528, + "grad_norm": 0.484375, + "learning_rate": 0.0004740128966205862, + "loss": 3.3415, + "step": 3883 + }, + { + "epoch": 0.16353684210526315, + "grad_norm": 0.431640625, + "learning_rate": 0.00047399791380784426, + "loss": 3.6714, + "step": 3884 + }, + { + "epoch": 0.16357894736842105, + "grad_norm": 0.44921875, + "learning_rate": 0.0004739829269141235, + "loss": 3.5951, + "step": 3885 + }, + { + "epoch": 0.16362105263157894, + "grad_norm": 0.466796875, + "learning_rate": 0.000473967935939697, + "loss": 3.6779, + "step": 3886 + }, + { + "epoch": 0.16366315789473684, + "grad_norm": 0.54296875, + "learning_rate": 0.00047395294088483785, + "loss": 3.3035, + "step": 3887 + }, + { + "epoch": 0.16370526315789474, + "grad_norm": 0.458984375, + "learning_rate": 0.0004739379417498193, + "loss": 3.0263, + "step": 3888 + }, + { + "epoch": 0.16374736842105264, + "grad_norm": 0.4765625, + "learning_rate": 0.0004739229385349145, + "loss": 3.5854, + "step": 3889 + }, + { + "epoch": 0.1637894736842105, + "grad_norm": 0.490234375, + "learning_rate": 0.0004739079312403969, + "loss": 3.5468, + "step": 3890 + }, + { + "epoch": 0.1638315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.00047389291986653983, + "loss": 3.2046, + "step": 3891 + }, + { + "epoch": 0.1638736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.00047387790441361685, + "loss": 2.9892, + "step": 3892 + }, + { + "epoch": 0.1639157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.0004738628848819015, + "loss": 3.5071, + "step": 3893 + }, + { + "epoch": 0.1639578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.0004738478612716674, + "loss": 3.605, + "step": 3894 + }, + { + "epoch": 0.164, + "grad_norm": 0.4453125, + "learning_rate": 0.0004738328335831883, + "loss": 3.847, + "step": 3895 + }, + { + "epoch": 0.1640421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.00047381780181673797, + "loss": 3.5521, + "step": 3896 + }, + { + "epoch": 0.16408421052631578, + "grad_norm": 0.421875, + "learning_rate": 0.0004738027659725902, + "loss": 3.8377, + "step": 3897 + }, + { + "epoch": 0.16412631578947368, + "grad_norm": 0.484375, + "learning_rate": 0.00047378772605101904, + "loss": 3.6925, + "step": 3898 + }, + { + "epoch": 0.16416842105263157, + "grad_norm": 0.58203125, + "learning_rate": 0.0004737726820522985, + "loss": 3.4188, + "step": 3899 + }, + { + "epoch": 0.16421052631578947, + "grad_norm": 0.546875, + "learning_rate": 0.00047375763397670257, + "loss": 3.0865, + "step": 3900 + }, + { + "epoch": 0.16425263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.0004737425818245055, + "loss": 3.2189, + "step": 3901 + }, + { + "epoch": 0.16429473684210527, + "grad_norm": 0.447265625, + "learning_rate": 0.00047372752559598136, + "loss": 4.0025, + "step": 3902 + }, + { + "epoch": 0.16433684210526317, + "grad_norm": 0.453125, + "learning_rate": 0.0004737124652914047, + "loss": 3.2244, + "step": 3903 + }, + { + "epoch": 0.16437894736842104, + "grad_norm": 0.419921875, + "learning_rate": 0.00047369740091104967, + "loss": 3.7539, + "step": 3904 + }, + { + "epoch": 0.16442105263157894, + "grad_norm": 0.435546875, + "learning_rate": 0.0004736823324551909, + "loss": 3.6803, + "step": 3905 + }, + { + "epoch": 0.16446315789473684, + "grad_norm": 0.41015625, + "learning_rate": 0.0004736672599241028, + "loss": 2.9834, + "step": 3906 + }, + { + "epoch": 0.16450526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00047365218331806003, + "loss": 3.1287, + "step": 3907 + }, + { + "epoch": 0.16454736842105264, + "grad_norm": 0.396484375, + "learning_rate": 0.0004736371026373373, + "loss": 2.9637, + "step": 3908 + }, + { + "epoch": 0.16458947368421054, + "grad_norm": 0.41796875, + "learning_rate": 0.00047362201788220926, + "loss": 3.3041, + "step": 3909 + }, + { + "epoch": 0.16463157894736843, + "grad_norm": 0.42578125, + "learning_rate": 0.0004736069290529509, + "loss": 3.5508, + "step": 3910 + }, + { + "epoch": 0.1646736842105263, + "grad_norm": 0.462890625, + "learning_rate": 0.0004735918361498369, + "loss": 3.7763, + "step": 3911 + }, + { + "epoch": 0.1647157894736842, + "grad_norm": 0.474609375, + "learning_rate": 0.0004735767391731424, + "loss": 3.4247, + "step": 3912 + }, + { + "epoch": 0.1647578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00047356163812314244, + "loss": 2.9776, + "step": 3913 + }, + { + "epoch": 0.1648, + "grad_norm": 0.423828125, + "learning_rate": 0.0004735465330001121, + "loss": 3.7389, + "step": 3914 + }, + { + "epoch": 0.1648421052631579, + "grad_norm": 0.455078125, + "learning_rate": 0.00047353142380432656, + "loss": 3.4266, + "step": 3915 + }, + { + "epoch": 0.1648842105263158, + "grad_norm": 0.50390625, + "learning_rate": 0.00047351631053606116, + "loss": 3.3735, + "step": 3916 + }, + { + "epoch": 0.16492631578947367, + "grad_norm": 0.40234375, + "learning_rate": 0.0004735011931955912, + "loss": 3.7591, + "step": 3917 + }, + { + "epoch": 0.16496842105263157, + "grad_norm": 0.482421875, + "learning_rate": 0.00047348607178319203, + "loss": 2.6511, + "step": 3918 + }, + { + "epoch": 0.16501052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.00047347094629913923, + "loss": 3.7593, + "step": 3919 + }, + { + "epoch": 0.16505263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.00047345581674370843, + "loss": 3.5246, + "step": 3920 + }, + { + "epoch": 0.16509473684210527, + "grad_norm": 0.423828125, + "learning_rate": 0.0004734406831171751, + "loss": 3.4634, + "step": 3921 + }, + { + "epoch": 0.16513684210526317, + "grad_norm": 0.4140625, + "learning_rate": 0.00047342554541981514, + "loss": 3.5734, + "step": 3922 + }, + { + "epoch": 0.16517894736842106, + "grad_norm": 0.458984375, + "learning_rate": 0.00047341040365190423, + "loss": 3.3072, + "step": 3923 + }, + { + "epoch": 0.16522105263157894, + "grad_norm": 0.39453125, + "learning_rate": 0.0004733952578137183, + "loss": 3.6486, + "step": 3924 + }, + { + "epoch": 0.16526315789473683, + "grad_norm": 0.5625, + "learning_rate": 0.0004733801079055332, + "loss": 3.0872, + "step": 3925 + }, + { + "epoch": 0.16530526315789473, + "grad_norm": 0.39453125, + "learning_rate": 0.000473364953927625, + "loss": 3.6241, + "step": 3926 + }, + { + "epoch": 0.16534736842105263, + "grad_norm": 0.412109375, + "learning_rate": 0.0004733497958802697, + "loss": 3.3633, + "step": 3927 + }, + { + "epoch": 0.16538947368421053, + "grad_norm": 0.494140625, + "learning_rate": 0.0004733346337637437, + "loss": 3.5949, + "step": 3928 + }, + { + "epoch": 0.16543157894736843, + "grad_norm": 0.423828125, + "learning_rate": 0.000473319467578323, + "loss": 3.2528, + "step": 3929 + }, + { + "epoch": 0.16547368421052633, + "grad_norm": 0.396484375, + "learning_rate": 0.00047330429732428403, + "loss": 3.6965, + "step": 3930 + }, + { + "epoch": 0.1655157894736842, + "grad_norm": 0.447265625, + "learning_rate": 0.0004732891230019031, + "loss": 3.3838, + "step": 3931 + }, + { + "epoch": 0.1655578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.0004732739446114567, + "loss": 3.3009, + "step": 3932 + }, + { + "epoch": 0.1656, + "grad_norm": 0.40625, + "learning_rate": 0.0004732587621532214, + "loss": 3.4809, + "step": 3933 + }, + { + "epoch": 0.1656421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.00047324357562747373, + "loss": 3.422, + "step": 3934 + }, + { + "epoch": 0.1656842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.0004732283850344904, + "loss": 3.6986, + "step": 3935 + }, + { + "epoch": 0.1657263157894737, + "grad_norm": 0.45703125, + "learning_rate": 0.00047321319037454826, + "loss": 3.7558, + "step": 3936 + }, + { + "epoch": 0.1657684210526316, + "grad_norm": 0.392578125, + "learning_rate": 0.000473197991647924, + "loss": 3.4695, + "step": 3937 + }, + { + "epoch": 0.16581052631578946, + "grad_norm": 0.40625, + "learning_rate": 0.00047318278885489454, + "loss": 3.7542, + "step": 3938 + }, + { + "epoch": 0.16585263157894736, + "grad_norm": 0.486328125, + "learning_rate": 0.00047316758199573695, + "loss": 3.1024, + "step": 3939 + }, + { + "epoch": 0.16589473684210526, + "grad_norm": 0.435546875, + "learning_rate": 0.0004731523710707282, + "loss": 3.3538, + "step": 3940 + }, + { + "epoch": 0.16593684210526316, + "grad_norm": 0.3984375, + "learning_rate": 0.00047313715608014544, + "loss": 3.2272, + "step": 3941 + }, + { + "epoch": 0.16597894736842106, + "grad_norm": 0.482421875, + "learning_rate": 0.00047312193702426586, + "loss": 3.2253, + "step": 3942 + }, + { + "epoch": 0.16602105263157896, + "grad_norm": 0.3984375, + "learning_rate": 0.0004731067139033668, + "loss": 3.8651, + "step": 3943 + }, + { + "epoch": 0.16606315789473683, + "grad_norm": 0.40234375, + "learning_rate": 0.0004730914867177255, + "loss": 3.5505, + "step": 3944 + }, + { + "epoch": 0.16610526315789473, + "grad_norm": 0.3984375, + "learning_rate": 0.0004730762554676195, + "loss": 3.5339, + "step": 3945 + }, + { + "epoch": 0.16614736842105263, + "grad_norm": 0.59765625, + "learning_rate": 0.0004730610201533262, + "loss": 3.5887, + "step": 3946 + }, + { + "epoch": 0.16618947368421053, + "grad_norm": 0.3984375, + "learning_rate": 0.00047304578077512317, + "loss": 3.4064, + "step": 3947 + }, + { + "epoch": 0.16623157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.0004730305373332881, + "loss": 3.2213, + "step": 3948 + }, + { + "epoch": 0.16627368421052632, + "grad_norm": 0.40625, + "learning_rate": 0.0004730152898280987, + "loss": 3.8282, + "step": 3949 + }, + { + "epoch": 0.16631578947368422, + "grad_norm": 0.408203125, + "learning_rate": 0.00047300003825983275, + "loss": 3.599, + "step": 3950 + }, + { + "epoch": 0.1663578947368421, + "grad_norm": 0.470703125, + "learning_rate": 0.0004729847826287681, + "loss": 3.0654, + "step": 3951 + }, + { + "epoch": 0.1664, + "grad_norm": 0.400390625, + "learning_rate": 0.0004729695229351827, + "loss": 3.5216, + "step": 3952 + }, + { + "epoch": 0.1664421052631579, + "grad_norm": 0.404296875, + "learning_rate": 0.00047295425917935456, + "loss": 3.3799, + "step": 3953 + }, + { + "epoch": 0.1664842105263158, + "grad_norm": 0.392578125, + "learning_rate": 0.0004729389913615618, + "loss": 3.5818, + "step": 3954 + }, + { + "epoch": 0.1665263157894737, + "grad_norm": 0.45703125, + "learning_rate": 0.00047292371948208257, + "loss": 2.9763, + "step": 3955 + }, + { + "epoch": 0.1665684210526316, + "grad_norm": 0.4296875, + "learning_rate": 0.0004729084435411951, + "loss": 3.4696, + "step": 3956 + }, + { + "epoch": 0.1666105263157895, + "grad_norm": 0.482421875, + "learning_rate": 0.00047289316353917764, + "loss": 3.1098, + "step": 3957 + }, + { + "epoch": 0.16665263157894736, + "grad_norm": 0.400390625, + "learning_rate": 0.00047287787947630875, + "loss": 3.5092, + "step": 3958 + }, + { + "epoch": 0.16669473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.0004728625913528667, + "loss": 3.5431, + "step": 3959 + }, + { + "epoch": 0.16673684210526316, + "grad_norm": 0.51953125, + "learning_rate": 0.00047284729916913004, + "loss": 3.5243, + "step": 3960 + }, + { + "epoch": 0.16677894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.00047283200292537746, + "loss": 3.5311, + "step": 3961 + }, + { + "epoch": 0.16682105263157895, + "grad_norm": 0.42578125, + "learning_rate": 0.00047281670262188757, + "loss": 3.3587, + "step": 3962 + }, + { + "epoch": 0.16686315789473685, + "grad_norm": 0.41796875, + "learning_rate": 0.0004728013982589392, + "loss": 3.3555, + "step": 3963 + }, + { + "epoch": 0.16690526315789472, + "grad_norm": 0.41796875, + "learning_rate": 0.00047278608983681116, + "loss": 3.6365, + "step": 3964 + }, + { + "epoch": 0.16694736842105262, + "grad_norm": 0.392578125, + "learning_rate": 0.00047277077735578235, + "loss": 3.847, + "step": 3965 + }, + { + "epoch": 0.16698947368421052, + "grad_norm": 0.482421875, + "learning_rate": 0.0004727554608161316, + "loss": 3.6163, + "step": 3966 + }, + { + "epoch": 0.16703157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.0004727401402181382, + "loss": 3.5799, + "step": 3967 + }, + { + "epoch": 0.16707368421052632, + "grad_norm": 0.3984375, + "learning_rate": 0.000472724815562081, + "loss": 2.9099, + "step": 3968 + }, + { + "epoch": 0.16711578947368422, + "grad_norm": 0.412109375, + "learning_rate": 0.0004727094868482395, + "loss": 3.2216, + "step": 3969 + }, + { + "epoch": 0.16715789473684212, + "grad_norm": 0.439453125, + "learning_rate": 0.00047269415407689275, + "loss": 3.39, + "step": 3970 + }, + { + "epoch": 0.1672, + "grad_norm": 0.4296875, + "learning_rate": 0.00047267881724832015, + "loss": 3.1619, + "step": 3971 + }, + { + "epoch": 0.1672421052631579, + "grad_norm": 0.53125, + "learning_rate": 0.00047266347636280127, + "loss": 2.9004, + "step": 3972 + }, + { + "epoch": 0.16728421052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.0004726481314206154, + "loss": 3.7035, + "step": 3973 + }, + { + "epoch": 0.16732631578947368, + "grad_norm": 0.4140625, + "learning_rate": 0.0004726327824220421, + "loss": 3.3023, + "step": 3974 + }, + { + "epoch": 0.16736842105263158, + "grad_norm": 0.466796875, + "learning_rate": 0.0004726174293673612, + "loss": 3.5242, + "step": 3975 + }, + { + "epoch": 0.16741052631578948, + "grad_norm": 0.486328125, + "learning_rate": 0.00047260207225685217, + "loss": 3.2183, + "step": 3976 + }, + { + "epoch": 0.16745263157894738, + "grad_norm": 0.421875, + "learning_rate": 0.00047258671109079496, + "loss": 3.1849, + "step": 3977 + }, + { + "epoch": 0.16749473684210525, + "grad_norm": 0.427734375, + "learning_rate": 0.0004725713458694695, + "loss": 3.3391, + "step": 3978 + }, + { + "epoch": 0.16753684210526315, + "grad_norm": 0.421875, + "learning_rate": 0.00047255597659315556, + "loss": 3.3541, + "step": 3979 + }, + { + "epoch": 0.16757894736842105, + "grad_norm": 0.451171875, + "learning_rate": 0.0004725406032621332, + "loss": 2.9318, + "step": 3980 + }, + { + "epoch": 0.16762105263157895, + "grad_norm": 0.490234375, + "learning_rate": 0.0004725252258766826, + "loss": 3.4536, + "step": 3981 + }, + { + "epoch": 0.16766315789473685, + "grad_norm": 0.412109375, + "learning_rate": 0.00047250984443708376, + "loss": 3.83, + "step": 3982 + }, + { + "epoch": 0.16770526315789475, + "grad_norm": 0.45703125, + "learning_rate": 0.0004724944589436171, + "loss": 3.4686, + "step": 3983 + }, + { + "epoch": 0.16774736842105265, + "grad_norm": 0.43359375, + "learning_rate": 0.0004724790693965627, + "loss": 3.4259, + "step": 3984 + }, + { + "epoch": 0.16778947368421052, + "grad_norm": 0.498046875, + "learning_rate": 0.00047246367579620107, + "loss": 3.8927, + "step": 3985 + }, + { + "epoch": 0.16783157894736841, + "grad_norm": 0.42578125, + "learning_rate": 0.00047244827814281275, + "loss": 3.5099, + "step": 3986 + }, + { + "epoch": 0.1678736842105263, + "grad_norm": 0.609375, + "learning_rate": 0.0004724328764366781, + "loss": 3.8294, + "step": 3987 + }, + { + "epoch": 0.1679157894736842, + "grad_norm": 0.470703125, + "learning_rate": 0.00047241747067807777, + "loss": 3.027, + "step": 3988 + }, + { + "epoch": 0.1679578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00047240206086729256, + "loss": 3.297, + "step": 3989 + }, + { + "epoch": 0.168, + "grad_norm": 0.421875, + "learning_rate": 0.000472386647004603, + "loss": 3.7308, + "step": 3990 + }, + { + "epoch": 0.16804210526315788, + "grad_norm": 0.421875, + "learning_rate": 0.00047237122909029, + "loss": 3.5469, + "step": 3991 + }, + { + "epoch": 0.16808421052631578, + "grad_norm": 0.45703125, + "learning_rate": 0.00047235580712463453, + "loss": 3.0115, + "step": 3992 + }, + { + "epoch": 0.16812631578947368, + "grad_norm": 0.44140625, + "learning_rate": 0.00047234038110791754, + "loss": 3.5591, + "step": 3993 + }, + { + "epoch": 0.16816842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.00047232495104042007, + "loss": 3.2605, + "step": 3994 + }, + { + "epoch": 0.16821052631578948, + "grad_norm": 0.40625, + "learning_rate": 0.0004723095169224231, + "loss": 3.4928, + "step": 3995 + }, + { + "epoch": 0.16825263157894738, + "grad_norm": 0.40625, + "learning_rate": 0.00047229407875420807, + "loss": 3.5546, + "step": 3996 + }, + { + "epoch": 0.16829473684210527, + "grad_norm": 0.404296875, + "learning_rate": 0.000472278636536056, + "loss": 3.3614, + "step": 3997 + }, + { + "epoch": 0.16833684210526315, + "grad_norm": 0.40234375, + "learning_rate": 0.00047226319026824837, + "loss": 3.9368, + "step": 3998 + }, + { + "epoch": 0.16837894736842104, + "grad_norm": 0.482421875, + "learning_rate": 0.00047224773995106664, + "loss": 2.9254, + "step": 3999 + }, + { + "epoch": 0.16842105263157894, + "grad_norm": 0.40234375, + "learning_rate": 0.0004722322855847921, + "loss": 3.2352, + "step": 4000 + }, + { + "epoch": 0.16846315789473684, + "grad_norm": 0.890625, + "learning_rate": 0.0004722168271697065, + "loss": 3.6208, + "step": 4001 + }, + { + "epoch": 0.16850526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.00047220136470609134, + "loss": 3.4051, + "step": 4002 + }, + { + "epoch": 0.16854736842105264, + "grad_norm": 0.42578125, + "learning_rate": 0.00047218589819422843, + "loss": 3.5408, + "step": 4003 + }, + { + "epoch": 0.16858947368421054, + "grad_norm": 0.443359375, + "learning_rate": 0.00047217042763439957, + "loss": 3.2939, + "step": 4004 + }, + { + "epoch": 0.1686315789473684, + "grad_norm": 0.458984375, + "learning_rate": 0.0004721549530268865, + "loss": 3.4704, + "step": 4005 + }, + { + "epoch": 0.1686736842105263, + "grad_norm": 0.60546875, + "learning_rate": 0.0004721394743719711, + "loss": 3.4414, + "step": 4006 + }, + { + "epoch": 0.1687157894736842, + "grad_norm": 0.4609375, + "learning_rate": 0.0004721239916699357, + "loss": 3.266, + "step": 4007 + }, + { + "epoch": 0.1687578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.000472108504921062, + "loss": 3.3826, + "step": 4008 + }, + { + "epoch": 0.1688, + "grad_norm": 0.419921875, + "learning_rate": 0.0004720930141256324, + "loss": 3.8842, + "step": 4009 + }, + { + "epoch": 0.1688421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00047207751928392895, + "loss": 3.4885, + "step": 4010 + }, + { + "epoch": 0.1688842105263158, + "grad_norm": 0.455078125, + "learning_rate": 0.00047206202039623405, + "loss": 3.577, + "step": 4011 + }, + { + "epoch": 0.16892631578947367, + "grad_norm": 0.474609375, + "learning_rate": 0.0004720465174628301, + "loss": 3.3415, + "step": 4012 + }, + { + "epoch": 0.16896842105263157, + "grad_norm": 0.453125, + "learning_rate": 0.00047203101048399945, + "loss": 3.3593, + "step": 4013 + }, + { + "epoch": 0.16901052631578947, + "grad_norm": 0.41015625, + "learning_rate": 0.00047201549946002466, + "loss": 3.2469, + "step": 4014 + }, + { + "epoch": 0.16905263157894737, + "grad_norm": 0.515625, + "learning_rate": 0.0004719999843911884, + "loss": 3.6397, + "step": 4015 + }, + { + "epoch": 0.16909473684210527, + "grad_norm": 0.4296875, + "learning_rate": 0.0004719844652777732, + "loss": 2.9888, + "step": 4016 + }, + { + "epoch": 0.16913684210526317, + "grad_norm": 0.51953125, + "learning_rate": 0.00047196894212006183, + "loss": 3.0445, + "step": 4017 + }, + { + "epoch": 0.16917894736842104, + "grad_norm": 0.421875, + "learning_rate": 0.0004719534149183372, + "loss": 3.3861, + "step": 4018 + }, + { + "epoch": 0.16922105263157894, + "grad_norm": 0.4609375, + "learning_rate": 0.0004719378836728821, + "loss": 3.5823, + "step": 4019 + }, + { + "epoch": 0.16926315789473684, + "grad_norm": 0.56640625, + "learning_rate": 0.00047192234838397954, + "loss": 3.2576, + "step": 4020 + }, + { + "epoch": 0.16930526315789474, + "grad_norm": 0.408203125, + "learning_rate": 0.0004719068090519125, + "loss": 3.6116, + "step": 4021 + }, + { + "epoch": 0.16934736842105264, + "grad_norm": 0.404296875, + "learning_rate": 0.00047189126567696415, + "loss": 3.3864, + "step": 4022 + }, + { + "epoch": 0.16938947368421053, + "grad_norm": 0.435546875, + "learning_rate": 0.0004718757182594177, + "loss": 3.7278, + "step": 4023 + }, + { + "epoch": 0.16943157894736843, + "grad_norm": 0.466796875, + "learning_rate": 0.00047186016679955637, + "loss": 3.1557, + "step": 4024 + }, + { + "epoch": 0.1694736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.00047184461129766344, + "loss": 3.4391, + "step": 4025 + }, + { + "epoch": 0.1695157894736842, + "grad_norm": 0.404296875, + "learning_rate": 0.0004718290517540223, + "loss": 3.3408, + "step": 4026 + }, + { + "epoch": 0.1695578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0004718134881689165, + "loss": 3.5917, + "step": 4027 + }, + { + "epoch": 0.1696, + "grad_norm": 0.416015625, + "learning_rate": 0.0004717979205426296, + "loss": 3.1851, + "step": 4028 + }, + { + "epoch": 0.1696421052631579, + "grad_norm": 0.447265625, + "learning_rate": 0.00047178234887544515, + "loss": 3.6029, + "step": 4029 + }, + { + "epoch": 0.1696842105263158, + "grad_norm": 0.4453125, + "learning_rate": 0.0004717667731676469, + "loss": 3.7685, + "step": 4030 + }, + { + "epoch": 0.1697263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.00047175119341951865, + "loss": 3.5113, + "step": 4031 + }, + { + "epoch": 0.16976842105263157, + "grad_norm": 0.57421875, + "learning_rate": 0.00047173560963134414, + "loss": 3.6431, + "step": 4032 + }, + { + "epoch": 0.16981052631578947, + "grad_norm": 0.453125, + "learning_rate": 0.0004717200218034074, + "loss": 3.5522, + "step": 4033 + }, + { + "epoch": 0.16985263157894737, + "grad_norm": 0.4453125, + "learning_rate": 0.00047170442993599233, + "loss": 3.8096, + "step": 4034 + }, + { + "epoch": 0.16989473684210527, + "grad_norm": 0.4765625, + "learning_rate": 0.0004716888340293831, + "loss": 3.5516, + "step": 4035 + }, + { + "epoch": 0.16993684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.00047167323408386377, + "loss": 3.4304, + "step": 4036 + }, + { + "epoch": 0.16997894736842106, + "grad_norm": 0.462890625, + "learning_rate": 0.0004716576300997185, + "loss": 2.9175, + "step": 4037 + }, + { + "epoch": 0.17002105263157893, + "grad_norm": 0.44921875, + "learning_rate": 0.00047164202207723173, + "loss": 3.6158, + "step": 4038 + }, + { + "epoch": 0.17006315789473683, + "grad_norm": 0.41796875, + "learning_rate": 0.0004716264100166877, + "loss": 3.4811, + "step": 4039 + }, + { + "epoch": 0.17010526315789473, + "grad_norm": 0.455078125, + "learning_rate": 0.0004716107939183708, + "loss": 3.5036, + "step": 4040 + }, + { + "epoch": 0.17014736842105263, + "grad_norm": 0.486328125, + "learning_rate": 0.00047159517378256576, + "loss": 3.512, + "step": 4041 + }, + { + "epoch": 0.17018947368421053, + "grad_norm": 0.486328125, + "learning_rate": 0.00047157954960955694, + "loss": 2.9685, + "step": 4042 + }, + { + "epoch": 0.17023157894736843, + "grad_norm": 0.416015625, + "learning_rate": 0.0004715639213996291, + "loss": 3.396, + "step": 4043 + }, + { + "epoch": 0.17027368421052633, + "grad_norm": 0.458984375, + "learning_rate": 0.0004715482891530669, + "loss": 3.2873, + "step": 4044 + }, + { + "epoch": 0.1703157894736842, + "grad_norm": 0.60546875, + "learning_rate": 0.0004715326528701552, + "loss": 3.4103, + "step": 4045 + }, + { + "epoch": 0.1703578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00047151701255117887, + "loss": 3.4841, + "step": 4046 + }, + { + "epoch": 0.1704, + "grad_norm": 0.498046875, + "learning_rate": 0.0004715013681964229, + "loss": 3.4816, + "step": 4047 + }, + { + "epoch": 0.1704421052631579, + "grad_norm": 0.453125, + "learning_rate": 0.0004714857198061722, + "loss": 3.6729, + "step": 4048 + }, + { + "epoch": 0.1704842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.00047147006738071183, + "loss": 3.3521, + "step": 4049 + }, + { + "epoch": 0.1705263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.0004714544109203272, + "loss": 3.5727, + "step": 4050 + }, + { + "epoch": 0.1705684210526316, + "grad_norm": 0.443359375, + "learning_rate": 0.00047143875042530325, + "loss": 3.8321, + "step": 4051 + }, + { + "epoch": 0.17061052631578946, + "grad_norm": 0.41796875, + "learning_rate": 0.0004714230858959255, + "loss": 3.649, + "step": 4052 + }, + { + "epoch": 0.17065263157894736, + "grad_norm": 0.443359375, + "learning_rate": 0.00047140741733247936, + "loss": 3.1874, + "step": 4053 + }, + { + "epoch": 0.17069473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.0004713917447352501, + "loss": 3.4428, + "step": 4054 + }, + { + "epoch": 0.17073684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.0004713760681045235, + "loss": 3.3829, + "step": 4055 + }, + { + "epoch": 0.17077894736842106, + "grad_norm": 0.458984375, + "learning_rate": 0.000471360387440585, + "loss": 3.4407, + "step": 4056 + }, + { + "epoch": 0.17082105263157896, + "grad_norm": 0.43359375, + "learning_rate": 0.0004713447027437203, + "loss": 3.1996, + "step": 4057 + }, + { + "epoch": 0.17086315789473686, + "grad_norm": 0.419921875, + "learning_rate": 0.00047132901401421523, + "loss": 2.883, + "step": 4058 + }, + { + "epoch": 0.17090526315789473, + "grad_norm": 0.4296875, + "learning_rate": 0.0004713133212523556, + "loss": 3.6038, + "step": 4059 + }, + { + "epoch": 0.17094736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.00047129762445842716, + "loss": 3.7883, + "step": 4060 + }, + { + "epoch": 0.17098947368421052, + "grad_norm": 0.412109375, + "learning_rate": 0.00047128192363271614, + "loss": 3.4081, + "step": 4061 + }, + { + "epoch": 0.17103157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.0004712662187755085, + "loss": 3.2596, + "step": 4062 + }, + { + "epoch": 0.17107368421052632, + "grad_norm": 0.443359375, + "learning_rate": 0.00047125050988709017, + "loss": 3.6669, + "step": 4063 + }, + { + "epoch": 0.17111578947368422, + "grad_norm": 0.7265625, + "learning_rate": 0.0004712347969677476, + "loss": 3.5728, + "step": 4064 + }, + { + "epoch": 0.1711578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.000471219080017767, + "loss": 3.3157, + "step": 4065 + }, + { + "epoch": 0.1712, + "grad_norm": 0.4296875, + "learning_rate": 0.0004712033590374346, + "loss": 3.0154, + "step": 4066 + }, + { + "epoch": 0.1712421052631579, + "grad_norm": 0.474609375, + "learning_rate": 0.00047118763402703695, + "loss": 3.3101, + "step": 4067 + }, + { + "epoch": 0.1712842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.0004711719049868605, + "loss": 3.4353, + "step": 4068 + }, + { + "epoch": 0.1713263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.0004711561719171918, + "loss": 3.1108, + "step": 4069 + }, + { + "epoch": 0.1713684210526316, + "grad_norm": 0.388671875, + "learning_rate": 0.00047114043481831747, + "loss": 3.4348, + "step": 4070 + }, + { + "epoch": 0.17141052631578949, + "grad_norm": 0.419921875, + "learning_rate": 0.0004711246936905243, + "loss": 3.1932, + "step": 4071 + }, + { + "epoch": 0.17145263157894736, + "grad_norm": 0.416015625, + "learning_rate": 0.000471108948534099, + "loss": 3.4375, + "step": 4072 + }, + { + "epoch": 0.17149473684210526, + "grad_norm": 1.265625, + "learning_rate": 0.0004710931993493285, + "loss": 3.4676, + "step": 4073 + }, + { + "epoch": 0.17153684210526315, + "grad_norm": 0.423828125, + "learning_rate": 0.00047107744613649957, + "loss": 3.8069, + "step": 4074 + }, + { + "epoch": 0.17157894736842105, + "grad_norm": 0.427734375, + "learning_rate": 0.0004710616888958994, + "loss": 3.628, + "step": 4075 + }, + { + "epoch": 0.17162105263157895, + "grad_norm": 0.435546875, + "learning_rate": 0.00047104592762781496, + "loss": 3.5614, + "step": 4076 + }, + { + "epoch": 0.17166315789473685, + "grad_norm": 0.54296875, + "learning_rate": 0.0004710301623325335, + "loss": 3.4875, + "step": 4077 + }, + { + "epoch": 0.17170526315789475, + "grad_norm": 0.40625, + "learning_rate": 0.0004710143930103421, + "loss": 2.96, + "step": 4078 + }, + { + "epoch": 0.17174736842105262, + "grad_norm": 1.2890625, + "learning_rate": 0.0004709986196615281, + "loss": 3.5211, + "step": 4079 + }, + { + "epoch": 0.17178947368421052, + "grad_norm": 0.50390625, + "learning_rate": 0.00047098284228637904, + "loss": 3.5459, + "step": 4080 + }, + { + "epoch": 0.17183157894736842, + "grad_norm": 0.474609375, + "learning_rate": 0.0004709670608851822, + "loss": 3.6898, + "step": 4081 + }, + { + "epoch": 0.17187368421052632, + "grad_norm": 0.453125, + "learning_rate": 0.00047095127545822513, + "loss": 3.6728, + "step": 4082 + }, + { + "epoch": 0.17191578947368422, + "grad_norm": 0.419921875, + "learning_rate": 0.0004709354860057954, + "loss": 3.3401, + "step": 4083 + }, + { + "epoch": 0.17195789473684212, + "grad_norm": 0.384765625, + "learning_rate": 0.0004709196925281807, + "loss": 3.3327, + "step": 4084 + }, + { + "epoch": 0.172, + "grad_norm": 0.45703125, + "learning_rate": 0.0004709038950256688, + "loss": 2.6864, + "step": 4085 + }, + { + "epoch": 0.17204210526315789, + "grad_norm": 0.453125, + "learning_rate": 0.00047088809349854754, + "loss": 3.6756, + "step": 4086 + }, + { + "epoch": 0.17208421052631578, + "grad_norm": 0.447265625, + "learning_rate": 0.0004708722879471047, + "loss": 3.7004, + "step": 4087 + }, + { + "epoch": 0.17212631578947368, + "grad_norm": 0.40625, + "learning_rate": 0.00047085647837162825, + "loss": 3.4447, + "step": 4088 + }, + { + "epoch": 0.17216842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.00047084066477240627, + "loss": 3.1243, + "step": 4089 + }, + { + "epoch": 0.17221052631578948, + "grad_norm": 0.427734375, + "learning_rate": 0.00047082484714972684, + "loss": 3.8072, + "step": 4090 + }, + { + "epoch": 0.17225263157894738, + "grad_norm": 0.4296875, + "learning_rate": 0.00047080902550387824, + "loss": 3.5478, + "step": 4091 + }, + { + "epoch": 0.17229473684210525, + "grad_norm": 0.3984375, + "learning_rate": 0.00047079319983514854, + "loss": 3.5313, + "step": 4092 + }, + { + "epoch": 0.17233684210526315, + "grad_norm": 0.466796875, + "learning_rate": 0.00047077737014382626, + "loss": 3.3409, + "step": 4093 + }, + { + "epoch": 0.17237894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.00047076153643019956, + "loss": 3.6879, + "step": 4094 + }, + { + "epoch": 0.17242105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.0004707456986945572, + "loss": 3.5188, + "step": 4095 + }, + { + "epoch": 0.17246315789473685, + "grad_norm": 0.443359375, + "learning_rate": 0.0004707298569371875, + "loss": 3.3724, + "step": 4096 + }, + { + "epoch": 0.17250526315789474, + "grad_norm": 0.416015625, + "learning_rate": 0.00047071401115837913, + "loss": 2.9252, + "step": 4097 + }, + { + "epoch": 0.17254736842105264, + "grad_norm": 0.408203125, + "learning_rate": 0.00047069816135842085, + "loss": 3.7512, + "step": 4098 + }, + { + "epoch": 0.17258947368421051, + "grad_norm": 0.41015625, + "learning_rate": 0.00047068230753760133, + "loss": 3.769, + "step": 4099 + }, + { + "epoch": 0.1726315789473684, + "grad_norm": 0.44140625, + "learning_rate": 0.00047066644969620953, + "loss": 3.0994, + "step": 4100 + }, + { + "epoch": 0.1726736842105263, + "grad_norm": 0.458984375, + "learning_rate": 0.0004706505878345342, + "loss": 3.5695, + "step": 4101 + }, + { + "epoch": 0.1727157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.0004706347219528645, + "loss": 3.497, + "step": 4102 + }, + { + "epoch": 0.1727578947368421, + "grad_norm": 0.4609375, + "learning_rate": 0.0004706188520514894, + "loss": 3.8816, + "step": 4103 + }, + { + "epoch": 0.1728, + "grad_norm": 0.396484375, + "learning_rate": 0.00047060297813069794, + "loss": 3.4126, + "step": 4104 + }, + { + "epoch": 0.1728421052631579, + "grad_norm": 0.5234375, + "learning_rate": 0.0004705871001907795, + "loss": 2.8575, + "step": 4105 + }, + { + "epoch": 0.17288421052631578, + "grad_norm": 0.43359375, + "learning_rate": 0.00047057121823202325, + "loss": 3.4735, + "step": 4106 + }, + { + "epoch": 0.17292631578947368, + "grad_norm": 0.578125, + "learning_rate": 0.0004705553322547186, + "loss": 3.662, + "step": 4107 + }, + { + "epoch": 0.17296842105263158, + "grad_norm": 2.046875, + "learning_rate": 0.0004705394422591548, + "loss": 3.2555, + "step": 4108 + }, + { + "epoch": 0.17301052631578948, + "grad_norm": 0.43359375, + "learning_rate": 0.0004705235482456217, + "loss": 2.9976, + "step": 4109 + }, + { + "epoch": 0.17305263157894737, + "grad_norm": 0.447265625, + "learning_rate": 0.0004705076502144085, + "loss": 3.5981, + "step": 4110 + }, + { + "epoch": 0.17309473684210527, + "grad_norm": 0.435546875, + "learning_rate": 0.00047049174816580504, + "loss": 3.4769, + "step": 4111 + }, + { + "epoch": 0.17313684210526314, + "grad_norm": 0.419921875, + "learning_rate": 0.00047047584210010095, + "loss": 3.7224, + "step": 4112 + }, + { + "epoch": 0.17317894736842104, + "grad_norm": 0.59375, + "learning_rate": 0.00047045993201758616, + "loss": 3.6483, + "step": 4113 + }, + { + "epoch": 0.17322105263157894, + "grad_norm": 0.408203125, + "learning_rate": 0.0004704440179185504, + "loss": 3.2328, + "step": 4114 + }, + { + "epoch": 0.17326315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.0004704280998032836, + "loss": 3.5565, + "step": 4115 + }, + { + "epoch": 0.17330526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.0004704121776720759, + "loss": 3.5066, + "step": 4116 + }, + { + "epoch": 0.17334736842105264, + "grad_norm": 0.4375, + "learning_rate": 0.0004703962515252172, + "loss": 3.5862, + "step": 4117 + }, + { + "epoch": 0.17338947368421054, + "grad_norm": 0.451171875, + "learning_rate": 0.0004703803213629978, + "loss": 3.3561, + "step": 4118 + }, + { + "epoch": 0.1734315789473684, + "grad_norm": 0.443359375, + "learning_rate": 0.00047036438718570784, + "loss": 3.28, + "step": 4119 + }, + { + "epoch": 0.1734736842105263, + "grad_norm": 0.458984375, + "learning_rate": 0.0004703484489936377, + "loss": 3.61, + "step": 4120 + }, + { + "epoch": 0.1735157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.00047033250678707775, + "loss": 3.604, + "step": 4121 + }, + { + "epoch": 0.1735578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00047031656056631833, + "loss": 3.5327, + "step": 4122 + }, + { + "epoch": 0.1736, + "grad_norm": 0.408203125, + "learning_rate": 0.0004703006103316501, + "loss": 3.7875, + "step": 4123 + }, + { + "epoch": 0.1736421052631579, + "grad_norm": 0.49609375, + "learning_rate": 0.0004702846560833636, + "loss": 2.7759, + "step": 4124 + }, + { + "epoch": 0.1736842105263158, + "grad_norm": 0.470703125, + "learning_rate": 0.00047026869782174947, + "loss": 3.1796, + "step": 4125 + }, + { + "epoch": 0.17372631578947367, + "grad_norm": 0.4140625, + "learning_rate": 0.0004702527355470985, + "loss": 3.5315, + "step": 4126 + }, + { + "epoch": 0.17376842105263157, + "grad_norm": 0.435546875, + "learning_rate": 0.00047023676925970136, + "loss": 3.8077, + "step": 4127 + }, + { + "epoch": 0.17381052631578947, + "grad_norm": 0.412109375, + "learning_rate": 0.00047022079895984916, + "loss": 3.5666, + "step": 4128 + }, + { + "epoch": 0.17385263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.00047020482464783277, + "loss": 3.3682, + "step": 4129 + }, + { + "epoch": 0.17389473684210527, + "grad_norm": 0.423828125, + "learning_rate": 0.0004701888463239431, + "loss": 3.7405, + "step": 4130 + }, + { + "epoch": 0.17393684210526317, + "grad_norm": 0.5078125, + "learning_rate": 0.0004701728639884714, + "loss": 3.5147, + "step": 4131 + }, + { + "epoch": 0.17397894736842107, + "grad_norm": 0.431640625, + "learning_rate": 0.0004701568776417089, + "loss": 3.1774, + "step": 4132 + }, + { + "epoch": 0.17402105263157894, + "grad_norm": 0.53125, + "learning_rate": 0.00047014088728394664, + "loss": 3.3915, + "step": 4133 + }, + { + "epoch": 0.17406315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00047012489291547614, + "loss": 3.7362, + "step": 4134 + }, + { + "epoch": 0.17410526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.0004701088945365887, + "loss": 3.1461, + "step": 4135 + }, + { + "epoch": 0.17414736842105263, + "grad_norm": 0.416015625, + "learning_rate": 0.00047009289214757584, + "loss": 3.4335, + "step": 4136 + }, + { + "epoch": 0.17418947368421053, + "grad_norm": 0.42578125, + "learning_rate": 0.00047007688574872907, + "loss": 3.2442, + "step": 4137 + }, + { + "epoch": 0.17423157894736843, + "grad_norm": 0.44140625, + "learning_rate": 0.00047006087534034004, + "loss": 3.362, + "step": 4138 + }, + { + "epoch": 0.1742736842105263, + "grad_norm": 0.408203125, + "learning_rate": 0.0004700448609227004, + "loss": 3.0886, + "step": 4139 + }, + { + "epoch": 0.1743157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.0004700288424961019, + "loss": 3.3822, + "step": 4140 + }, + { + "epoch": 0.1743578947368421, + "grad_norm": 0.515625, + "learning_rate": 0.00047001282006083647, + "loss": 2.7435, + "step": 4141 + }, + { + "epoch": 0.1744, + "grad_norm": 0.388671875, + "learning_rate": 0.0004699967936171959, + "loss": 3.337, + "step": 4142 + }, + { + "epoch": 0.1744421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00046998076316547233, + "loss": 3.075, + "step": 4143 + }, + { + "epoch": 0.1744842105263158, + "grad_norm": 0.466796875, + "learning_rate": 0.0004699647287059577, + "loss": 3.0559, + "step": 4144 + }, + { + "epoch": 0.1745263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.0004699486902389441, + "loss": 3.8189, + "step": 4145 + }, + { + "epoch": 0.17456842105263157, + "grad_norm": 0.44140625, + "learning_rate": 0.0004699326477647239, + "loss": 3.336, + "step": 4146 + }, + { + "epoch": 0.17461052631578947, + "grad_norm": 0.431640625, + "learning_rate": 0.0004699166012835891, + "loss": 3.6742, + "step": 4147 + }, + { + "epoch": 0.17465263157894736, + "grad_norm": 0.447265625, + "learning_rate": 0.0004699005507958324, + "loss": 3.2918, + "step": 4148 + }, + { + "epoch": 0.17469473684210526, + "grad_norm": 0.439453125, + "learning_rate": 0.0004698844963017459, + "loss": 3.2911, + "step": 4149 + }, + { + "epoch": 0.17473684210526316, + "grad_norm": 0.37890625, + "learning_rate": 0.00046986843780162223, + "loss": 2.9933, + "step": 4150 + }, + { + "epoch": 0.17477894736842106, + "grad_norm": 0.39453125, + "learning_rate": 0.0004698523752957541, + "loss": 3.4779, + "step": 4151 + }, + { + "epoch": 0.17482105263157896, + "grad_norm": 0.4375, + "learning_rate": 0.00046983630878443385, + "loss": 2.9505, + "step": 4152 + }, + { + "epoch": 0.17486315789473683, + "grad_norm": 0.38671875, + "learning_rate": 0.00046982023826795444, + "loss": 3.1004, + "step": 4153 + }, + { + "epoch": 0.17490526315789473, + "grad_norm": 0.400390625, + "learning_rate": 0.0004698041637466085, + "loss": 3.6035, + "step": 4154 + }, + { + "epoch": 0.17494736842105263, + "grad_norm": 0.47265625, + "learning_rate": 0.0004697880852206891, + "loss": 3.3295, + "step": 4155 + }, + { + "epoch": 0.17498947368421053, + "grad_norm": 0.400390625, + "learning_rate": 0.00046977200269048886, + "loss": 3.3053, + "step": 4156 + }, + { + "epoch": 0.17503157894736843, + "grad_norm": 0.439453125, + "learning_rate": 0.00046975591615630103, + "loss": 3.6766, + "step": 4157 + }, + { + "epoch": 0.17507368421052633, + "grad_norm": 0.412109375, + "learning_rate": 0.0004697398256184186, + "loss": 3.2469, + "step": 4158 + }, + { + "epoch": 0.1751157894736842, + "grad_norm": 0.384765625, + "learning_rate": 0.0004697237310771347, + "loss": 3.5089, + "step": 4159 + }, + { + "epoch": 0.1751578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.00046970763253274264, + "loss": 3.1863, + "step": 4160 + }, + { + "epoch": 0.1752, + "grad_norm": 0.451171875, + "learning_rate": 0.00046969152998553565, + "loss": 3.7891, + "step": 4161 + }, + { + "epoch": 0.1752421052631579, + "grad_norm": 0.474609375, + "learning_rate": 0.00046967542343580705, + "loss": 3.0643, + "step": 4162 + }, + { + "epoch": 0.1752842105263158, + "grad_norm": 0.39453125, + "learning_rate": 0.00046965931288385043, + "loss": 3.2654, + "step": 4163 + }, + { + "epoch": 0.1753263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.0004696431983299592, + "loss": 3.5448, + "step": 4164 + }, + { + "epoch": 0.1753684210526316, + "grad_norm": 0.396484375, + "learning_rate": 0.00046962707977442694, + "loss": 3.5483, + "step": 4165 + }, + { + "epoch": 0.17541052631578946, + "grad_norm": 0.400390625, + "learning_rate": 0.00046961095721754733, + "loss": 2.9364, + "step": 4166 + }, + { + "epoch": 0.17545263157894736, + "grad_norm": 0.4375, + "learning_rate": 0.0004695948306596141, + "loss": 2.904, + "step": 4167 + }, + { + "epoch": 0.17549473684210526, + "grad_norm": 0.4140625, + "learning_rate": 0.0004695787001009211, + "loss": 3.6715, + "step": 4168 + }, + { + "epoch": 0.17553684210526316, + "grad_norm": 0.4453125, + "learning_rate": 0.0004695625655417622, + "loss": 3.2459, + "step": 4169 + }, + { + "epoch": 0.17557894736842106, + "grad_norm": 0.396484375, + "learning_rate": 0.00046954642698243134, + "loss": 3.3887, + "step": 4170 + }, + { + "epoch": 0.17562105263157896, + "grad_norm": 0.4140625, + "learning_rate": 0.00046953028442322245, + "loss": 3.2032, + "step": 4171 + }, + { + "epoch": 0.17566315789473685, + "grad_norm": 0.4140625, + "learning_rate": 0.00046951413786442976, + "loss": 3.6078, + "step": 4172 + }, + { + "epoch": 0.17570526315789473, + "grad_norm": 0.39453125, + "learning_rate": 0.0004694979873063474, + "loss": 3.439, + "step": 4173 + }, + { + "epoch": 0.17574736842105262, + "grad_norm": 0.41796875, + "learning_rate": 0.0004694818327492696, + "loss": 3.3076, + "step": 4174 + }, + { + "epoch": 0.17578947368421052, + "grad_norm": 0.3984375, + "learning_rate": 0.0004694656741934907, + "loss": 3.4701, + "step": 4175 + }, + { + "epoch": 0.17583157894736842, + "grad_norm": 0.40234375, + "learning_rate": 0.00046944951163930507, + "loss": 3.715, + "step": 4176 + }, + { + "epoch": 0.17587368421052632, + "grad_norm": 0.396484375, + "learning_rate": 0.0004694333450870072, + "loss": 3.3777, + "step": 4177 + }, + { + "epoch": 0.17591578947368422, + "grad_norm": 0.427734375, + "learning_rate": 0.0004694171745368916, + "loss": 3.5243, + "step": 4178 + }, + { + "epoch": 0.17595789473684212, + "grad_norm": 0.455078125, + "learning_rate": 0.0004694009999892529, + "loss": 3.5761, + "step": 4179 + }, + { + "epoch": 0.176, + "grad_norm": 0.443359375, + "learning_rate": 0.00046938482144438576, + "loss": 3.5781, + "step": 4180 + }, + { + "epoch": 0.1760421052631579, + "grad_norm": 0.447265625, + "learning_rate": 0.00046936863890258487, + "loss": 3.4956, + "step": 4181 + }, + { + "epoch": 0.1760842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.0004693524523641452, + "loss": 3.5015, + "step": 4182 + }, + { + "epoch": 0.1761263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.00046933626182936157, + "loss": 3.8282, + "step": 4183 + }, + { + "epoch": 0.17616842105263159, + "grad_norm": 0.38671875, + "learning_rate": 0.00046932006729852896, + "loss": 3.7916, + "step": 4184 + }, + { + "epoch": 0.17621052631578948, + "grad_norm": 0.416015625, + "learning_rate": 0.0004693038687719424, + "loss": 3.513, + "step": 4185 + }, + { + "epoch": 0.17625263157894736, + "grad_norm": 0.453125, + "learning_rate": 0.000469287666249897, + "loss": 3.2565, + "step": 4186 + }, + { + "epoch": 0.17629473684210525, + "grad_norm": 0.42578125, + "learning_rate": 0.000469271459732688, + "loss": 3.0477, + "step": 4187 + }, + { + "epoch": 0.17633684210526315, + "grad_norm": 0.4453125, + "learning_rate": 0.00046925524922061064, + "loss": 3.2048, + "step": 4188 + }, + { + "epoch": 0.17637894736842105, + "grad_norm": 0.4453125, + "learning_rate": 0.00046923903471396026, + "loss": 3.5178, + "step": 4189 + }, + { + "epoch": 0.17642105263157895, + "grad_norm": 0.416015625, + "learning_rate": 0.0004692228162130322, + "loss": 3.6415, + "step": 4190 + }, + { + "epoch": 0.17646315789473685, + "grad_norm": 0.404296875, + "learning_rate": 0.0004692065937181221, + "loss": 3.3499, + "step": 4191 + }, + { + "epoch": 0.17650526315789475, + "grad_norm": 0.408203125, + "learning_rate": 0.00046919036722952535, + "loss": 3.6412, + "step": 4192 + }, + { + "epoch": 0.17654736842105262, + "grad_norm": 0.412109375, + "learning_rate": 0.00046917413674753765, + "loss": 3.2624, + "step": 4193 + }, + { + "epoch": 0.17658947368421052, + "grad_norm": 0.384765625, + "learning_rate": 0.00046915790227245477, + "loss": 3.6382, + "step": 4194 + }, + { + "epoch": 0.17663157894736842, + "grad_norm": 0.38671875, + "learning_rate": 0.00046914166380457236, + "loss": 3.1782, + "step": 4195 + }, + { + "epoch": 0.17667368421052632, + "grad_norm": 0.40234375, + "learning_rate": 0.0004691254213441863, + "loss": 3.3404, + "step": 4196 + }, + { + "epoch": 0.17671578947368421, + "grad_norm": 0.384765625, + "learning_rate": 0.0004691091748915925, + "loss": 3.2054, + "step": 4197 + }, + { + "epoch": 0.1767578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.0004690929244470871, + "loss": 3.6556, + "step": 4198 + }, + { + "epoch": 0.1768, + "grad_norm": 0.431640625, + "learning_rate": 0.0004690766700109659, + "loss": 3.4804, + "step": 4199 + }, + { + "epoch": 0.17684210526315788, + "grad_norm": 0.45703125, + "learning_rate": 0.0004690604115835252, + "loss": 3.5481, + "step": 4200 + }, + { + "epoch": 0.17688421052631578, + "grad_norm": 0.53515625, + "learning_rate": 0.0004690441491650613, + "loss": 3.4507, + "step": 4201 + }, + { + "epoch": 0.17692631578947368, + "grad_norm": 0.4296875, + "learning_rate": 0.00046902788275587025, + "loss": 3.4143, + "step": 4202 + }, + { + "epoch": 0.17696842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.0004690116123562486, + "loss": 3.4218, + "step": 4203 + }, + { + "epoch": 0.17701052631578948, + "grad_norm": 0.48046875, + "learning_rate": 0.0004689953379664926, + "loss": 3.5022, + "step": 4204 + }, + { + "epoch": 0.17705263157894738, + "grad_norm": 0.408203125, + "learning_rate": 0.0004689790595868989, + "loss": 3.6886, + "step": 4205 + }, + { + "epoch": 0.17709473684210528, + "grad_norm": 0.412109375, + "learning_rate": 0.00046896277721776407, + "loss": 3.2538, + "step": 4206 + }, + { + "epoch": 0.17713684210526315, + "grad_norm": 0.439453125, + "learning_rate": 0.0004689464908593847, + "loss": 3.1598, + "step": 4207 + }, + { + "epoch": 0.17717894736842105, + "grad_norm": 0.51953125, + "learning_rate": 0.0004689302005120575, + "loss": 3.6936, + "step": 4208 + }, + { + "epoch": 0.17722105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.00046891390617607927, + "loss": 3.2474, + "step": 4209 + }, + { + "epoch": 0.17726315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.0004688976078517468, + "loss": 3.3635, + "step": 4210 + }, + { + "epoch": 0.17730526315789474, + "grad_norm": 0.458984375, + "learning_rate": 0.00046888130553935726, + "loss": 3.2507, + "step": 4211 + }, + { + "epoch": 0.17734736842105264, + "grad_norm": 0.390625, + "learning_rate": 0.0004688649992392074, + "loss": 3.348, + "step": 4212 + }, + { + "epoch": 0.1773894736842105, + "grad_norm": 0.400390625, + "learning_rate": 0.0004688486889515945, + "loss": 3.3024, + "step": 4213 + }, + { + "epoch": 0.1774315789473684, + "grad_norm": 0.4453125, + "learning_rate": 0.0004688323746768156, + "loss": 3.0525, + "step": 4214 + }, + { + "epoch": 0.1774736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.0004688160564151679, + "loss": 3.294, + "step": 4215 + }, + { + "epoch": 0.1775157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.00046879973416694875, + "loss": 3.0432, + "step": 4216 + }, + { + "epoch": 0.1775578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0004687834079324556, + "loss": 3.237, + "step": 4217 + }, + { + "epoch": 0.1776, + "grad_norm": 0.416015625, + "learning_rate": 0.0004687670777119858, + "loss": 3.3216, + "step": 4218 + }, + { + "epoch": 0.1776421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.0004687507435058368, + "loss": 3.2273, + "step": 4219 + }, + { + "epoch": 0.17768421052631578, + "grad_norm": 0.4140625, + "learning_rate": 0.0004687344053143063, + "loss": 3.4829, + "step": 4220 + }, + { + "epoch": 0.17772631578947368, + "grad_norm": 0.416015625, + "learning_rate": 0.00046871806313769204, + "loss": 3.2, + "step": 4221 + }, + { + "epoch": 0.17776842105263158, + "grad_norm": 0.482421875, + "learning_rate": 0.00046870171697629157, + "loss": 3.3576, + "step": 4222 + }, + { + "epoch": 0.17781052631578947, + "grad_norm": 0.400390625, + "learning_rate": 0.00046868536683040274, + "loss": 3.3935, + "step": 4223 + }, + { + "epoch": 0.17785263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.0004686690127003236, + "loss": 3.3322, + "step": 4224 + }, + { + "epoch": 0.17789473684210527, + "grad_norm": 0.4296875, + "learning_rate": 0.0004686526545863519, + "loss": 3.6801, + "step": 4225 + }, + { + "epoch": 0.17793684210526317, + "grad_norm": 0.462890625, + "learning_rate": 0.0004686362924887857, + "loss": 3.4058, + "step": 4226 + }, + { + "epoch": 0.17797894736842104, + "grad_norm": 0.443359375, + "learning_rate": 0.0004686199264079232, + "loss": 3.0714, + "step": 4227 + }, + { + "epoch": 0.17802105263157894, + "grad_norm": 0.494140625, + "learning_rate": 0.0004686035563440625, + "loss": 3.3951, + "step": 4228 + }, + { + "epoch": 0.17806315789473684, + "grad_norm": 0.41015625, + "learning_rate": 0.00046858718229750177, + "loss": 3.2651, + "step": 4229 + }, + { + "epoch": 0.17810526315789474, + "grad_norm": 0.53515625, + "learning_rate": 0.0004685708042685395, + "loss": 3.783, + "step": 4230 + }, + { + "epoch": 0.17814736842105264, + "grad_norm": 0.484375, + "learning_rate": 0.00046855442225747393, + "loss": 3.4605, + "step": 4231 + }, + { + "epoch": 0.17818947368421054, + "grad_norm": 0.4140625, + "learning_rate": 0.00046853803626460363, + "loss": 3.2673, + "step": 4232 + }, + { + "epoch": 0.1782315789473684, + "grad_norm": 0.451171875, + "learning_rate": 0.000468521646290227, + "loss": 3.2684, + "step": 4233 + }, + { + "epoch": 0.1782736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.0004685052523346428, + "loss": 3.5541, + "step": 4234 + }, + { + "epoch": 0.1783157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.00046848885439814965, + "loss": 3.7024, + "step": 4235 + }, + { + "epoch": 0.1783578947368421, + "grad_norm": 0.45703125, + "learning_rate": 0.0004684724524810462, + "loss": 3.5113, + "step": 4236 + }, + { + "epoch": 0.1784, + "grad_norm": 0.4921875, + "learning_rate": 0.0004684560465836315, + "loss": 3.4779, + "step": 4237 + }, + { + "epoch": 0.1784421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.0004684396367062042, + "loss": 3.1146, + "step": 4238 + }, + { + "epoch": 0.1784842105263158, + "grad_norm": 0.447265625, + "learning_rate": 0.00046842322284906335, + "loss": 3.4847, + "step": 4239 + }, + { + "epoch": 0.17852631578947367, + "grad_norm": 0.4375, + "learning_rate": 0.0004684068050125081, + "loss": 2.9235, + "step": 4240 + }, + { + "epoch": 0.17856842105263157, + "grad_norm": 0.458984375, + "learning_rate": 0.00046839038319683744, + "loss": 3.4952, + "step": 4241 + }, + { + "epoch": 0.17861052631578947, + "grad_norm": 0.400390625, + "learning_rate": 0.00046837395740235055, + "loss": 3.2787, + "step": 4242 + }, + { + "epoch": 0.17865263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.0004683575276293468, + "loss": 3.611, + "step": 4243 + }, + { + "epoch": 0.17869473684210527, + "grad_norm": 0.416015625, + "learning_rate": 0.00046834109387812554, + "loss": 3.792, + "step": 4244 + }, + { + "epoch": 0.17873684210526317, + "grad_norm": 0.427734375, + "learning_rate": 0.0004683246561489859, + "loss": 3.3878, + "step": 4245 + }, + { + "epoch": 0.17877894736842107, + "grad_norm": 0.390625, + "learning_rate": 0.00046830821444222766, + "loss": 3.4518, + "step": 4246 + }, + { + "epoch": 0.17882105263157894, + "grad_norm": 0.384765625, + "learning_rate": 0.00046829176875815026, + "loss": 3.124, + "step": 4247 + }, + { + "epoch": 0.17886315789473684, + "grad_norm": 0.462890625, + "learning_rate": 0.0004682753190970533, + "loss": 3.5221, + "step": 4248 + }, + { + "epoch": 0.17890526315789473, + "grad_norm": 0.51953125, + "learning_rate": 0.00046825886545923644, + "loss": 3.2836, + "step": 4249 + }, + { + "epoch": 0.17894736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.0004682424078449995, + "loss": 3.4374, + "step": 4250 + }, + { + "epoch": 0.17898947368421053, + "grad_norm": 0.462890625, + "learning_rate": 0.00046822594625464236, + "loss": 3.6944, + "step": 4251 + }, + { + "epoch": 0.17903157894736843, + "grad_norm": 0.458984375, + "learning_rate": 0.0004682094806884649, + "loss": 3.4054, + "step": 4252 + }, + { + "epoch": 0.17907368421052633, + "grad_norm": 0.43359375, + "learning_rate": 0.00046819301114676704, + "loss": 3.46, + "step": 4253 + }, + { + "epoch": 0.1791157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.0004681765376298489, + "loss": 3.4442, + "step": 4254 + }, + { + "epoch": 0.1791578947368421, + "grad_norm": 0.48046875, + "learning_rate": 0.0004681600601380106, + "loss": 3.7417, + "step": 4255 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4453125, + "learning_rate": 0.0004681435786715523, + "loss": 3.4556, + "step": 4256 + }, + { + "epoch": 0.1792421052631579, + "grad_norm": 0.3984375, + "learning_rate": 0.0004681270932307743, + "loss": 3.3891, + "step": 4257 + }, + { + "epoch": 0.1792842105263158, + "grad_norm": 0.482421875, + "learning_rate": 0.000468110603815977, + "loss": 3.3337, + "step": 4258 + }, + { + "epoch": 0.1793263157894737, + "grad_norm": 0.458984375, + "learning_rate": 0.00046809411042746075, + "loss": 3.515, + "step": 4259 + }, + { + "epoch": 0.17936842105263157, + "grad_norm": 0.421875, + "learning_rate": 0.0004680776130655261, + "loss": 3.4017, + "step": 4260 + }, + { + "epoch": 0.17941052631578946, + "grad_norm": 0.515625, + "learning_rate": 0.0004680611117304735, + "loss": 3.9449, + "step": 4261 + }, + { + "epoch": 0.17945263157894736, + "grad_norm": 0.400390625, + "learning_rate": 0.0004680446064226037, + "loss": 3.157, + "step": 4262 + }, + { + "epoch": 0.17949473684210526, + "grad_norm": 0.41015625, + "learning_rate": 0.0004680280971422173, + "loss": 3.3798, + "step": 4263 + }, + { + "epoch": 0.17953684210526316, + "grad_norm": 0.48828125, + "learning_rate": 0.00046801158388961515, + "loss": 3.5937, + "step": 4264 + }, + { + "epoch": 0.17957894736842106, + "grad_norm": 0.39453125, + "learning_rate": 0.00046799506666509816, + "loss": 3.2309, + "step": 4265 + }, + { + "epoch": 0.17962105263157896, + "grad_norm": 0.392578125, + "learning_rate": 0.00046797854546896717, + "loss": 3.2346, + "step": 4266 + }, + { + "epoch": 0.17966315789473683, + "grad_norm": 0.412109375, + "learning_rate": 0.0004679620203015232, + "loss": 3.336, + "step": 4267 + }, + { + "epoch": 0.17970526315789473, + "grad_norm": 0.416015625, + "learning_rate": 0.0004679454911630673, + "loss": 3.4914, + "step": 4268 + }, + { + "epoch": 0.17974736842105263, + "grad_norm": 0.40625, + "learning_rate": 0.00046792895805390064, + "loss": 3.5245, + "step": 4269 + }, + { + "epoch": 0.17978947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.0004679124209743244, + "loss": 3.7937, + "step": 4270 + }, + { + "epoch": 0.17983157894736843, + "grad_norm": 0.482421875, + "learning_rate": 0.0004678958799246399, + "loss": 3.2519, + "step": 4271 + }, + { + "epoch": 0.17987368421052632, + "grad_norm": 0.41015625, + "learning_rate": 0.00046787933490514846, + "loss": 3.4968, + "step": 4272 + }, + { + "epoch": 0.17991578947368422, + "grad_norm": 0.4375, + "learning_rate": 0.0004678627859161515, + "loss": 3.4933, + "step": 4273 + }, + { + "epoch": 0.1799578947368421, + "grad_norm": 0.39453125, + "learning_rate": 0.0004678462329579506, + "loss": 3.4709, + "step": 4274 + }, + { + "epoch": 0.18, + "grad_norm": 0.423828125, + "learning_rate": 0.00046782967603084736, + "loss": 3.194, + "step": 4275 + }, + { + "epoch": 0.1800421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.0004678131151351433, + "loss": 3.3363, + "step": 4276 + }, + { + "epoch": 0.1800842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.0004677965502711402, + "loss": 3.0924, + "step": 4277 + }, + { + "epoch": 0.1801263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00046777998143913985, + "loss": 3.1704, + "step": 4278 + }, + { + "epoch": 0.1801684210526316, + "grad_norm": 0.4765625, + "learning_rate": 0.0004677634086394441, + "loss": 3.2711, + "step": 4279 + }, + { + "epoch": 0.18021052631578946, + "grad_norm": 0.40625, + "learning_rate": 0.000467746831872355, + "loss": 3.5172, + "step": 4280 + }, + { + "epoch": 0.18025263157894736, + "grad_norm": 0.42578125, + "learning_rate": 0.0004677302511381744, + "loss": 3.2466, + "step": 4281 + }, + { + "epoch": 0.18029473684210526, + "grad_norm": 0.451171875, + "learning_rate": 0.0004677136664372045, + "loss": 2.9772, + "step": 4282 + }, + { + "epoch": 0.18033684210526316, + "grad_norm": 0.59375, + "learning_rate": 0.0004676970777697473, + "loss": 3.4742, + "step": 4283 + }, + { + "epoch": 0.18037894736842106, + "grad_norm": 0.41796875, + "learning_rate": 0.00046768048513610517, + "loss": 3.5518, + "step": 4284 + }, + { + "epoch": 0.18042105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.00046766388853658036, + "loss": 3.5894, + "step": 4285 + }, + { + "epoch": 0.18046315789473685, + "grad_norm": 0.408203125, + "learning_rate": 0.00046764728797147524, + "loss": 3.1369, + "step": 4286 + }, + { + "epoch": 0.18050526315789472, + "grad_norm": 0.412109375, + "learning_rate": 0.00046763068344109226, + "loss": 3.4405, + "step": 4287 + }, + { + "epoch": 0.18054736842105262, + "grad_norm": 0.427734375, + "learning_rate": 0.00046761407494573407, + "loss": 3.1766, + "step": 4288 + }, + { + "epoch": 0.18058947368421052, + "grad_norm": 0.412109375, + "learning_rate": 0.0004675974624857029, + "loss": 3.6131, + "step": 4289 + }, + { + "epoch": 0.18063157894736842, + "grad_norm": 0.40234375, + "learning_rate": 0.0004675808460613018, + "loss": 3.4915, + "step": 4290 + }, + { + "epoch": 0.18067368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.0004675642256728332, + "loss": 3.9375, + "step": 4291 + }, + { + "epoch": 0.18071578947368422, + "grad_norm": 0.404296875, + "learning_rate": 0.00046754760132060015, + "loss": 3.8096, + "step": 4292 + }, + { + "epoch": 0.18075789473684212, + "grad_norm": 0.4765625, + "learning_rate": 0.00046753097300490534, + "loss": 3.6347, + "step": 4293 + }, + { + "epoch": 0.1808, + "grad_norm": 0.51953125, + "learning_rate": 0.0004675143407260518, + "loss": 3.318, + "step": 4294 + }, + { + "epoch": 0.1808421052631579, + "grad_norm": 0.453125, + "learning_rate": 0.0004674977044843426, + "loss": 2.9944, + "step": 4295 + }, + { + "epoch": 0.1808842105263158, + "grad_norm": 0.474609375, + "learning_rate": 0.0004674810642800806, + "loss": 3.7377, + "step": 4296 + }, + { + "epoch": 0.18092631578947369, + "grad_norm": 0.54296875, + "learning_rate": 0.0004674644201135694, + "loss": 3.5213, + "step": 4297 + }, + { + "epoch": 0.18096842105263158, + "grad_norm": 0.52734375, + "learning_rate": 0.0004674477719851118, + "loss": 3.2998, + "step": 4298 + }, + { + "epoch": 0.18101052631578948, + "grad_norm": 0.474609375, + "learning_rate": 0.0004674311198950113, + "loss": 3.3592, + "step": 4299 + }, + { + "epoch": 0.18105263157894738, + "grad_norm": 0.5078125, + "learning_rate": 0.00046741446384357133, + "loss": 3.5434, + "step": 4300 + }, + { + "epoch": 0.18109473684210525, + "grad_norm": 0.43359375, + "learning_rate": 0.0004673978038310952, + "loss": 3.2923, + "step": 4301 + }, + { + "epoch": 0.18113684210526315, + "grad_norm": 0.46875, + "learning_rate": 0.00046738113985788656, + "loss": 3.0153, + "step": 4302 + }, + { + "epoch": 0.18117894736842105, + "grad_norm": 0.455078125, + "learning_rate": 0.000467364471924249, + "loss": 3.2006, + "step": 4303 + }, + { + "epoch": 0.18122105263157895, + "grad_norm": 0.412109375, + "learning_rate": 0.00046734780003048604, + "loss": 3.7719, + "step": 4304 + }, + { + "epoch": 0.18126315789473685, + "grad_norm": 0.52734375, + "learning_rate": 0.00046733112417690165, + "loss": 3.3493, + "step": 4305 + }, + { + "epoch": 0.18130526315789475, + "grad_norm": 0.404296875, + "learning_rate": 0.0004673144443637995, + "loss": 2.6939, + "step": 4306 + }, + { + "epoch": 0.18134736842105262, + "grad_norm": 0.412109375, + "learning_rate": 0.0004672977605914835, + "loss": 3.4434, + "step": 4307 + }, + { + "epoch": 0.18138947368421052, + "grad_norm": 0.416015625, + "learning_rate": 0.0004672810728602576, + "loss": 3.5016, + "step": 4308 + }, + { + "epoch": 0.18143157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.00046726438117042585, + "loss": 3.7426, + "step": 4309 + }, + { + "epoch": 0.18147368421052631, + "grad_norm": 0.40625, + "learning_rate": 0.00046724768552229237, + "loss": 3.2753, + "step": 4310 + }, + { + "epoch": 0.1815157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.00046723098591616133, + "loss": 3.2095, + "step": 4311 + }, + { + "epoch": 0.1815578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00046721428235233696, + "loss": 2.8532, + "step": 4312 + }, + { + "epoch": 0.1816, + "grad_norm": 0.41796875, + "learning_rate": 0.0004671975748311236, + "loss": 3.577, + "step": 4313 + }, + { + "epoch": 0.18164210526315788, + "grad_norm": 0.4140625, + "learning_rate": 0.0004671808633528255, + "loss": 3.7791, + "step": 4314 + }, + { + "epoch": 0.18168421052631578, + "grad_norm": 0.404296875, + "learning_rate": 0.0004671641479177474, + "loss": 3.4846, + "step": 4315 + }, + { + "epoch": 0.18172631578947368, + "grad_norm": 0.42578125, + "learning_rate": 0.0004671474285261936, + "loss": 3.9367, + "step": 4316 + }, + { + "epoch": 0.18176842105263158, + "grad_norm": 0.40625, + "learning_rate": 0.0004671307051784688, + "loss": 3.732, + "step": 4317 + }, + { + "epoch": 0.18181052631578948, + "grad_norm": 0.412109375, + "learning_rate": 0.00046711397787487766, + "loss": 3.7979, + "step": 4318 + }, + { + "epoch": 0.18185263157894738, + "grad_norm": 0.439453125, + "learning_rate": 0.00046709724661572494, + "loss": 3.3694, + "step": 4319 + }, + { + "epoch": 0.18189473684210528, + "grad_norm": 0.416015625, + "learning_rate": 0.0004670805114013155, + "loss": 3.1885, + "step": 4320 + }, + { + "epoch": 0.18193684210526315, + "grad_norm": 0.427734375, + "learning_rate": 0.0004670637722319542, + "loss": 3.1586, + "step": 4321 + }, + { + "epoch": 0.18197894736842105, + "grad_norm": 0.404296875, + "learning_rate": 0.000467047029107946, + "loss": 3.4971, + "step": 4322 + }, + { + "epoch": 0.18202105263157894, + "grad_norm": 0.439453125, + "learning_rate": 0.0004670302820295959, + "loss": 3.3221, + "step": 4323 + }, + { + "epoch": 0.18206315789473684, + "grad_norm": 0.478515625, + "learning_rate": 0.0004670135309972091, + "loss": 3.2375, + "step": 4324 + }, + { + "epoch": 0.18210526315789474, + "grad_norm": 0.431640625, + "learning_rate": 0.0004669967760110908, + "loss": 3.1671, + "step": 4325 + }, + { + "epoch": 0.18214736842105264, + "grad_norm": 0.4140625, + "learning_rate": 0.00046698001707154614, + "loss": 3.7695, + "step": 4326 + }, + { + "epoch": 0.18218947368421054, + "grad_norm": 0.4296875, + "learning_rate": 0.0004669632541788805, + "loss": 3.3264, + "step": 4327 + }, + { + "epoch": 0.1822315789473684, + "grad_norm": 0.419921875, + "learning_rate": 0.0004669464873333993, + "loss": 3.2035, + "step": 4328 + }, + { + "epoch": 0.1822736842105263, + "grad_norm": 0.412109375, + "learning_rate": 0.00046692971653540804, + "loss": 3.2817, + "step": 4329 + }, + { + "epoch": 0.1823157894736842, + "grad_norm": 0.494140625, + "learning_rate": 0.00046691294178521214, + "loss": 3.4109, + "step": 4330 + }, + { + "epoch": 0.1823578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00046689616308311744, + "loss": 3.2351, + "step": 4331 + }, + { + "epoch": 0.1824, + "grad_norm": 0.416015625, + "learning_rate": 0.0004668793804294294, + "loss": 3.4091, + "step": 4332 + }, + { + "epoch": 0.1824421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.0004668625938244538, + "loss": 3.3767, + "step": 4333 + }, + { + "epoch": 0.18248421052631578, + "grad_norm": 0.40625, + "learning_rate": 0.00046684580326849666, + "loss": 3.4928, + "step": 4334 + }, + { + "epoch": 0.18252631578947368, + "grad_norm": 0.70703125, + "learning_rate": 0.0004668290087618638, + "loss": 3.0895, + "step": 4335 + }, + { + "epoch": 0.18256842105263157, + "grad_norm": 0.40625, + "learning_rate": 0.0004668122103048611, + "loss": 3.3407, + "step": 4336 + }, + { + "epoch": 0.18261052631578947, + "grad_norm": 0.48828125, + "learning_rate": 0.00046679540789779463, + "loss": 3.6747, + "step": 4337 + }, + { + "epoch": 0.18265263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.0004667786015409706, + "loss": 3.9243, + "step": 4338 + }, + { + "epoch": 0.18269473684210527, + "grad_norm": 0.4296875, + "learning_rate": 0.0004667617912346952, + "loss": 3.0803, + "step": 4339 + }, + { + "epoch": 0.18273684210526317, + "grad_norm": 0.4375, + "learning_rate": 0.00046674497697927456, + "loss": 3.0517, + "step": 4340 + }, + { + "epoch": 0.18277894736842104, + "grad_norm": 0.4140625, + "learning_rate": 0.0004667281587750152, + "loss": 3.5114, + "step": 4341 + }, + { + "epoch": 0.18282105263157894, + "grad_norm": 0.41015625, + "learning_rate": 0.00046671133662222334, + "loss": 3.7038, + "step": 4342 + }, + { + "epoch": 0.18286315789473684, + "grad_norm": 0.384765625, + "learning_rate": 0.0004666945105212056, + "loss": 3.0405, + "step": 4343 + }, + { + "epoch": 0.18290526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.00046667768047226846, + "loss": 3.3121, + "step": 4344 + }, + { + "epoch": 0.18294736842105264, + "grad_norm": 0.396484375, + "learning_rate": 0.0004666608464757186, + "loss": 3.5014, + "step": 4345 + }, + { + "epoch": 0.18298947368421054, + "grad_norm": 0.396484375, + "learning_rate": 0.0004666440085318626, + "loss": 3.4629, + "step": 4346 + }, + { + "epoch": 0.18303157894736843, + "grad_norm": 0.369140625, + "learning_rate": 0.0004666271666410074, + "loss": 2.9162, + "step": 4347 + }, + { + "epoch": 0.1830736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.0004666103208034597, + "loss": 3.4681, + "step": 4348 + }, + { + "epoch": 0.1831157894736842, + "grad_norm": 0.49609375, + "learning_rate": 0.00046659347101952656, + "loss": 3.255, + "step": 4349 + }, + { + "epoch": 0.1831578947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.0004665766172895147, + "loss": 3.046, + "step": 4350 + }, + { + "epoch": 0.1832, + "grad_norm": 5.46875, + "learning_rate": 0.00046655975961373147, + "loss": 3.4244, + "step": 4351 + }, + { + "epoch": 0.1832421052631579, + "grad_norm": 0.45703125, + "learning_rate": 0.00046654289799248383, + "loss": 2.9638, + "step": 4352 + }, + { + "epoch": 0.1832842105263158, + "grad_norm": 0.44140625, + "learning_rate": 0.00046652603242607906, + "loss": 3.5785, + "step": 4353 + }, + { + "epoch": 0.18332631578947367, + "grad_norm": 0.392578125, + "learning_rate": 0.00046650916291482425, + "loss": 3.2352, + "step": 4354 + }, + { + "epoch": 0.18336842105263157, + "grad_norm": 0.416015625, + "learning_rate": 0.000466492289459027, + "loss": 3.4128, + "step": 4355 + }, + { + "epoch": 0.18341052631578947, + "grad_norm": 0.466796875, + "learning_rate": 0.00046647541205899456, + "loss": 3.1858, + "step": 4356 + }, + { + "epoch": 0.18345263157894737, + "grad_norm": 0.45703125, + "learning_rate": 0.0004664585307150345, + "loss": 3.5665, + "step": 4357 + }, + { + "epoch": 0.18349473684210527, + "grad_norm": 0.412109375, + "learning_rate": 0.00046644164542745427, + "loss": 3.7898, + "step": 4358 + }, + { + "epoch": 0.18353684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.00046642475619656165, + "loss": 3.0775, + "step": 4359 + }, + { + "epoch": 0.18357894736842106, + "grad_norm": 0.4140625, + "learning_rate": 0.00046640786302266425, + "loss": 3.3362, + "step": 4360 + }, + { + "epoch": 0.18362105263157893, + "grad_norm": 0.419921875, + "learning_rate": 0.00046639096590606977, + "loss": 3.3037, + "step": 4361 + }, + { + "epoch": 0.18366315789473683, + "grad_norm": 0.478515625, + "learning_rate": 0.00046637406484708623, + "loss": 3.9068, + "step": 4362 + }, + { + "epoch": 0.18370526315789473, + "grad_norm": 0.458984375, + "learning_rate": 0.0004663571598460214, + "loss": 3.4112, + "step": 4363 + }, + { + "epoch": 0.18374736842105263, + "grad_norm": 0.466796875, + "learning_rate": 0.0004663402509031833, + "loss": 3.1825, + "step": 4364 + }, + { + "epoch": 0.18378947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.0004663233380188802, + "loss": 3.2201, + "step": 4365 + }, + { + "epoch": 0.18383157894736843, + "grad_norm": 0.439453125, + "learning_rate": 0.0004663064211934198, + "loss": 3.3133, + "step": 4366 + }, + { + "epoch": 0.18387368421052633, + "grad_norm": 0.435546875, + "learning_rate": 0.0004662895004271107, + "loss": 3.0813, + "step": 4367 + }, + { + "epoch": 0.1839157894736842, + "grad_norm": 0.40625, + "learning_rate": 0.00046627257572026107, + "loss": 3.5289, + "step": 4368 + }, + { + "epoch": 0.1839578947368421, + "grad_norm": 0.388671875, + "learning_rate": 0.0004662556470731791, + "loss": 3.5994, + "step": 4369 + }, + { + "epoch": 0.184, + "grad_norm": 0.427734375, + "learning_rate": 0.0004662387144861734, + "loss": 3.9798, + "step": 4370 + }, + { + "epoch": 0.1840421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.00046622177795955236, + "loss": 3.0208, + "step": 4371 + }, + { + "epoch": 0.1840842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.00046620483749362465, + "loss": 3.6957, + "step": 4372 + }, + { + "epoch": 0.1841263157894737, + "grad_norm": 0.439453125, + "learning_rate": 0.00046618789308869877, + "loss": 3.1677, + "step": 4373 + }, + { + "epoch": 0.1841684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.0004661709447450835, + "loss": 3.7236, + "step": 4374 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 0.396484375, + "learning_rate": 0.00046615399246308754, + "loss": 3.4274, + "step": 4375 + }, + { + "epoch": 0.18425263157894736, + "grad_norm": 0.423828125, + "learning_rate": 0.0004661370362430199, + "loss": 3.6138, + "step": 4376 + }, + { + "epoch": 0.18429473684210526, + "grad_norm": 0.5625, + "learning_rate": 0.00046612007608518935, + "loss": 2.9665, + "step": 4377 + }, + { + "epoch": 0.18433684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.00046610311198990497, + "loss": 3.4138, + "step": 4378 + }, + { + "epoch": 0.18437894736842106, + "grad_norm": 0.4453125, + "learning_rate": 0.0004660861439574758, + "loss": 2.9325, + "step": 4379 + }, + { + "epoch": 0.18442105263157896, + "grad_norm": 0.447265625, + "learning_rate": 0.00046606917198821094, + "loss": 3.3308, + "step": 4380 + }, + { + "epoch": 0.18446315789473683, + "grad_norm": 0.4140625, + "learning_rate": 0.0004660521960824198, + "loss": 3.5993, + "step": 4381 + }, + { + "epoch": 0.18450526315789473, + "grad_norm": 0.43359375, + "learning_rate": 0.0004660352162404113, + "loss": 3.3553, + "step": 4382 + }, + { + "epoch": 0.18454736842105263, + "grad_norm": 0.4765625, + "learning_rate": 0.00046601823246249506, + "loss": 3.3686, + "step": 4383 + }, + { + "epoch": 0.18458947368421053, + "grad_norm": 0.404296875, + "learning_rate": 0.0004660012447489805, + "loss": 3.2605, + "step": 4384 + }, + { + "epoch": 0.18463157894736842, + "grad_norm": 0.4609375, + "learning_rate": 0.000465984253100177, + "loss": 3.3257, + "step": 4385 + }, + { + "epoch": 0.18467368421052632, + "grad_norm": 0.4921875, + "learning_rate": 0.00046596725751639413, + "loss": 3.4743, + "step": 4386 + }, + { + "epoch": 0.18471578947368422, + "grad_norm": 0.439453125, + "learning_rate": 0.00046595025799794166, + "loss": 3.061, + "step": 4387 + }, + { + "epoch": 0.1847578947368421, + "grad_norm": 0.453125, + "learning_rate": 0.00046593325454512914, + "loss": 3.3951, + "step": 4388 + }, + { + "epoch": 0.1848, + "grad_norm": 0.4765625, + "learning_rate": 0.00046591624715826654, + "loss": 3.3823, + "step": 4389 + }, + { + "epoch": 0.1848421052631579, + "grad_norm": 0.8359375, + "learning_rate": 0.00046589923583766345, + "loss": 3.1218, + "step": 4390 + }, + { + "epoch": 0.1848842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.0004658822205836301, + "loss": 3.3523, + "step": 4391 + }, + { + "epoch": 0.1849263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.00046586520139647625, + "loss": 3.2584, + "step": 4392 + }, + { + "epoch": 0.1849684210526316, + "grad_norm": 0.43359375, + "learning_rate": 0.00046584817827651214, + "loss": 3.2319, + "step": 4393 + }, + { + "epoch": 0.1850105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.00046583115122404787, + "loss": 3.1867, + "step": 4394 + }, + { + "epoch": 0.18505263157894736, + "grad_norm": 0.453125, + "learning_rate": 0.0004658141202393935, + "loss": 3.7308, + "step": 4395 + }, + { + "epoch": 0.18509473684210526, + "grad_norm": 0.41796875, + "learning_rate": 0.00046579708532285945, + "loss": 3.4538, + "step": 4396 + }, + { + "epoch": 0.18513684210526316, + "grad_norm": 0.5625, + "learning_rate": 0.00046578004647475607, + "loss": 2.8032, + "step": 4397 + }, + { + "epoch": 0.18517894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.0004657630036953938, + "loss": 3.5403, + "step": 4398 + }, + { + "epoch": 0.18522105263157895, + "grad_norm": 0.416015625, + "learning_rate": 0.0004657459569850831, + "loss": 3.6156, + "step": 4399 + }, + { + "epoch": 0.18526315789473685, + "grad_norm": 0.44140625, + "learning_rate": 0.00046572890634413456, + "loss": 3.9091, + "step": 4400 + }, + { + "epoch": 0.18530526315789475, + "grad_norm": 0.44140625, + "learning_rate": 0.0004657118517728588, + "loss": 3.2509, + "step": 4401 + }, + { + "epoch": 0.18534736842105262, + "grad_norm": 0.455078125, + "learning_rate": 0.00046569479327156655, + "loss": 3.4803, + "step": 4402 + }, + { + "epoch": 0.18538947368421052, + "grad_norm": 0.416015625, + "learning_rate": 0.0004656777308405686, + "loss": 3.1033, + "step": 4403 + }, + { + "epoch": 0.18543157894736842, + "grad_norm": 0.48046875, + "learning_rate": 0.00046566066448017586, + "loss": 3.2882, + "step": 4404 + }, + { + "epoch": 0.18547368421052632, + "grad_norm": 0.419921875, + "learning_rate": 0.00046564359419069915, + "loss": 3.4529, + "step": 4405 + }, + { + "epoch": 0.18551578947368422, + "grad_norm": 0.4609375, + "learning_rate": 0.00046562651997244955, + "loss": 3.442, + "step": 4406 + }, + { + "epoch": 0.18555789473684212, + "grad_norm": 0.419921875, + "learning_rate": 0.00046560944182573807, + "loss": 3.1917, + "step": 4407 + }, + { + "epoch": 0.1856, + "grad_norm": 0.392578125, + "learning_rate": 0.00046559235975087587, + "loss": 3.0058, + "step": 4408 + }, + { + "epoch": 0.18564210526315789, + "grad_norm": 0.455078125, + "learning_rate": 0.0004655752737481742, + "loss": 3.4125, + "step": 4409 + }, + { + "epoch": 0.18568421052631578, + "grad_norm": 0.474609375, + "learning_rate": 0.00046555818381794435, + "loss": 3.3958, + "step": 4410 + }, + { + "epoch": 0.18572631578947368, + "grad_norm": 0.498046875, + "learning_rate": 0.0004655410899604977, + "loss": 3.4025, + "step": 4411 + }, + { + "epoch": 0.18576842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.0004655239921761456, + "loss": 3.0672, + "step": 4412 + }, + { + "epoch": 0.18581052631578948, + "grad_norm": 0.42578125, + "learning_rate": 0.0004655068904651996, + "loss": 3.8892, + "step": 4413 + }, + { + "epoch": 0.18585263157894738, + "grad_norm": 0.470703125, + "learning_rate": 0.0004654897848279713, + "loss": 3.4316, + "step": 4414 + }, + { + "epoch": 0.18589473684210525, + "grad_norm": 0.40234375, + "learning_rate": 0.00046547267526477224, + "loss": 3.1254, + "step": 4415 + }, + { + "epoch": 0.18593684210526315, + "grad_norm": 0.439453125, + "learning_rate": 0.0004654555617759143, + "loss": 3.299, + "step": 4416 + }, + { + "epoch": 0.18597894736842105, + "grad_norm": 0.4453125, + "learning_rate": 0.00046543844436170914, + "loss": 3.5446, + "step": 4417 + }, + { + "epoch": 0.18602105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.00046542132302246857, + "loss": 3.2613, + "step": 4418 + }, + { + "epoch": 0.18606315789473685, + "grad_norm": 0.412109375, + "learning_rate": 0.0004654041977585047, + "loss": 3.5915, + "step": 4419 + }, + { + "epoch": 0.18610526315789475, + "grad_norm": 0.431640625, + "learning_rate": 0.0004653870685701294, + "loss": 3.2797, + "step": 4420 + }, + { + "epoch": 0.18614736842105264, + "grad_norm": 0.416015625, + "learning_rate": 0.0004653699354576548, + "loss": 3.2533, + "step": 4421 + }, + { + "epoch": 0.18618947368421052, + "grad_norm": 0.392578125, + "learning_rate": 0.000465352798421393, + "loss": 3.034, + "step": 4422 + }, + { + "epoch": 0.18623157894736841, + "grad_norm": 0.423828125, + "learning_rate": 0.0004653356574616563, + "loss": 3.3064, + "step": 4423 + }, + { + "epoch": 0.1862736842105263, + "grad_norm": 0.404296875, + "learning_rate": 0.00046531851257875683, + "loss": 3.3461, + "step": 4424 + }, + { + "epoch": 0.1863157894736842, + "grad_norm": 0.50390625, + "learning_rate": 0.00046530136377300716, + "loss": 2.9325, + "step": 4425 + }, + { + "epoch": 0.1863578947368421, + "grad_norm": 0.40234375, + "learning_rate": 0.0004652842110447195, + "loss": 3.5022, + "step": 4426 + }, + { + "epoch": 0.1864, + "grad_norm": 0.408203125, + "learning_rate": 0.0004652670543942065, + "loss": 3.4207, + "step": 4427 + }, + { + "epoch": 0.18644210526315788, + "grad_norm": 0.46875, + "learning_rate": 0.00046524989382178074, + "loss": 3.3801, + "step": 4428 + }, + { + "epoch": 0.18648421052631578, + "grad_norm": 0.41015625, + "learning_rate": 0.0004652327293277548, + "loss": 3.8296, + "step": 4429 + }, + { + "epoch": 0.18652631578947368, + "grad_norm": 0.455078125, + "learning_rate": 0.0004652155609124414, + "loss": 3.2296, + "step": 4430 + }, + { + "epoch": 0.18656842105263158, + "grad_norm": 0.44921875, + "learning_rate": 0.00046519838857615335, + "loss": 3.0834, + "step": 4431 + }, + { + "epoch": 0.18661052631578948, + "grad_norm": 0.44921875, + "learning_rate": 0.0004651812123192035, + "loss": 3.469, + "step": 4432 + }, + { + "epoch": 0.18665263157894738, + "grad_norm": 0.4296875, + "learning_rate": 0.0004651640321419048, + "loss": 3.1003, + "step": 4433 + }, + { + "epoch": 0.18669473684210527, + "grad_norm": 0.4609375, + "learning_rate": 0.0004651468480445703, + "loss": 3.6867, + "step": 4434 + }, + { + "epoch": 0.18673684210526315, + "grad_norm": 0.46484375, + "learning_rate": 0.000465129660027513, + "loss": 3.2798, + "step": 4435 + }, + { + "epoch": 0.18677894736842104, + "grad_norm": 0.39453125, + "learning_rate": 0.000465112468091046, + "loss": 3.0654, + "step": 4436 + }, + { + "epoch": 0.18682105263157894, + "grad_norm": 0.419921875, + "learning_rate": 0.0004650952722354826, + "loss": 3.282, + "step": 4437 + }, + { + "epoch": 0.18686315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.0004650780724611361, + "loss": 3.398, + "step": 4438 + }, + { + "epoch": 0.18690526315789474, + "grad_norm": 0.45703125, + "learning_rate": 0.00046506086876831976, + "loss": 3.4586, + "step": 4439 + }, + { + "epoch": 0.18694736842105264, + "grad_norm": 0.40234375, + "learning_rate": 0.0004650436611573471, + "loss": 3.7511, + "step": 4440 + }, + { + "epoch": 0.18698947368421054, + "grad_norm": 0.3984375, + "learning_rate": 0.0004650264496285317, + "loss": 3.7843, + "step": 4441 + }, + { + "epoch": 0.1870315789473684, + "grad_norm": 0.490234375, + "learning_rate": 0.0004650092341821869, + "loss": 3.3451, + "step": 4442 + }, + { + "epoch": 0.1870736842105263, + "grad_norm": 0.404296875, + "learning_rate": 0.00046499201481862653, + "loss": 3.4381, + "step": 4443 + }, + { + "epoch": 0.1871157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.0004649747915381642, + "loss": 3.3073, + "step": 4444 + }, + { + "epoch": 0.1871578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0004649575643411139, + "loss": 3.0032, + "step": 4445 + }, + { + "epoch": 0.1872, + "grad_norm": 0.37109375, + "learning_rate": 0.00046494033322778917, + "loss": 3.5015, + "step": 4446 + }, + { + "epoch": 0.1872421052631579, + "grad_norm": 0.46484375, + "learning_rate": 0.0004649230981985042, + "loss": 2.8596, + "step": 4447 + }, + { + "epoch": 0.1872842105263158, + "grad_norm": 0.400390625, + "learning_rate": 0.0004649058592535729, + "loss": 3.6203, + "step": 4448 + }, + { + "epoch": 0.18732631578947367, + "grad_norm": 0.41015625, + "learning_rate": 0.0004648886163933094, + "loss": 3.1498, + "step": 4449 + }, + { + "epoch": 0.18736842105263157, + "grad_norm": 0.49609375, + "learning_rate": 0.00046487136961802765, + "loss": 3.2854, + "step": 4450 + }, + { + "epoch": 0.18741052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.00046485411892804217, + "loss": 3.1656, + "step": 4451 + }, + { + "epoch": 0.18745263157894737, + "grad_norm": 0.455078125, + "learning_rate": 0.000464836864323667, + "loss": 3.347, + "step": 4452 + }, + { + "epoch": 0.18749473684210527, + "grad_norm": 0.44921875, + "learning_rate": 0.00046481960580521664, + "loss": 3.4064, + "step": 4453 + }, + { + "epoch": 0.18753684210526317, + "grad_norm": 0.4296875, + "learning_rate": 0.00046480234337300543, + "loss": 3.0811, + "step": 4454 + }, + { + "epoch": 0.18757894736842104, + "grad_norm": 0.431640625, + "learning_rate": 0.00046478507702734783, + "loss": 3.2662, + "step": 4455 + }, + { + "epoch": 0.18762105263157894, + "grad_norm": 0.416015625, + "learning_rate": 0.0004647678067685586, + "loss": 3.2447, + "step": 4456 + }, + { + "epoch": 0.18766315789473684, + "grad_norm": 0.41015625, + "learning_rate": 0.0004647505325969522, + "loss": 3.4242, + "step": 4457 + }, + { + "epoch": 0.18770526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.00046473325451284347, + "loss": 3.6144, + "step": 4458 + }, + { + "epoch": 0.18774736842105264, + "grad_norm": 0.439453125, + "learning_rate": 0.00046471597251654715, + "loss": 3.3922, + "step": 4459 + }, + { + "epoch": 0.18778947368421053, + "grad_norm": 0.38671875, + "learning_rate": 0.00046469868660837805, + "loss": 3.15, + "step": 4460 + }, + { + "epoch": 0.18783157894736843, + "grad_norm": 0.41796875, + "learning_rate": 0.0004646813967886512, + "loss": 3.5424, + "step": 4461 + }, + { + "epoch": 0.1878736842105263, + "grad_norm": 0.408203125, + "learning_rate": 0.00046466410305768147, + "loss": 3.2203, + "step": 4462 + }, + { + "epoch": 0.1879157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.00046464680541578396, + "loss": 3.47, + "step": 4463 + }, + { + "epoch": 0.1879578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.00046462950386327395, + "loss": 3.0017, + "step": 4464 + }, + { + "epoch": 0.188, + "grad_norm": 0.4765625, + "learning_rate": 0.0004646121984004665, + "loss": 3.4457, + "step": 4465 + }, + { + "epoch": 0.1880421052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.00046459488902767703, + "loss": 3.3676, + "step": 4466 + }, + { + "epoch": 0.1880842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.00046457757574522074, + "loss": 2.9927, + "step": 4467 + }, + { + "epoch": 0.1881263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0004645602585534132, + "loss": 3.2295, + "step": 4468 + }, + { + "epoch": 0.18816842105263157, + "grad_norm": 0.408203125, + "learning_rate": 0.0004645429374525698, + "loss": 3.4414, + "step": 4469 + }, + { + "epoch": 0.18821052631578947, + "grad_norm": 0.419921875, + "learning_rate": 0.0004645256124430062, + "loss": 3.1934, + "step": 4470 + }, + { + "epoch": 0.18825263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.000464508283525038, + "loss": 3.0759, + "step": 4471 + }, + { + "epoch": 0.18829473684210526, + "grad_norm": 0.453125, + "learning_rate": 0.0004644909506989808, + "loss": 3.4296, + "step": 4472 + }, + { + "epoch": 0.18833684210526316, + "grad_norm": 0.482421875, + "learning_rate": 0.00046447361396515066, + "loss": 3.2265, + "step": 4473 + }, + { + "epoch": 0.18837894736842106, + "grad_norm": 0.40234375, + "learning_rate": 0.0004644562733238632, + "loss": 3.3423, + "step": 4474 + }, + { + "epoch": 0.18842105263157893, + "grad_norm": 0.44921875, + "learning_rate": 0.00046443892877543434, + "loss": 2.8664, + "step": 4475 + }, + { + "epoch": 0.18846315789473683, + "grad_norm": 0.423828125, + "learning_rate": 0.0004644215803201803, + "loss": 3.5501, + "step": 4476 + }, + { + "epoch": 0.18850526315789473, + "grad_norm": 0.435546875, + "learning_rate": 0.00046440422795841687, + "loss": 3.3441, + "step": 4477 + }, + { + "epoch": 0.18854736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.0004643868716904604, + "loss": 3.5883, + "step": 4478 + }, + { + "epoch": 0.18858947368421053, + "grad_norm": 0.404296875, + "learning_rate": 0.000464369511516627, + "loss": 3.4756, + "step": 4479 + }, + { + "epoch": 0.18863157894736843, + "grad_norm": 0.42578125, + "learning_rate": 0.00046435214743723296, + "loss": 3.1863, + "step": 4480 + }, + { + "epoch": 0.18867368421052633, + "grad_norm": 0.408203125, + "learning_rate": 0.00046433477945259465, + "loss": 3.5316, + "step": 4481 + }, + { + "epoch": 0.1887157894736842, + "grad_norm": 0.40234375, + "learning_rate": 0.0004643174075630285, + "loss": 3.5398, + "step": 4482 + }, + { + "epoch": 0.1887578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00046430003176885107, + "loss": 3.3589, + "step": 4483 + }, + { + "epoch": 0.1888, + "grad_norm": 0.44140625, + "learning_rate": 0.00046428265207037876, + "loss": 3.7751, + "step": 4484 + }, + { + "epoch": 0.1888421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.0004642652684679283, + "loss": 3.1204, + "step": 4485 + }, + { + "epoch": 0.1888842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.0004642478809618164, + "loss": 3.6897, + "step": 4486 + }, + { + "epoch": 0.1889263157894737, + "grad_norm": 0.384765625, + "learning_rate": 0.0004642304895523599, + "loss": 3.4737, + "step": 4487 + }, + { + "epoch": 0.1889684210526316, + "grad_norm": 0.392578125, + "learning_rate": 0.00046421309423987556, + "loss": 3.6536, + "step": 4488 + }, + { + "epoch": 0.18901052631578946, + "grad_norm": 0.40234375, + "learning_rate": 0.00046419569502468026, + "loss": 3.6034, + "step": 4489 + }, + { + "epoch": 0.18905263157894736, + "grad_norm": 0.41015625, + "learning_rate": 0.00046417829190709115, + "loss": 3.3792, + "step": 4490 + }, + { + "epoch": 0.18909473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.0004641608848874252, + "loss": 3.3922, + "step": 4491 + }, + { + "epoch": 0.18913684210526316, + "grad_norm": 0.3984375, + "learning_rate": 0.00046414347396599956, + "loss": 3.6969, + "step": 4492 + }, + { + "epoch": 0.18917894736842106, + "grad_norm": 0.392578125, + "learning_rate": 0.00046412605914313145, + "loss": 3.4315, + "step": 4493 + }, + { + "epoch": 0.18922105263157896, + "grad_norm": 0.419921875, + "learning_rate": 0.00046410864041913804, + "loss": 3.6181, + "step": 4494 + }, + { + "epoch": 0.18926315789473686, + "grad_norm": 0.408203125, + "learning_rate": 0.00046409121779433684, + "loss": 3.3158, + "step": 4495 + }, + { + "epoch": 0.18930526315789473, + "grad_norm": 0.4140625, + "learning_rate": 0.0004640737912690452, + "loss": 3.4135, + "step": 4496 + }, + { + "epoch": 0.18934736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.0004640563608435806, + "loss": 3.5823, + "step": 4497 + }, + { + "epoch": 0.18938947368421052, + "grad_norm": 0.392578125, + "learning_rate": 0.00046403892651826056, + "loss": 3.4215, + "step": 4498 + }, + { + "epoch": 0.18943157894736842, + "grad_norm": 0.39453125, + "learning_rate": 0.00046402148829340284, + "loss": 3.55, + "step": 4499 + }, + { + "epoch": 0.18947368421052632, + "grad_norm": 0.423828125, + "learning_rate": 0.000464004046169325, + "loss": 3.1173, + "step": 4500 + }, + { + "epoch": 0.18951578947368422, + "grad_norm": 0.396484375, + "learning_rate": 0.00046398660014634495, + "loss": 3.1488, + "step": 4501 + }, + { + "epoch": 0.1895578947368421, + "grad_norm": 0.400390625, + "learning_rate": 0.0004639691502247805, + "loss": 3.5974, + "step": 4502 + }, + { + "epoch": 0.1896, + "grad_norm": 0.421875, + "learning_rate": 0.00046395169640494943, + "loss": 3.386, + "step": 4503 + }, + { + "epoch": 0.1896421052631579, + "grad_norm": 0.3984375, + "learning_rate": 0.0004639342386871699, + "loss": 3.2139, + "step": 4504 + }, + { + "epoch": 0.1896842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.00046391677707175984, + "loss": 3.4286, + "step": 4505 + }, + { + "epoch": 0.1897263157894737, + "grad_norm": 0.40625, + "learning_rate": 0.0004638993115590375, + "loss": 3.4419, + "step": 4506 + }, + { + "epoch": 0.1897684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.00046388184214932106, + "loss": 3.2082, + "step": 4507 + }, + { + "epoch": 0.18981052631578949, + "grad_norm": 0.435546875, + "learning_rate": 0.0004638643688429287, + "loss": 3.6518, + "step": 4508 + }, + { + "epoch": 0.18985263157894736, + "grad_norm": 0.4140625, + "learning_rate": 0.0004638468916401788, + "loss": 3.9826, + "step": 4509 + }, + { + "epoch": 0.18989473684210526, + "grad_norm": 0.439453125, + "learning_rate": 0.0004638294105413898, + "loss": 3.1157, + "step": 4510 + }, + { + "epoch": 0.18993684210526315, + "grad_norm": 0.39453125, + "learning_rate": 0.00046381192554688025, + "loss": 3.6597, + "step": 4511 + }, + { + "epoch": 0.18997894736842105, + "grad_norm": 0.390625, + "learning_rate": 0.0004637944366569686, + "loss": 3.486, + "step": 4512 + }, + { + "epoch": 0.19002105263157895, + "grad_norm": 0.3984375, + "learning_rate": 0.0004637769438719735, + "loss": 3.5676, + "step": 4513 + }, + { + "epoch": 0.19006315789473685, + "grad_norm": 0.400390625, + "learning_rate": 0.00046375944719221366, + "loss": 3.2566, + "step": 4514 + }, + { + "epoch": 0.19010526315789475, + "grad_norm": 0.37890625, + "learning_rate": 0.0004637419466180078, + "loss": 3.4759, + "step": 4515 + }, + { + "epoch": 0.19014736842105262, + "grad_norm": 0.41015625, + "learning_rate": 0.0004637244421496749, + "loss": 3.5676, + "step": 4516 + }, + { + "epoch": 0.19018947368421052, + "grad_norm": 0.4296875, + "learning_rate": 0.00046370693378753377, + "loss": 3.2276, + "step": 4517 + }, + { + "epoch": 0.19023157894736842, + "grad_norm": 0.404296875, + "learning_rate": 0.0004636894215319033, + "loss": 3.2458, + "step": 4518 + }, + { + "epoch": 0.19027368421052632, + "grad_norm": 0.388671875, + "learning_rate": 0.00046367190538310277, + "loss": 3.2596, + "step": 4519 + }, + { + "epoch": 0.19031578947368422, + "grad_norm": 0.388671875, + "learning_rate": 0.0004636543853414511, + "loss": 3.7653, + "step": 4520 + }, + { + "epoch": 0.19035789473684211, + "grad_norm": 0.453125, + "learning_rate": 0.0004636368614072676, + "loss": 3.7527, + "step": 4521 + }, + { + "epoch": 0.1904, + "grad_norm": 0.408203125, + "learning_rate": 0.0004636193335808715, + "loss": 3.5574, + "step": 4522 + }, + { + "epoch": 0.19044210526315788, + "grad_norm": 0.435546875, + "learning_rate": 0.0004636018018625821, + "loss": 3.4611, + "step": 4523 + }, + { + "epoch": 0.19048421052631578, + "grad_norm": 0.458984375, + "learning_rate": 0.0004635842662527189, + "loss": 2.6897, + "step": 4524 + }, + { + "epoch": 0.19052631578947368, + "grad_norm": 0.388671875, + "learning_rate": 0.0004635667267516013, + "loss": 3.4963, + "step": 4525 + }, + { + "epoch": 0.19056842105263158, + "grad_norm": 0.404296875, + "learning_rate": 0.00046354918335954874, + "loss": 3.6976, + "step": 4526 + }, + { + "epoch": 0.19061052631578948, + "grad_norm": 0.462890625, + "learning_rate": 0.00046353163607688115, + "loss": 3.523, + "step": 4527 + }, + { + "epoch": 0.19065263157894738, + "grad_norm": 0.404296875, + "learning_rate": 0.000463514084903918, + "loss": 3.7666, + "step": 4528 + }, + { + "epoch": 0.19069473684210525, + "grad_norm": 0.439453125, + "learning_rate": 0.000463496529840979, + "loss": 3.3872, + "step": 4529 + }, + { + "epoch": 0.19073684210526315, + "grad_norm": 0.40625, + "learning_rate": 0.0004634789708883842, + "loss": 3.4158, + "step": 4530 + }, + { + "epoch": 0.19077894736842105, + "grad_norm": 0.40234375, + "learning_rate": 0.00046346140804645326, + "loss": 3.4758, + "step": 4531 + }, + { + "epoch": 0.19082105263157895, + "grad_norm": 0.439453125, + "learning_rate": 0.0004634438413155063, + "loss": 3.3974, + "step": 4532 + }, + { + "epoch": 0.19086315789473685, + "grad_norm": 0.4296875, + "learning_rate": 0.0004634262706958634, + "loss": 2.8901, + "step": 4533 + }, + { + "epoch": 0.19090526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.0004634086961878446, + "loss": 3.608, + "step": 4534 + }, + { + "epoch": 0.19094736842105264, + "grad_norm": 0.412109375, + "learning_rate": 0.00046339111779177005, + "loss": 3.8581, + "step": 4535 + }, + { + "epoch": 0.19098947368421051, + "grad_norm": 0.3984375, + "learning_rate": 0.00046337353550796013, + "loss": 3.2613, + "step": 4536 + }, + { + "epoch": 0.1910315789473684, + "grad_norm": 0.408203125, + "learning_rate": 0.000463355949336735, + "loss": 2.9671, + "step": 4537 + }, + { + "epoch": 0.1910736842105263, + "grad_norm": 0.37890625, + "learning_rate": 0.0004633383592784152, + "loss": 3.9004, + "step": 4538 + }, + { + "epoch": 0.1911157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.00046332076533332116, + "loss": 3.4956, + "step": 4539 + }, + { + "epoch": 0.1911578947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.0004633031675017735, + "loss": 3.2314, + "step": 4540 + }, + { + "epoch": 0.1912, + "grad_norm": 0.4140625, + "learning_rate": 0.00046328556578409267, + "loss": 3.8581, + "step": 4541 + }, + { + "epoch": 0.1912421052631579, + "grad_norm": 0.466796875, + "learning_rate": 0.00046326796018059937, + "loss": 3.1897, + "step": 4542 + }, + { + "epoch": 0.19128421052631578, + "grad_norm": 0.41015625, + "learning_rate": 0.0004632503506916145, + "loss": 3.6953, + "step": 4543 + }, + { + "epoch": 0.19132631578947368, + "grad_norm": 0.427734375, + "learning_rate": 0.0004632327373174587, + "loss": 3.5021, + "step": 4544 + }, + { + "epoch": 0.19136842105263158, + "grad_norm": 0.375, + "learning_rate": 0.00046321512005845314, + "loss": 3.2975, + "step": 4545 + }, + { + "epoch": 0.19141052631578948, + "grad_norm": 0.42578125, + "learning_rate": 0.0004631974989149185, + "loss": 3.2312, + "step": 4546 + }, + { + "epoch": 0.19145263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0004631798738871759, + "loss": 3.6851, + "step": 4547 + }, + { + "epoch": 0.19149473684210527, + "grad_norm": 0.4453125, + "learning_rate": 0.00046316224497554656, + "loss": 3.1115, + "step": 4548 + }, + { + "epoch": 0.19153684210526314, + "grad_norm": 0.4296875, + "learning_rate": 0.0004631446121803515, + "loss": 3.1139, + "step": 4549 + }, + { + "epoch": 0.19157894736842104, + "grad_norm": 0.419921875, + "learning_rate": 0.00046312697550191206, + "loss": 3.1891, + "step": 4550 + }, + { + "epoch": 0.19162105263157894, + "grad_norm": 0.404296875, + "learning_rate": 0.00046310933494054954, + "loss": 3.4793, + "step": 4551 + }, + { + "epoch": 0.19166315789473684, + "grad_norm": 0.90234375, + "learning_rate": 0.0004630916904965853, + "loss": 3.0458, + "step": 4552 + }, + { + "epoch": 0.19170526315789474, + "grad_norm": 0.439453125, + "learning_rate": 0.0004630740421703409, + "loss": 3.2249, + "step": 4553 + }, + { + "epoch": 0.19174736842105264, + "grad_norm": 0.4375, + "learning_rate": 0.00046305638996213777, + "loss": 3.0532, + "step": 4554 + }, + { + "epoch": 0.19178947368421054, + "grad_norm": 0.4375, + "learning_rate": 0.00046303873387229755, + "loss": 3.3277, + "step": 4555 + }, + { + "epoch": 0.1918315789473684, + "grad_norm": 0.4921875, + "learning_rate": 0.0004630210739011419, + "loss": 3.5759, + "step": 4556 + }, + { + "epoch": 0.1918736842105263, + "grad_norm": 0.4921875, + "learning_rate": 0.0004630034100489926, + "loss": 3.2083, + "step": 4557 + }, + { + "epoch": 0.1919157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.00046298574231617144, + "loss": 3.17, + "step": 4558 + }, + { + "epoch": 0.1919578947368421, + "grad_norm": 0.40234375, + "learning_rate": 0.0004629680707030002, + "loss": 3.1794, + "step": 4559 + }, + { + "epoch": 0.192, + "grad_norm": 0.392578125, + "learning_rate": 0.0004629503952098011, + "loss": 3.2657, + "step": 4560 + }, + { + "epoch": 0.1920421052631579, + "grad_norm": 0.392578125, + "learning_rate": 0.0004629327158368959, + "loss": 3.4405, + "step": 4561 + }, + { + "epoch": 0.1920842105263158, + "grad_norm": 0.400390625, + "learning_rate": 0.0004629150325846068, + "loss": 3.9288, + "step": 4562 + }, + { + "epoch": 0.19212631578947367, + "grad_norm": 0.3828125, + "learning_rate": 0.000462897345453256, + "loss": 3.5464, + "step": 4563 + }, + { + "epoch": 0.19216842105263157, + "grad_norm": 0.408203125, + "learning_rate": 0.0004628796544431657, + "loss": 3.5072, + "step": 4564 + }, + { + "epoch": 0.19221052631578947, + "grad_norm": 0.4140625, + "learning_rate": 0.00046286195955465824, + "loss": 3.4338, + "step": 4565 + }, + { + "epoch": 0.19225263157894737, + "grad_norm": 0.443359375, + "learning_rate": 0.000462844260788056, + "loss": 3.0108, + "step": 4566 + }, + { + "epoch": 0.19229473684210527, + "grad_norm": 0.458984375, + "learning_rate": 0.00046282655814368134, + "loss": 3.4061, + "step": 4567 + }, + { + "epoch": 0.19233684210526317, + "grad_norm": 0.609375, + "learning_rate": 0.0004628088516218569, + "loss": 3.4992, + "step": 4568 + }, + { + "epoch": 0.19237894736842107, + "grad_norm": 0.494140625, + "learning_rate": 0.0004627911412229052, + "loss": 2.7391, + "step": 4569 + }, + { + "epoch": 0.19242105263157894, + "grad_norm": 0.48046875, + "learning_rate": 0.00046277342694714895, + "loss": 3.5581, + "step": 4570 + }, + { + "epoch": 0.19246315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.0004627557087949108, + "loss": 3.6555, + "step": 4571 + }, + { + "epoch": 0.19250526315789473, + "grad_norm": 0.416015625, + "learning_rate": 0.00046273798676651367, + "loss": 3.6472, + "step": 4572 + }, + { + "epoch": 0.19254736842105263, + "grad_norm": 0.390625, + "learning_rate": 0.00046272026086228036, + "loss": 3.7668, + "step": 4573 + }, + { + "epoch": 0.19258947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.0004627025310825339, + "loss": 3.4273, + "step": 4574 + }, + { + "epoch": 0.19263157894736843, + "grad_norm": 0.451171875, + "learning_rate": 0.00046268479742759714, + "loss": 3.5233, + "step": 4575 + }, + { + "epoch": 0.1926736842105263, + "grad_norm": 0.5, + "learning_rate": 0.0004626670598977933, + "loss": 3.2058, + "step": 4576 + }, + { + "epoch": 0.1927157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.00046264931849344547, + "loss": 3.0236, + "step": 4577 + }, + { + "epoch": 0.1927578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00046263157321487695, + "loss": 3.2673, + "step": 4578 + }, + { + "epoch": 0.1928, + "grad_norm": 0.4296875, + "learning_rate": 0.00046261382406241104, + "loss": 3.5448, + "step": 4579 + }, + { + "epoch": 0.1928421052631579, + "grad_norm": 0.384765625, + "learning_rate": 0.000462596071036371, + "loss": 3.0543, + "step": 4580 + }, + { + "epoch": 0.1928842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.0004625783141370804, + "loss": 3.5139, + "step": 4581 + }, + { + "epoch": 0.1929263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.0004625605533648626, + "loss": 2.7327, + "step": 4582 + }, + { + "epoch": 0.19296842105263157, + "grad_norm": 0.40234375, + "learning_rate": 0.0004625427887200413, + "loss": 3.6466, + "step": 4583 + }, + { + "epoch": 0.19301052631578947, + "grad_norm": 0.4453125, + "learning_rate": 0.00046252502020294016, + "loss": 3.2897, + "step": 4584 + }, + { + "epoch": 0.19305263157894736, + "grad_norm": 0.390625, + "learning_rate": 0.0004625072478138828, + "loss": 3.5112, + "step": 4585 + }, + { + "epoch": 0.19309473684210526, + "grad_norm": 0.435546875, + "learning_rate": 0.0004624894715531931, + "loss": 3.4037, + "step": 4586 + }, + { + "epoch": 0.19313684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.0004624716914211949, + "loss": 3.0236, + "step": 4587 + }, + { + "epoch": 0.19317894736842106, + "grad_norm": 0.421875, + "learning_rate": 0.0004624539074182121, + "loss": 3.5033, + "step": 4588 + }, + { + "epoch": 0.19322105263157896, + "grad_norm": 0.38671875, + "learning_rate": 0.0004624361195445688, + "loss": 3.2123, + "step": 4589 + }, + { + "epoch": 0.19326315789473683, + "grad_norm": 0.392578125, + "learning_rate": 0.0004624183278005889, + "loss": 2.9114, + "step": 4590 + }, + { + "epoch": 0.19330526315789473, + "grad_norm": 0.5078125, + "learning_rate": 0.00046240053218659673, + "loss": 3.2894, + "step": 4591 + }, + { + "epoch": 0.19334736842105263, + "grad_norm": 0.5625, + "learning_rate": 0.00046238273270291636, + "loss": 3.2422, + "step": 4592 + }, + { + "epoch": 0.19338947368421053, + "grad_norm": 0.4140625, + "learning_rate": 0.00046236492934987217, + "loss": 3.2904, + "step": 4593 + }, + { + "epoch": 0.19343157894736843, + "grad_norm": 0.4375, + "learning_rate": 0.0004623471221277885, + "loss": 3.5449, + "step": 4594 + }, + { + "epoch": 0.19347368421052633, + "grad_norm": 0.484375, + "learning_rate": 0.0004623293110369897, + "loss": 3.7585, + "step": 4595 + }, + { + "epoch": 0.19351578947368422, + "grad_norm": 0.466796875, + "learning_rate": 0.00046231149607780045, + "loss": 3.6394, + "step": 4596 + }, + { + "epoch": 0.1935578947368421, + "grad_norm": 0.40234375, + "learning_rate": 0.00046229367725054505, + "loss": 3.8085, + "step": 4597 + }, + { + "epoch": 0.1936, + "grad_norm": 0.412109375, + "learning_rate": 0.00046227585455554845, + "loss": 3.2591, + "step": 4598 + }, + { + "epoch": 0.1936421052631579, + "grad_norm": 0.45703125, + "learning_rate": 0.00046225802799313505, + "loss": 3.3793, + "step": 4599 + }, + { + "epoch": 0.1936842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.00046224019756362984, + "loss": 3.9237, + "step": 4600 + }, + { + "epoch": 0.1937263157894737, + "grad_norm": 0.400390625, + "learning_rate": 0.0004622223632673576, + "loss": 3.448, + "step": 4601 + }, + { + "epoch": 0.1937684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.0004622045251046432, + "loss": 3.3553, + "step": 4602 + }, + { + "epoch": 0.19381052631578946, + "grad_norm": 0.451171875, + "learning_rate": 0.00046218668307581174, + "loss": 3.3716, + "step": 4603 + }, + { + "epoch": 0.19385263157894736, + "grad_norm": 0.4375, + "learning_rate": 0.00046216883718118825, + "loss": 3.4757, + "step": 4604 + }, + { + "epoch": 0.19389473684210526, + "grad_norm": 0.421875, + "learning_rate": 0.0004621509874210977, + "loss": 3.5335, + "step": 4605 + }, + { + "epoch": 0.19393684210526316, + "grad_norm": 0.439453125, + "learning_rate": 0.0004621331337958656, + "loss": 3.3825, + "step": 4606 + }, + { + "epoch": 0.19397894736842106, + "grad_norm": 0.6171875, + "learning_rate": 0.00046211527630581696, + "loss": 2.9603, + "step": 4607 + }, + { + "epoch": 0.19402105263157896, + "grad_norm": 0.40625, + "learning_rate": 0.00046209741495127724, + "loss": 3.5945, + "step": 4608 + }, + { + "epoch": 0.19406315789473685, + "grad_norm": 0.41796875, + "learning_rate": 0.00046207954973257175, + "loss": 3.1631, + "step": 4609 + }, + { + "epoch": 0.19410526315789473, + "grad_norm": 0.416015625, + "learning_rate": 0.00046206168065002613, + "loss": 2.9896, + "step": 4610 + }, + { + "epoch": 0.19414736842105262, + "grad_norm": 0.53515625, + "learning_rate": 0.0004620438077039658, + "loss": 3.2623, + "step": 4611 + }, + { + "epoch": 0.19418947368421052, + "grad_norm": 0.427734375, + "learning_rate": 0.0004620259308947165, + "loss": 3.1442, + "step": 4612 + }, + { + "epoch": 0.19423157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.00046200805022260377, + "loss": 3.0004, + "step": 4613 + }, + { + "epoch": 0.19427368421052632, + "grad_norm": 0.404296875, + "learning_rate": 0.0004619901656879535, + "loss": 3.3269, + "step": 4614 + }, + { + "epoch": 0.19431578947368422, + "grad_norm": 0.396484375, + "learning_rate": 0.00046197227729109146, + "loss": 3.2997, + "step": 4615 + }, + { + "epoch": 0.19435789473684212, + "grad_norm": 0.421875, + "learning_rate": 0.00046195438503234367, + "loss": 3.3216, + "step": 4616 + }, + { + "epoch": 0.1944, + "grad_norm": 0.4140625, + "learning_rate": 0.000461936488912036, + "loss": 3.2208, + "step": 4617 + }, + { + "epoch": 0.1944421052631579, + "grad_norm": 0.390625, + "learning_rate": 0.0004619185889304945, + "loss": 3.4379, + "step": 4618 + }, + { + "epoch": 0.1944842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.0004619006850880453, + "loss": 3.0507, + "step": 4619 + }, + { + "epoch": 0.19452631578947369, + "grad_norm": 0.392578125, + "learning_rate": 0.00046188277738501454, + "loss": 3.3616, + "step": 4620 + }, + { + "epoch": 0.19456842105263158, + "grad_norm": 0.388671875, + "learning_rate": 0.0004618648658217286, + "loss": 3.1964, + "step": 4621 + }, + { + "epoch": 0.19461052631578948, + "grad_norm": 0.46875, + "learning_rate": 0.0004618469503985137, + "loss": 3.2916, + "step": 4622 + }, + { + "epoch": 0.19465263157894735, + "grad_norm": 0.427734375, + "learning_rate": 0.0004618290311156963, + "loss": 3.4239, + "step": 4623 + }, + { + "epoch": 0.19469473684210525, + "grad_norm": 0.40234375, + "learning_rate": 0.00046181110797360284, + "loss": 3.3782, + "step": 4624 + }, + { + "epoch": 0.19473684210526315, + "grad_norm": 0.41015625, + "learning_rate": 0.00046179318097255986, + "loss": 3.4291, + "step": 4625 + }, + { + "epoch": 0.19477894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.00046177525011289387, + "loss": 3.6038, + "step": 4626 + }, + { + "epoch": 0.19482105263157895, + "grad_norm": 0.392578125, + "learning_rate": 0.0004617573153949318, + "loss": 3.7854, + "step": 4627 + }, + { + "epoch": 0.19486315789473685, + "grad_norm": 0.443359375, + "learning_rate": 0.0004617393768190001, + "loss": 3.4795, + "step": 4628 + }, + { + "epoch": 0.19490526315789475, + "grad_norm": 0.419921875, + "learning_rate": 0.00046172143438542586, + "loss": 3.0325, + "step": 4629 + }, + { + "epoch": 0.19494736842105262, + "grad_norm": 0.3984375, + "learning_rate": 0.0004617034880945358, + "loss": 2.9884, + "step": 4630 + }, + { + "epoch": 0.19498947368421052, + "grad_norm": 0.396484375, + "learning_rate": 0.00046168553794665693, + "loss": 3.435, + "step": 4631 + }, + { + "epoch": 0.19503157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.00046166758394211626, + "loss": 3.5957, + "step": 4632 + }, + { + "epoch": 0.19507368421052632, + "grad_norm": 0.412109375, + "learning_rate": 0.00046164962608124095, + "loss": 3.0424, + "step": 4633 + }, + { + "epoch": 0.19511578947368421, + "grad_norm": 0.3984375, + "learning_rate": 0.0004616316643643582, + "loss": 3.3512, + "step": 4634 + }, + { + "epoch": 0.1951578947368421, + "grad_norm": 0.380859375, + "learning_rate": 0.00046161369879179503, + "loss": 3.4092, + "step": 4635 + }, + { + "epoch": 0.1952, + "grad_norm": 0.48828125, + "learning_rate": 0.00046159572936387895, + "loss": 3.3861, + "step": 4636 + }, + { + "epoch": 0.19524210526315788, + "grad_norm": 0.3984375, + "learning_rate": 0.00046157775608093735, + "loss": 3.3005, + "step": 4637 + }, + { + "epoch": 0.19528421052631578, + "grad_norm": 0.58203125, + "learning_rate": 0.0004615597789432976, + "loss": 3.4246, + "step": 4638 + }, + { + "epoch": 0.19532631578947368, + "grad_norm": 0.416015625, + "learning_rate": 0.00046154179795128723, + "loss": 3.3376, + "step": 4639 + }, + { + "epoch": 0.19536842105263158, + "grad_norm": 0.390625, + "learning_rate": 0.00046152381310523384, + "loss": 3.1749, + "step": 4640 + }, + { + "epoch": 0.19541052631578948, + "grad_norm": 0.40625, + "learning_rate": 0.0004615058244054652, + "loss": 3.6176, + "step": 4641 + }, + { + "epoch": 0.19545263157894738, + "grad_norm": 0.392578125, + "learning_rate": 0.0004614878318523088, + "loss": 3.3055, + "step": 4642 + }, + { + "epoch": 0.19549473684210528, + "grad_norm": 0.515625, + "learning_rate": 0.00046146983544609273, + "loss": 3.6996, + "step": 4643 + }, + { + "epoch": 0.19553684210526315, + "grad_norm": 0.46484375, + "learning_rate": 0.00046145183518714463, + "loss": 2.9007, + "step": 4644 + }, + { + "epoch": 0.19557894736842105, + "grad_norm": 0.71875, + "learning_rate": 0.00046143383107579263, + "loss": 3.1735, + "step": 4645 + }, + { + "epoch": 0.19562105263157895, + "grad_norm": 0.6328125, + "learning_rate": 0.0004614158231123646, + "loss": 3.333, + "step": 4646 + }, + { + "epoch": 0.19566315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.00046139781129718866, + "loss": 3.3323, + "step": 4647 + }, + { + "epoch": 0.19570526315789474, + "grad_norm": 0.419921875, + "learning_rate": 0.000461379795630593, + "loss": 3.4821, + "step": 4648 + }, + { + "epoch": 0.19574736842105264, + "grad_norm": 0.447265625, + "learning_rate": 0.0004613617761129058, + "loss": 3.6953, + "step": 4649 + }, + { + "epoch": 0.1957894736842105, + "grad_norm": 0.412109375, + "learning_rate": 0.0004613437527444554, + "loss": 3.6383, + "step": 4650 + }, + { + "epoch": 0.1958315789473684, + "grad_norm": 0.494140625, + "learning_rate": 0.0004613257255255702, + "loss": 3.2791, + "step": 4651 + }, + { + "epoch": 0.1958736842105263, + "grad_norm": 0.56640625, + "learning_rate": 0.0004613076944565785, + "loss": 3.3195, + "step": 4652 + }, + { + "epoch": 0.1959157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.0004612896595378089, + "loss": 3.3579, + "step": 4653 + }, + { + "epoch": 0.1959578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00046127162076958994, + "loss": 3.3907, + "step": 4654 + }, + { + "epoch": 0.196, + "grad_norm": 0.439453125, + "learning_rate": 0.0004612535781522503, + "loss": 3.6124, + "step": 4655 + }, + { + "epoch": 0.1960421052631579, + "grad_norm": 0.396484375, + "learning_rate": 0.0004612355316861187, + "loss": 3.6001, + "step": 4656 + }, + { + "epoch": 0.19608421052631578, + "grad_norm": 0.40625, + "learning_rate": 0.00046121748137152384, + "loss": 3.1411, + "step": 4657 + }, + { + "epoch": 0.19612631578947368, + "grad_norm": 0.423828125, + "learning_rate": 0.00046119942720879473, + "loss": 3.6213, + "step": 4658 + }, + { + "epoch": 0.19616842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.00046118136919826014, + "loss": 3.0817, + "step": 4659 + }, + { + "epoch": 0.19621052631578947, + "grad_norm": 0.396484375, + "learning_rate": 0.0004611633073402491, + "loss": 3.4891, + "step": 4660 + }, + { + "epoch": 0.19625263157894737, + "grad_norm": 0.5625, + "learning_rate": 0.0004611452416350909, + "loss": 3.4957, + "step": 4661 + }, + { + "epoch": 0.19629473684210527, + "grad_norm": 0.416015625, + "learning_rate": 0.0004611271720831143, + "loss": 3.6496, + "step": 4662 + }, + { + "epoch": 0.19633684210526317, + "grad_norm": 0.65625, + "learning_rate": 0.0004611090986846487, + "loss": 3.3819, + "step": 4663 + }, + { + "epoch": 0.19637894736842104, + "grad_norm": 0.408203125, + "learning_rate": 0.0004610910214400235, + "loss": 3.478, + "step": 4664 + }, + { + "epoch": 0.19642105263157894, + "grad_norm": 0.478515625, + "learning_rate": 0.0004610729403495677, + "loss": 3.3809, + "step": 4665 + }, + { + "epoch": 0.19646315789473684, + "grad_norm": 0.41015625, + "learning_rate": 0.000461054855413611, + "loss": 3.405, + "step": 4666 + }, + { + "epoch": 0.19650526315789474, + "grad_norm": 0.400390625, + "learning_rate": 0.00046103676663248293, + "loss": 3.1792, + "step": 4667 + }, + { + "epoch": 0.19654736842105264, + "grad_norm": 0.427734375, + "learning_rate": 0.00046101867400651287, + "loss": 3.8562, + "step": 4668 + }, + { + "epoch": 0.19658947368421054, + "grad_norm": 0.42578125, + "learning_rate": 0.00046100057753603043, + "loss": 3.3059, + "step": 4669 + }, + { + "epoch": 0.1966315789473684, + "grad_norm": 0.9453125, + "learning_rate": 0.0004609824772213654, + "loss": 3.5399, + "step": 4670 + }, + { + "epoch": 0.1966736842105263, + "grad_norm": 0.431640625, + "learning_rate": 0.00046096437306284766, + "loss": 3.4827, + "step": 4671 + }, + { + "epoch": 0.1967157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.00046094626506080685, + "loss": 3.186, + "step": 4672 + }, + { + "epoch": 0.1967578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00046092815321557287, + "loss": 3.6666, + "step": 4673 + }, + { + "epoch": 0.1968, + "grad_norm": 0.443359375, + "learning_rate": 0.00046091003752747584, + "loss": 3.3509, + "step": 4674 + }, + { + "epoch": 0.1968421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.0004608919179968457, + "loss": 3.2043, + "step": 4675 + }, + { + "epoch": 0.1968842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.0004608737946240126, + "loss": 3.1292, + "step": 4676 + }, + { + "epoch": 0.19692631578947367, + "grad_norm": 0.50390625, + "learning_rate": 0.0004608556674093067, + "loss": 2.6994, + "step": 4677 + }, + { + "epoch": 0.19696842105263157, + "grad_norm": 0.4140625, + "learning_rate": 0.00046083753635305834, + "loss": 3.4107, + "step": 4678 + }, + { + "epoch": 0.19701052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.00046081940145559783, + "loss": 3.6181, + "step": 4679 + }, + { + "epoch": 0.19705263157894737, + "grad_norm": 0.48828125, + "learning_rate": 0.00046080126271725544, + "loss": 3.1048, + "step": 4680 + }, + { + "epoch": 0.19709473684210527, + "grad_norm": 0.453125, + "learning_rate": 0.00046078312013836175, + "loss": 3.0991, + "step": 4681 + }, + { + "epoch": 0.19713684210526317, + "grad_norm": 0.431640625, + "learning_rate": 0.00046076497371924733, + "loss": 3.2433, + "step": 4682 + }, + { + "epoch": 0.19717894736842106, + "grad_norm": 0.466796875, + "learning_rate": 0.00046074682346024264, + "loss": 2.6989, + "step": 4683 + }, + { + "epoch": 0.19722105263157894, + "grad_norm": 0.404296875, + "learning_rate": 0.00046072866936167854, + "loss": 3.5098, + "step": 4684 + }, + { + "epoch": 0.19726315789473683, + "grad_norm": 0.41015625, + "learning_rate": 0.0004607105114238855, + "loss": 3.2954, + "step": 4685 + }, + { + "epoch": 0.19730526315789473, + "grad_norm": 0.419921875, + "learning_rate": 0.0004606923496471947, + "loss": 3.4342, + "step": 4686 + }, + { + "epoch": 0.19734736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.0004606741840319368, + "loss": 3.8941, + "step": 4687 + }, + { + "epoch": 0.19738947368421053, + "grad_norm": 0.49609375, + "learning_rate": 0.00046065601457844277, + "loss": 3.826, + "step": 4688 + }, + { + "epoch": 0.19743157894736843, + "grad_norm": 0.421875, + "learning_rate": 0.0004606378412870437, + "loss": 3.5838, + "step": 4689 + }, + { + "epoch": 0.19747368421052633, + "grad_norm": 0.431640625, + "learning_rate": 0.0004606196641580706, + "loss": 3.4745, + "step": 4690 + }, + { + "epoch": 0.1975157894736842, + "grad_norm": 0.46875, + "learning_rate": 0.00046060148319185474, + "loss": 2.8536, + "step": 4691 + }, + { + "epoch": 0.1975578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.0004605832983887273, + "loss": 3.1668, + "step": 4692 + }, + { + "epoch": 0.1976, + "grad_norm": 0.4140625, + "learning_rate": 0.00046056510974901957, + "loss": 3.4817, + "step": 4693 + }, + { + "epoch": 0.1976421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.0004605469172730629, + "loss": 3.6233, + "step": 4694 + }, + { + "epoch": 0.1976842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.0004605287209611888, + "loss": 3.3508, + "step": 4695 + }, + { + "epoch": 0.1977263157894737, + "grad_norm": 0.3828125, + "learning_rate": 0.0004605105208137288, + "loss": 3.308, + "step": 4696 + }, + { + "epoch": 0.19776842105263157, + "grad_norm": 0.421875, + "learning_rate": 0.00046049231683101445, + "loss": 2.9944, + "step": 4697 + }, + { + "epoch": 0.19781052631578946, + "grad_norm": 0.515625, + "learning_rate": 0.00046047410901337737, + "loss": 3.5698, + "step": 4698 + }, + { + "epoch": 0.19785263157894736, + "grad_norm": 0.451171875, + "learning_rate": 0.00046045589736114933, + "loss": 3.5788, + "step": 4699 + }, + { + "epoch": 0.19789473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.00046043768187466216, + "loss": 3.4954, + "step": 4700 + }, + { + "epoch": 0.19793684210526316, + "grad_norm": 0.59765625, + "learning_rate": 0.00046041946255424767, + "loss": 3.1578, + "step": 4701 + }, + { + "epoch": 0.19797894736842106, + "grad_norm": 0.423828125, + "learning_rate": 0.00046040123940023774, + "loss": 3.0436, + "step": 4702 + }, + { + "epoch": 0.19802105263157896, + "grad_norm": 0.41015625, + "learning_rate": 0.0004603830124129645, + "loss": 3.1099, + "step": 4703 + }, + { + "epoch": 0.19806315789473683, + "grad_norm": 0.447265625, + "learning_rate": 0.00046036478159275997, + "loss": 3.659, + "step": 4704 + }, + { + "epoch": 0.19810526315789473, + "grad_norm": 0.41796875, + "learning_rate": 0.00046034654693995626, + "loss": 3.3704, + "step": 4705 + }, + { + "epoch": 0.19814736842105263, + "grad_norm": 0.40625, + "learning_rate": 0.0004603283084548856, + "loss": 3.6293, + "step": 4706 + }, + { + "epoch": 0.19818947368421053, + "grad_norm": 0.443359375, + "learning_rate": 0.0004603100661378803, + "loss": 3.2193, + "step": 4707 + }, + { + "epoch": 0.19823157894736843, + "grad_norm": 0.412109375, + "learning_rate": 0.0004602918199892727, + "loss": 3.2627, + "step": 4708 + }, + { + "epoch": 0.19827368421052632, + "grad_norm": 0.412109375, + "learning_rate": 0.00046027357000939524, + "loss": 3.2634, + "step": 4709 + }, + { + "epoch": 0.19831578947368422, + "grad_norm": 0.478515625, + "learning_rate": 0.00046025531619858036, + "loss": 3.4003, + "step": 4710 + }, + { + "epoch": 0.1983578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0004602370585571607, + "loss": 3.6085, + "step": 4711 + }, + { + "epoch": 0.1984, + "grad_norm": 0.69921875, + "learning_rate": 0.00046021879708546884, + "loss": 2.9857, + "step": 4712 + }, + { + "epoch": 0.1984421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.00046020053178383747, + "loss": 3.7464, + "step": 4713 + }, + { + "epoch": 0.1984842105263158, + "grad_norm": 0.39453125, + "learning_rate": 0.0004601822626525994, + "loss": 3.5108, + "step": 4714 + }, + { + "epoch": 0.1985263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.0004601639896920874, + "loss": 3.222, + "step": 4715 + }, + { + "epoch": 0.1985684210526316, + "grad_norm": 0.453125, + "learning_rate": 0.0004601457129026345, + "loss": 2.9223, + "step": 4716 + }, + { + "epoch": 0.1986105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.00046012743228457365, + "loss": 3.4981, + "step": 4717 + }, + { + "epoch": 0.19865263157894736, + "grad_norm": 0.439453125, + "learning_rate": 0.00046010914783823785, + "loss": 3.2794, + "step": 4718 + }, + { + "epoch": 0.19869473684210526, + "grad_norm": 0.4375, + "learning_rate": 0.00046009085956396023, + "loss": 3.3854, + "step": 4719 + }, + { + "epoch": 0.19873684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.000460072567462074, + "loss": 3.7012, + "step": 4720 + }, + { + "epoch": 0.19877894736842106, + "grad_norm": 0.396484375, + "learning_rate": 0.00046005427153291247, + "loss": 3.6206, + "step": 4721 + }, + { + "epoch": 0.19882105263157895, + "grad_norm": 0.435546875, + "learning_rate": 0.00046003597177680884, + "loss": 3.8439, + "step": 4722 + }, + { + "epoch": 0.19886315789473685, + "grad_norm": 0.4375, + "learning_rate": 0.00046001766819409664, + "loss": 3.293, + "step": 4723 + }, + { + "epoch": 0.19890526315789472, + "grad_norm": 0.404296875, + "learning_rate": 0.0004599993607851092, + "loss": 3.616, + "step": 4724 + }, + { + "epoch": 0.19894736842105262, + "grad_norm": 0.470703125, + "learning_rate": 0.00045998104955018024, + "loss": 3.369, + "step": 4725 + }, + { + "epoch": 0.19898947368421052, + "grad_norm": 1.4140625, + "learning_rate": 0.0004599627344896432, + "loss": 3.7414, + "step": 4726 + }, + { + "epoch": 0.19903157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.0004599444156038319, + "loss": 3.4799, + "step": 4727 + }, + { + "epoch": 0.19907368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.00045992609289308, + "loss": 2.8465, + "step": 4728 + }, + { + "epoch": 0.19911578947368422, + "grad_norm": 0.462890625, + "learning_rate": 0.00045990776635772133, + "loss": 3.36, + "step": 4729 + }, + { + "epoch": 0.19915789473684212, + "grad_norm": 0.390625, + "learning_rate": 0.00045988943599808974, + "loss": 3.2382, + "step": 4730 + }, + { + "epoch": 0.1992, + "grad_norm": 0.400390625, + "learning_rate": 0.0004598711018145193, + "loss": 3.2816, + "step": 4731 + }, + { + "epoch": 0.1992421052631579, + "grad_norm": 0.53125, + "learning_rate": 0.000459852763807344, + "loss": 3.1602, + "step": 4732 + }, + { + "epoch": 0.19928421052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.0004598344219768978, + "loss": 3.5339, + "step": 4733 + }, + { + "epoch": 0.19932631578947368, + "grad_norm": 0.42578125, + "learning_rate": 0.0004598160763235151, + "loss": 3.5453, + "step": 4734 + }, + { + "epoch": 0.19936842105263158, + "grad_norm": 0.453125, + "learning_rate": 0.00045979772684753, + "loss": 3.3341, + "step": 4735 + }, + { + "epoch": 0.19941052631578948, + "grad_norm": 0.451171875, + "learning_rate": 0.00045977937354927675, + "loss": 3.141, + "step": 4736 + }, + { + "epoch": 0.19945263157894738, + "grad_norm": 0.392578125, + "learning_rate": 0.00045976101642908985, + "loss": 3.5318, + "step": 4737 + }, + { + "epoch": 0.19949473684210525, + "grad_norm": 0.404296875, + "learning_rate": 0.00045974265548730367, + "loss": 3.2621, + "step": 4738 + }, + { + "epoch": 0.19953684210526315, + "grad_norm": 0.396484375, + "learning_rate": 0.00045972429072425273, + "loss": 3.3693, + "step": 4739 + }, + { + "epoch": 0.19957894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.0004597059221402716, + "loss": 3.5329, + "step": 4740 + }, + { + "epoch": 0.19962105263157895, + "grad_norm": 0.39453125, + "learning_rate": 0.000459687549735695, + "loss": 3.6713, + "step": 4741 + }, + { + "epoch": 0.19966315789473685, + "grad_norm": 0.408203125, + "learning_rate": 0.00045966917351085765, + "loss": 3.366, + "step": 4742 + }, + { + "epoch": 0.19970526315789475, + "grad_norm": 0.435546875, + "learning_rate": 0.0004596507934660943, + "loss": 3.6754, + "step": 4743 + }, + { + "epoch": 0.19974736842105262, + "grad_norm": 0.482421875, + "learning_rate": 0.0004596324096017398, + "loss": 3.3526, + "step": 4744 + }, + { + "epoch": 0.19978947368421052, + "grad_norm": 0.390625, + "learning_rate": 0.00045961402191812905, + "loss": 3.4811, + "step": 4745 + }, + { + "epoch": 0.19983157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.00045959563041559715, + "loss": 3.3433, + "step": 4746 + }, + { + "epoch": 0.19987368421052631, + "grad_norm": 0.431640625, + "learning_rate": 0.00045957723509447917, + "loss": 3.3926, + "step": 4747 + }, + { + "epoch": 0.1999157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.00045955883595511014, + "loss": 3.8522, + "step": 4748 + }, + { + "epoch": 0.1999578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00045954043299782546, + "loss": 3.4289, + "step": 4749 + }, + { + "epoch": 0.2, + "grad_norm": 0.50390625, + "learning_rate": 0.00045952202622296013, + "loss": 3.3673, + "step": 4750 + }, + { + "epoch": 0.20004210526315788, + "grad_norm": 0.453125, + "learning_rate": 0.0004595036156308498, + "loss": 3.0865, + "step": 4751 + }, + { + "epoch": 0.20008421052631578, + "grad_norm": 0.421875, + "learning_rate": 0.0004594852012218297, + "loss": 3.1619, + "step": 4752 + }, + { + "epoch": 0.20012631578947368, + "grad_norm": 0.4609375, + "learning_rate": 0.00045946678299623535, + "loss": 3.3714, + "step": 4753 + }, + { + "epoch": 0.20016842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.0004594483609544023, + "loss": 3.5813, + "step": 4754 + }, + { + "epoch": 0.20021052631578948, + "grad_norm": 0.392578125, + "learning_rate": 0.0004594299350966663, + "loss": 3.5254, + "step": 4755 + }, + { + "epoch": 0.20025263157894738, + "grad_norm": 0.41015625, + "learning_rate": 0.00045941150542336284, + "loss": 3.4176, + "step": 4756 + }, + { + "epoch": 0.20029473684210528, + "grad_norm": 0.427734375, + "learning_rate": 0.0004593930719348279, + "loss": 3.0222, + "step": 4757 + }, + { + "epoch": 0.20033684210526315, + "grad_norm": 0.421875, + "learning_rate": 0.00045937463463139715, + "loss": 3.5983, + "step": 4758 + }, + { + "epoch": 0.20037894736842105, + "grad_norm": 0.392578125, + "learning_rate": 0.00045935619351340664, + "loss": 3.2003, + "step": 4759 + }, + { + "epoch": 0.20042105263157894, + "grad_norm": 0.400390625, + "learning_rate": 0.00045933774858119213, + "loss": 3.3637, + "step": 4760 + }, + { + "epoch": 0.20046315789473684, + "grad_norm": 0.482421875, + "learning_rate": 0.0004593192998350899, + "loss": 3.5309, + "step": 4761 + }, + { + "epoch": 0.20050526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00045930084727543595, + "loss": 3.4571, + "step": 4762 + }, + { + "epoch": 0.20054736842105264, + "grad_norm": 0.408203125, + "learning_rate": 0.00045928239090256654, + "loss": 3.2884, + "step": 4763 + }, + { + "epoch": 0.20058947368421054, + "grad_norm": 0.408203125, + "learning_rate": 0.00045926393071681775, + "loss": 3.0831, + "step": 4764 + }, + { + "epoch": 0.2006315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.0004592454667185261, + "loss": 3.57, + "step": 4765 + }, + { + "epoch": 0.2006736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.0004592269989080279, + "loss": 3.3712, + "step": 4766 + }, + { + "epoch": 0.2007157894736842, + "grad_norm": 0.65234375, + "learning_rate": 0.00045920852728565954, + "loss": 3.4081, + "step": 4767 + }, + { + "epoch": 0.2007578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.00045919005185175764, + "loss": 3.5499, + "step": 4768 + }, + { + "epoch": 0.2008, + "grad_norm": 0.388671875, + "learning_rate": 0.0004591715726066589, + "loss": 2.6562, + "step": 4769 + }, + { + "epoch": 0.2008421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.00045915308955069966, + "loss": 3.3569, + "step": 4770 + }, + { + "epoch": 0.20088421052631578, + "grad_norm": 0.40625, + "learning_rate": 0.000459134602684217, + "loss": 3.0852, + "step": 4771 + }, + { + "epoch": 0.20092631578947368, + "grad_norm": 0.4140625, + "learning_rate": 0.00045911611200754763, + "loss": 3.257, + "step": 4772 + }, + { + "epoch": 0.20096842105263157, + "grad_norm": 0.453125, + "learning_rate": 0.0004590976175210284, + "loss": 2.9791, + "step": 4773 + }, + { + "epoch": 0.20101052631578947, + "grad_norm": 0.40625, + "learning_rate": 0.00045907911922499624, + "loss": 3.4335, + "step": 4774 + }, + { + "epoch": 0.20105263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.00045906061711978817, + "loss": 3.0195, + "step": 4775 + }, + { + "epoch": 0.20109473684210527, + "grad_norm": 0.408203125, + "learning_rate": 0.0004590421112057413, + "loss": 3.5424, + "step": 4776 + }, + { + "epoch": 0.20113684210526317, + "grad_norm": 0.41796875, + "learning_rate": 0.00045902360148319276, + "loss": 3.0324, + "step": 4777 + }, + { + "epoch": 0.20117894736842104, + "grad_norm": 0.439453125, + "learning_rate": 0.0004590050879524798, + "loss": 3.0185, + "step": 4778 + }, + { + "epoch": 0.20122105263157894, + "grad_norm": 0.404296875, + "learning_rate": 0.0004589865706139397, + "loss": 3.5402, + "step": 4779 + }, + { + "epoch": 0.20126315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.0004589680494679099, + "loss": 3.4977, + "step": 4780 + }, + { + "epoch": 0.20130526315789474, + "grad_norm": 0.3828125, + "learning_rate": 0.0004589495245147277, + "loss": 3.7023, + "step": 4781 + }, + { + "epoch": 0.20134736842105264, + "grad_norm": 0.40625, + "learning_rate": 0.0004589309957547307, + "loss": 3.3714, + "step": 4782 + }, + { + "epoch": 0.20138947368421053, + "grad_norm": 0.482421875, + "learning_rate": 0.0004589124631882564, + "loss": 3.2131, + "step": 4783 + }, + { + "epoch": 0.20143157894736843, + "grad_norm": 0.392578125, + "learning_rate": 0.0004588939268156425, + "loss": 3.7069, + "step": 4784 + }, + { + "epoch": 0.2014736842105263, + "grad_norm": 0.431640625, + "learning_rate": 0.0004588753866372267, + "loss": 3.8827, + "step": 4785 + }, + { + "epoch": 0.2015157894736842, + "grad_norm": 0.4375, + "learning_rate": 0.00045885684265334684, + "loss": 3.6997, + "step": 4786 + }, + { + "epoch": 0.2015578947368421, + "grad_norm": 0.4453125, + "learning_rate": 0.0004588382948643406, + "loss": 3.2795, + "step": 4787 + }, + { + "epoch": 0.2016, + "grad_norm": 0.392578125, + "learning_rate": 0.00045881974327054604, + "loss": 3.3301, + "step": 4788 + }, + { + "epoch": 0.2016421052631579, + "grad_norm": 0.953125, + "learning_rate": 0.00045880118787230105, + "loss": 3.3275, + "step": 4789 + }, + { + "epoch": 0.2016842105263158, + "grad_norm": 0.48046875, + "learning_rate": 0.0004587826286699439, + "loss": 2.9759, + "step": 4790 + }, + { + "epoch": 0.2017263157894737, + "grad_norm": 0.439453125, + "learning_rate": 0.00045876406566381246, + "loss": 3.4448, + "step": 4791 + }, + { + "epoch": 0.20176842105263157, + "grad_norm": 0.7265625, + "learning_rate": 0.000458745498854245, + "loss": 3.567, + "step": 4792 + }, + { + "epoch": 0.20181052631578947, + "grad_norm": 0.478515625, + "learning_rate": 0.0004587269282415799, + "loss": 3.2422, + "step": 4793 + }, + { + "epoch": 0.20185263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.0004587083538261554, + "loss": 3.1943, + "step": 4794 + }, + { + "epoch": 0.20189473684210527, + "grad_norm": 0.466796875, + "learning_rate": 0.0004586897756083098, + "loss": 3.2912, + "step": 4795 + }, + { + "epoch": 0.20193684210526316, + "grad_norm": 0.5234375, + "learning_rate": 0.00045867119358838183, + "loss": 3.5231, + "step": 4796 + }, + { + "epoch": 0.20197894736842106, + "grad_norm": 0.53515625, + "learning_rate": 0.00045865260776670983, + "loss": 3.1257, + "step": 4797 + }, + { + "epoch": 0.20202105263157893, + "grad_norm": 0.466796875, + "learning_rate": 0.00045863401814363246, + "loss": 3.237, + "step": 4798 + }, + { + "epoch": 0.20206315789473683, + "grad_norm": 0.458984375, + "learning_rate": 0.0004586154247194885, + "loss": 3.3493, + "step": 4799 + }, + { + "epoch": 0.20210526315789473, + "grad_norm": 0.466796875, + "learning_rate": 0.00045859682749461653, + "loss": 3.3089, + "step": 4800 + }, + { + "epoch": 0.20214736842105263, + "grad_norm": 0.431640625, + "learning_rate": 0.0004585782264693555, + "loss": 3.1251, + "step": 4801 + }, + { + "epoch": 0.20218947368421053, + "grad_norm": 0.52734375, + "learning_rate": 0.00045855962164404417, + "loss": 3.708, + "step": 4802 + }, + { + "epoch": 0.20223157894736843, + "grad_norm": 0.44140625, + "learning_rate": 0.0004585410130190217, + "loss": 3.7962, + "step": 4803 + }, + { + "epoch": 0.20227368421052633, + "grad_norm": 0.44921875, + "learning_rate": 0.0004585224005946269, + "loss": 3.7885, + "step": 4804 + }, + { + "epoch": 0.2023157894736842, + "grad_norm": 0.451171875, + "learning_rate": 0.000458503784371199, + "loss": 3.2529, + "step": 4805 + }, + { + "epoch": 0.2023578947368421, + "grad_norm": 0.4453125, + "learning_rate": 0.00045848516434907717, + "loss": 3.2713, + "step": 4806 + }, + { + "epoch": 0.2024, + "grad_norm": 0.498046875, + "learning_rate": 0.0004584665405286006, + "loss": 3.2695, + "step": 4807 + }, + { + "epoch": 0.2024421052631579, + "grad_norm": 0.400390625, + "learning_rate": 0.0004584479129101085, + "loss": 3.056, + "step": 4808 + }, + { + "epoch": 0.2024842105263158, + "grad_norm": 0.400390625, + "learning_rate": 0.00045842928149394046, + "loss": 3.1304, + "step": 4809 + }, + { + "epoch": 0.2025263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.0004584106462804357, + "loss": 3.2922, + "step": 4810 + }, + { + "epoch": 0.2025684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.0004583920072699339, + "loss": 3.6885, + "step": 4811 + }, + { + "epoch": 0.20261052631578946, + "grad_norm": 0.3984375, + "learning_rate": 0.0004583733644627746, + "loss": 3.5408, + "step": 4812 + }, + { + "epoch": 0.20265263157894736, + "grad_norm": 0.421875, + "learning_rate": 0.0004583547178592974, + "loss": 2.9682, + "step": 4813 + }, + { + "epoch": 0.20269473684210526, + "grad_norm": 0.55859375, + "learning_rate": 0.000458336067459842, + "loss": 3.1798, + "step": 4814 + }, + { + "epoch": 0.20273684210526316, + "grad_norm": 0.388671875, + "learning_rate": 0.00045831741326474835, + "loss": 3.387, + "step": 4815 + }, + { + "epoch": 0.20277894736842106, + "grad_norm": 0.427734375, + "learning_rate": 0.0004582987552743562, + "loss": 3.6772, + "step": 4816 + }, + { + "epoch": 0.20282105263157896, + "grad_norm": 0.51171875, + "learning_rate": 0.00045828009348900537, + "loss": 3.3466, + "step": 4817 + }, + { + "epoch": 0.20286315789473683, + "grad_norm": 0.4140625, + "learning_rate": 0.0004582614279090359, + "loss": 3.4067, + "step": 4818 + }, + { + "epoch": 0.20290526315789473, + "grad_norm": 0.4453125, + "learning_rate": 0.000458242758534788, + "loss": 3.0989, + "step": 4819 + }, + { + "epoch": 0.20294736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.0004582240853666018, + "loss": 3.3641, + "step": 4820 + }, + { + "epoch": 0.20298947368421053, + "grad_norm": 0.451171875, + "learning_rate": 0.0004582054084048173, + "loss": 2.9067, + "step": 4821 + }, + { + "epoch": 0.20303157894736842, + "grad_norm": 0.40625, + "learning_rate": 0.0004581867276497749, + "loss": 3.5387, + "step": 4822 + }, + { + "epoch": 0.20307368421052632, + "grad_norm": 0.486328125, + "learning_rate": 0.00045816804310181493, + "loss": 3.6092, + "step": 4823 + }, + { + "epoch": 0.20311578947368422, + "grad_norm": 0.55078125, + "learning_rate": 0.00045814935476127784, + "loss": 3.6426, + "step": 4824 + }, + { + "epoch": 0.2031578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.000458130662628504, + "loss": 3.531, + "step": 4825 + }, + { + "epoch": 0.2032, + "grad_norm": 0.45703125, + "learning_rate": 0.00045811196670383415, + "loss": 3.583, + "step": 4826 + }, + { + "epoch": 0.2032421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.0004580932669876087, + "loss": 3.159, + "step": 4827 + }, + { + "epoch": 0.2032842105263158, + "grad_norm": 0.498046875, + "learning_rate": 0.0004580745634801684, + "loss": 3.3291, + "step": 4828 + }, + { + "epoch": 0.2033263157894737, + "grad_norm": 0.4453125, + "learning_rate": 0.00045805585618185406, + "loss": 3.4634, + "step": 4829 + }, + { + "epoch": 0.2033684210526316, + "grad_norm": 0.478515625, + "learning_rate": 0.0004580371450930065, + "loss": 3.1244, + "step": 4830 + }, + { + "epoch": 0.20341052631578949, + "grad_norm": 0.431640625, + "learning_rate": 0.0004580184302139666, + "loss": 3.0725, + "step": 4831 + }, + { + "epoch": 0.20345263157894736, + "grad_norm": 0.42578125, + "learning_rate": 0.00045799971154507536, + "loss": 3.7631, + "step": 4832 + }, + { + "epoch": 0.20349473684210526, + "grad_norm": 0.388671875, + "learning_rate": 0.00045798098908667365, + "loss": 3.0916, + "step": 4833 + }, + { + "epoch": 0.20353684210526315, + "grad_norm": 0.455078125, + "learning_rate": 0.00045796226283910277, + "loss": 3.3006, + "step": 4834 + }, + { + "epoch": 0.20357894736842105, + "grad_norm": 0.44140625, + "learning_rate": 0.00045794353280270375, + "loss": 3.1864, + "step": 4835 + }, + { + "epoch": 0.20362105263157895, + "grad_norm": 0.44140625, + "learning_rate": 0.0004579247989778179, + "loss": 3.3763, + "step": 4836 + }, + { + "epoch": 0.20366315789473685, + "grad_norm": 0.41796875, + "learning_rate": 0.0004579060613647865, + "loss": 3.3983, + "step": 4837 + }, + { + "epoch": 0.20370526315789475, + "grad_norm": 0.431640625, + "learning_rate": 0.000457887319963951, + "loss": 3.445, + "step": 4838 + }, + { + "epoch": 0.20374736842105262, + "grad_norm": 0.435546875, + "learning_rate": 0.0004578685747756528, + "loss": 3.4286, + "step": 4839 + }, + { + "epoch": 0.20378947368421052, + "grad_norm": 0.4296875, + "learning_rate": 0.00045784982580023334, + "loss": 3.1014, + "step": 4840 + }, + { + "epoch": 0.20383157894736842, + "grad_norm": 0.458984375, + "learning_rate": 0.0004578310730380343, + "loss": 3.7364, + "step": 4841 + }, + { + "epoch": 0.20387368421052632, + "grad_norm": 0.3984375, + "learning_rate": 0.00045781231648939723, + "loss": 3.2253, + "step": 4842 + }, + { + "epoch": 0.20391578947368422, + "grad_norm": 0.416015625, + "learning_rate": 0.000457793556154664, + "loss": 3.611, + "step": 4843 + }, + { + "epoch": 0.20395789473684212, + "grad_norm": 0.404296875, + "learning_rate": 0.0004577747920341763, + "loss": 3.1724, + "step": 4844 + }, + { + "epoch": 0.204, + "grad_norm": 0.67578125, + "learning_rate": 0.000457756024128276, + "loss": 3.1954, + "step": 4845 + }, + { + "epoch": 0.20404210526315789, + "grad_norm": 0.41015625, + "learning_rate": 0.00045773725243730505, + "loss": 3.2098, + "step": 4846 + }, + { + "epoch": 0.20408421052631578, + "grad_norm": 0.388671875, + "learning_rate": 0.0004577184769616055, + "loss": 3.0881, + "step": 4847 + }, + { + "epoch": 0.20412631578947368, + "grad_norm": 0.439453125, + "learning_rate": 0.00045769969770151933, + "loss": 3.1638, + "step": 4848 + }, + { + "epoch": 0.20416842105263158, + "grad_norm": 0.3828125, + "learning_rate": 0.00045768091465738864, + "loss": 3.2756, + "step": 4849 + }, + { + "epoch": 0.20421052631578948, + "grad_norm": 0.4375, + "learning_rate": 0.00045766212782955576, + "loss": 3.456, + "step": 4850 + }, + { + "epoch": 0.20425263157894738, + "grad_norm": 0.392578125, + "learning_rate": 0.00045764333721836286, + "loss": 3.23, + "step": 4851 + }, + { + "epoch": 0.20429473684210525, + "grad_norm": 0.4375, + "learning_rate": 0.0004576245428241524, + "loss": 3.3233, + "step": 4852 + }, + { + "epoch": 0.20433684210526315, + "grad_norm": 0.5390625, + "learning_rate": 0.0004576057446472667, + "loss": 2.8044, + "step": 4853 + }, + { + "epoch": 0.20437894736842105, + "grad_norm": 0.412109375, + "learning_rate": 0.0004575869426880482, + "loss": 3.1849, + "step": 4854 + }, + { + "epoch": 0.20442105263157895, + "grad_norm": 0.435546875, + "learning_rate": 0.0004575681369468395, + "loss": 3.3302, + "step": 4855 + }, + { + "epoch": 0.20446315789473685, + "grad_norm": 0.439453125, + "learning_rate": 0.0004575493274239833, + "loss": 3.3471, + "step": 4856 + }, + { + "epoch": 0.20450526315789475, + "grad_norm": 0.388671875, + "learning_rate": 0.0004575305141198222, + "loss": 3.2048, + "step": 4857 + }, + { + "epoch": 0.20454736842105264, + "grad_norm": 0.39453125, + "learning_rate": 0.00045751169703469897, + "loss": 3.5034, + "step": 4858 + }, + { + "epoch": 0.20458947368421052, + "grad_norm": 0.4140625, + "learning_rate": 0.0004574928761689564, + "loss": 3.8968, + "step": 4859 + }, + { + "epoch": 0.20463157894736841, + "grad_norm": 0.40625, + "learning_rate": 0.0004574740515229374, + "loss": 3.3529, + "step": 4860 + }, + { + "epoch": 0.2046736842105263, + "grad_norm": 0.470703125, + "learning_rate": 0.00045745522309698506, + "loss": 3.7565, + "step": 4861 + }, + { + "epoch": 0.2047157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.0004574363908914422, + "loss": 3.8542, + "step": 4862 + }, + { + "epoch": 0.2047578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.00045741755490665204, + "loss": 3.021, + "step": 4863 + }, + { + "epoch": 0.2048, + "grad_norm": 0.40625, + "learning_rate": 0.00045739871514295785, + "loss": 2.9405, + "step": 4864 + }, + { + "epoch": 0.20484210526315788, + "grad_norm": 0.435546875, + "learning_rate": 0.0004573798716007026, + "loss": 3.1506, + "step": 4865 + }, + { + "epoch": 0.20488421052631578, + "grad_norm": 0.447265625, + "learning_rate": 0.00045736102428022983, + "loss": 3.4479, + "step": 4866 + }, + { + "epoch": 0.20492631578947368, + "grad_norm": 0.404296875, + "learning_rate": 0.00045734217318188286, + "loss": 3.5769, + "step": 4867 + }, + { + "epoch": 0.20496842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.000457323318306005, + "loss": 3.5091, + "step": 4868 + }, + { + "epoch": 0.20501052631578948, + "grad_norm": 0.44921875, + "learning_rate": 0.00045730445965293996, + "loss": 3.3422, + "step": 4869 + }, + { + "epoch": 0.20505263157894738, + "grad_norm": 0.392578125, + "learning_rate": 0.00045728559722303127, + "loss": 2.8421, + "step": 4870 + }, + { + "epoch": 0.20509473684210527, + "grad_norm": 0.44921875, + "learning_rate": 0.0004572667310166225, + "loss": 3.1501, + "step": 4871 + }, + { + "epoch": 0.20513684210526315, + "grad_norm": 0.39453125, + "learning_rate": 0.0004572478610340574, + "loss": 3.6378, + "step": 4872 + }, + { + "epoch": 0.20517894736842104, + "grad_norm": 0.451171875, + "learning_rate": 0.00045722898727567984, + "loss": 3.2267, + "step": 4873 + }, + { + "epoch": 0.20522105263157894, + "grad_norm": 0.44140625, + "learning_rate": 0.00045721010974183356, + "loss": 3.6669, + "step": 4874 + }, + { + "epoch": 0.20526315789473684, + "grad_norm": 0.400390625, + "learning_rate": 0.00045719122843286254, + "loss": 3.2277, + "step": 4875 + }, + { + "epoch": 0.20530526315789474, + "grad_norm": 0.416015625, + "learning_rate": 0.00045717234334911085, + "loss": 3.5051, + "step": 4876 + }, + { + "epoch": 0.20534736842105264, + "grad_norm": 0.416015625, + "learning_rate": 0.0004571534544909224, + "loss": 3.6637, + "step": 4877 + }, + { + "epoch": 0.20538947368421054, + "grad_norm": 0.408203125, + "learning_rate": 0.0004571345618586414, + "loss": 3.2662, + "step": 4878 + }, + { + "epoch": 0.2054315789473684, + "grad_norm": 0.47265625, + "learning_rate": 0.00045711566545261214, + "loss": 3.4582, + "step": 4879 + }, + { + "epoch": 0.2054736842105263, + "grad_norm": 0.39453125, + "learning_rate": 0.0004570967652731788, + "loss": 3.3523, + "step": 4880 + }, + { + "epoch": 0.2055157894736842, + "grad_norm": 0.388671875, + "learning_rate": 0.00045707786132068563, + "loss": 3.1303, + "step": 4881 + }, + { + "epoch": 0.2055578947368421, + "grad_norm": 0.40234375, + "learning_rate": 0.0004570589535954772, + "loss": 3.6112, + "step": 4882 + }, + { + "epoch": 0.2056, + "grad_norm": 0.40234375, + "learning_rate": 0.0004570400420978979, + "loss": 3.0822, + "step": 4883 + }, + { + "epoch": 0.2056421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.00045702112682829234, + "loss": 3.4396, + "step": 4884 + }, + { + "epoch": 0.2056842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.000457002207787005, + "loss": 3.0132, + "step": 4885 + }, + { + "epoch": 0.20572631578947367, + "grad_norm": 0.396484375, + "learning_rate": 0.00045698328497438073, + "loss": 3.7644, + "step": 4886 + }, + { + "epoch": 0.20576842105263157, + "grad_norm": 0.435546875, + "learning_rate": 0.0004569643583907642, + "loss": 3.7482, + "step": 4887 + }, + { + "epoch": 0.20581052631578947, + "grad_norm": 0.41796875, + "learning_rate": 0.0004569454280365002, + "loss": 3.4677, + "step": 4888 + }, + { + "epoch": 0.20585263157894737, + "grad_norm": 0.40234375, + "learning_rate": 0.0004569264939119336, + "loss": 3.2267, + "step": 4889 + }, + { + "epoch": 0.20589473684210527, + "grad_norm": 0.4453125, + "learning_rate": 0.0004569075560174095, + "loss": 3.4683, + "step": 4890 + }, + { + "epoch": 0.20593684210526317, + "grad_norm": 0.482421875, + "learning_rate": 0.00045688861435327276, + "loss": 2.7343, + "step": 4891 + }, + { + "epoch": 0.20597894736842104, + "grad_norm": 0.42578125, + "learning_rate": 0.00045686966891986865, + "loss": 3.3173, + "step": 4892 + }, + { + "epoch": 0.20602105263157894, + "grad_norm": 0.404296875, + "learning_rate": 0.0004568507197175422, + "loss": 3.542, + "step": 4893 + }, + { + "epoch": 0.20606315789473684, + "grad_norm": 0.390625, + "learning_rate": 0.0004568317667466386, + "loss": 3.409, + "step": 4894 + }, + { + "epoch": 0.20610526315789474, + "grad_norm": 0.39453125, + "learning_rate": 0.0004568128100075033, + "loss": 3.2829, + "step": 4895 + }, + { + "epoch": 0.20614736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.00045679384950048167, + "loss": 3.3108, + "step": 4896 + }, + { + "epoch": 0.20618947368421053, + "grad_norm": 0.40234375, + "learning_rate": 0.00045677488522591895, + "loss": 3.0382, + "step": 4897 + }, + { + "epoch": 0.20623157894736843, + "grad_norm": 0.43359375, + "learning_rate": 0.00045675591718416075, + "loss": 3.2969, + "step": 4898 + }, + { + "epoch": 0.2062736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.00045673694537555276, + "loss": 3.7491, + "step": 4899 + }, + { + "epoch": 0.2063157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.0004567179698004405, + "loss": 3.5097, + "step": 4900 + }, + { + "epoch": 0.2063578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.0004566989904591697, + "loss": 3.8193, + "step": 4901 + }, + { + "epoch": 0.2064, + "grad_norm": 0.416015625, + "learning_rate": 0.00045668000735208617, + "loss": 3.0735, + "step": 4902 + }, + { + "epoch": 0.2064421052631579, + "grad_norm": 0.45703125, + "learning_rate": 0.00045666102047953573, + "loss": 3.3607, + "step": 4903 + }, + { + "epoch": 0.2064842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.00045664202984186433, + "loss": 3.3636, + "step": 4904 + }, + { + "epoch": 0.2065263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.00045662303543941797, + "loss": 3.6543, + "step": 4905 + }, + { + "epoch": 0.20656842105263157, + "grad_norm": 0.4375, + "learning_rate": 0.00045660403727254263, + "loss": 3.4569, + "step": 4906 + }, + { + "epoch": 0.20661052631578947, + "grad_norm": 0.419921875, + "learning_rate": 0.00045658503534158444, + "loss": 3.1884, + "step": 4907 + }, + { + "epoch": 0.20665263157894737, + "grad_norm": 0.453125, + "learning_rate": 0.00045656602964688975, + "loss": 3.5567, + "step": 4908 + }, + { + "epoch": 0.20669473684210526, + "grad_norm": 0.390625, + "learning_rate": 0.0004565470201888046, + "loss": 3.2059, + "step": 4909 + }, + { + "epoch": 0.20673684210526316, + "grad_norm": 0.447265625, + "learning_rate": 0.0004565280069676755, + "loss": 2.9784, + "step": 4910 + }, + { + "epoch": 0.20677894736842106, + "grad_norm": 0.4296875, + "learning_rate": 0.0004565089899838487, + "loss": 3.223, + "step": 4911 + }, + { + "epoch": 0.20682105263157896, + "grad_norm": 0.421875, + "learning_rate": 0.00045648996923767084, + "loss": 3.146, + "step": 4912 + }, + { + "epoch": 0.20686315789473683, + "grad_norm": 0.392578125, + "learning_rate": 0.00045647094472948827, + "loss": 3.3057, + "step": 4913 + }, + { + "epoch": 0.20690526315789473, + "grad_norm": 0.431640625, + "learning_rate": 0.00045645191645964767, + "loss": 3.2634, + "step": 4914 + }, + { + "epoch": 0.20694736842105263, + "grad_norm": 0.462890625, + "learning_rate": 0.0004564328844284957, + "loss": 3.5713, + "step": 4915 + }, + { + "epoch": 0.20698947368421053, + "grad_norm": 0.404296875, + "learning_rate": 0.00045641384863637924, + "loss": 3.5453, + "step": 4916 + }, + { + "epoch": 0.20703157894736843, + "grad_norm": 0.423828125, + "learning_rate": 0.00045639480908364497, + "loss": 3.3009, + "step": 4917 + }, + { + "epoch": 0.20707368421052633, + "grad_norm": 0.4375, + "learning_rate": 0.0004563757657706397, + "loss": 3.7999, + "step": 4918 + }, + { + "epoch": 0.2071157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.0004563567186977105, + "loss": 3.432, + "step": 4919 + }, + { + "epoch": 0.2071578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.00045633766786520435, + "loss": 3.0901, + "step": 4920 + }, + { + "epoch": 0.2072, + "grad_norm": 0.3828125, + "learning_rate": 0.00045631861327346835, + "loss": 3.4383, + "step": 4921 + }, + { + "epoch": 0.2072421052631579, + "grad_norm": 0.3984375, + "learning_rate": 0.0004562995549228496, + "loss": 2.9675, + "step": 4922 + }, + { + "epoch": 0.2072842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.00045628049281369533, + "loss": 3.1739, + "step": 4923 + }, + { + "epoch": 0.2073263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.00045626142694635285, + "loss": 3.3017, + "step": 4924 + }, + { + "epoch": 0.2073684210526316, + "grad_norm": 0.416015625, + "learning_rate": 0.0004562423573211696, + "loss": 3.4198, + "step": 4925 + }, + { + "epoch": 0.20741052631578946, + "grad_norm": 0.46484375, + "learning_rate": 0.00045622328393849285, + "loss": 3.1396, + "step": 4926 + }, + { + "epoch": 0.20745263157894736, + "grad_norm": 0.404296875, + "learning_rate": 0.00045620420679867013, + "loss": 3.1957, + "step": 4927 + }, + { + "epoch": 0.20749473684210526, + "grad_norm": 0.4453125, + "learning_rate": 0.00045618512590204916, + "loss": 3.1129, + "step": 4928 + }, + { + "epoch": 0.20753684210526316, + "grad_norm": 0.400390625, + "learning_rate": 0.0004561660412489773, + "loss": 3.1561, + "step": 4929 + }, + { + "epoch": 0.20757894736842106, + "grad_norm": 0.41796875, + "learning_rate": 0.00045614695283980255, + "loss": 3.5342, + "step": 4930 + }, + { + "epoch": 0.20762105263157896, + "grad_norm": 0.390625, + "learning_rate": 0.0004561278606748725, + "loss": 3.1264, + "step": 4931 + }, + { + "epoch": 0.20766315789473686, + "grad_norm": 0.458984375, + "learning_rate": 0.000456108764754535, + "loss": 3.1724, + "step": 4932 + }, + { + "epoch": 0.20770526315789473, + "grad_norm": 0.3984375, + "learning_rate": 0.00045608966507913794, + "loss": 3.3363, + "step": 4933 + }, + { + "epoch": 0.20774736842105263, + "grad_norm": 0.388671875, + "learning_rate": 0.00045607056164902936, + "loss": 3.0936, + "step": 4934 + }, + { + "epoch": 0.20778947368421052, + "grad_norm": 0.41015625, + "learning_rate": 0.00045605145446455726, + "loss": 3.6235, + "step": 4935 + }, + { + "epoch": 0.20783157894736842, + "grad_norm": 0.3984375, + "learning_rate": 0.00045603234352606983, + "loss": 3.071, + "step": 4936 + }, + { + "epoch": 0.20787368421052632, + "grad_norm": 0.419921875, + "learning_rate": 0.00045601322883391505, + "loss": 3.4229, + "step": 4937 + }, + { + "epoch": 0.20791578947368422, + "grad_norm": 0.419921875, + "learning_rate": 0.0004559941103884414, + "loss": 3.5181, + "step": 4938 + }, + { + "epoch": 0.2079578947368421, + "grad_norm": 0.546875, + "learning_rate": 0.00045597498818999706, + "loss": 3.0271, + "step": 4939 + }, + { + "epoch": 0.208, + "grad_norm": 0.375, + "learning_rate": 0.0004559558622389304, + "loss": 3.0135, + "step": 4940 + }, + { + "epoch": 0.2080421052631579, + "grad_norm": 0.400390625, + "learning_rate": 0.00045593673253558996, + "loss": 3.7471, + "step": 4941 + }, + { + "epoch": 0.2080842105263158, + "grad_norm": 0.447265625, + "learning_rate": 0.0004559175990803242, + "loss": 3.1358, + "step": 4942 + }, + { + "epoch": 0.2081263157894737, + "grad_norm": 0.404296875, + "learning_rate": 0.0004558984618734818, + "loss": 3.1351, + "step": 4943 + }, + { + "epoch": 0.20816842105263159, + "grad_norm": 0.53125, + "learning_rate": 0.0004558793209154113, + "loss": 3.2653, + "step": 4944 + }, + { + "epoch": 0.20821052631578948, + "grad_norm": 0.412109375, + "learning_rate": 0.0004558601762064614, + "loss": 3.4543, + "step": 4945 + }, + { + "epoch": 0.20825263157894736, + "grad_norm": 0.466796875, + "learning_rate": 0.00045584102774698105, + "loss": 3.485, + "step": 4946 + }, + { + "epoch": 0.20829473684210525, + "grad_norm": 0.431640625, + "learning_rate": 0.000455821875537319, + "loss": 3.4329, + "step": 4947 + }, + { + "epoch": 0.20833684210526315, + "grad_norm": 0.408203125, + "learning_rate": 0.0004558027195778242, + "loss": 3.7951, + "step": 4948 + }, + { + "epoch": 0.20837894736842105, + "grad_norm": 0.408203125, + "learning_rate": 0.0004557835598688457, + "loss": 3.5596, + "step": 4949 + }, + { + "epoch": 0.20842105263157895, + "grad_norm": 0.43359375, + "learning_rate": 0.00045576439641073247, + "loss": 3.4217, + "step": 4950 + }, + { + "epoch": 0.20846315789473685, + "grad_norm": 0.419921875, + "learning_rate": 0.0004557452292038338, + "loss": 3.7025, + "step": 4951 + }, + { + "epoch": 0.20850526315789475, + "grad_norm": 0.42578125, + "learning_rate": 0.0004557260582484987, + "loss": 3.4397, + "step": 4952 + }, + { + "epoch": 0.20854736842105262, + "grad_norm": 0.40234375, + "learning_rate": 0.0004557068835450765, + "loss": 3.3109, + "step": 4953 + }, + { + "epoch": 0.20858947368421052, + "grad_norm": 0.404296875, + "learning_rate": 0.00045568770509391666, + "loss": 3.5784, + "step": 4954 + }, + { + "epoch": 0.20863157894736842, + "grad_norm": 0.3984375, + "learning_rate": 0.00045566852289536845, + "loss": 3.5591, + "step": 4955 + }, + { + "epoch": 0.20867368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.00045564933694978147, + "loss": 3.0718, + "step": 4956 + }, + { + "epoch": 0.20871578947368422, + "grad_norm": 0.451171875, + "learning_rate": 0.0004556301472575052, + "loss": 3.4293, + "step": 4957 + }, + { + "epoch": 0.20875789473684211, + "grad_norm": 0.48046875, + "learning_rate": 0.00045561095381888917, + "loss": 3.6398, + "step": 4958 + }, + { + "epoch": 0.2088, + "grad_norm": 0.41796875, + "learning_rate": 0.0004555917566342832, + "loss": 3.1091, + "step": 4959 + }, + { + "epoch": 0.20884210526315788, + "grad_norm": 0.4375, + "learning_rate": 0.000455572555704037, + "loss": 3.0131, + "step": 4960 + }, + { + "epoch": 0.20888421052631578, + "grad_norm": 0.439453125, + "learning_rate": 0.0004555533510285003, + "loss": 3.2339, + "step": 4961 + }, + { + "epoch": 0.20892631578947368, + "grad_norm": 0.421875, + "learning_rate": 0.0004555341426080231, + "loss": 3.3304, + "step": 4962 + }, + { + "epoch": 0.20896842105263158, + "grad_norm": 0.4609375, + "learning_rate": 0.00045551493044295534, + "loss": 3.1964, + "step": 4963 + }, + { + "epoch": 0.20901052631578948, + "grad_norm": 0.4296875, + "learning_rate": 0.00045549571453364703, + "loss": 3.4942, + "step": 4964 + }, + { + "epoch": 0.20905263157894738, + "grad_norm": 0.416015625, + "learning_rate": 0.00045547649488044816, + "loss": 3.5193, + "step": 4965 + }, + { + "epoch": 0.20909473684210525, + "grad_norm": 0.416015625, + "learning_rate": 0.0004554572714837091, + "loss": 3.3332, + "step": 4966 + }, + { + "epoch": 0.20913684210526315, + "grad_norm": 0.408203125, + "learning_rate": 0.00045543804434377985, + "loss": 3.3037, + "step": 4967 + }, + { + "epoch": 0.20917894736842105, + "grad_norm": 0.443359375, + "learning_rate": 0.00045541881346101087, + "loss": 3.0467, + "step": 4968 + }, + { + "epoch": 0.20922105263157895, + "grad_norm": 0.412109375, + "learning_rate": 0.00045539957883575244, + "loss": 3.2079, + "step": 4969 + }, + { + "epoch": 0.20926315789473685, + "grad_norm": 0.4453125, + "learning_rate": 0.00045538034046835503, + "loss": 3.2942, + "step": 4970 + }, + { + "epoch": 0.20930526315789474, + "grad_norm": 0.40234375, + "learning_rate": 0.0004553610983591691, + "loss": 3.3587, + "step": 4971 + }, + { + "epoch": 0.20934736842105264, + "grad_norm": 0.42578125, + "learning_rate": 0.0004553418525085453, + "loss": 3.3851, + "step": 4972 + }, + { + "epoch": 0.20938947368421051, + "grad_norm": 0.439453125, + "learning_rate": 0.00045532260291683416, + "loss": 3.4295, + "step": 4973 + }, + { + "epoch": 0.2094315789473684, + "grad_norm": 0.419921875, + "learning_rate": 0.00045530334958438645, + "loss": 3.2109, + "step": 4974 + }, + { + "epoch": 0.2094736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.000455284092511553, + "loss": 3.3413, + "step": 4975 + }, + { + "epoch": 0.2095157894736842, + "grad_norm": 0.45703125, + "learning_rate": 0.0004552648316986845, + "loss": 3.4819, + "step": 4976 + }, + { + "epoch": 0.2095578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.000455245567146132, + "loss": 3.3589, + "step": 4977 + }, + { + "epoch": 0.2096, + "grad_norm": 0.478515625, + "learning_rate": 0.0004552262988542464, + "loss": 3.427, + "step": 4978 + }, + { + "epoch": 0.2096421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00045520702682337876, + "loss": 3.5033, + "step": 4979 + }, + { + "epoch": 0.20968421052631578, + "grad_norm": 0.478515625, + "learning_rate": 0.0004551877510538802, + "loss": 3.6604, + "step": 4980 + }, + { + "epoch": 0.20972631578947368, + "grad_norm": 0.40625, + "learning_rate": 0.00045516847154610185, + "loss": 3.6099, + "step": 4981 + }, + { + "epoch": 0.20976842105263158, + "grad_norm": 0.376953125, + "learning_rate": 0.0004551491883003951, + "loss": 3.2591, + "step": 4982 + }, + { + "epoch": 0.20981052631578948, + "grad_norm": 0.419921875, + "learning_rate": 0.00045512990131711114, + "loss": 3.6699, + "step": 4983 + }, + { + "epoch": 0.20985263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.00045511061059660144, + "loss": 3.2984, + "step": 4984 + }, + { + "epoch": 0.20989473684210527, + "grad_norm": 0.400390625, + "learning_rate": 0.00045509131613921726, + "loss": 3.3488, + "step": 4985 + }, + { + "epoch": 0.20993684210526317, + "grad_norm": 0.453125, + "learning_rate": 0.00045507201794531044, + "loss": 3.7559, + "step": 4986 + }, + { + "epoch": 0.20997894736842104, + "grad_norm": 0.404296875, + "learning_rate": 0.00045505271601523235, + "loss": 3.5581, + "step": 4987 + }, + { + "epoch": 0.21002105263157894, + "grad_norm": 0.427734375, + "learning_rate": 0.0004550334103493347, + "loss": 3.7323, + "step": 4988 + }, + { + "epoch": 0.21006315789473684, + "grad_norm": 0.390625, + "learning_rate": 0.00045501410094796917, + "loss": 3.2852, + "step": 4989 + }, + { + "epoch": 0.21010526315789474, + "grad_norm": 0.392578125, + "learning_rate": 0.00045499478781148766, + "loss": 3.7183, + "step": 4990 + }, + { + "epoch": 0.21014736842105264, + "grad_norm": 0.435546875, + "learning_rate": 0.000454975470940242, + "loss": 3.9877, + "step": 4991 + }, + { + "epoch": 0.21018947368421054, + "grad_norm": 0.443359375, + "learning_rate": 0.00045495615033458404, + "loss": 3.6588, + "step": 4992 + }, + { + "epoch": 0.2102315789473684, + "grad_norm": 0.458984375, + "learning_rate": 0.00045493682599486584, + "loss": 2.7438, + "step": 4993 + }, + { + "epoch": 0.2102736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.0004549174979214394, + "loss": 3.5225, + "step": 4994 + }, + { + "epoch": 0.2103157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.00045489816611465704, + "loss": 3.0684, + "step": 4995 + }, + { + "epoch": 0.2103578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0004548788305748708, + "loss": 3.4274, + "step": 4996 + }, + { + "epoch": 0.2104, + "grad_norm": 0.40625, + "learning_rate": 0.000454859491302433, + "loss": 3.198, + "step": 4997 + }, + { + "epoch": 0.2104421052631579, + "grad_norm": 0.404296875, + "learning_rate": 0.00045484014829769593, + "loss": 3.8203, + "step": 4998 + }, + { + "epoch": 0.2104842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.000454820801561012, + "loss": 3.8871, + "step": 4999 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.41015625, + "learning_rate": 0.00045480145109273387, + "loss": 3.4842, + "step": 5000 + }, + { + "epoch": 0.21056842105263157, + "grad_norm": 0.416015625, + "learning_rate": 0.0004547820968932138, + "loss": 3.4447, + "step": 5001 + }, + { + "epoch": 0.21061052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.0004547627389628045, + "loss": 3.35, + "step": 5002 + }, + { + "epoch": 0.21065263157894737, + "grad_norm": 0.5234375, + "learning_rate": 0.0004547433773018588, + "loss": 3.1837, + "step": 5003 + }, + { + "epoch": 0.21069473684210527, + "grad_norm": 0.41796875, + "learning_rate": 0.00045472401191072934, + "loss": 3.2302, + "step": 5004 + }, + { + "epoch": 0.21073684210526317, + "grad_norm": 0.416015625, + "learning_rate": 0.00045470464278976873, + "loss": 3.1655, + "step": 5005 + }, + { + "epoch": 0.21077894736842107, + "grad_norm": 0.42578125, + "learning_rate": 0.00045468526993933025, + "loss": 3.4476, + "step": 5006 + }, + { + "epoch": 0.21082105263157894, + "grad_norm": 0.400390625, + "learning_rate": 0.0004546658933597666, + "loss": 3.1952, + "step": 5007 + }, + { + "epoch": 0.21086315789473684, + "grad_norm": 0.462890625, + "learning_rate": 0.0004546465130514308, + "loss": 2.9211, + "step": 5008 + }, + { + "epoch": 0.21090526315789473, + "grad_norm": 0.6015625, + "learning_rate": 0.00045462712901467593, + "loss": 3.5233, + "step": 5009 + }, + { + "epoch": 0.21094736842105263, + "grad_norm": 0.4453125, + "learning_rate": 0.0004546077412498553, + "loss": 3.2, + "step": 5010 + }, + { + "epoch": 0.21098947368421053, + "grad_norm": 0.392578125, + "learning_rate": 0.000454588349757322, + "loss": 3.2371, + "step": 5011 + }, + { + "epoch": 0.21103157894736843, + "grad_norm": 0.455078125, + "learning_rate": 0.0004545689545374294, + "loss": 3.3118, + "step": 5012 + }, + { + "epoch": 0.2110736842105263, + "grad_norm": 0.400390625, + "learning_rate": 0.00045454955559053067, + "loss": 2.7364, + "step": 5013 + }, + { + "epoch": 0.2111157894736842, + "grad_norm": 0.392578125, + "learning_rate": 0.0004545301529169795, + "loss": 3.2338, + "step": 5014 + }, + { + "epoch": 0.2111578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.0004545107465171292, + "loss": 3.6475, + "step": 5015 + }, + { + "epoch": 0.2112, + "grad_norm": 0.427734375, + "learning_rate": 0.0004544913363913334, + "loss": 3.2459, + "step": 5016 + }, + { + "epoch": 0.2112421052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.0004544719225399457, + "loss": 3.4978, + "step": 5017 + }, + { + "epoch": 0.2112842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.0004544525049633199, + "loss": 3.5852, + "step": 5018 + }, + { + "epoch": 0.2113263157894737, + "grad_norm": 0.439453125, + "learning_rate": 0.0004544330836618096, + "loss": 3.4177, + "step": 5019 + }, + { + "epoch": 0.21136842105263157, + "grad_norm": 0.5703125, + "learning_rate": 0.00045441365863576873, + "loss": 3.4454, + "step": 5020 + }, + { + "epoch": 0.21141052631578947, + "grad_norm": 0.408203125, + "learning_rate": 0.0004543942298855512, + "loss": 3.3222, + "step": 5021 + }, + { + "epoch": 0.21145263157894736, + "grad_norm": 0.453125, + "learning_rate": 0.0004543747974115109, + "loss": 3.0539, + "step": 5022 + }, + { + "epoch": 0.21149473684210526, + "grad_norm": 0.396484375, + "learning_rate": 0.00045435536121400194, + "loss": 3.5983, + "step": 5023 + }, + { + "epoch": 0.21153684210526316, + "grad_norm": 0.400390625, + "learning_rate": 0.00045433592129337844, + "loss": 3.5145, + "step": 5024 + }, + { + "epoch": 0.21157894736842106, + "grad_norm": 0.40625, + "learning_rate": 0.0004543164776499945, + "loss": 3.4578, + "step": 5025 + }, + { + "epoch": 0.21162105263157896, + "grad_norm": 0.41015625, + "learning_rate": 0.00045429703028420444, + "loss": 3.4846, + "step": 5026 + }, + { + "epoch": 0.21166315789473683, + "grad_norm": 0.431640625, + "learning_rate": 0.0004542775791963625, + "loss": 3.6145, + "step": 5027 + }, + { + "epoch": 0.21170526315789473, + "grad_norm": 0.41015625, + "learning_rate": 0.00045425812438682313, + "loss": 3.1894, + "step": 5028 + }, + { + "epoch": 0.21174736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.0004542386658559406, + "loss": 3.3676, + "step": 5029 + }, + { + "epoch": 0.21178947368421053, + "grad_norm": 0.400390625, + "learning_rate": 0.00045421920360406965, + "loss": 3.5897, + "step": 5030 + }, + { + "epoch": 0.21183157894736843, + "grad_norm": 0.40234375, + "learning_rate": 0.00045419973763156474, + "loss": 3.6415, + "step": 5031 + }, + { + "epoch": 0.21187368421052633, + "grad_norm": 0.4140625, + "learning_rate": 0.0004541802679387806, + "loss": 3.4499, + "step": 5032 + }, + { + "epoch": 0.21191578947368422, + "grad_norm": 0.423828125, + "learning_rate": 0.0004541607945260718, + "loss": 3.0771, + "step": 5033 + }, + { + "epoch": 0.2119578947368421, + "grad_norm": 0.384765625, + "learning_rate": 0.0004541413173937932, + "loss": 3.5545, + "step": 5034 + }, + { + "epoch": 0.212, + "grad_norm": 0.37890625, + "learning_rate": 0.00045412183654229966, + "loss": 3.219, + "step": 5035 + }, + { + "epoch": 0.2120421052631579, + "grad_norm": 0.447265625, + "learning_rate": 0.0004541023519719462, + "loss": 3.4325, + "step": 5036 + }, + { + "epoch": 0.2120842105263158, + "grad_norm": 0.40234375, + "learning_rate": 0.0004540828636830875, + "loss": 3.3257, + "step": 5037 + }, + { + "epoch": 0.2121263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.0004540633716760789, + "loss": 3.0871, + "step": 5038 + }, + { + "epoch": 0.2121684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.0004540438759512755, + "loss": 3.248, + "step": 5039 + }, + { + "epoch": 0.21221052631578946, + "grad_norm": 0.421875, + "learning_rate": 0.00045402437650903237, + "loss": 3.2909, + "step": 5040 + }, + { + "epoch": 0.21225263157894736, + "grad_norm": 0.42578125, + "learning_rate": 0.0004540048733497048, + "loss": 3.6888, + "step": 5041 + }, + { + "epoch": 0.21229473684210526, + "grad_norm": 0.490234375, + "learning_rate": 0.0004539853664736482, + "loss": 3.2015, + "step": 5042 + }, + { + "epoch": 0.21233684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.0004539658558812178, + "loss": 3.1744, + "step": 5043 + }, + { + "epoch": 0.21237894736842106, + "grad_norm": 0.4375, + "learning_rate": 0.0004539463415727692, + "loss": 3.5649, + "step": 5044 + }, + { + "epoch": 0.21242105263157895, + "grad_norm": 0.396484375, + "learning_rate": 0.0004539268235486579, + "loss": 3.6747, + "step": 5045 + }, + { + "epoch": 0.21246315789473685, + "grad_norm": 0.80078125, + "learning_rate": 0.00045390730180923947, + "loss": 3.2573, + "step": 5046 + }, + { + "epoch": 0.21250526315789472, + "grad_norm": 0.5546875, + "learning_rate": 0.00045388777635486954, + "loss": 2.9376, + "step": 5047 + }, + { + "epoch": 0.21254736842105262, + "grad_norm": 0.42578125, + "learning_rate": 0.0004538682471859039, + "loss": 3.3826, + "step": 5048 + }, + { + "epoch": 0.21258947368421052, + "grad_norm": 0.416015625, + "learning_rate": 0.00045384871430269834, + "loss": 3.7058, + "step": 5049 + }, + { + "epoch": 0.21263157894736842, + "grad_norm": 0.4453125, + "learning_rate": 0.00045382917770560885, + "loss": 3.041, + "step": 5050 + }, + { + "epoch": 0.21267368421052632, + "grad_norm": 0.40625, + "learning_rate": 0.000453809637394991, + "loss": 2.7356, + "step": 5051 + }, + { + "epoch": 0.21271578947368422, + "grad_norm": 0.400390625, + "learning_rate": 0.0004537900933712012, + "loss": 2.9815, + "step": 5052 + }, + { + "epoch": 0.21275789473684212, + "grad_norm": 0.53125, + "learning_rate": 0.0004537705456345953, + "loss": 3.2517, + "step": 5053 + }, + { + "epoch": 0.2128, + "grad_norm": 0.4609375, + "learning_rate": 0.00045375099418552947, + "loss": 3.5416, + "step": 5054 + }, + { + "epoch": 0.2128421052631579, + "grad_norm": 0.38671875, + "learning_rate": 0.00045373143902435986, + "loss": 3.5455, + "step": 5055 + }, + { + "epoch": 0.2128842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.0004537118801514428, + "loss": 3.2139, + "step": 5056 + }, + { + "epoch": 0.21292631578947369, + "grad_norm": 0.40625, + "learning_rate": 0.00045369231756713476, + "loss": 2.9617, + "step": 5057 + }, + { + "epoch": 0.21296842105263158, + "grad_norm": 0.45703125, + "learning_rate": 0.00045367275127179197, + "loss": 3.3845, + "step": 5058 + }, + { + "epoch": 0.21301052631578948, + "grad_norm": 0.451171875, + "learning_rate": 0.00045365318126577094, + "loss": 3.5087, + "step": 5059 + }, + { + "epoch": 0.21305263157894735, + "grad_norm": 0.392578125, + "learning_rate": 0.0004536336075494282, + "loss": 3.1787, + "step": 5060 + }, + { + "epoch": 0.21309473684210525, + "grad_norm": 0.412109375, + "learning_rate": 0.00045361403012312044, + "loss": 3.0566, + "step": 5061 + }, + { + "epoch": 0.21313684210526315, + "grad_norm": 0.4609375, + "learning_rate": 0.0004535944489872043, + "loss": 3.137, + "step": 5062 + }, + { + "epoch": 0.21317894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.0004535748641420364, + "loss": 3.7776, + "step": 5063 + }, + { + "epoch": 0.21322105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.0004535552755879738, + "loss": 3.4562, + "step": 5064 + }, + { + "epoch": 0.21326315789473685, + "grad_norm": 0.435546875, + "learning_rate": 0.0004535356833253732, + "loss": 3.5042, + "step": 5065 + }, + { + "epoch": 0.21330526315789475, + "grad_norm": 0.46875, + "learning_rate": 0.00045351608735459155, + "loss": 3.3419, + "step": 5066 + }, + { + "epoch": 0.21334736842105262, + "grad_norm": 0.388671875, + "learning_rate": 0.0004534964876759859, + "loss": 3.6556, + "step": 5067 + }, + { + "epoch": 0.21338947368421052, + "grad_norm": 0.396484375, + "learning_rate": 0.0004534768842899134, + "loss": 3.3249, + "step": 5068 + }, + { + "epoch": 0.21343157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.00045345727719673116, + "loss": 3.4426, + "step": 5069 + }, + { + "epoch": 0.21347368421052632, + "grad_norm": 0.41015625, + "learning_rate": 0.0004534376663967964, + "loss": 3.2209, + "step": 5070 + }, + { + "epoch": 0.21351578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.00045341805189046634, + "loss": 3.4591, + "step": 5071 + }, + { + "epoch": 0.2135578947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.0004533984336780984, + "loss": 3.2081, + "step": 5072 + }, + { + "epoch": 0.2136, + "grad_norm": 0.40625, + "learning_rate": 0.0004533788117600499, + "loss": 3.2632, + "step": 5073 + }, + { + "epoch": 0.21364210526315788, + "grad_norm": 0.421875, + "learning_rate": 0.00045335918613667847, + "loss": 3.4523, + "step": 5074 + }, + { + "epoch": 0.21368421052631578, + "grad_norm": 0.490234375, + "learning_rate": 0.00045333955680834165, + "loss": 3.5172, + "step": 5075 + }, + { + "epoch": 0.21372631578947368, + "grad_norm": 0.4296875, + "learning_rate": 0.000453319923775397, + "loss": 3.3143, + "step": 5076 + }, + { + "epoch": 0.21376842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.0004533002870382022, + "loss": 3.6705, + "step": 5077 + }, + { + "epoch": 0.21381052631578948, + "grad_norm": 0.423828125, + "learning_rate": 0.00045328064659711507, + "loss": 3.6537, + "step": 5078 + }, + { + "epoch": 0.21385263157894738, + "grad_norm": 0.447265625, + "learning_rate": 0.00045326100245249334, + "loss": 2.9073, + "step": 5079 + }, + { + "epoch": 0.21389473684210528, + "grad_norm": 0.58984375, + "learning_rate": 0.00045324135460469506, + "loss": 3.3215, + "step": 5080 + }, + { + "epoch": 0.21393684210526315, + "grad_norm": 0.443359375, + "learning_rate": 0.0004532217030540781, + "loss": 3.5495, + "step": 5081 + }, + { + "epoch": 0.21397894736842105, + "grad_norm": 0.404296875, + "learning_rate": 0.0004532020478010004, + "loss": 3.2773, + "step": 5082 + }, + { + "epoch": 0.21402105263157895, + "grad_norm": 0.408203125, + "learning_rate": 0.0004531823888458201, + "loss": 3.427, + "step": 5083 + }, + { + "epoch": 0.21406315789473684, + "grad_norm": 0.396484375, + "learning_rate": 0.00045316272618889554, + "loss": 3.2485, + "step": 5084 + }, + { + "epoch": 0.21410526315789474, + "grad_norm": 0.53125, + "learning_rate": 0.00045314305983058475, + "loss": 3.1793, + "step": 5085 + }, + { + "epoch": 0.21414736842105264, + "grad_norm": 0.54296875, + "learning_rate": 0.0004531233897712461, + "loss": 3.2069, + "step": 5086 + }, + { + "epoch": 0.2141894736842105, + "grad_norm": 0.455078125, + "learning_rate": 0.0004531037160112379, + "loss": 3.405, + "step": 5087 + }, + { + "epoch": 0.2142315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.0004530840385509187, + "loss": 3.4204, + "step": 5088 + }, + { + "epoch": 0.2142736842105263, + "grad_norm": 0.453125, + "learning_rate": 0.0004530643573906469, + "loss": 3.1079, + "step": 5089 + }, + { + "epoch": 0.2143157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.0004530446725307811, + "loss": 3.5732, + "step": 5090 + }, + { + "epoch": 0.2143578947368421, + "grad_norm": 0.443359375, + "learning_rate": 0.00045302498397168, + "loss": 2.8091, + "step": 5091 + }, + { + "epoch": 0.2144, + "grad_norm": 0.412109375, + "learning_rate": 0.0004530052917137022, + "loss": 3.1624, + "step": 5092 + }, + { + "epoch": 0.2144421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.0004529855957572064, + "loss": 3.3619, + "step": 5093 + }, + { + "epoch": 0.21448421052631578, + "grad_norm": 0.455078125, + "learning_rate": 0.0004529658961025517, + "loss": 2.9494, + "step": 5094 + }, + { + "epoch": 0.21452631578947368, + "grad_norm": 0.42578125, + "learning_rate": 0.0004529461927500967, + "loss": 3.6869, + "step": 5095 + }, + { + "epoch": 0.21456842105263157, + "grad_norm": 0.419921875, + "learning_rate": 0.0004529264857002007, + "loss": 3.4809, + "step": 5096 + }, + { + "epoch": 0.21461052631578947, + "grad_norm": 0.4609375, + "learning_rate": 0.0004529067749532224, + "loss": 2.87, + "step": 5097 + }, + { + "epoch": 0.21465263157894737, + "grad_norm": 0.404296875, + "learning_rate": 0.00045288706050952113, + "loss": 3.2988, + "step": 5098 + }, + { + "epoch": 0.21469473684210527, + "grad_norm": 0.462890625, + "learning_rate": 0.00045286734236945605, + "loss": 3.6759, + "step": 5099 + }, + { + "epoch": 0.21473684210526317, + "grad_norm": 0.41796875, + "learning_rate": 0.0004528476205333862, + "loss": 3.4897, + "step": 5100 + }, + { + "epoch": 0.21477894736842104, + "grad_norm": 0.419921875, + "learning_rate": 0.0004528278950016712, + "loss": 3.3378, + "step": 5101 + }, + { + "epoch": 0.21482105263157894, + "grad_norm": 0.39453125, + "learning_rate": 0.0004528081657746702, + "loss": 3.4187, + "step": 5102 + }, + { + "epoch": 0.21486315789473684, + "grad_norm": 0.53125, + "learning_rate": 0.00045278843285274266, + "loss": 3.4823, + "step": 5103 + }, + { + "epoch": 0.21490526315789474, + "grad_norm": 0.435546875, + "learning_rate": 0.0004527686962362482, + "loss": 3.1691, + "step": 5104 + }, + { + "epoch": 0.21494736842105264, + "grad_norm": 0.443359375, + "learning_rate": 0.00045274895592554634, + "loss": 3.2961, + "step": 5105 + }, + { + "epoch": 0.21498947368421054, + "grad_norm": 0.44921875, + "learning_rate": 0.0004527292119209967, + "loss": 2.6684, + "step": 5106 + }, + { + "epoch": 0.21503157894736843, + "grad_norm": 0.439453125, + "learning_rate": 0.00045270946422295904, + "loss": 3.134, + "step": 5107 + }, + { + "epoch": 0.2150736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.0004526897128317931, + "loss": 3.2913, + "step": 5108 + }, + { + "epoch": 0.2151157894736842, + "grad_norm": 0.82421875, + "learning_rate": 0.0004526699577478588, + "loss": 3.2627, + "step": 5109 + }, + { + "epoch": 0.2151578947368421, + "grad_norm": 0.484375, + "learning_rate": 0.0004526501989715159, + "loss": 3.5957, + "step": 5110 + }, + { + "epoch": 0.2152, + "grad_norm": 0.404296875, + "learning_rate": 0.00045263043650312453, + "loss": 3.3444, + "step": 5111 + }, + { + "epoch": 0.2152421052631579, + "grad_norm": 0.40234375, + "learning_rate": 0.0004526106703430447, + "loss": 3.3949, + "step": 5112 + }, + { + "epoch": 0.2152842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.0004525909004916365, + "loss": 3.7753, + "step": 5113 + }, + { + "epoch": 0.21532631578947367, + "grad_norm": 0.46484375, + "learning_rate": 0.00045257112694926005, + "loss": 3.4975, + "step": 5114 + }, + { + "epoch": 0.21536842105263157, + "grad_norm": 0.4140625, + "learning_rate": 0.0004525513497162758, + "loss": 3.5303, + "step": 5115 + }, + { + "epoch": 0.21541052631578947, + "grad_norm": 0.390625, + "learning_rate": 0.0004525315687930439, + "loss": 3.1494, + "step": 5116 + }, + { + "epoch": 0.21545263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.0004525117841799248, + "loss": 3.5098, + "step": 5117 + }, + { + "epoch": 0.21549473684210527, + "grad_norm": 0.41015625, + "learning_rate": 0.0004524919958772788, + "loss": 3.4156, + "step": 5118 + }, + { + "epoch": 0.21553684210526317, + "grad_norm": 0.40234375, + "learning_rate": 0.0004524722038854667, + "loss": 3.4661, + "step": 5119 + }, + { + "epoch": 0.21557894736842106, + "grad_norm": 0.3984375, + "learning_rate": 0.00045245240820484894, + "loss": 3.7576, + "step": 5120 + }, + { + "epoch": 0.21562105263157894, + "grad_norm": 0.4296875, + "learning_rate": 0.0004524326088357862, + "loss": 3.0054, + "step": 5121 + }, + { + "epoch": 0.21566315789473683, + "grad_norm": 0.416015625, + "learning_rate": 0.0004524128057786391, + "loss": 3.3508, + "step": 5122 + }, + { + "epoch": 0.21570526315789473, + "grad_norm": 0.40625, + "learning_rate": 0.0004523929990337685, + "loss": 3.0819, + "step": 5123 + }, + { + "epoch": 0.21574736842105263, + "grad_norm": 0.431640625, + "learning_rate": 0.00045237318860153527, + "loss": 3.3521, + "step": 5124 + }, + { + "epoch": 0.21578947368421053, + "grad_norm": 0.51953125, + "learning_rate": 0.00045235337448230027, + "loss": 3.0247, + "step": 5125 + }, + { + "epoch": 0.21583157894736843, + "grad_norm": 0.421875, + "learning_rate": 0.0004523335566764246, + "loss": 3.7046, + "step": 5126 + }, + { + "epoch": 0.21587368421052633, + "grad_norm": 0.38671875, + "learning_rate": 0.00045231373518426934, + "loss": 3.3787, + "step": 5127 + }, + { + "epoch": 0.2159157894736842, + "grad_norm": 0.45703125, + "learning_rate": 0.00045229391000619544, + "loss": 3.2496, + "step": 5128 + }, + { + "epoch": 0.2159578947368421, + "grad_norm": 0.39453125, + "learning_rate": 0.0004522740811425642, + "loss": 3.1712, + "step": 5129 + }, + { + "epoch": 0.216, + "grad_norm": 0.455078125, + "learning_rate": 0.0004522542485937369, + "loss": 3.0353, + "step": 5130 + }, + { + "epoch": 0.2160421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.0004522344123600748, + "loss": 3.213, + "step": 5131 + }, + { + "epoch": 0.2160842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.00045221457244193933, + "loss": 3.7437, + "step": 5132 + }, + { + "epoch": 0.2161263157894737, + "grad_norm": 0.5390625, + "learning_rate": 0.00045219472883969194, + "loss": 3.1857, + "step": 5133 + }, + { + "epoch": 0.21616842105263157, + "grad_norm": 0.38671875, + "learning_rate": 0.00045217488155369415, + "loss": 3.3486, + "step": 5134 + }, + { + "epoch": 0.21621052631578946, + "grad_norm": 0.412109375, + "learning_rate": 0.00045215503058430753, + "loss": 3.2492, + "step": 5135 + }, + { + "epoch": 0.21625263157894736, + "grad_norm": 0.47265625, + "learning_rate": 0.0004521351759318938, + "loss": 3.6465, + "step": 5136 + }, + { + "epoch": 0.21629473684210526, + "grad_norm": 0.53515625, + "learning_rate": 0.0004521153175968147, + "loss": 3.0847, + "step": 5137 + }, + { + "epoch": 0.21633684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.00045209545557943194, + "loss": 3.0244, + "step": 5138 + }, + { + "epoch": 0.21637894736842106, + "grad_norm": 0.419921875, + "learning_rate": 0.00045207558988010746, + "loss": 3.5579, + "step": 5139 + }, + { + "epoch": 0.21642105263157896, + "grad_norm": 0.43359375, + "learning_rate": 0.00045205572049920314, + "loss": 3.1917, + "step": 5140 + }, + { + "epoch": 0.21646315789473683, + "grad_norm": 0.43359375, + "learning_rate": 0.00045203584743708106, + "loss": 3.401, + "step": 5141 + }, + { + "epoch": 0.21650526315789473, + "grad_norm": 0.412109375, + "learning_rate": 0.00045201597069410317, + "loss": 3.3406, + "step": 5142 + }, + { + "epoch": 0.21654736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.0004519960902706316, + "loss": 3.3438, + "step": 5143 + }, + { + "epoch": 0.21658947368421053, + "grad_norm": 0.40625, + "learning_rate": 0.00045197620616702864, + "loss": 3.2168, + "step": 5144 + }, + { + "epoch": 0.21663157894736843, + "grad_norm": 0.412109375, + "learning_rate": 0.00045195631838365656, + "loss": 3.2245, + "step": 5145 + }, + { + "epoch": 0.21667368421052632, + "grad_norm": 0.404296875, + "learning_rate": 0.00045193642692087767, + "loss": 3.2725, + "step": 5146 + }, + { + "epoch": 0.21671578947368422, + "grad_norm": 0.44140625, + "learning_rate": 0.00045191653177905424, + "loss": 3.8455, + "step": 5147 + }, + { + "epoch": 0.2167578947368421, + "grad_norm": 0.447265625, + "learning_rate": 0.0004518966329585489, + "loss": 3.3189, + "step": 5148 + }, + { + "epoch": 0.2168, + "grad_norm": 0.453125, + "learning_rate": 0.0004518767304597241, + "loss": 3.0322, + "step": 5149 + }, + { + "epoch": 0.2168421052631579, + "grad_norm": 0.5078125, + "learning_rate": 0.0004518568242829425, + "loss": 3.2413, + "step": 5150 + }, + { + "epoch": 0.2168842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.0004518369144285668, + "loss": 3.7623, + "step": 5151 + }, + { + "epoch": 0.2169263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.00045181700089695956, + "loss": 3.2499, + "step": 5152 + }, + { + "epoch": 0.2169684210526316, + "grad_norm": 0.40625, + "learning_rate": 0.0004517970836884837, + "loss": 3.6117, + "step": 5153 + }, + { + "epoch": 0.2170105263157895, + "grad_norm": 0.470703125, + "learning_rate": 0.0004517771628035021, + "loss": 3.2788, + "step": 5154 + }, + { + "epoch": 0.21705263157894736, + "grad_norm": 0.416015625, + "learning_rate": 0.0004517572382423777, + "loss": 3.7973, + "step": 5155 + }, + { + "epoch": 0.21709473684210526, + "grad_norm": 0.40625, + "learning_rate": 0.00045173731000547346, + "loss": 3.312, + "step": 5156 + }, + { + "epoch": 0.21713684210526316, + "grad_norm": 0.4453125, + "learning_rate": 0.00045171737809315246, + "loss": 3.3567, + "step": 5157 + }, + { + "epoch": 0.21717894736842105, + "grad_norm": 0.419921875, + "learning_rate": 0.0004516974425057778, + "loss": 3.1575, + "step": 5158 + }, + { + "epoch": 0.21722105263157895, + "grad_norm": 0.474609375, + "learning_rate": 0.0004516775032437128, + "loss": 3.251, + "step": 5159 + }, + { + "epoch": 0.21726315789473685, + "grad_norm": 0.40234375, + "learning_rate": 0.00045165756030732056, + "loss": 3.3178, + "step": 5160 + }, + { + "epoch": 0.21730526315789472, + "grad_norm": 0.400390625, + "learning_rate": 0.00045163761369696457, + "loss": 3.0763, + "step": 5161 + }, + { + "epoch": 0.21734736842105262, + "grad_norm": 0.458984375, + "learning_rate": 0.00045161766341300815, + "loss": 3.2707, + "step": 5162 + }, + { + "epoch": 0.21738947368421052, + "grad_norm": 0.5234375, + "learning_rate": 0.0004515977094558148, + "loss": 3.491, + "step": 5163 + }, + { + "epoch": 0.21743157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.0004515777518257481, + "loss": 3.0893, + "step": 5164 + }, + { + "epoch": 0.21747368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.00045155779052317155, + "loss": 3.0381, + "step": 5165 + }, + { + "epoch": 0.21751578947368422, + "grad_norm": 0.4609375, + "learning_rate": 0.00045153782554844884, + "loss": 3.355, + "step": 5166 + }, + { + "epoch": 0.21755789473684212, + "grad_norm": 0.4453125, + "learning_rate": 0.0004515178569019438, + "loss": 2.8812, + "step": 5167 + }, + { + "epoch": 0.2176, + "grad_norm": 0.41015625, + "learning_rate": 0.0004514978845840202, + "loss": 2.6799, + "step": 5168 + }, + { + "epoch": 0.2176421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00045147790859504186, + "loss": 3.5175, + "step": 5169 + }, + { + "epoch": 0.21768421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.00045145792893537274, + "loss": 3.2856, + "step": 5170 + }, + { + "epoch": 0.21772631578947368, + "grad_norm": 0.46484375, + "learning_rate": 0.0004514379456053769, + "loss": 3.003, + "step": 5171 + }, + { + "epoch": 0.21776842105263158, + "grad_norm": 0.44921875, + "learning_rate": 0.00045141795860541837, + "loss": 3.3975, + "step": 5172 + }, + { + "epoch": 0.21781052631578948, + "grad_norm": 0.39453125, + "learning_rate": 0.00045139796793586127, + "loss": 3.1275, + "step": 5173 + }, + { + "epoch": 0.21785263157894738, + "grad_norm": 0.408203125, + "learning_rate": 0.00045137797359706976, + "loss": 3.5616, + "step": 5174 + }, + { + "epoch": 0.21789473684210525, + "grad_norm": 0.443359375, + "learning_rate": 0.0004513579755894083, + "loss": 3.2332, + "step": 5175 + }, + { + "epoch": 0.21793684210526315, + "grad_norm": 0.462890625, + "learning_rate": 0.0004513379739132411, + "loss": 3.2272, + "step": 5176 + }, + { + "epoch": 0.21797894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.0004513179685689325, + "loss": 3.0212, + "step": 5177 + }, + { + "epoch": 0.21802105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.00045129795955684704, + "loss": 3.3967, + "step": 5178 + }, + { + "epoch": 0.21806315789473685, + "grad_norm": 0.462890625, + "learning_rate": 0.00045127794687734935, + "loss": 3.3554, + "step": 5179 + }, + { + "epoch": 0.21810526315789475, + "grad_norm": 0.89453125, + "learning_rate": 0.0004512579305308039, + "loss": 3.2028, + "step": 5180 + }, + { + "epoch": 0.21814736842105265, + "grad_norm": 0.458984375, + "learning_rate": 0.00045123791051757546, + "loss": 3.3141, + "step": 5181 + }, + { + "epoch": 0.21818947368421052, + "grad_norm": 0.474609375, + "learning_rate": 0.0004512178868380287, + "loss": 3.7758, + "step": 5182 + }, + { + "epoch": 0.21823157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.0004511978594925285, + "loss": 3.1238, + "step": 5183 + }, + { + "epoch": 0.21827368421052631, + "grad_norm": 0.40625, + "learning_rate": 0.0004511778284814397, + "loss": 2.844, + "step": 5184 + }, + { + "epoch": 0.2183157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.00045115779380512725, + "loss": 3.4275, + "step": 5185 + }, + { + "epoch": 0.2183578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.0004511377554639561, + "loss": 3.464, + "step": 5186 + }, + { + "epoch": 0.2184, + "grad_norm": 0.60546875, + "learning_rate": 0.00045111771345829137, + "loss": 3.1737, + "step": 5187 + }, + { + "epoch": 0.21844210526315788, + "grad_norm": 0.423828125, + "learning_rate": 0.00045109766778849824, + "loss": 3.2259, + "step": 5188 + }, + { + "epoch": 0.21848421052631578, + "grad_norm": 0.43359375, + "learning_rate": 0.00045107761845494186, + "loss": 3.5325, + "step": 5189 + }, + { + "epoch": 0.21852631578947368, + "grad_norm": 0.466796875, + "learning_rate": 0.0004510575654579876, + "loss": 3.1308, + "step": 5190 + }, + { + "epoch": 0.21856842105263158, + "grad_norm": 0.400390625, + "learning_rate": 0.00045103750879800064, + "loss": 3.0776, + "step": 5191 + }, + { + "epoch": 0.21861052631578948, + "grad_norm": 0.462890625, + "learning_rate": 0.00045101744847534655, + "loss": 3.226, + "step": 5192 + }, + { + "epoch": 0.21865263157894738, + "grad_norm": 0.408203125, + "learning_rate": 0.0004509973844903906, + "loss": 2.9164, + "step": 5193 + }, + { + "epoch": 0.21869473684210528, + "grad_norm": 0.400390625, + "learning_rate": 0.0004509773168434986, + "loss": 3.288, + "step": 5194 + }, + { + "epoch": 0.21873684210526315, + "grad_norm": 0.40234375, + "learning_rate": 0.00045095724553503603, + "loss": 2.9882, + "step": 5195 + }, + { + "epoch": 0.21877894736842105, + "grad_norm": 0.400390625, + "learning_rate": 0.0004509371705653685, + "loss": 3.3799, + "step": 5196 + }, + { + "epoch": 0.21882105263157894, + "grad_norm": 0.458984375, + "learning_rate": 0.00045091709193486186, + "loss": 3.2181, + "step": 5197 + }, + { + "epoch": 0.21886315789473684, + "grad_norm": 0.4609375, + "learning_rate": 0.0004508970096438818, + "loss": 3.1634, + "step": 5198 + }, + { + "epoch": 0.21890526315789474, + "grad_norm": 0.419921875, + "learning_rate": 0.0004508769236927943, + "loss": 3.5145, + "step": 5199 + }, + { + "epoch": 0.21894736842105264, + "grad_norm": 0.4609375, + "learning_rate": 0.0004508568340819653, + "loss": 3.3697, + "step": 5200 + }, + { + "epoch": 0.21898947368421054, + "grad_norm": 0.89453125, + "learning_rate": 0.0004508367408117608, + "loss": 3.5037, + "step": 5201 + }, + { + "epoch": 0.2190315789473684, + "grad_norm": 0.41015625, + "learning_rate": 0.00045081664388254677, + "loss": 2.9871, + "step": 5202 + }, + { + "epoch": 0.2190736842105263, + "grad_norm": 0.486328125, + "learning_rate": 0.00045079654329468956, + "loss": 3.28, + "step": 5203 + }, + { + "epoch": 0.2191157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.0004507764390485551, + "loss": 3.4845, + "step": 5204 + }, + { + "epoch": 0.2191578947368421, + "grad_norm": 0.466796875, + "learning_rate": 0.00045075633114451, + "loss": 3.5159, + "step": 5205 + }, + { + "epoch": 0.2192, + "grad_norm": 0.427734375, + "learning_rate": 0.00045073621958292034, + "loss": 3.5112, + "step": 5206 + }, + { + "epoch": 0.2192421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00045071610436415256, + "loss": 3.3459, + "step": 5207 + }, + { + "epoch": 0.21928421052631578, + "grad_norm": 0.392578125, + "learning_rate": 0.00045069598548857327, + "loss": 3.6525, + "step": 5208 + }, + { + "epoch": 0.21932631578947367, + "grad_norm": 0.404296875, + "learning_rate": 0.0004506758629565489, + "loss": 3.294, + "step": 5209 + }, + { + "epoch": 0.21936842105263157, + "grad_norm": 0.40234375, + "learning_rate": 0.00045065573676844606, + "loss": 3.412, + "step": 5210 + }, + { + "epoch": 0.21941052631578947, + "grad_norm": 0.4296875, + "learning_rate": 0.0004506356069246315, + "loss": 3.3039, + "step": 5211 + }, + { + "epoch": 0.21945263157894737, + "grad_norm": 0.421875, + "learning_rate": 0.0004506154734254719, + "loss": 3.7136, + "step": 5212 + }, + { + "epoch": 0.21949473684210527, + "grad_norm": 0.431640625, + "learning_rate": 0.0004505953362713341, + "loss": 3.3052, + "step": 5213 + }, + { + "epoch": 0.21953684210526317, + "grad_norm": 0.4375, + "learning_rate": 0.00045057519546258496, + "loss": 3.655, + "step": 5214 + }, + { + "epoch": 0.21957894736842104, + "grad_norm": 0.447265625, + "learning_rate": 0.00045055505099959143, + "loss": 3.343, + "step": 5215 + }, + { + "epoch": 0.21962105263157894, + "grad_norm": 0.462890625, + "learning_rate": 0.0004505349028827205, + "loss": 3.1474, + "step": 5216 + }, + { + "epoch": 0.21966315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.0004505147511123393, + "loss": 3.4399, + "step": 5217 + }, + { + "epoch": 0.21970526315789474, + "grad_norm": 0.4140625, + "learning_rate": 0.0004504945956888149, + "loss": 3.2831, + "step": 5218 + }, + { + "epoch": 0.21974736842105264, + "grad_norm": 0.3984375, + "learning_rate": 0.0004504744366125145, + "loss": 3.4026, + "step": 5219 + }, + { + "epoch": 0.21978947368421053, + "grad_norm": 0.50390625, + "learning_rate": 0.0004504542738838054, + "loss": 3.3068, + "step": 5220 + }, + { + "epoch": 0.21983157894736843, + "grad_norm": 0.396484375, + "learning_rate": 0.00045043410750305506, + "loss": 3.3471, + "step": 5221 + }, + { + "epoch": 0.2198736842105263, + "grad_norm": 0.462890625, + "learning_rate": 0.00045041393747063073, + "loss": 3.2265, + "step": 5222 + }, + { + "epoch": 0.2199157894736842, + "grad_norm": 0.39453125, + "learning_rate": 0.00045039376378689993, + "loss": 3.2977, + "step": 5223 + }, + { + "epoch": 0.2199578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.0004503735864522302, + "loss": 3.1652, + "step": 5224 + }, + { + "epoch": 0.22, + "grad_norm": 0.453125, + "learning_rate": 0.00045035340546698916, + "loss": 3.0997, + "step": 5225 + }, + { + "epoch": 0.2200421052631579, + "grad_norm": 0.46484375, + "learning_rate": 0.0004503332208315445, + "loss": 2.7862, + "step": 5226 + }, + { + "epoch": 0.2200842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.0004503130325462639, + "loss": 3.5518, + "step": 5227 + }, + { + "epoch": 0.2201263157894737, + "grad_norm": 0.3984375, + "learning_rate": 0.0004502928406115152, + "loss": 3.3894, + "step": 5228 + }, + { + "epoch": 0.22016842105263157, + "grad_norm": 0.3984375, + "learning_rate": 0.0004502726450276663, + "loss": 3.65, + "step": 5229 + }, + { + "epoch": 0.22021052631578947, + "grad_norm": 0.41015625, + "learning_rate": 0.0004502524457950852, + "loss": 3.6042, + "step": 5230 + }, + { + "epoch": 0.22025263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.00045023224291413966, + "loss": 3.3507, + "step": 5231 + }, + { + "epoch": 0.22029473684210527, + "grad_norm": 0.4375, + "learning_rate": 0.00045021203638519804, + "loss": 2.7982, + "step": 5232 + }, + { + "epoch": 0.22033684210526316, + "grad_norm": 0.392578125, + "learning_rate": 0.00045019182620862833, + "loss": 3.0429, + "step": 5233 + }, + { + "epoch": 0.22037894736842106, + "grad_norm": 0.4140625, + "learning_rate": 0.00045017161238479876, + "loss": 3.4239, + "step": 5234 + }, + { + "epoch": 0.22042105263157893, + "grad_norm": 0.400390625, + "learning_rate": 0.0004501513949140776, + "loss": 3.3614, + "step": 5235 + }, + { + "epoch": 0.22046315789473683, + "grad_norm": 0.41796875, + "learning_rate": 0.00045013117379683314, + "loss": 3.3063, + "step": 5236 + }, + { + "epoch": 0.22050526315789473, + "grad_norm": 0.388671875, + "learning_rate": 0.0004501109490334339, + "loss": 3.3593, + "step": 5237 + }, + { + "epoch": 0.22054736842105263, + "grad_norm": 0.484375, + "learning_rate": 0.00045009072062424826, + "loss": 3.1511, + "step": 5238 + }, + { + "epoch": 0.22058947368421053, + "grad_norm": 0.419921875, + "learning_rate": 0.0004500704885696448, + "loss": 3.3692, + "step": 5239 + }, + { + "epoch": 0.22063157894736843, + "grad_norm": 0.40234375, + "learning_rate": 0.0004500502528699921, + "loss": 3.6416, + "step": 5240 + }, + { + "epoch": 0.22067368421052633, + "grad_norm": 0.427734375, + "learning_rate": 0.00045003001352565886, + "loss": 3.2563, + "step": 5241 + }, + { + "epoch": 0.2207157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.0004500097705370138, + "loss": 3.4161, + "step": 5242 + }, + { + "epoch": 0.2207578947368421, + "grad_norm": 0.4921875, + "learning_rate": 0.00044998952390442573, + "loss": 3.4393, + "step": 5243 + }, + { + "epoch": 0.2208, + "grad_norm": 0.3828125, + "learning_rate": 0.00044996927362826354, + "loss": 3.4941, + "step": 5244 + }, + { + "epoch": 0.2208421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.000449949019708896, + "loss": 2.8758, + "step": 5245 + }, + { + "epoch": 0.2208842105263158, + "grad_norm": 0.4765625, + "learning_rate": 0.00044992876214669243, + "loss": 3.814, + "step": 5246 + }, + { + "epoch": 0.2209263157894737, + "grad_norm": 0.435546875, + "learning_rate": 0.00044990850094202164, + "loss": 3.0105, + "step": 5247 + }, + { + "epoch": 0.2209684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.0004498882360952528, + "loss": 3.795, + "step": 5248 + }, + { + "epoch": 0.22101052631578946, + "grad_norm": 0.37890625, + "learning_rate": 0.0004498679676067552, + "loss": 3.4465, + "step": 5249 + }, + { + "epoch": 0.22105263157894736, + "grad_norm": 0.408203125, + "learning_rate": 0.0004498476954768981, + "loss": 3.0624, + "step": 5250 + }, + { + "epoch": 0.22109473684210526, + "grad_norm": 0.412109375, + "learning_rate": 0.00044982741970605076, + "loss": 3.5291, + "step": 5251 + }, + { + "epoch": 0.22113684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.0004498071402945826, + "loss": 3.3404, + "step": 5252 + }, + { + "epoch": 0.22117894736842106, + "grad_norm": 0.421875, + "learning_rate": 0.00044978685724286315, + "loss": 3.6174, + "step": 5253 + }, + { + "epoch": 0.22122105263157896, + "grad_norm": 0.380859375, + "learning_rate": 0.00044976657055126193, + "loss": 3.5667, + "step": 5254 + }, + { + "epoch": 0.22126315789473683, + "grad_norm": 0.38671875, + "learning_rate": 0.0004497462802201484, + "loss": 3.2798, + "step": 5255 + }, + { + "epoch": 0.22130526315789473, + "grad_norm": 0.41015625, + "learning_rate": 0.00044972598624989246, + "loss": 3.3554, + "step": 5256 + }, + { + "epoch": 0.22134736842105263, + "grad_norm": 0.40625, + "learning_rate": 0.00044970568864086363, + "loss": 3.1223, + "step": 5257 + }, + { + "epoch": 0.22138947368421052, + "grad_norm": 0.451171875, + "learning_rate": 0.0004496853873934318, + "loss": 3.5604, + "step": 5258 + }, + { + "epoch": 0.22143157894736842, + "grad_norm": 0.39453125, + "learning_rate": 0.0004496650825079669, + "loss": 3.7436, + "step": 5259 + }, + { + "epoch": 0.22147368421052632, + "grad_norm": 0.384765625, + "learning_rate": 0.0004496447739848387, + "loss": 3.0985, + "step": 5260 + }, + { + "epoch": 0.22151578947368422, + "grad_norm": 0.46484375, + "learning_rate": 0.0004496244618244174, + "loss": 3.3216, + "step": 5261 + }, + { + "epoch": 0.2215578947368421, + "grad_norm": 0.396484375, + "learning_rate": 0.0004496041460270729, + "loss": 3.6747, + "step": 5262 + }, + { + "epoch": 0.2216, + "grad_norm": 0.44140625, + "learning_rate": 0.00044958382659317534, + "loss": 3.4734, + "step": 5263 + }, + { + "epoch": 0.2216421052631579, + "grad_norm": 0.38671875, + "learning_rate": 0.00044956350352309506, + "loss": 3.6832, + "step": 5264 + }, + { + "epoch": 0.2216842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.00044954317681720216, + "loss": 3.6663, + "step": 5265 + }, + { + "epoch": 0.2217263157894737, + "grad_norm": 0.5234375, + "learning_rate": 0.00044952284647586704, + "loss": 3.6828, + "step": 5266 + }, + { + "epoch": 0.2217684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.00044950251249946003, + "loss": 3.691, + "step": 5267 + }, + { + "epoch": 0.22181052631578949, + "grad_norm": 0.46875, + "learning_rate": 0.00044948217488835176, + "loss": 3.2944, + "step": 5268 + }, + { + "epoch": 0.22185263157894736, + "grad_norm": 0.453125, + "learning_rate": 0.0004494618336429126, + "loss": 2.6757, + "step": 5269 + }, + { + "epoch": 0.22189473684210526, + "grad_norm": 0.451171875, + "learning_rate": 0.0004494414887635132, + "loss": 3.1438, + "step": 5270 + }, + { + "epoch": 0.22193684210526315, + "grad_norm": 0.408203125, + "learning_rate": 0.00044942114025052417, + "loss": 3.0991, + "step": 5271 + }, + { + "epoch": 0.22197894736842105, + "grad_norm": 0.39453125, + "learning_rate": 0.00044940078810431626, + "loss": 3.4627, + "step": 5272 + }, + { + "epoch": 0.22202105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.00044938043232526037, + "loss": 3.7105, + "step": 5273 + }, + { + "epoch": 0.22206315789473685, + "grad_norm": 0.439453125, + "learning_rate": 0.00044936007291372713, + "loss": 3.3262, + "step": 5274 + }, + { + "epoch": 0.22210526315789475, + "grad_norm": 0.3984375, + "learning_rate": 0.0004493397098700877, + "loss": 3.8209, + "step": 5275 + }, + { + "epoch": 0.22214736842105262, + "grad_norm": 0.41015625, + "learning_rate": 0.000449319343194713, + "loss": 3.5708, + "step": 5276 + }, + { + "epoch": 0.22218947368421052, + "grad_norm": 0.55859375, + "learning_rate": 0.000449298972887974, + "loss": 3.3666, + "step": 5277 + }, + { + "epoch": 0.22223157894736842, + "grad_norm": 0.392578125, + "learning_rate": 0.00044927859895024184, + "loss": 3.5422, + "step": 5278 + }, + { + "epoch": 0.22227368421052632, + "grad_norm": 0.416015625, + "learning_rate": 0.0004492582213818878, + "loss": 3.7804, + "step": 5279 + }, + { + "epoch": 0.22231578947368422, + "grad_norm": 0.40234375, + "learning_rate": 0.00044923784018328314, + "loss": 3.3119, + "step": 5280 + }, + { + "epoch": 0.22235789473684212, + "grad_norm": 0.41796875, + "learning_rate": 0.0004492174553547991, + "loss": 3.2462, + "step": 5281 + }, + { + "epoch": 0.2224, + "grad_norm": 0.439453125, + "learning_rate": 0.00044919706689680697, + "loss": 3.061, + "step": 5282 + }, + { + "epoch": 0.22244210526315789, + "grad_norm": 0.46875, + "learning_rate": 0.00044917667480967843, + "loss": 3.3981, + "step": 5283 + }, + { + "epoch": 0.22248421052631578, + "grad_norm": 0.384765625, + "learning_rate": 0.00044915627909378485, + "loss": 3.7243, + "step": 5284 + }, + { + "epoch": 0.22252631578947368, + "grad_norm": 0.40625, + "learning_rate": 0.000449135879749498, + "loss": 3.2684, + "step": 5285 + }, + { + "epoch": 0.22256842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.0004491154767771892, + "loss": 3.8089, + "step": 5286 + }, + { + "epoch": 0.22261052631578948, + "grad_norm": 0.41015625, + "learning_rate": 0.0004490950701772305, + "loss": 3.0875, + "step": 5287 + }, + { + "epoch": 0.22265263157894738, + "grad_norm": 0.4140625, + "learning_rate": 0.0004490746599499935, + "loss": 3.5963, + "step": 5288 + }, + { + "epoch": 0.22269473684210525, + "grad_norm": 0.435546875, + "learning_rate": 0.0004490542460958501, + "loss": 3.5365, + "step": 5289 + }, + { + "epoch": 0.22273684210526315, + "grad_norm": 0.4140625, + "learning_rate": 0.0004490338286151723, + "loss": 3.1544, + "step": 5290 + }, + { + "epoch": 0.22277894736842105, + "grad_norm": 0.412109375, + "learning_rate": 0.0004490134075083319, + "loss": 3.5545, + "step": 5291 + }, + { + "epoch": 0.22282105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.00044899298277570104, + "loss": 3.4912, + "step": 5292 + }, + { + "epoch": 0.22286315789473685, + "grad_norm": 0.470703125, + "learning_rate": 0.0004489725544176519, + "loss": 3.7183, + "step": 5293 + }, + { + "epoch": 0.22290526315789475, + "grad_norm": 0.390625, + "learning_rate": 0.00044895212243455655, + "loss": 2.8311, + "step": 5294 + }, + { + "epoch": 0.22294736842105264, + "grad_norm": 0.4296875, + "learning_rate": 0.0004489316868267873, + "loss": 3.3294, + "step": 5295 + }, + { + "epoch": 0.22298947368421052, + "grad_norm": 0.41015625, + "learning_rate": 0.00044891124759471646, + "loss": 3.2148, + "step": 5296 + }, + { + "epoch": 0.22303157894736841, + "grad_norm": 0.43359375, + "learning_rate": 0.0004488908047387164, + "loss": 3.6311, + "step": 5297 + }, + { + "epoch": 0.2230736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.00044887035825915957, + "loss": 3.3109, + "step": 5298 + }, + { + "epoch": 0.2231157894736842, + "grad_norm": 0.455078125, + "learning_rate": 0.0004488499081564185, + "loss": 3.8074, + "step": 5299 + }, + { + "epoch": 0.2231578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.00044882945443086564, + "loss": 3.6378, + "step": 5300 + }, + { + "epoch": 0.2232, + "grad_norm": 0.404296875, + "learning_rate": 0.00044880899708287387, + "loss": 3.2694, + "step": 5301 + }, + { + "epoch": 0.2232421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.0004487885361128157, + "loss": 3.2198, + "step": 5302 + }, + { + "epoch": 0.22328421052631578, + "grad_norm": 0.3984375, + "learning_rate": 0.0004487680715210639, + "loss": 3.299, + "step": 5303 + }, + { + "epoch": 0.22332631578947368, + "grad_norm": 0.40234375, + "learning_rate": 0.0004487476033079915, + "loss": 3.1346, + "step": 5304 + }, + { + "epoch": 0.22336842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.00044872713147397125, + "loss": 3.0447, + "step": 5305 + }, + { + "epoch": 0.22341052631578948, + "grad_norm": 0.408203125, + "learning_rate": 0.00044870665601937613, + "loss": 3.1736, + "step": 5306 + }, + { + "epoch": 0.22345263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.0004486861769445793, + "loss": 3.3146, + "step": 5307 + }, + { + "epoch": 0.22349473684210527, + "grad_norm": 0.41796875, + "learning_rate": 0.00044866569424995364, + "loss": 3.0451, + "step": 5308 + }, + { + "epoch": 0.22353684210526314, + "grad_norm": 0.40234375, + "learning_rate": 0.00044864520793587247, + "loss": 3.5234, + "step": 5309 + }, + { + "epoch": 0.22357894736842104, + "grad_norm": 0.419921875, + "learning_rate": 0.0004486247180027091, + "loss": 3.0574, + "step": 5310 + }, + { + "epoch": 0.22362105263157894, + "grad_norm": 0.392578125, + "learning_rate": 0.00044860422445083673, + "loss": 3.5278, + "step": 5311 + }, + { + "epoch": 0.22366315789473684, + "grad_norm": 0.54296875, + "learning_rate": 0.0004485837272806287, + "loss": 3.6568, + "step": 5312 + }, + { + "epoch": 0.22370526315789474, + "grad_norm": 0.400390625, + "learning_rate": 0.00044856322649245844, + "loss": 3.4264, + "step": 5313 + }, + { + "epoch": 0.22374736842105264, + "grad_norm": 0.400390625, + "learning_rate": 0.00044854272208669953, + "loss": 3.7323, + "step": 5314 + }, + { + "epoch": 0.22378947368421054, + "grad_norm": 0.392578125, + "learning_rate": 0.0004485222140637255, + "loss": 3.2967, + "step": 5315 + }, + { + "epoch": 0.2238315789473684, + "grad_norm": 0.396484375, + "learning_rate": 0.00044850170242391, + "loss": 3.366, + "step": 5316 + }, + { + "epoch": 0.2238736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.00044848118716762675, + "loss": 3.1688, + "step": 5317 + }, + { + "epoch": 0.2239157894736842, + "grad_norm": 0.384765625, + "learning_rate": 0.00044846066829524946, + "loss": 3.3749, + "step": 5318 + }, + { + "epoch": 0.2239578947368421, + "grad_norm": 0.396484375, + "learning_rate": 0.0004484401458071519, + "loss": 3.4601, + "step": 5319 + }, + { + "epoch": 0.224, + "grad_norm": 0.478515625, + "learning_rate": 0.00044841961970370814, + "loss": 3.0214, + "step": 5320 + }, + { + "epoch": 0.2240421052631579, + "grad_norm": 0.404296875, + "learning_rate": 0.000448399089985292, + "loss": 3.2473, + "step": 5321 + }, + { + "epoch": 0.2240842105263158, + "grad_norm": 0.40625, + "learning_rate": 0.0004483785566522776, + "loss": 2.9274, + "step": 5322 + }, + { + "epoch": 0.22412631578947367, + "grad_norm": 0.40234375, + "learning_rate": 0.00044835801970503886, + "loss": 3.3684, + "step": 5323 + }, + { + "epoch": 0.22416842105263157, + "grad_norm": 0.421875, + "learning_rate": 0.00044833747914395014, + "loss": 3.6371, + "step": 5324 + }, + { + "epoch": 0.22421052631578947, + "grad_norm": 0.404296875, + "learning_rate": 0.00044831693496938553, + "loss": 3.7025, + "step": 5325 + }, + { + "epoch": 0.22425263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0004482963871817195, + "loss": 3.3776, + "step": 5326 + }, + { + "epoch": 0.22429473684210527, + "grad_norm": 0.439453125, + "learning_rate": 0.0004482758357813262, + "loss": 3.3394, + "step": 5327 + }, + { + "epoch": 0.22433684210526317, + "grad_norm": 0.431640625, + "learning_rate": 0.00044825528076858015, + "loss": 3.4329, + "step": 5328 + }, + { + "epoch": 0.22437894736842104, + "grad_norm": 0.46484375, + "learning_rate": 0.0004482347221438558, + "loss": 2.8252, + "step": 5329 + }, + { + "epoch": 0.22442105263157894, + "grad_norm": 0.3984375, + "learning_rate": 0.00044821415990752776, + "loss": 3.1779, + "step": 5330 + }, + { + "epoch": 0.22446315789473684, + "grad_norm": 0.41015625, + "learning_rate": 0.00044819359405997065, + "loss": 2.9593, + "step": 5331 + }, + { + "epoch": 0.22450526315789474, + "grad_norm": 0.404296875, + "learning_rate": 0.0004481730246015591, + "loss": 3.3424, + "step": 5332 + }, + { + "epoch": 0.22454736842105263, + "grad_norm": 0.416015625, + "learning_rate": 0.0004481524515326678, + "loss": 3.5201, + "step": 5333 + }, + { + "epoch": 0.22458947368421053, + "grad_norm": 0.41796875, + "learning_rate": 0.0004481318748536717, + "loss": 3.0077, + "step": 5334 + }, + { + "epoch": 0.22463157894736843, + "grad_norm": 0.400390625, + "learning_rate": 0.0004481112945649457, + "loss": 3.2576, + "step": 5335 + }, + { + "epoch": 0.2246736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.00044809071066686455, + "loss": 3.2089, + "step": 5336 + }, + { + "epoch": 0.2247157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.00044807012315980355, + "loss": 2.9588, + "step": 5337 + }, + { + "epoch": 0.2247578947368421, + "grad_norm": 0.40234375, + "learning_rate": 0.00044804953204413754, + "loss": 3.2408, + "step": 5338 + }, + { + "epoch": 0.2248, + "grad_norm": 0.44140625, + "learning_rate": 0.0004480289373202418, + "loss": 3.6674, + "step": 5339 + }, + { + "epoch": 0.2248421052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.0004480083389884915, + "loss": 3.6298, + "step": 5340 + }, + { + "epoch": 0.2248842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.00044798773704926185, + "loss": 3.1455, + "step": 5341 + }, + { + "epoch": 0.2249263157894737, + "grad_norm": 0.51953125, + "learning_rate": 0.00044796713150292835, + "loss": 2.7858, + "step": 5342 + }, + { + "epoch": 0.22496842105263157, + "grad_norm": 0.400390625, + "learning_rate": 0.0004479465223498662, + "loss": 3.0402, + "step": 5343 + }, + { + "epoch": 0.22501052631578947, + "grad_norm": 0.40234375, + "learning_rate": 0.0004479259095904511, + "loss": 3.6379, + "step": 5344 + }, + { + "epoch": 0.22505263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.00044790529322505845, + "loss": 3.3131, + "step": 5345 + }, + { + "epoch": 0.22509473684210526, + "grad_norm": 0.419921875, + "learning_rate": 0.0004478846732540639, + "loss": 2.9419, + "step": 5346 + }, + { + "epoch": 0.22513684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.0004478640496778431, + "loss": 3.4673, + "step": 5347 + }, + { + "epoch": 0.22517894736842106, + "grad_norm": 0.37890625, + "learning_rate": 0.00044784342249677174, + "loss": 3.4492, + "step": 5348 + }, + { + "epoch": 0.22522105263157896, + "grad_norm": 0.388671875, + "learning_rate": 0.0004478227917112257, + "loss": 3.5149, + "step": 5349 + }, + { + "epoch": 0.22526315789473683, + "grad_norm": 0.453125, + "learning_rate": 0.0004478021573215808, + "loss": 3.6039, + "step": 5350 + }, + { + "epoch": 0.22530526315789473, + "grad_norm": 0.5078125, + "learning_rate": 0.0004477815193282131, + "loss": 2.7898, + "step": 5351 + }, + { + "epoch": 0.22534736842105263, + "grad_norm": 0.3984375, + "learning_rate": 0.00044776087773149844, + "loss": 3.0764, + "step": 5352 + }, + { + "epoch": 0.22538947368421053, + "grad_norm": 0.427734375, + "learning_rate": 0.00044774023253181294, + "loss": 3.5454, + "step": 5353 + }, + { + "epoch": 0.22543157894736843, + "grad_norm": 0.41015625, + "learning_rate": 0.00044771958372953277, + "loss": 3.3953, + "step": 5354 + }, + { + "epoch": 0.22547368421052633, + "grad_norm": 0.390625, + "learning_rate": 0.0004476989313250341, + "loss": 3.5499, + "step": 5355 + }, + { + "epoch": 0.2255157894736842, + "grad_norm": 0.400390625, + "learning_rate": 0.00044767827531869316, + "loss": 3.4708, + "step": 5356 + }, + { + "epoch": 0.2255578947368421, + "grad_norm": 0.458984375, + "learning_rate": 0.0004476576157108863, + "loss": 3.5151, + "step": 5357 + }, + { + "epoch": 0.2256, + "grad_norm": 0.447265625, + "learning_rate": 0.00044763695250198994, + "loss": 3.656, + "step": 5358 + }, + { + "epoch": 0.2256421052631579, + "grad_norm": 0.39453125, + "learning_rate": 0.0004476162856923805, + "loss": 3.3099, + "step": 5359 + }, + { + "epoch": 0.2256842105263158, + "grad_norm": 0.3984375, + "learning_rate": 0.00044759561528243455, + "loss": 3.3123, + "step": 5360 + }, + { + "epoch": 0.2257263157894737, + "grad_norm": 0.44921875, + "learning_rate": 0.0004475749412725286, + "loss": 2.8758, + "step": 5361 + }, + { + "epoch": 0.2257684210526316, + "grad_norm": 0.5, + "learning_rate": 0.00044755426366303944, + "loss": 3.0864, + "step": 5362 + }, + { + "epoch": 0.22581052631578946, + "grad_norm": 0.392578125, + "learning_rate": 0.0004475335824543436, + "loss": 3.1212, + "step": 5363 + }, + { + "epoch": 0.22585263157894736, + "grad_norm": 0.4296875, + "learning_rate": 0.0004475128976468181, + "loss": 3.3591, + "step": 5364 + }, + { + "epoch": 0.22589473684210526, + "grad_norm": 0.451171875, + "learning_rate": 0.0004474922092408397, + "loss": 3.6221, + "step": 5365 + }, + { + "epoch": 0.22593684210526316, + "grad_norm": 0.453125, + "learning_rate": 0.00044747151723678515, + "loss": 3.3725, + "step": 5366 + }, + { + "epoch": 0.22597894736842106, + "grad_norm": 0.40625, + "learning_rate": 0.00044745082163503167, + "loss": 3.2719, + "step": 5367 + }, + { + "epoch": 0.22602105263157896, + "grad_norm": 0.421875, + "learning_rate": 0.00044743012243595614, + "loss": 3.4701, + "step": 5368 + }, + { + "epoch": 0.22606315789473685, + "grad_norm": 0.435546875, + "learning_rate": 0.0004474094196399358, + "loss": 3.4782, + "step": 5369 + }, + { + "epoch": 0.22610526315789473, + "grad_norm": 0.4296875, + "learning_rate": 0.00044738871324734775, + "loss": 3.5036, + "step": 5370 + }, + { + "epoch": 0.22614736842105262, + "grad_norm": 0.48046875, + "learning_rate": 0.0004473680032585694, + "loss": 3.4156, + "step": 5371 + }, + { + "epoch": 0.22618947368421052, + "grad_norm": 0.404296875, + "learning_rate": 0.00044734728967397774, + "loss": 3.387, + "step": 5372 + }, + { + "epoch": 0.22623157894736842, + "grad_norm": 0.447265625, + "learning_rate": 0.00044732657249395047, + "loss": 3.2278, + "step": 5373 + }, + { + "epoch": 0.22627368421052632, + "grad_norm": 0.61328125, + "learning_rate": 0.0004473058517188648, + "loss": 3.0927, + "step": 5374 + }, + { + "epoch": 0.22631578947368422, + "grad_norm": 0.61328125, + "learning_rate": 0.00044728512734909845, + "loss": 3.4747, + "step": 5375 + }, + { + "epoch": 0.22635789473684212, + "grad_norm": 0.388671875, + "learning_rate": 0.0004472643993850288, + "loss": 3.2073, + "step": 5376 + }, + { + "epoch": 0.2264, + "grad_norm": 0.408203125, + "learning_rate": 0.0004472436678270336, + "loss": 3.5488, + "step": 5377 + }, + { + "epoch": 0.2264421052631579, + "grad_norm": 0.400390625, + "learning_rate": 0.0004472229326754905, + "loss": 3.3754, + "step": 5378 + }, + { + "epoch": 0.2264842105263158, + "grad_norm": 0.63671875, + "learning_rate": 0.00044720219393077726, + "loss": 3.1922, + "step": 5379 + }, + { + "epoch": 0.2265263157894737, + "grad_norm": 0.4609375, + "learning_rate": 0.00044718145159327184, + "loss": 3.1251, + "step": 5380 + }, + { + "epoch": 0.22656842105263159, + "grad_norm": 0.439453125, + "learning_rate": 0.00044716070566335197, + "loss": 3.0596, + "step": 5381 + }, + { + "epoch": 0.22661052631578948, + "grad_norm": 0.443359375, + "learning_rate": 0.0004471399561413957, + "loss": 3.2554, + "step": 5382 + }, + { + "epoch": 0.22665263157894736, + "grad_norm": 0.4296875, + "learning_rate": 0.0004471192030277811, + "loss": 2.9799, + "step": 5383 + }, + { + "epoch": 0.22669473684210525, + "grad_norm": 0.431640625, + "learning_rate": 0.0004470984463228862, + "loss": 3.1611, + "step": 5384 + }, + { + "epoch": 0.22673684210526315, + "grad_norm": 0.462890625, + "learning_rate": 0.00044707768602708924, + "loss": 3.3698, + "step": 5385 + }, + { + "epoch": 0.22677894736842105, + "grad_norm": 0.828125, + "learning_rate": 0.0004470569221407683, + "loss": 3.4199, + "step": 5386 + }, + { + "epoch": 0.22682105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.0004470361546643019, + "loss": 3.0724, + "step": 5387 + }, + { + "epoch": 0.22686315789473685, + "grad_norm": 0.44921875, + "learning_rate": 0.00044701538359806823, + "loss": 2.797, + "step": 5388 + }, + { + "epoch": 0.22690526315789475, + "grad_norm": 0.51171875, + "learning_rate": 0.0004469946089424457, + "loss": 3.6683, + "step": 5389 + }, + { + "epoch": 0.22694736842105262, + "grad_norm": 0.41796875, + "learning_rate": 0.0004469738306978128, + "loss": 3.3828, + "step": 5390 + }, + { + "epoch": 0.22698947368421052, + "grad_norm": 0.4140625, + "learning_rate": 0.0004469530488645483, + "loss": 3.5654, + "step": 5391 + }, + { + "epoch": 0.22703157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.0004469322634430306, + "loss": 3.688, + "step": 5392 + }, + { + "epoch": 0.22707368421052632, + "grad_norm": 0.451171875, + "learning_rate": 0.0004469114744336384, + "loss": 3.4679, + "step": 5393 + }, + { + "epoch": 0.22711578947368422, + "grad_norm": 0.42578125, + "learning_rate": 0.0004468906818367505, + "loss": 3.5697, + "step": 5394 + }, + { + "epoch": 0.22715789473684211, + "grad_norm": 0.53515625, + "learning_rate": 0.00044686988565274577, + "loss": 3.4219, + "step": 5395 + }, + { + "epoch": 0.2272, + "grad_norm": 0.435546875, + "learning_rate": 0.00044684908588200304, + "loss": 2.9927, + "step": 5396 + }, + { + "epoch": 0.22724210526315788, + "grad_norm": 2.15625, + "learning_rate": 0.0004468282825249012, + "loss": 3.5558, + "step": 5397 + }, + { + "epoch": 0.22728421052631578, + "grad_norm": 0.50390625, + "learning_rate": 0.00044680747558181933, + "loss": 2.798, + "step": 5398 + }, + { + "epoch": 0.22732631578947368, + "grad_norm": 0.53515625, + "learning_rate": 0.0004467866650531365, + "loss": 3.3905, + "step": 5399 + }, + { + "epoch": 0.22736842105263158, + "grad_norm": 0.55078125, + "learning_rate": 0.00044676585093923185, + "loss": 3.158, + "step": 5400 + }, + { + "epoch": 0.22741052631578948, + "grad_norm": 0.39453125, + "learning_rate": 0.00044674503324048455, + "loss": 3.6752, + "step": 5401 + }, + { + "epoch": 0.22745263157894738, + "grad_norm": 0.46484375, + "learning_rate": 0.00044672421195727396, + "loss": 3.1355, + "step": 5402 + }, + { + "epoch": 0.22749473684210525, + "grad_norm": 0.453125, + "learning_rate": 0.00044670338708997933, + "loss": 3.0773, + "step": 5403 + }, + { + "epoch": 0.22753684210526315, + "grad_norm": 0.484375, + "learning_rate": 0.00044668255863898013, + "loss": 3.7283, + "step": 5404 + }, + { + "epoch": 0.22757894736842105, + "grad_norm": 0.45703125, + "learning_rate": 0.0004466617266046558, + "loss": 3.6276, + "step": 5405 + }, + { + "epoch": 0.22762105263157895, + "grad_norm": 0.443359375, + "learning_rate": 0.00044664089098738586, + "loss": 3.3026, + "step": 5406 + }, + { + "epoch": 0.22766315789473685, + "grad_norm": 0.431640625, + "learning_rate": 0.00044662005178754995, + "loss": 3.5225, + "step": 5407 + }, + { + "epoch": 0.22770526315789474, + "grad_norm": 0.44921875, + "learning_rate": 0.0004465992090055277, + "loss": 3.5985, + "step": 5408 + }, + { + "epoch": 0.22774736842105264, + "grad_norm": 0.400390625, + "learning_rate": 0.0004465783626416989, + "loss": 3.7124, + "step": 5409 + }, + { + "epoch": 0.22778947368421051, + "grad_norm": 0.44140625, + "learning_rate": 0.0004465575126964433, + "loss": 3.3157, + "step": 5410 + }, + { + "epoch": 0.2278315789473684, + "grad_norm": 0.451171875, + "learning_rate": 0.00044653665917014076, + "loss": 3.2714, + "step": 5411 + }, + { + "epoch": 0.2278736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.0004465158020631711, + "loss": 3.5093, + "step": 5412 + }, + { + "epoch": 0.2279157894736842, + "grad_norm": 0.455078125, + "learning_rate": 0.00044649494137591453, + "loss": 3.3758, + "step": 5413 + }, + { + "epoch": 0.2279578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.000446474077108751, + "loss": 3.3725, + "step": 5414 + }, + { + "epoch": 0.228, + "grad_norm": 0.412109375, + "learning_rate": 0.0004464532092620606, + "loss": 3.4444, + "step": 5415 + }, + { + "epoch": 0.2280421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.0004464323378362236, + "loss": 2.7441, + "step": 5416 + }, + { + "epoch": 0.22808421052631578, + "grad_norm": 0.50390625, + "learning_rate": 0.00044641146283162016, + "loss": 4.0745, + "step": 5417 + }, + { + "epoch": 0.22812631578947368, + "grad_norm": 0.455078125, + "learning_rate": 0.00044639058424863066, + "loss": 3.1495, + "step": 5418 + }, + { + "epoch": 0.22816842105263158, + "grad_norm": 0.45703125, + "learning_rate": 0.00044636970208763543, + "loss": 2.872, + "step": 5419 + }, + { + "epoch": 0.22821052631578947, + "grad_norm": 0.40234375, + "learning_rate": 0.000446348816349015, + "loss": 3.4887, + "step": 5420 + }, + { + "epoch": 0.22825263157894737, + "grad_norm": 0.421875, + "learning_rate": 0.0004463279270331498, + "loss": 3.6268, + "step": 5421 + }, + { + "epoch": 0.22829473684210527, + "grad_norm": 0.40234375, + "learning_rate": 0.0004463070341404205, + "loss": 3.7353, + "step": 5422 + }, + { + "epoch": 0.22833684210526317, + "grad_norm": 0.404296875, + "learning_rate": 0.00044628613767120755, + "loss": 3.7054, + "step": 5423 + }, + { + "epoch": 0.22837894736842104, + "grad_norm": 0.5078125, + "learning_rate": 0.00044626523762589194, + "loss": 3.2892, + "step": 5424 + }, + { + "epoch": 0.22842105263157894, + "grad_norm": 0.41015625, + "learning_rate": 0.00044624433400485426, + "loss": 2.9682, + "step": 5425 + }, + { + "epoch": 0.22846315789473684, + "grad_norm": 0.8046875, + "learning_rate": 0.0004462234268084754, + "loss": 3.6513, + "step": 5426 + }, + { + "epoch": 0.22850526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.00044620251603713623, + "loss": 3.6864, + "step": 5427 + }, + { + "epoch": 0.22854736842105264, + "grad_norm": 0.42578125, + "learning_rate": 0.0004461816016912178, + "loss": 2.9612, + "step": 5428 + }, + { + "epoch": 0.22858947368421054, + "grad_norm": 0.43359375, + "learning_rate": 0.00044616068377110096, + "loss": 3.3855, + "step": 5429 + }, + { + "epoch": 0.2286315789473684, + "grad_norm": 0.4453125, + "learning_rate": 0.0004461397622771671, + "loss": 3.2489, + "step": 5430 + }, + { + "epoch": 0.2286736842105263, + "grad_norm": 0.455078125, + "learning_rate": 0.00044611883720979716, + "loss": 3.078, + "step": 5431 + }, + { + "epoch": 0.2287157894736842, + "grad_norm": 0.4375, + "learning_rate": 0.0004460979085693724, + "loss": 3.7057, + "step": 5432 + }, + { + "epoch": 0.2287578947368421, + "grad_norm": 0.396484375, + "learning_rate": 0.0004460769763562742, + "loss": 3.3223, + "step": 5433 + }, + { + "epoch": 0.2288, + "grad_norm": 0.435546875, + "learning_rate": 0.0004460560405708839, + "loss": 3.4355, + "step": 5434 + }, + { + "epoch": 0.2288421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.0004460351012135828, + "loss": 3.4338, + "step": 5435 + }, + { + "epoch": 0.2288842105263158, + "grad_norm": 0.61328125, + "learning_rate": 0.00044601415828475255, + "loss": 3.2501, + "step": 5436 + }, + { + "epoch": 0.22892631578947367, + "grad_norm": 0.42578125, + "learning_rate": 0.0004459932117847747, + "loss": 3.2722, + "step": 5437 + }, + { + "epoch": 0.22896842105263157, + "grad_norm": 0.4453125, + "learning_rate": 0.0004459722617140307, + "loss": 3.1988, + "step": 5438 + }, + { + "epoch": 0.22901052631578947, + "grad_norm": 0.400390625, + "learning_rate": 0.0004459513080729024, + "loss": 3.2104, + "step": 5439 + }, + { + "epoch": 0.22905263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.0004459303508617715, + "loss": 3.0857, + "step": 5440 + }, + { + "epoch": 0.22909473684210527, + "grad_norm": 0.408203125, + "learning_rate": 0.0004459093900810198, + "loss": 3.3463, + "step": 5441 + }, + { + "epoch": 0.22913684210526317, + "grad_norm": 0.404296875, + "learning_rate": 0.0004458884257310293, + "loss": 2.6772, + "step": 5442 + }, + { + "epoch": 0.22917894736842107, + "grad_norm": 0.416015625, + "learning_rate": 0.0004458674578121817, + "loss": 3.5425, + "step": 5443 + }, + { + "epoch": 0.22922105263157894, + "grad_norm": 0.423828125, + "learning_rate": 0.00044584648632485925, + "loss": 3.4621, + "step": 5444 + }, + { + "epoch": 0.22926315789473684, + "grad_norm": 0.490234375, + "learning_rate": 0.0004458255112694439, + "loss": 3.3796, + "step": 5445 + }, + { + "epoch": 0.22930526315789473, + "grad_norm": 0.484375, + "learning_rate": 0.0004458045326463177, + "loss": 2.9716, + "step": 5446 + }, + { + "epoch": 0.22934736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.0004457835504558632, + "loss": 3.1893, + "step": 5447 + }, + { + "epoch": 0.22938947368421053, + "grad_norm": 0.44921875, + "learning_rate": 0.0004457625646984623, + "loss": 3.3768, + "step": 5448 + }, + { + "epoch": 0.22943157894736843, + "grad_norm": 0.427734375, + "learning_rate": 0.0004457415753744975, + "loss": 3.7506, + "step": 5449 + }, + { + "epoch": 0.2294736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.0004457205824843512, + "loss": 3.4001, + "step": 5450 + }, + { + "epoch": 0.2295157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.0004456995860284059, + "loss": 3.6028, + "step": 5451 + }, + { + "epoch": 0.2295578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.00044567858600704397, + "loss": 2.8487, + "step": 5452 + }, + { + "epoch": 0.2296, + "grad_norm": 0.419921875, + "learning_rate": 0.00044565758242064813, + "loss": 3.4237, + "step": 5453 + }, + { + "epoch": 0.2296421052631579, + "grad_norm": 0.392578125, + "learning_rate": 0.0004456365752696011, + "loss": 3.3297, + "step": 5454 + }, + { + "epoch": 0.2296842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.00044561556455428554, + "loss": 3.5052, + "step": 5455 + }, + { + "epoch": 0.2297263157894737, + "grad_norm": 0.396484375, + "learning_rate": 0.00044559455027508424, + "loss": 3.2671, + "step": 5456 + }, + { + "epoch": 0.22976842105263157, + "grad_norm": 0.419921875, + "learning_rate": 0.00044557353243238, + "loss": 3.3195, + "step": 5457 + }, + { + "epoch": 0.22981052631578947, + "grad_norm": 0.423828125, + "learning_rate": 0.0004455525110265557, + "loss": 3.3865, + "step": 5458 + }, + { + "epoch": 0.22985263157894736, + "grad_norm": 0.4375, + "learning_rate": 0.0004455314860579945, + "loss": 3.7336, + "step": 5459 + }, + { + "epoch": 0.22989473684210526, + "grad_norm": 0.39453125, + "learning_rate": 0.0004455104575270794, + "loss": 3.104, + "step": 5460 + }, + { + "epoch": 0.22993684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.00044548942543419344, + "loss": 3.3694, + "step": 5461 + }, + { + "epoch": 0.22997894736842106, + "grad_norm": 0.388671875, + "learning_rate": 0.00044546838977971986, + "loss": 3.5706, + "step": 5462 + }, + { + "epoch": 0.23002105263157896, + "grad_norm": 0.494140625, + "learning_rate": 0.00044544735056404185, + "loss": 3.9186, + "step": 5463 + }, + { + "epoch": 0.23006315789473683, + "grad_norm": 0.40234375, + "learning_rate": 0.0004454263077875428, + "loss": 3.4084, + "step": 5464 + }, + { + "epoch": 0.23010526315789473, + "grad_norm": 0.45703125, + "learning_rate": 0.000445405261450606, + "loss": 3.7675, + "step": 5465 + }, + { + "epoch": 0.23014736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.00044538421155361496, + "loss": 3.5539, + "step": 5466 + }, + { + "epoch": 0.23018947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.00044536315809695317, + "loss": 3.8473, + "step": 5467 + }, + { + "epoch": 0.23023157894736843, + "grad_norm": 0.44140625, + "learning_rate": 0.00044534210108100413, + "loss": 3.4921, + "step": 5468 + }, + { + "epoch": 0.23027368421052632, + "grad_norm": 0.388671875, + "learning_rate": 0.00044532104050615153, + "loss": 3.2213, + "step": 5469 + }, + { + "epoch": 0.23031578947368422, + "grad_norm": 0.41015625, + "learning_rate": 0.0004452999763727791, + "loss": 3.0958, + "step": 5470 + }, + { + "epoch": 0.2303578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00044527890868127054, + "loss": 2.8111, + "step": 5471 + }, + { + "epoch": 0.2304, + "grad_norm": 0.53125, + "learning_rate": 0.0004452578374320097, + "loss": 3.3192, + "step": 5472 + }, + { + "epoch": 0.2304421052631579, + "grad_norm": 0.404296875, + "learning_rate": 0.0004452367626253805, + "loss": 3.177, + "step": 5473 + }, + { + "epoch": 0.2304842105263158, + "grad_norm": 0.40234375, + "learning_rate": 0.0004452156842617668, + "loss": 2.9855, + "step": 5474 + }, + { + "epoch": 0.2305263157894737, + "grad_norm": 0.48828125, + "learning_rate": 0.0004451946023415528, + "loss": 2.9031, + "step": 5475 + }, + { + "epoch": 0.2305684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.0004451735168651225, + "loss": 2.9309, + "step": 5476 + }, + { + "epoch": 0.23061052631578946, + "grad_norm": 0.416015625, + "learning_rate": 0.00044515242783286005, + "loss": 3.6076, + "step": 5477 + }, + { + "epoch": 0.23065263157894736, + "grad_norm": 0.52734375, + "learning_rate": 0.0004451313352451496, + "loss": 3.3416, + "step": 5478 + }, + { + "epoch": 0.23069473684210526, + "grad_norm": 0.455078125, + "learning_rate": 0.00044511023910237546, + "loss": 3.4338, + "step": 5479 + }, + { + "epoch": 0.23073684210526316, + "grad_norm": 0.3984375, + "learning_rate": 0.0004450891394049221, + "loss": 3.3928, + "step": 5480 + }, + { + "epoch": 0.23077894736842106, + "grad_norm": 0.3984375, + "learning_rate": 0.0004450680361531738, + "loss": 3.6654, + "step": 5481 + }, + { + "epoch": 0.23082105263157895, + "grad_norm": 0.416015625, + "learning_rate": 0.00044504692934751506, + "loss": 2.9869, + "step": 5482 + }, + { + "epoch": 0.23086315789473685, + "grad_norm": 0.41796875, + "learning_rate": 0.00044502581898833043, + "loss": 3.0357, + "step": 5483 + }, + { + "epoch": 0.23090526315789472, + "grad_norm": 0.45703125, + "learning_rate": 0.00044500470507600456, + "loss": 3.2127, + "step": 5484 + }, + { + "epoch": 0.23094736842105262, + "grad_norm": 0.408203125, + "learning_rate": 0.0004449835876109221, + "loss": 3.1452, + "step": 5485 + }, + { + "epoch": 0.23098947368421052, + "grad_norm": 0.455078125, + "learning_rate": 0.0004449624665934677, + "loss": 2.8172, + "step": 5486 + }, + { + "epoch": 0.23103157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.00044494134202402626, + "loss": 3.4768, + "step": 5487 + }, + { + "epoch": 0.23107368421052632, + "grad_norm": 0.435546875, + "learning_rate": 0.00044492021390298264, + "loss": 3.1722, + "step": 5488 + }, + { + "epoch": 0.23111578947368422, + "grad_norm": 0.419921875, + "learning_rate": 0.00044489908223072175, + "loss": 3.2747, + "step": 5489 + }, + { + "epoch": 0.23115789473684212, + "grad_norm": 0.4140625, + "learning_rate": 0.0004448779470076285, + "loss": 3.2161, + "step": 5490 + }, + { + "epoch": 0.2312, + "grad_norm": 0.4140625, + "learning_rate": 0.00044485680823408815, + "loss": 3.298, + "step": 5491 + }, + { + "epoch": 0.2312421052631579, + "grad_norm": 1.046875, + "learning_rate": 0.00044483566591048564, + "loss": 3.785, + "step": 5492 + }, + { + "epoch": 0.2312842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.0004448145200372062, + "loss": 3.5281, + "step": 5493 + }, + { + "epoch": 0.23132631578947369, + "grad_norm": 0.439453125, + "learning_rate": 0.0004447933706146351, + "loss": 3.3212, + "step": 5494 + }, + { + "epoch": 0.23136842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.0004447722176431577, + "loss": 3.3681, + "step": 5495 + }, + { + "epoch": 0.23141052631578948, + "grad_norm": 0.44140625, + "learning_rate": 0.00044475106112315933, + "loss": 3.539, + "step": 5496 + }, + { + "epoch": 0.23145263157894738, + "grad_norm": 0.427734375, + "learning_rate": 0.0004447299010550254, + "loss": 3.5777, + "step": 5497 + }, + { + "epoch": 0.23149473684210525, + "grad_norm": 0.4375, + "learning_rate": 0.00044470873743914153, + "loss": 3.551, + "step": 5498 + }, + { + "epoch": 0.23153684210526315, + "grad_norm": 0.423828125, + "learning_rate": 0.0004446875702758932, + "loss": 3.1899, + "step": 5499 + }, + { + "epoch": 0.23157894736842105, + "grad_norm": 0.416015625, + "learning_rate": 0.0004446663995656661, + "loss": 3.4669, + "step": 5500 + }, + { + "epoch": 0.23162105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.00044464522530884593, + "loss": 3.4734, + "step": 5501 + }, + { + "epoch": 0.23166315789473685, + "grad_norm": 0.44921875, + "learning_rate": 0.00044462404750581847, + "loss": 3.2174, + "step": 5502 + }, + { + "epoch": 0.23170526315789475, + "grad_norm": 0.4375, + "learning_rate": 0.00044460286615696955, + "loss": 3.2266, + "step": 5503 + }, + { + "epoch": 0.23174736842105262, + "grad_norm": 0.392578125, + "learning_rate": 0.00044458168126268504, + "loss": 3.0194, + "step": 5504 + }, + { + "epoch": 0.23178947368421052, + "grad_norm": 0.474609375, + "learning_rate": 0.00044456049282335085, + "loss": 3.5788, + "step": 5505 + }, + { + "epoch": 0.23183157894736842, + "grad_norm": 0.4453125, + "learning_rate": 0.0004445393008393532, + "loss": 3.3057, + "step": 5506 + }, + { + "epoch": 0.23187368421052632, + "grad_norm": 0.416015625, + "learning_rate": 0.0004445181053110779, + "loss": 3.0511, + "step": 5507 + }, + { + "epoch": 0.23191578947368421, + "grad_norm": 0.46875, + "learning_rate": 0.0004444969062389115, + "loss": 3.1072, + "step": 5508 + }, + { + "epoch": 0.2319578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.00044447570362323986, + "loss": 3.2669, + "step": 5509 + }, + { + "epoch": 0.232, + "grad_norm": 0.421875, + "learning_rate": 0.00044445449746444935, + "loss": 3.2213, + "step": 5510 + }, + { + "epoch": 0.23204210526315788, + "grad_norm": 0.41796875, + "learning_rate": 0.0004444332877629265, + "loss": 3.2559, + "step": 5511 + }, + { + "epoch": 0.23208421052631578, + "grad_norm": 0.6171875, + "learning_rate": 0.0004444120745190575, + "loss": 3.5853, + "step": 5512 + }, + { + "epoch": 0.23212631578947368, + "grad_norm": 0.42578125, + "learning_rate": 0.00044439085773322897, + "loss": 3.0958, + "step": 5513 + }, + { + "epoch": 0.23216842105263158, + "grad_norm": 0.48046875, + "learning_rate": 0.00044436963740582735, + "loss": 3.284, + "step": 5514 + }, + { + "epoch": 0.23221052631578948, + "grad_norm": 0.5078125, + "learning_rate": 0.00044434841353723935, + "loss": 3.6203, + "step": 5515 + }, + { + "epoch": 0.23225263157894738, + "grad_norm": 0.390625, + "learning_rate": 0.00044432718612785163, + "loss": 3.1295, + "step": 5516 + }, + { + "epoch": 0.23229473684210528, + "grad_norm": 0.453125, + "learning_rate": 0.0004443059551780509, + "loss": 3.1918, + "step": 5517 + }, + { + "epoch": 0.23233684210526315, + "grad_norm": 0.390625, + "learning_rate": 0.0004442847206882239, + "loss": 3.3971, + "step": 5518 + }, + { + "epoch": 0.23237894736842105, + "grad_norm": 0.5, + "learning_rate": 0.00044426348265875766, + "loss": 3.1022, + "step": 5519 + }, + { + "epoch": 0.23242105263157894, + "grad_norm": 0.451171875, + "learning_rate": 0.00044424224109003896, + "loss": 3.1693, + "step": 5520 + }, + { + "epoch": 0.23246315789473684, + "grad_norm": 0.392578125, + "learning_rate": 0.0004442209959824549, + "loss": 3.4511, + "step": 5521 + }, + { + "epoch": 0.23250526315789474, + "grad_norm": 0.408203125, + "learning_rate": 0.0004441997473363924, + "loss": 3.1361, + "step": 5522 + }, + { + "epoch": 0.23254736842105264, + "grad_norm": 0.435546875, + "learning_rate": 0.00044417849515223875, + "loss": 3.0397, + "step": 5523 + }, + { + "epoch": 0.2325894736842105, + "grad_norm": 0.435546875, + "learning_rate": 0.00044415723943038105, + "loss": 3.5254, + "step": 5524 + }, + { + "epoch": 0.2326315789473684, + "grad_norm": 0.43359375, + "learning_rate": 0.0004441359801712066, + "loss": 3.3133, + "step": 5525 + }, + { + "epoch": 0.2326736842105263, + "grad_norm": 0.408203125, + "learning_rate": 0.0004441147173751027, + "loss": 3.3942, + "step": 5526 + }, + { + "epoch": 0.2327157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.00044409345104245664, + "loss": 3.2951, + "step": 5527 + }, + { + "epoch": 0.2327578947368421, + "grad_norm": 0.443359375, + "learning_rate": 0.000444072181173656, + "loss": 3.2359, + "step": 5528 + }, + { + "epoch": 0.2328, + "grad_norm": 0.396484375, + "learning_rate": 0.0004440509077690883, + "loss": 3.3952, + "step": 5529 + }, + { + "epoch": 0.2328421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.000444029630829141, + "loss": 3.2425, + "step": 5530 + }, + { + "epoch": 0.23288421052631578, + "grad_norm": 0.439453125, + "learning_rate": 0.0004440083503542018, + "loss": 2.7607, + "step": 5531 + }, + { + "epoch": 0.23292631578947368, + "grad_norm": 0.41796875, + "learning_rate": 0.00044398706634465846, + "loss": 3.7614, + "step": 5532 + }, + { + "epoch": 0.23296842105263157, + "grad_norm": 0.421875, + "learning_rate": 0.0004439657788008986, + "loss": 3.2997, + "step": 5533 + }, + { + "epoch": 0.23301052631578947, + "grad_norm": 0.44140625, + "learning_rate": 0.0004439444877233102, + "loss": 3.2707, + "step": 5534 + }, + { + "epoch": 0.23305263157894737, + "grad_norm": 0.400390625, + "learning_rate": 0.00044392319311228114, + "loss": 3.5786, + "step": 5535 + }, + { + "epoch": 0.23309473684210527, + "grad_norm": 0.44921875, + "learning_rate": 0.00044390189496819934, + "loss": 3.3597, + "step": 5536 + }, + { + "epoch": 0.23313684210526317, + "grad_norm": 0.38671875, + "learning_rate": 0.0004438805932914528, + "loss": 3.3336, + "step": 5537 + }, + { + "epoch": 0.23317894736842104, + "grad_norm": 0.423828125, + "learning_rate": 0.0004438592880824297, + "loss": 3.1167, + "step": 5538 + }, + { + "epoch": 0.23322105263157894, + "grad_norm": 0.474609375, + "learning_rate": 0.00044383797934151813, + "loss": 3.4276, + "step": 5539 + }, + { + "epoch": 0.23326315789473684, + "grad_norm": 0.4765625, + "learning_rate": 0.0004438166670691063, + "loss": 3.4127, + "step": 5540 + }, + { + "epoch": 0.23330526315789474, + "grad_norm": 0.439453125, + "learning_rate": 0.00044379535126558257, + "loss": 3.1334, + "step": 5541 + }, + { + "epoch": 0.23334736842105264, + "grad_norm": 0.42578125, + "learning_rate": 0.0004437740319313352, + "loss": 3.3359, + "step": 5542 + }, + { + "epoch": 0.23338947368421054, + "grad_norm": 0.43359375, + "learning_rate": 0.0004437527090667527, + "loss": 3.2754, + "step": 5543 + }, + { + "epoch": 0.23343157894736843, + "grad_norm": 0.423828125, + "learning_rate": 0.0004437313826722234, + "loss": 3.2022, + "step": 5544 + }, + { + "epoch": 0.2334736842105263, + "grad_norm": 0.7421875, + "learning_rate": 0.00044371005274813604, + "loss": 3.3855, + "step": 5545 + }, + { + "epoch": 0.2335157894736842, + "grad_norm": 0.64453125, + "learning_rate": 0.00044368871929487895, + "loss": 3.1619, + "step": 5546 + }, + { + "epoch": 0.2335578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.00044366738231284107, + "loss": 3.4004, + "step": 5547 + }, + { + "epoch": 0.2336, + "grad_norm": 0.45703125, + "learning_rate": 0.0004436460418024111, + "loss": 3.3367, + "step": 5548 + }, + { + "epoch": 0.2336421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.0004436246977639777, + "loss": 3.4574, + "step": 5549 + }, + { + "epoch": 0.2336842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.00044360335019792983, + "loss": 3.1847, + "step": 5550 + }, + { + "epoch": 0.23372631578947367, + "grad_norm": 0.4375, + "learning_rate": 0.00044358199910465634, + "loss": 3.7824, + "step": 5551 + }, + { + "epoch": 0.23376842105263157, + "grad_norm": 0.41796875, + "learning_rate": 0.00044356064448454637, + "loss": 3.3178, + "step": 5552 + }, + { + "epoch": 0.23381052631578947, + "grad_norm": 0.466796875, + "learning_rate": 0.00044353928633798887, + "loss": 2.8391, + "step": 5553 + }, + { + "epoch": 0.23385263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.0004435179246653729, + "loss": 3.5665, + "step": 5554 + }, + { + "epoch": 0.23389473684210527, + "grad_norm": 0.43359375, + "learning_rate": 0.00044349655946708777, + "loss": 3.6181, + "step": 5555 + }, + { + "epoch": 0.23393684210526317, + "grad_norm": 0.421875, + "learning_rate": 0.0004434751907435227, + "loss": 3.6585, + "step": 5556 + }, + { + "epoch": 0.23397894736842106, + "grad_norm": 0.404296875, + "learning_rate": 0.0004434538184950669, + "loss": 3.349, + "step": 5557 + }, + { + "epoch": 0.23402105263157894, + "grad_norm": 0.4140625, + "learning_rate": 0.00044343244272210985, + "loss": 3.1035, + "step": 5558 + }, + { + "epoch": 0.23406315789473683, + "grad_norm": 0.41015625, + "learning_rate": 0.000443411063425041, + "loss": 3.4502, + "step": 5559 + }, + { + "epoch": 0.23410526315789473, + "grad_norm": 0.4609375, + "learning_rate": 0.0004433896806042498, + "loss": 3.5399, + "step": 5560 + }, + { + "epoch": 0.23414736842105263, + "grad_norm": 0.3828125, + "learning_rate": 0.0004433682942601258, + "loss": 3.7573, + "step": 5561 + }, + { + "epoch": 0.23418947368421053, + "grad_norm": 0.41796875, + "learning_rate": 0.00044334690439305877, + "loss": 3.1991, + "step": 5562 + }, + { + "epoch": 0.23423157894736843, + "grad_norm": 0.392578125, + "learning_rate": 0.0004433255110034382, + "loss": 3.3179, + "step": 5563 + }, + { + "epoch": 0.23427368421052633, + "grad_norm": 0.431640625, + "learning_rate": 0.00044330411409165404, + "loss": 3.0712, + "step": 5564 + }, + { + "epoch": 0.2343157894736842, + "grad_norm": 0.47265625, + "learning_rate": 0.0004432827136580961, + "loss": 3.1028, + "step": 5565 + }, + { + "epoch": 0.2343578947368421, + "grad_norm": 0.404296875, + "learning_rate": 0.00044326130970315415, + "loss": 3.5401, + "step": 5566 + }, + { + "epoch": 0.2344, + "grad_norm": 0.466796875, + "learning_rate": 0.00044323990222721823, + "loss": 3.0381, + "step": 5567 + }, + { + "epoch": 0.2344421052631579, + "grad_norm": 0.396484375, + "learning_rate": 0.00044321849123067826, + "loss": 3.0968, + "step": 5568 + }, + { + "epoch": 0.2344842105263158, + "grad_norm": 0.44921875, + "learning_rate": 0.0004431970767139245, + "loss": 3.4412, + "step": 5569 + }, + { + "epoch": 0.2345263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.000443175658677347, + "loss": 3.1902, + "step": 5570 + }, + { + "epoch": 0.2345684210526316, + "grad_norm": 0.7109375, + "learning_rate": 0.0004431542371213359, + "loss": 3.2213, + "step": 5571 + }, + { + "epoch": 0.23461052631578946, + "grad_norm": 0.4296875, + "learning_rate": 0.00044313281204628156, + "loss": 3.3623, + "step": 5572 + }, + { + "epoch": 0.23465263157894736, + "grad_norm": 0.431640625, + "learning_rate": 0.00044311138345257443, + "loss": 3.2425, + "step": 5573 + }, + { + "epoch": 0.23469473684210526, + "grad_norm": 0.5, + "learning_rate": 0.00044308995134060457, + "loss": 3.1465, + "step": 5574 + }, + { + "epoch": 0.23473684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.00044306851571076285, + "loss": 3.6587, + "step": 5575 + }, + { + "epoch": 0.23477894736842106, + "grad_norm": 0.53125, + "learning_rate": 0.0004430470765634396, + "loss": 3.4262, + "step": 5576 + }, + { + "epoch": 0.23482105263157896, + "grad_norm": 0.416015625, + "learning_rate": 0.00044302563389902533, + "loss": 2.9537, + "step": 5577 + }, + { + "epoch": 0.23486315789473683, + "grad_norm": 0.484375, + "learning_rate": 0.00044300418771791087, + "loss": 3.2086, + "step": 5578 + }, + { + "epoch": 0.23490526315789473, + "grad_norm": 0.474609375, + "learning_rate": 0.00044298273802048687, + "loss": 2.7803, + "step": 5579 + }, + { + "epoch": 0.23494736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.00044296128480714414, + "loss": 3.0605, + "step": 5580 + }, + { + "epoch": 0.23498947368421053, + "grad_norm": 0.39453125, + "learning_rate": 0.0004429398280782735, + "loss": 2.8034, + "step": 5581 + }, + { + "epoch": 0.23503157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.00044291836783426597, + "loss": 3.5961, + "step": 5582 + }, + { + "epoch": 0.23507368421052632, + "grad_norm": 0.58984375, + "learning_rate": 0.00044289690407551234, + "loss": 3.3363, + "step": 5583 + }, + { + "epoch": 0.23511578947368422, + "grad_norm": 0.423828125, + "learning_rate": 0.00044287543680240384, + "loss": 3.5325, + "step": 5584 + }, + { + "epoch": 0.2351578947368421, + "grad_norm": 0.45703125, + "learning_rate": 0.00044285396601533147, + "loss": 3.241, + "step": 5585 + }, + { + "epoch": 0.2352, + "grad_norm": 0.400390625, + "learning_rate": 0.00044283249171468643, + "loss": 3.6339, + "step": 5586 + }, + { + "epoch": 0.2352421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00044281101390086, + "loss": 3.0311, + "step": 5587 + }, + { + "epoch": 0.2352842105263158, + "grad_norm": 0.400390625, + "learning_rate": 0.00044278953257424335, + "loss": 3.4906, + "step": 5588 + }, + { + "epoch": 0.2353263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.00044276804773522795, + "loss": 3.3244, + "step": 5589 + }, + { + "epoch": 0.2353684210526316, + "grad_norm": 2.109375, + "learning_rate": 0.0004427465593842053, + "loss": 2.8849, + "step": 5590 + }, + { + "epoch": 0.2354105263157895, + "grad_norm": 0.400390625, + "learning_rate": 0.0004427250675215667, + "loss": 2.844, + "step": 5591 + }, + { + "epoch": 0.23545263157894736, + "grad_norm": 0.41015625, + "learning_rate": 0.0004427035721477039, + "loss": 3.3261, + "step": 5592 + }, + { + "epoch": 0.23549473684210526, + "grad_norm": 0.43359375, + "learning_rate": 0.0004426820732630084, + "loss": 3.3929, + "step": 5593 + }, + { + "epoch": 0.23553684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.0004426605708678719, + "loss": 3.1813, + "step": 5594 + }, + { + "epoch": 0.23557894736842105, + "grad_norm": 0.451171875, + "learning_rate": 0.0004426390649626862, + "loss": 3.0942, + "step": 5595 + }, + { + "epoch": 0.23562105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.0004426175555478431, + "loss": 3.5262, + "step": 5596 + }, + { + "epoch": 0.23566315789473685, + "grad_norm": 0.419921875, + "learning_rate": 0.00044259604262373436, + "loss": 2.9177, + "step": 5597 + }, + { + "epoch": 0.23570526315789472, + "grad_norm": 0.416015625, + "learning_rate": 0.0004425745261907521, + "loss": 3.4885, + "step": 5598 + }, + { + "epoch": 0.23574736842105262, + "grad_norm": 0.439453125, + "learning_rate": 0.0004425530062492882, + "loss": 3.5132, + "step": 5599 + }, + { + "epoch": 0.23578947368421052, + "grad_norm": 0.421875, + "learning_rate": 0.0004425314827997348, + "loss": 3.1141, + "step": 5600 + }, + { + "epoch": 0.23583157894736842, + "grad_norm": 0.46875, + "learning_rate": 0.00044250995584248394, + "loss": 3.1155, + "step": 5601 + }, + { + "epoch": 0.23587368421052632, + "grad_norm": 0.443359375, + "learning_rate": 0.00044248842537792786, + "loss": 3.081, + "step": 5602 + }, + { + "epoch": 0.23591578947368422, + "grad_norm": 0.431640625, + "learning_rate": 0.00044246689140645893, + "loss": 3.4895, + "step": 5603 + }, + { + "epoch": 0.23595789473684212, + "grad_norm": 0.578125, + "learning_rate": 0.00044244535392846935, + "loss": 3.3859, + "step": 5604 + }, + { + "epoch": 0.236, + "grad_norm": 0.4453125, + "learning_rate": 0.00044242381294435154, + "loss": 3.5936, + "step": 5605 + }, + { + "epoch": 0.2360421052631579, + "grad_norm": 0.44921875, + "learning_rate": 0.0004424022684544979, + "loss": 3.3618, + "step": 5606 + }, + { + "epoch": 0.23608421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.000442380720459301, + "loss": 3.2641, + "step": 5607 + }, + { + "epoch": 0.23612631578947368, + "grad_norm": 0.42578125, + "learning_rate": 0.0004423591689591534, + "loss": 3.5162, + "step": 5608 + }, + { + "epoch": 0.23616842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.00044233761395444783, + "loss": 3.2502, + "step": 5609 + }, + { + "epoch": 0.23621052631578948, + "grad_norm": 0.447265625, + "learning_rate": 0.00044231605544557684, + "loss": 3.3254, + "step": 5610 + }, + { + "epoch": 0.23625263157894738, + "grad_norm": 0.423828125, + "learning_rate": 0.0004422944934329333, + "loss": 3.3557, + "step": 5611 + }, + { + "epoch": 0.23629473684210525, + "grad_norm": 0.41796875, + "learning_rate": 0.0004422729279169101, + "loss": 2.804, + "step": 5612 + }, + { + "epoch": 0.23633684210526315, + "grad_norm": 0.451171875, + "learning_rate": 0.00044225135889789996, + "loss": 3.3707, + "step": 5613 + }, + { + "epoch": 0.23637894736842105, + "grad_norm": 0.447265625, + "learning_rate": 0.000442229786376296, + "loss": 3.6086, + "step": 5614 + }, + { + "epoch": 0.23642105263157895, + "grad_norm": 0.451171875, + "learning_rate": 0.00044220821035249115, + "loss": 3.3041, + "step": 5615 + }, + { + "epoch": 0.23646315789473685, + "grad_norm": 0.419921875, + "learning_rate": 0.0004421866308268786, + "loss": 3.4646, + "step": 5616 + }, + { + "epoch": 0.23650526315789475, + "grad_norm": 0.39453125, + "learning_rate": 0.00044216504779985143, + "loss": 3.0456, + "step": 5617 + }, + { + "epoch": 0.23654736842105265, + "grad_norm": 0.396484375, + "learning_rate": 0.0004421434612718028, + "loss": 2.9969, + "step": 5618 + }, + { + "epoch": 0.23658947368421052, + "grad_norm": 0.447265625, + "learning_rate": 0.0004421218712431262, + "loss": 2.7777, + "step": 5619 + }, + { + "epoch": 0.23663157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.0004421002777142148, + "loss": 3.0334, + "step": 5620 + }, + { + "epoch": 0.23667368421052631, + "grad_norm": 0.427734375, + "learning_rate": 0.000442078680685462, + "loss": 3.2926, + "step": 5621 + }, + { + "epoch": 0.2367157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.0004420570801572613, + "loss": 3.4331, + "step": 5622 + }, + { + "epoch": 0.2367578947368421, + "grad_norm": 0.462890625, + "learning_rate": 0.0004420354761300063, + "loss": 3.0736, + "step": 5623 + }, + { + "epoch": 0.2368, + "grad_norm": 0.408203125, + "learning_rate": 0.0004420138686040906, + "loss": 3.0868, + "step": 5624 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 0.51171875, + "learning_rate": 0.0004419922575799078, + "loss": 3.392, + "step": 5625 + }, + { + "epoch": 0.23688421052631578, + "grad_norm": 0.3984375, + "learning_rate": 0.00044197064305785157, + "loss": 3.2685, + "step": 5626 + }, + { + "epoch": 0.23692631578947368, + "grad_norm": 0.404296875, + "learning_rate": 0.00044194902503831584, + "loss": 3.1218, + "step": 5627 + }, + { + "epoch": 0.23696842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.00044192740352169436, + "loss": 3.7138, + "step": 5628 + }, + { + "epoch": 0.23701052631578948, + "grad_norm": 0.404296875, + "learning_rate": 0.0004419057785083812, + "loss": 3.2949, + "step": 5629 + }, + { + "epoch": 0.23705263157894738, + "grad_norm": 0.41796875, + "learning_rate": 0.00044188414999877016, + "loss": 3.2364, + "step": 5630 + }, + { + "epoch": 0.23709473684210527, + "grad_norm": 0.431640625, + "learning_rate": 0.00044186251799325534, + "loss": 3.7482, + "step": 5631 + }, + { + "epoch": 0.23713684210526315, + "grad_norm": 0.396484375, + "learning_rate": 0.0004418408824922309, + "loss": 3.507, + "step": 5632 + }, + { + "epoch": 0.23717894736842104, + "grad_norm": 0.427734375, + "learning_rate": 0.00044181924349609105, + "loss": 2.9173, + "step": 5633 + }, + { + "epoch": 0.23722105263157894, + "grad_norm": 0.42578125, + "learning_rate": 0.00044179760100522986, + "loss": 3.4549, + "step": 5634 + }, + { + "epoch": 0.23726315789473684, + "grad_norm": 0.39453125, + "learning_rate": 0.00044177595502004177, + "loss": 3.0569, + "step": 5635 + }, + { + "epoch": 0.23730526315789474, + "grad_norm": 0.48046875, + "learning_rate": 0.0004417543055409211, + "loss": 3.2633, + "step": 5636 + }, + { + "epoch": 0.23734736842105264, + "grad_norm": 0.4375, + "learning_rate": 0.00044173265256826236, + "loss": 3.1601, + "step": 5637 + }, + { + "epoch": 0.23738947368421054, + "grad_norm": 0.63671875, + "learning_rate": 0.0004417109961024598, + "loss": 2.8502, + "step": 5638 + }, + { + "epoch": 0.2374315789473684, + "grad_norm": 0.41015625, + "learning_rate": 0.0004416893361439083, + "loss": 3.1021, + "step": 5639 + }, + { + "epoch": 0.2374736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.00044166767269300224, + "loss": 3.3733, + "step": 5640 + }, + { + "epoch": 0.2375157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.0004416460057501364, + "loss": 3.1021, + "step": 5641 + }, + { + "epoch": 0.2375578947368421, + "grad_norm": 0.404296875, + "learning_rate": 0.00044162433531570554, + "loss": 3.3575, + "step": 5642 + }, + { + "epoch": 0.2376, + "grad_norm": 0.427734375, + "learning_rate": 0.0004416026613901044, + "loss": 3.6462, + "step": 5643 + }, + { + "epoch": 0.2376421052631579, + "grad_norm": 0.39453125, + "learning_rate": 0.0004415809839737279, + "loss": 3.2634, + "step": 5644 + }, + { + "epoch": 0.23768421052631578, + "grad_norm": 0.447265625, + "learning_rate": 0.000441559303066971, + "loss": 3.3765, + "step": 5645 + }, + { + "epoch": 0.23772631578947367, + "grad_norm": 0.38671875, + "learning_rate": 0.0004415376186702287, + "loss": 3.5372, + "step": 5646 + }, + { + "epoch": 0.23776842105263157, + "grad_norm": 0.427734375, + "learning_rate": 0.00044151593078389594, + "loss": 3.2292, + "step": 5647 + }, + { + "epoch": 0.23781052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.000441494239408368, + "loss": 3.3993, + "step": 5648 + }, + { + "epoch": 0.23785263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.00044147254454403995, + "loss": 3.7321, + "step": 5649 + }, + { + "epoch": 0.23789473684210527, + "grad_norm": 0.431640625, + "learning_rate": 0.00044145084619130715, + "loss": 3.308, + "step": 5650 + }, + { + "epoch": 0.23793684210526317, + "grad_norm": 0.451171875, + "learning_rate": 0.00044142914435056494, + "loss": 3.3918, + "step": 5651 + }, + { + "epoch": 0.23797894736842104, + "grad_norm": 0.4453125, + "learning_rate": 0.0004414074390222086, + "loss": 3.5144, + "step": 5652 + }, + { + "epoch": 0.23802105263157894, + "grad_norm": 0.427734375, + "learning_rate": 0.0004413857302066336, + "loss": 3.1241, + "step": 5653 + }, + { + "epoch": 0.23806315789473684, + "grad_norm": 0.392578125, + "learning_rate": 0.00044136401790423547, + "loss": 3.5303, + "step": 5654 + }, + { + "epoch": 0.23810526315789474, + "grad_norm": 0.4375, + "learning_rate": 0.0004413423021154098, + "loss": 3.3143, + "step": 5655 + }, + { + "epoch": 0.23814736842105264, + "grad_norm": 0.412109375, + "learning_rate": 0.00044132058284055217, + "loss": 3.1831, + "step": 5656 + }, + { + "epoch": 0.23818947368421053, + "grad_norm": 0.3828125, + "learning_rate": 0.0004412988600800583, + "loss": 3.2469, + "step": 5657 + }, + { + "epoch": 0.23823157894736843, + "grad_norm": 0.45703125, + "learning_rate": 0.000441277133834324, + "loss": 3.5906, + "step": 5658 + }, + { + "epoch": 0.2382736842105263, + "grad_norm": 0.390625, + "learning_rate": 0.00044125540410374503, + "loss": 3.3973, + "step": 5659 + }, + { + "epoch": 0.2383157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.0004412336708887174, + "loss": 3.5825, + "step": 5660 + }, + { + "epoch": 0.2383578947368421, + "grad_norm": 0.396484375, + "learning_rate": 0.0004412119341896369, + "loss": 3.2473, + "step": 5661 + }, + { + "epoch": 0.2384, + "grad_norm": 0.48046875, + "learning_rate": 0.0004411901940068996, + "loss": 3.0988, + "step": 5662 + }, + { + "epoch": 0.2384421052631579, + "grad_norm": 0.453125, + "learning_rate": 0.0004411684503409017, + "loss": 2.9111, + "step": 5663 + }, + { + "epoch": 0.2384842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.00044114670319203916, + "loss": 3.5432, + "step": 5664 + }, + { + "epoch": 0.2385263157894737, + "grad_norm": 0.57421875, + "learning_rate": 0.0004411249525607083, + "loss": 3.7182, + "step": 5665 + }, + { + "epoch": 0.23856842105263157, + "grad_norm": 0.392578125, + "learning_rate": 0.0004411031984473054, + "loss": 3.716, + "step": 5666 + }, + { + "epoch": 0.23861052631578947, + "grad_norm": 0.39453125, + "learning_rate": 0.0004410814408522267, + "loss": 3.7485, + "step": 5667 + }, + { + "epoch": 0.23865263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.00044105967977586867, + "loss": 3.5136, + "step": 5668 + }, + { + "epoch": 0.23869473684210527, + "grad_norm": 0.384765625, + "learning_rate": 0.0004410379152186278, + "loss": 2.983, + "step": 5669 + }, + { + "epoch": 0.23873684210526316, + "grad_norm": 0.416015625, + "learning_rate": 0.00044101614718090057, + "loss": 3.216, + "step": 5670 + }, + { + "epoch": 0.23877894736842106, + "grad_norm": 0.4453125, + "learning_rate": 0.00044099437566308354, + "loss": 2.8832, + "step": 5671 + }, + { + "epoch": 0.23882105263157893, + "grad_norm": 0.427734375, + "learning_rate": 0.0004409726006655734, + "loss": 3.5001, + "step": 5672 + }, + { + "epoch": 0.23886315789473683, + "grad_norm": 0.400390625, + "learning_rate": 0.0004409508221887669, + "loss": 3.0544, + "step": 5673 + }, + { + "epoch": 0.23890526315789473, + "grad_norm": 0.390625, + "learning_rate": 0.0004409290402330608, + "loss": 3.0664, + "step": 5674 + }, + { + "epoch": 0.23894736842105263, + "grad_norm": 0.388671875, + "learning_rate": 0.00044090725479885187, + "loss": 3.3974, + "step": 5675 + }, + { + "epoch": 0.23898947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.000440885465886537, + "loss": 3.3597, + "step": 5676 + }, + { + "epoch": 0.23903157894736843, + "grad_norm": 0.39453125, + "learning_rate": 0.00044086367349651333, + "loss": 3.4833, + "step": 5677 + }, + { + "epoch": 0.23907368421052633, + "grad_norm": 0.390625, + "learning_rate": 0.0004408418776291777, + "loss": 3.2379, + "step": 5678 + }, + { + "epoch": 0.2391157894736842, + "grad_norm": 0.390625, + "learning_rate": 0.0004408200782849274, + "loss": 3.5677, + "step": 5679 + }, + { + "epoch": 0.2391578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.0004407982754641594, + "loss": 3.1961, + "step": 5680 + }, + { + "epoch": 0.2392, + "grad_norm": 0.408203125, + "learning_rate": 0.000440776469167271, + "loss": 3.1835, + "step": 5681 + }, + { + "epoch": 0.2392421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.0004407546593946595, + "loss": 2.9983, + "step": 5682 + }, + { + "epoch": 0.2392842105263158, + "grad_norm": 0.396484375, + "learning_rate": 0.00044073284614672224, + "loss": 3.4474, + "step": 5683 + }, + { + "epoch": 0.2393263157894737, + "grad_norm": 0.458984375, + "learning_rate": 0.0004407110294238566, + "loss": 3.2739, + "step": 5684 + }, + { + "epoch": 0.2393684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.0004406892092264601, + "loss": 3.0298, + "step": 5685 + }, + { + "epoch": 0.23941052631578946, + "grad_norm": 0.400390625, + "learning_rate": 0.00044066738555493027, + "loss": 3.0882, + "step": 5686 + }, + { + "epoch": 0.23945263157894736, + "grad_norm": 0.4375, + "learning_rate": 0.0004406455584096647, + "loss": 2.8994, + "step": 5687 + }, + { + "epoch": 0.23949473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.00044062372779106105, + "loss": 3.1491, + "step": 5688 + }, + { + "epoch": 0.23953684210526316, + "grad_norm": 0.3984375, + "learning_rate": 0.000440601893699517, + "loss": 3.1292, + "step": 5689 + }, + { + "epoch": 0.23957894736842106, + "grad_norm": 0.48828125, + "learning_rate": 0.0004405800561354304, + "loss": 3.425, + "step": 5690 + }, + { + "epoch": 0.23962105263157896, + "grad_norm": 0.3984375, + "learning_rate": 0.00044055821509919916, + "loss": 3.1172, + "step": 5691 + }, + { + "epoch": 0.23966315789473686, + "grad_norm": 0.40625, + "learning_rate": 0.00044053637059122107, + "loss": 3.8108, + "step": 5692 + }, + { + "epoch": 0.23970526315789473, + "grad_norm": 0.44921875, + "learning_rate": 0.0004405145226118943, + "loss": 3.2928, + "step": 5693 + }, + { + "epoch": 0.23974736842105263, + "grad_norm": 0.49609375, + "learning_rate": 0.0004404926711616167, + "loss": 2.7366, + "step": 5694 + }, + { + "epoch": 0.23978947368421052, + "grad_norm": 0.4140625, + "learning_rate": 0.00044047081624078643, + "loss": 3.2447, + "step": 5695 + }, + { + "epoch": 0.23983157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.00044044895784980167, + "loss": 3.202, + "step": 5696 + }, + { + "epoch": 0.23987368421052632, + "grad_norm": 0.384765625, + "learning_rate": 0.00044042709598906074, + "loss": 3.3263, + "step": 5697 + }, + { + "epoch": 0.23991578947368422, + "grad_norm": 0.4296875, + "learning_rate": 0.00044040523065896176, + "loss": 3.103, + "step": 5698 + }, + { + "epoch": 0.2399578947368421, + "grad_norm": 0.39453125, + "learning_rate": 0.00044038336185990324, + "loss": 3.2397, + "step": 5699 + }, + { + "epoch": 0.24, + "grad_norm": 0.39453125, + "learning_rate": 0.0004403614895922836, + "loss": 3.2629, + "step": 5700 + }, + { + "epoch": 0.2400421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.00044033961385650126, + "loss": 3.5444, + "step": 5701 + }, + { + "epoch": 0.2400842105263158, + "grad_norm": 0.44140625, + "learning_rate": 0.00044031773465295476, + "loss": 2.864, + "step": 5702 + }, + { + "epoch": 0.2401263157894737, + "grad_norm": 0.421875, + "learning_rate": 0.0004402958519820428, + "loss": 2.922, + "step": 5703 + }, + { + "epoch": 0.2401684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.000440273965844164, + "loss": 3.4326, + "step": 5704 + }, + { + "epoch": 0.24021052631578949, + "grad_norm": 0.423828125, + "learning_rate": 0.0004402520762397171, + "loss": 3.035, + "step": 5705 + }, + { + "epoch": 0.24025263157894736, + "grad_norm": 0.392578125, + "learning_rate": 0.00044023018316910093, + "loss": 2.7112, + "step": 5706 + }, + { + "epoch": 0.24029473684210526, + "grad_norm": 0.4140625, + "learning_rate": 0.0004402082866327143, + "loss": 3.4546, + "step": 5707 + }, + { + "epoch": 0.24033684210526315, + "grad_norm": 0.42578125, + "learning_rate": 0.00044018638663095616, + "loss": 3.508, + "step": 5708 + }, + { + "epoch": 0.24037894736842105, + "grad_norm": 0.44140625, + "learning_rate": 0.00044016448316422553, + "loss": 3.0937, + "step": 5709 + }, + { + "epoch": 0.24042105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.00044014257623292143, + "loss": 3.1282, + "step": 5710 + }, + { + "epoch": 0.24046315789473685, + "grad_norm": 0.40234375, + "learning_rate": 0.000440120665837443, + "loss": 3.3809, + "step": 5711 + }, + { + "epoch": 0.24050526315789475, + "grad_norm": 0.404296875, + "learning_rate": 0.0004400987519781895, + "loss": 3.7989, + "step": 5712 + }, + { + "epoch": 0.24054736842105262, + "grad_norm": 0.41796875, + "learning_rate": 0.00044007683465556, + "loss": 3.4561, + "step": 5713 + }, + { + "epoch": 0.24058947368421052, + "grad_norm": 0.408203125, + "learning_rate": 0.000440054913869954, + "loss": 3.6463, + "step": 5714 + }, + { + "epoch": 0.24063157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.0004400329896217707, + "loss": 3.1636, + "step": 5715 + }, + { + "epoch": 0.24067368421052632, + "grad_norm": 0.4296875, + "learning_rate": 0.0004400110619114096, + "loss": 3.5094, + "step": 5716 + }, + { + "epoch": 0.24071578947368422, + "grad_norm": 0.390625, + "learning_rate": 0.00043998913073927027, + "loss": 3.3629, + "step": 5717 + }, + { + "epoch": 0.24075789473684212, + "grad_norm": 0.404296875, + "learning_rate": 0.00043996719610575217, + "loss": 3.2755, + "step": 5718 + }, + { + "epoch": 0.2408, + "grad_norm": 0.40234375, + "learning_rate": 0.00043994525801125496, + "loss": 3.3365, + "step": 5719 + }, + { + "epoch": 0.24084210526315789, + "grad_norm": 0.416015625, + "learning_rate": 0.0004399233164561783, + "loss": 3.2219, + "step": 5720 + }, + { + "epoch": 0.24088421052631578, + "grad_norm": 0.4375, + "learning_rate": 0.000439901371440922, + "loss": 3.4116, + "step": 5721 + }, + { + "epoch": 0.24092631578947368, + "grad_norm": 0.53125, + "learning_rate": 0.0004398794229658858, + "loss": 3.0025, + "step": 5722 + }, + { + "epoch": 0.24096842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.0004398574710314697, + "loss": 3.6059, + "step": 5723 + }, + { + "epoch": 0.24101052631578948, + "grad_norm": 0.427734375, + "learning_rate": 0.00043983551563807344, + "loss": 2.8624, + "step": 5724 + }, + { + "epoch": 0.24105263157894738, + "grad_norm": 0.431640625, + "learning_rate": 0.0004398135567860972, + "loss": 3.3028, + "step": 5725 + }, + { + "epoch": 0.24109473684210525, + "grad_norm": 0.462890625, + "learning_rate": 0.00043979159447594094, + "loss": 2.9742, + "step": 5726 + }, + { + "epoch": 0.24113684210526315, + "grad_norm": 0.50390625, + "learning_rate": 0.00043976962870800483, + "loss": 3.2551, + "step": 5727 + }, + { + "epoch": 0.24117894736842105, + "grad_norm": 0.419921875, + "learning_rate": 0.0004397476594826891, + "loss": 3.4834, + "step": 5728 + }, + { + "epoch": 0.24122105263157895, + "grad_norm": 0.5625, + "learning_rate": 0.0004397256868003939, + "loss": 3.0787, + "step": 5729 + }, + { + "epoch": 0.24126315789473685, + "grad_norm": 0.42578125, + "learning_rate": 0.00043970371066151965, + "loss": 3.5561, + "step": 5730 + }, + { + "epoch": 0.24130526315789474, + "grad_norm": 0.416015625, + "learning_rate": 0.0004396817310664666, + "loss": 3.5463, + "step": 5731 + }, + { + "epoch": 0.24134736842105264, + "grad_norm": 0.466796875, + "learning_rate": 0.00043965974801563534, + "loss": 3.3084, + "step": 5732 + }, + { + "epoch": 0.24138947368421051, + "grad_norm": 0.447265625, + "learning_rate": 0.00043963776150942636, + "loss": 3.1025, + "step": 5733 + }, + { + "epoch": 0.2414315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.0004396157715482402, + "loss": 3.0449, + "step": 5734 + }, + { + "epoch": 0.2414736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.00043959377813247735, + "loss": 3.1708, + "step": 5735 + }, + { + "epoch": 0.2415157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.0004395717812625387, + "loss": 3.8698, + "step": 5736 + }, + { + "epoch": 0.2415578947368421, + "grad_norm": 0.396484375, + "learning_rate": 0.0004395497809388249, + "loss": 3.2482, + "step": 5737 + }, + { + "epoch": 0.2416, + "grad_norm": 0.41796875, + "learning_rate": 0.00043952777716173687, + "loss": 3.3577, + "step": 5738 + }, + { + "epoch": 0.2416421052631579, + "grad_norm": 0.392578125, + "learning_rate": 0.00043950576993167536, + "loss": 3.3885, + "step": 5739 + }, + { + "epoch": 0.24168421052631578, + "grad_norm": 0.408203125, + "learning_rate": 0.00043948375924904136, + "loss": 3.5907, + "step": 5740 + }, + { + "epoch": 0.24172631578947368, + "grad_norm": 0.4140625, + "learning_rate": 0.000439461745114236, + "loss": 3.4877, + "step": 5741 + }, + { + "epoch": 0.24176842105263158, + "grad_norm": 0.44921875, + "learning_rate": 0.00043943972752766015, + "loss": 3.7744, + "step": 5742 + }, + { + "epoch": 0.24181052631578948, + "grad_norm": 0.40625, + "learning_rate": 0.0004394177064897151, + "loss": 3.2848, + "step": 5743 + }, + { + "epoch": 0.24185263157894737, + "grad_norm": 1.078125, + "learning_rate": 0.00043939568200080195, + "loss": 3.1925, + "step": 5744 + }, + { + "epoch": 0.24189473684210527, + "grad_norm": 0.408203125, + "learning_rate": 0.00043937365406132204, + "loss": 3.2824, + "step": 5745 + }, + { + "epoch": 0.24193684210526314, + "grad_norm": 0.423828125, + "learning_rate": 0.0004393516226716767, + "loss": 3.2666, + "step": 5746 + }, + { + "epoch": 0.24197894736842104, + "grad_norm": 0.412109375, + "learning_rate": 0.00043932958783226715, + "loss": 3.4416, + "step": 5747 + }, + { + "epoch": 0.24202105263157894, + "grad_norm": 0.443359375, + "learning_rate": 0.0004393075495434951, + "loss": 3.4158, + "step": 5748 + }, + { + "epoch": 0.24206315789473684, + "grad_norm": 0.5859375, + "learning_rate": 0.00043928550780576183, + "loss": 3.6666, + "step": 5749 + }, + { + "epoch": 0.24210526315789474, + "grad_norm": 0.404296875, + "learning_rate": 0.00043926346261946906, + "loss": 2.8852, + "step": 5750 + }, + { + "epoch": 0.24214736842105264, + "grad_norm": 0.404296875, + "learning_rate": 0.0004392414139850184, + "loss": 2.9076, + "step": 5751 + }, + { + "epoch": 0.24218947368421054, + "grad_norm": 0.59765625, + "learning_rate": 0.0004392193619028114, + "loss": 2.988, + "step": 5752 + }, + { + "epoch": 0.2422315789473684, + "grad_norm": 0.4453125, + "learning_rate": 0.00043919730637325007, + "loss": 3.5439, + "step": 5753 + }, + { + "epoch": 0.2422736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.0004391752473967361, + "loss": 3.2195, + "step": 5754 + }, + { + "epoch": 0.2423157894736842, + "grad_norm": 0.48046875, + "learning_rate": 0.0004391531849736714, + "loss": 3.0817, + "step": 5755 + }, + { + "epoch": 0.2423578947368421, + "grad_norm": 0.470703125, + "learning_rate": 0.0004391311191044579, + "loss": 3.3419, + "step": 5756 + }, + { + "epoch": 0.2424, + "grad_norm": 0.4375, + "learning_rate": 0.00043910904978949764, + "loss": 3.4587, + "step": 5757 + }, + { + "epoch": 0.2424421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.0004390869770291926, + "loss": 3.6578, + "step": 5758 + }, + { + "epoch": 0.2424842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.00043906490082394514, + "loss": 3.1426, + "step": 5759 + }, + { + "epoch": 0.24252631578947367, + "grad_norm": 0.43359375, + "learning_rate": 0.0004390428211741572, + "loss": 3.2086, + "step": 5760 + }, + { + "epoch": 0.24256842105263157, + "grad_norm": 0.458984375, + "learning_rate": 0.00043902073808023125, + "loss": 3.4118, + "step": 5761 + }, + { + "epoch": 0.24261052631578947, + "grad_norm": 0.421875, + "learning_rate": 0.0004389986515425696, + "loss": 3.3434, + "step": 5762 + }, + { + "epoch": 0.24265263157894737, + "grad_norm": 0.44921875, + "learning_rate": 0.00043897656156157447, + "loss": 3.5494, + "step": 5763 + }, + { + "epoch": 0.24269473684210527, + "grad_norm": 0.4375, + "learning_rate": 0.0004389544681376485, + "loss": 3.5953, + "step": 5764 + }, + { + "epoch": 0.24273684210526317, + "grad_norm": 0.45703125, + "learning_rate": 0.0004389323712711941, + "loss": 3.1458, + "step": 5765 + }, + { + "epoch": 0.24277894736842107, + "grad_norm": 0.4375, + "learning_rate": 0.0004389102709626138, + "loss": 3.3899, + "step": 5766 + }, + { + "epoch": 0.24282105263157894, + "grad_norm": 0.40234375, + "learning_rate": 0.0004388881672123105, + "loss": 3.423, + "step": 5767 + }, + { + "epoch": 0.24286315789473684, + "grad_norm": 0.40625, + "learning_rate": 0.0004388660600206866, + "loss": 3.1283, + "step": 5768 + }, + { + "epoch": 0.24290526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.00043884394938814504, + "loss": 4.1053, + "step": 5769 + }, + { + "epoch": 0.24294736842105263, + "grad_norm": 0.404296875, + "learning_rate": 0.0004388218353150886, + "loss": 3.3953, + "step": 5770 + }, + { + "epoch": 0.24298947368421053, + "grad_norm": 0.408203125, + "learning_rate": 0.0004387997178019202, + "loss": 3.7624, + "step": 5771 + }, + { + "epoch": 0.24303157894736843, + "grad_norm": 0.40625, + "learning_rate": 0.00043877759684904274, + "loss": 3.4884, + "step": 5772 + }, + { + "epoch": 0.2430736842105263, + "grad_norm": 0.400390625, + "learning_rate": 0.00043875547245685926, + "loss": 3.4067, + "step": 5773 + }, + { + "epoch": 0.2431157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.00043873334462577287, + "loss": 3.3116, + "step": 5774 + }, + { + "epoch": 0.2431578947368421, + "grad_norm": 0.376953125, + "learning_rate": 0.00043871121335618666, + "loss": 3.3819, + "step": 5775 + }, + { + "epoch": 0.2432, + "grad_norm": 0.55859375, + "learning_rate": 0.0004386890786485039, + "loss": 3.2145, + "step": 5776 + }, + { + "epoch": 0.2432421052631579, + "grad_norm": 0.404296875, + "learning_rate": 0.00043866694050312784, + "loss": 3.0936, + "step": 5777 + }, + { + "epoch": 0.2432842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.0004386447989204618, + "loss": 2.9633, + "step": 5778 + }, + { + "epoch": 0.2433263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.00043862265390090917, + "loss": 3.9013, + "step": 5779 + }, + { + "epoch": 0.24336842105263157, + "grad_norm": 0.412109375, + "learning_rate": 0.0004386005054448734, + "loss": 3.2631, + "step": 5780 + }, + { + "epoch": 0.24341052631578947, + "grad_norm": 0.392578125, + "learning_rate": 0.000438578353552758, + "loss": 3.4248, + "step": 5781 + }, + { + "epoch": 0.24345263157894736, + "grad_norm": 0.396484375, + "learning_rate": 0.00043855619822496655, + "loss": 3.2423, + "step": 5782 + }, + { + "epoch": 0.24349473684210526, + "grad_norm": 0.3984375, + "learning_rate": 0.00043853403946190274, + "loss": 3.1958, + "step": 5783 + }, + { + "epoch": 0.24353684210526316, + "grad_norm": 0.39453125, + "learning_rate": 0.0004385118772639703, + "loss": 3.4908, + "step": 5784 + }, + { + "epoch": 0.24357894736842106, + "grad_norm": 0.447265625, + "learning_rate": 0.0004384897116315729, + "loss": 3.2035, + "step": 5785 + }, + { + "epoch": 0.24362105263157896, + "grad_norm": 0.41015625, + "learning_rate": 0.0004384675425651144, + "loss": 3.335, + "step": 5786 + }, + { + "epoch": 0.24366315789473683, + "grad_norm": 0.41015625, + "learning_rate": 0.00043844537006499875, + "loss": 3.5487, + "step": 5787 + }, + { + "epoch": 0.24370526315789473, + "grad_norm": 0.4296875, + "learning_rate": 0.0004384231941316299, + "loss": 3.3652, + "step": 5788 + }, + { + "epoch": 0.24374736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.00043840101476541176, + "loss": 2.6983, + "step": 5789 + }, + { + "epoch": 0.24378947368421053, + "grad_norm": 0.396484375, + "learning_rate": 0.00043837883196674854, + "loss": 3.6199, + "step": 5790 + }, + { + "epoch": 0.24383157894736843, + "grad_norm": 0.42578125, + "learning_rate": 0.0004383566457360443, + "loss": 3.2554, + "step": 5791 + }, + { + "epoch": 0.24387368421052633, + "grad_norm": 0.3828125, + "learning_rate": 0.00043833445607370335, + "loss": 3.0794, + "step": 5792 + }, + { + "epoch": 0.2439157894736842, + "grad_norm": 0.52734375, + "learning_rate": 0.0004383122629801298, + "loss": 2.9403, + "step": 5793 + }, + { + "epoch": 0.2439578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.0004382900664557282, + "loss": 3.3505, + "step": 5794 + }, + { + "epoch": 0.244, + "grad_norm": 0.4453125, + "learning_rate": 0.00043826786650090276, + "loss": 3.1737, + "step": 5795 + }, + { + "epoch": 0.2440421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.00043824566311605797, + "loss": 3.395, + "step": 5796 + }, + { + "epoch": 0.2440842105263158, + "grad_norm": 0.39453125, + "learning_rate": 0.00043822345630159846, + "loss": 2.9132, + "step": 5797 + }, + { + "epoch": 0.2441263157894737, + "grad_norm": 0.3984375, + "learning_rate": 0.0004382012460579287, + "loss": 2.9797, + "step": 5798 + }, + { + "epoch": 0.2441684210526316, + "grad_norm": 0.384765625, + "learning_rate": 0.0004381790323854533, + "loss": 3.3223, + "step": 5799 + }, + { + "epoch": 0.24421052631578946, + "grad_norm": 0.39453125, + "learning_rate": 0.00043815681528457703, + "loss": 3.4199, + "step": 5800 + }, + { + "epoch": 0.24425263157894736, + "grad_norm": 0.419921875, + "learning_rate": 0.0004381345947557047, + "loss": 2.8119, + "step": 5801 + }, + { + "epoch": 0.24429473684210526, + "grad_norm": 0.44140625, + "learning_rate": 0.00043811237079924113, + "loss": 3.2709, + "step": 5802 + }, + { + "epoch": 0.24433684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.0004380901434155911, + "loss": 3.1417, + "step": 5803 + }, + { + "epoch": 0.24437894736842106, + "grad_norm": 0.3984375, + "learning_rate": 0.0004380679126051598, + "loss": 3.3475, + "step": 5804 + }, + { + "epoch": 0.24442105263157896, + "grad_norm": 0.400390625, + "learning_rate": 0.00043804567836835197, + "loss": 3.3515, + "step": 5805 + }, + { + "epoch": 0.24446315789473685, + "grad_norm": 0.41015625, + "learning_rate": 0.0004380234407055729, + "loss": 2.8609, + "step": 5806 + }, + { + "epoch": 0.24450526315789473, + "grad_norm": 0.416015625, + "learning_rate": 0.0004380011996172276, + "loss": 3.1671, + "step": 5807 + }, + { + "epoch": 0.24454736842105262, + "grad_norm": 0.431640625, + "learning_rate": 0.0004379789551037213, + "loss": 3.1463, + "step": 5808 + }, + { + "epoch": 0.24458947368421052, + "grad_norm": 0.4453125, + "learning_rate": 0.0004379567071654594, + "loss": 3.5578, + "step": 5809 + }, + { + "epoch": 0.24463157894736842, + "grad_norm": 0.4375, + "learning_rate": 0.00043793445580284714, + "loss": 3.4545, + "step": 5810 + }, + { + "epoch": 0.24467368421052632, + "grad_norm": 0.41015625, + "learning_rate": 0.0004379122010162898, + "loss": 3.3313, + "step": 5811 + }, + { + "epoch": 0.24471578947368422, + "grad_norm": 0.431640625, + "learning_rate": 0.0004378899428061931, + "loss": 2.9929, + "step": 5812 + }, + { + "epoch": 0.24475789473684212, + "grad_norm": 0.39453125, + "learning_rate": 0.00043786768117296226, + "loss": 3.3065, + "step": 5813 + }, + { + "epoch": 0.2448, + "grad_norm": 0.388671875, + "learning_rate": 0.0004378454161170031, + "loss": 3.2392, + "step": 5814 + }, + { + "epoch": 0.2448421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.0004378231476387211, + "loss": 3.2017, + "step": 5815 + }, + { + "epoch": 0.2448842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.00043780087573852213, + "loss": 3.3878, + "step": 5816 + }, + { + "epoch": 0.2449263157894737, + "grad_norm": 0.4453125, + "learning_rate": 0.0004377786004168118, + "loss": 3.7428, + "step": 5817 + }, + { + "epoch": 0.24496842105263159, + "grad_norm": 0.396484375, + "learning_rate": 0.000437756321673996, + "loss": 3.2103, + "step": 5818 + }, + { + "epoch": 0.24501052631578948, + "grad_norm": 0.416015625, + "learning_rate": 0.0004377340395104806, + "loss": 3.3718, + "step": 5819 + }, + { + "epoch": 0.24505263157894736, + "grad_norm": 0.412109375, + "learning_rate": 0.0004377117539266716, + "loss": 2.4851, + "step": 5820 + }, + { + "epoch": 0.24509473684210525, + "grad_norm": 0.40234375, + "learning_rate": 0.000437689464922975, + "loss": 3.379, + "step": 5821 + }, + { + "epoch": 0.24513684210526315, + "grad_norm": 0.455078125, + "learning_rate": 0.00043766717249979687, + "loss": 3.4554, + "step": 5822 + }, + { + "epoch": 0.24517894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.0004376448766575434, + "loss": 3.4001, + "step": 5823 + }, + { + "epoch": 0.24522105263157895, + "grad_norm": 0.388671875, + "learning_rate": 0.0004376225773966206, + "loss": 3.4533, + "step": 5824 + }, + { + "epoch": 0.24526315789473685, + "grad_norm": 0.408203125, + "learning_rate": 0.00043760027471743506, + "loss": 3.8316, + "step": 5825 + }, + { + "epoch": 0.24530526315789475, + "grad_norm": 0.40234375, + "learning_rate": 0.0004375779686203929, + "loss": 3.3495, + "step": 5826 + }, + { + "epoch": 0.24534736842105262, + "grad_norm": 0.447265625, + "learning_rate": 0.0004375556591059005, + "loss": 3.1279, + "step": 5827 + }, + { + "epoch": 0.24538947368421052, + "grad_norm": 0.4453125, + "learning_rate": 0.00043753334617436433, + "loss": 3.8785, + "step": 5828 + }, + { + "epoch": 0.24543157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.00043751102982619107, + "loss": 3.7162, + "step": 5829 + }, + { + "epoch": 0.24547368421052632, + "grad_norm": 0.396484375, + "learning_rate": 0.000437488710061787, + "loss": 3.359, + "step": 5830 + }, + { + "epoch": 0.24551578947368422, + "grad_norm": 0.400390625, + "learning_rate": 0.00043746638688155903, + "loss": 3.3231, + "step": 5831 + }, + { + "epoch": 0.24555789473684211, + "grad_norm": 0.453125, + "learning_rate": 0.0004374440602859138, + "loss": 2.9204, + "step": 5832 + }, + { + "epoch": 0.2456, + "grad_norm": 0.390625, + "learning_rate": 0.00043742173027525787, + "loss": 3.3773, + "step": 5833 + }, + { + "epoch": 0.24564210526315788, + "grad_norm": 0.427734375, + "learning_rate": 0.0004373993968499983, + "loss": 4.0035, + "step": 5834 + }, + { + "epoch": 0.24568421052631578, + "grad_norm": 0.435546875, + "learning_rate": 0.00043737706001054197, + "loss": 3.3762, + "step": 5835 + }, + { + "epoch": 0.24572631578947368, + "grad_norm": 0.462890625, + "learning_rate": 0.0004373547197572957, + "loss": 3.3595, + "step": 5836 + }, + { + "epoch": 0.24576842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.00043733237609066655, + "loss": 3.4537, + "step": 5837 + }, + { + "epoch": 0.24581052631578948, + "grad_norm": 0.40234375, + "learning_rate": 0.00043731002901106167, + "loss": 2.9956, + "step": 5838 + }, + { + "epoch": 0.24585263157894738, + "grad_norm": 0.40625, + "learning_rate": 0.0004372876785188881, + "loss": 3.4069, + "step": 5839 + }, + { + "epoch": 0.24589473684210525, + "grad_norm": 0.3984375, + "learning_rate": 0.00043726532461455317, + "loss": 3.3592, + "step": 5840 + }, + { + "epoch": 0.24593684210526315, + "grad_norm": 0.50390625, + "learning_rate": 0.0004372429672984639, + "loss": 3.2404, + "step": 5841 + }, + { + "epoch": 0.24597894736842105, + "grad_norm": 0.400390625, + "learning_rate": 0.0004372206065710279, + "loss": 3.1899, + "step": 5842 + }, + { + "epoch": 0.24602105263157895, + "grad_norm": 0.443359375, + "learning_rate": 0.0004371982424326523, + "loss": 3.5338, + "step": 5843 + }, + { + "epoch": 0.24606315789473684, + "grad_norm": 0.396484375, + "learning_rate": 0.00043717587488374476, + "loss": 3.0906, + "step": 5844 + }, + { + "epoch": 0.24610526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.0004371535039247127, + "loss": 3.0455, + "step": 5845 + }, + { + "epoch": 0.24614736842105264, + "grad_norm": 0.4296875, + "learning_rate": 0.00043713112955596367, + "loss": 3.4332, + "step": 5846 + }, + { + "epoch": 0.2461894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.0004371087517779053, + "loss": 2.9745, + "step": 5847 + }, + { + "epoch": 0.2462315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.0004370863705909454, + "loss": 3.3057, + "step": 5848 + }, + { + "epoch": 0.2462736842105263, + "grad_norm": 0.400390625, + "learning_rate": 0.0004370639859954916, + "loss": 3.385, + "step": 5849 + }, + { + "epoch": 0.2463157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.00043704159799195174, + "loss": 3.521, + "step": 5850 + }, + { + "epoch": 0.2463578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00043701920658073377, + "loss": 3.3314, + "step": 5851 + }, + { + "epoch": 0.2464, + "grad_norm": 0.390625, + "learning_rate": 0.00043699681176224557, + "loss": 2.798, + "step": 5852 + }, + { + "epoch": 0.2464421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.00043697441353689523, + "loss": 3.4144, + "step": 5853 + }, + { + "epoch": 0.24648421052631578, + "grad_norm": 0.447265625, + "learning_rate": 0.0004369520119050907, + "loss": 3.4753, + "step": 5854 + }, + { + "epoch": 0.24652631578947368, + "grad_norm": 1.1171875, + "learning_rate": 0.0004369296068672402, + "loss": 3.4717, + "step": 5855 + }, + { + "epoch": 0.24656842105263158, + "grad_norm": 0.45703125, + "learning_rate": 0.00043690719842375187, + "loss": 3.7164, + "step": 5856 + }, + { + "epoch": 0.24661052631578947, + "grad_norm": 0.41015625, + "learning_rate": 0.000436884786575034, + "loss": 2.9484, + "step": 5857 + }, + { + "epoch": 0.24665263157894737, + "grad_norm": 0.421875, + "learning_rate": 0.00043686237132149487, + "loss": 3.4486, + "step": 5858 + }, + { + "epoch": 0.24669473684210527, + "grad_norm": 0.4296875, + "learning_rate": 0.000436839952663543, + "loss": 3.3158, + "step": 5859 + }, + { + "epoch": 0.24673684210526317, + "grad_norm": 0.3984375, + "learning_rate": 0.0004368175306015866, + "loss": 2.6301, + "step": 5860 + }, + { + "epoch": 0.24677894736842104, + "grad_norm": 0.44921875, + "learning_rate": 0.0004367951051360344, + "loss": 3.6451, + "step": 5861 + }, + { + "epoch": 0.24682105263157894, + "grad_norm": 0.421875, + "learning_rate": 0.00043677267626729475, + "loss": 3.6841, + "step": 5862 + }, + { + "epoch": 0.24686315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.00043675024399577646, + "loss": 3.0407, + "step": 5863 + }, + { + "epoch": 0.24690526315789474, + "grad_norm": 0.447265625, + "learning_rate": 0.0004367278083218881, + "loss": 3.1672, + "step": 5864 + }, + { + "epoch": 0.24694736842105264, + "grad_norm": 0.42578125, + "learning_rate": 0.0004367053692460385, + "loss": 3.4539, + "step": 5865 + }, + { + "epoch": 0.24698947368421054, + "grad_norm": 0.43359375, + "learning_rate": 0.0004366829267686364, + "loss": 3.6821, + "step": 5866 + }, + { + "epoch": 0.2470315789473684, + "grad_norm": 0.45703125, + "learning_rate": 0.0004366604808900907, + "loss": 3.3186, + "step": 5867 + }, + { + "epoch": 0.2470736842105263, + "grad_norm": 0.38671875, + "learning_rate": 0.00043663803161081053, + "loss": 3.4977, + "step": 5868 + }, + { + "epoch": 0.2471157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.0004366155789312045, + "loss": 3.2481, + "step": 5869 + }, + { + "epoch": 0.2471578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.00043659312285168194, + "loss": 3.4353, + "step": 5870 + }, + { + "epoch": 0.2472, + "grad_norm": 0.427734375, + "learning_rate": 0.000436570663372652, + "loss": 2.9201, + "step": 5871 + }, + { + "epoch": 0.2472421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.00043654820049452374, + "loss": 3.2972, + "step": 5872 + }, + { + "epoch": 0.2472842105263158, + "grad_norm": 0.40234375, + "learning_rate": 0.0004365257342177065, + "loss": 3.0657, + "step": 5873 + }, + { + "epoch": 0.24732631578947367, + "grad_norm": 0.416015625, + "learning_rate": 0.0004365032645426095, + "loss": 3.4154, + "step": 5874 + }, + { + "epoch": 0.24736842105263157, + "grad_norm": 0.40625, + "learning_rate": 0.0004364807914696421, + "loss": 3.1522, + "step": 5875 + }, + { + "epoch": 0.24741052631578947, + "grad_norm": 0.4296875, + "learning_rate": 0.00043645831499921386, + "loss": 3.2581, + "step": 5876 + }, + { + "epoch": 0.24745263157894737, + "grad_norm": 0.451171875, + "learning_rate": 0.0004364358351317341, + "loss": 3.457, + "step": 5877 + }, + { + "epoch": 0.24749473684210527, + "grad_norm": 0.439453125, + "learning_rate": 0.0004364133518676126, + "loss": 3.3686, + "step": 5878 + }, + { + "epoch": 0.24753684210526317, + "grad_norm": 0.458984375, + "learning_rate": 0.0004363908652072588, + "loss": 3.2853, + "step": 5879 + }, + { + "epoch": 0.24757894736842107, + "grad_norm": 0.43359375, + "learning_rate": 0.0004363683751510824, + "loss": 3.3215, + "step": 5880 + }, + { + "epoch": 0.24762105263157894, + "grad_norm": 0.412109375, + "learning_rate": 0.00043634588169949324, + "loss": 3.4715, + "step": 5881 + }, + { + "epoch": 0.24766315789473684, + "grad_norm": 0.44140625, + "learning_rate": 0.0004363233848529011, + "loss": 3.2916, + "step": 5882 + }, + { + "epoch": 0.24770526315789473, + "grad_norm": 0.439453125, + "learning_rate": 0.0004363008846117157, + "loss": 3.4356, + "step": 5883 + }, + { + "epoch": 0.24774736842105263, + "grad_norm": 0.455078125, + "learning_rate": 0.0004362783809763471, + "loss": 3.0956, + "step": 5884 + }, + { + "epoch": 0.24778947368421053, + "grad_norm": 0.41015625, + "learning_rate": 0.0004362558739472053, + "loss": 3.2828, + "step": 5885 + }, + { + "epoch": 0.24783157894736843, + "grad_norm": 0.45703125, + "learning_rate": 0.0004362333635247003, + "loss": 3.5293, + "step": 5886 + }, + { + "epoch": 0.24787368421052633, + "grad_norm": 0.412109375, + "learning_rate": 0.0004362108497092423, + "loss": 3.4688, + "step": 5887 + }, + { + "epoch": 0.2479157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.00043618833250124134, + "loss": 3.1629, + "step": 5888 + }, + { + "epoch": 0.2479578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.00043616581190110773, + "loss": 3.6413, + "step": 5889 + }, + { + "epoch": 0.248, + "grad_norm": 0.41796875, + "learning_rate": 0.0004361432879092518, + "loss": 3.1462, + "step": 5890 + }, + { + "epoch": 0.2480421052631579, + "grad_norm": 0.38671875, + "learning_rate": 0.0004361207605260838, + "loss": 3.4458, + "step": 5891 + }, + { + "epoch": 0.2480842105263158, + "grad_norm": 0.412109375, + "learning_rate": 0.0004360982297520143, + "loss": 3.1227, + "step": 5892 + }, + { + "epoch": 0.2481263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.00043607569558745365, + "loss": 3.321, + "step": 5893 + }, + { + "epoch": 0.24816842105263157, + "grad_norm": 0.44921875, + "learning_rate": 0.00043605315803281247, + "loss": 3.7713, + "step": 5894 + }, + { + "epoch": 0.24821052631578946, + "grad_norm": 0.41015625, + "learning_rate": 0.00043603061708850135, + "loss": 2.9461, + "step": 5895 + }, + { + "epoch": 0.24825263157894736, + "grad_norm": 0.41796875, + "learning_rate": 0.00043600807275493103, + "loss": 3.2043, + "step": 5896 + }, + { + "epoch": 0.24829473684210526, + "grad_norm": 1.0546875, + "learning_rate": 0.0004359855250325121, + "loss": 3.1347, + "step": 5897 + }, + { + "epoch": 0.24833684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.0004359629739216555, + "loss": 3.2559, + "step": 5898 + }, + { + "epoch": 0.24837894736842106, + "grad_norm": 0.443359375, + "learning_rate": 0.00043594041942277196, + "loss": 3.4053, + "step": 5899 + }, + { + "epoch": 0.24842105263157896, + "grad_norm": 0.3984375, + "learning_rate": 0.00043591786153627247, + "loss": 3.2963, + "step": 5900 + }, + { + "epoch": 0.24846315789473683, + "grad_norm": 0.453125, + "learning_rate": 0.00043589530026256796, + "loss": 3.0448, + "step": 5901 + }, + { + "epoch": 0.24850526315789473, + "grad_norm": 0.55078125, + "learning_rate": 0.0004358727356020695, + "loss": 3.1376, + "step": 5902 + }, + { + "epoch": 0.24854736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.0004358501675551882, + "loss": 3.4717, + "step": 5903 + }, + { + "epoch": 0.24858947368421053, + "grad_norm": 0.447265625, + "learning_rate": 0.0004358275961223352, + "loss": 3.207, + "step": 5904 + }, + { + "epoch": 0.24863157894736843, + "grad_norm": 0.423828125, + "learning_rate": 0.0004358050213039217, + "loss": 3.4567, + "step": 5905 + }, + { + "epoch": 0.24867368421052632, + "grad_norm": 0.404296875, + "learning_rate": 0.00043578244310035915, + "loss": 2.851, + "step": 5906 + }, + { + "epoch": 0.24871578947368422, + "grad_norm": 0.578125, + "learning_rate": 0.00043575986151205866, + "loss": 3.0597, + "step": 5907 + }, + { + "epoch": 0.2487578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.00043573727653943177, + "loss": 3.1925, + "step": 5908 + }, + { + "epoch": 0.2488, + "grad_norm": 0.40625, + "learning_rate": 0.00043571468818288993, + "loss": 3.4693, + "step": 5909 + }, + { + "epoch": 0.2488421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00043569209644284476, + "loss": 3.0362, + "step": 5910 + }, + { + "epoch": 0.2488842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.0004356695013197077, + "loss": 3.2697, + "step": 5911 + }, + { + "epoch": 0.2489263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.00043564690281389047, + "loss": 3.5087, + "step": 5912 + }, + { + "epoch": 0.2489684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.00043562430092580485, + "loss": 3.5611, + "step": 5913 + }, + { + "epoch": 0.24901052631578946, + "grad_norm": 0.458984375, + "learning_rate": 0.0004356016956558625, + "loss": 2.921, + "step": 5914 + }, + { + "epoch": 0.24905263157894736, + "grad_norm": 0.40625, + "learning_rate": 0.00043557908700447536, + "loss": 3.0882, + "step": 5915 + }, + { + "epoch": 0.24909473684210526, + "grad_norm": 0.451171875, + "learning_rate": 0.0004355564749720553, + "loss": 3.4369, + "step": 5916 + }, + { + "epoch": 0.24913684210526316, + "grad_norm": 0.416015625, + "learning_rate": 0.00043553385955901424, + "loss": 3.4472, + "step": 5917 + }, + { + "epoch": 0.24917894736842106, + "grad_norm": 0.435546875, + "learning_rate": 0.00043551124076576434, + "loss": 3.3542, + "step": 5918 + }, + { + "epoch": 0.24922105263157895, + "grad_norm": 0.4140625, + "learning_rate": 0.0004354886185927176, + "loss": 3.7573, + "step": 5919 + }, + { + "epoch": 0.24926315789473685, + "grad_norm": 0.4375, + "learning_rate": 0.00043546599304028613, + "loss": 3.6567, + "step": 5920 + }, + { + "epoch": 0.24930526315789472, + "grad_norm": 0.416015625, + "learning_rate": 0.00043544336410888214, + "loss": 3.8174, + "step": 5921 + }, + { + "epoch": 0.24934736842105262, + "grad_norm": 0.421875, + "learning_rate": 0.000435420731798918, + "loss": 3.4057, + "step": 5922 + }, + { + "epoch": 0.24938947368421052, + "grad_norm": 0.486328125, + "learning_rate": 0.0004353980961108059, + "loss": 3.3162, + "step": 5923 + }, + { + "epoch": 0.24943157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.00043537545704495846, + "loss": 3.4463, + "step": 5924 + }, + { + "epoch": 0.24947368421052632, + "grad_norm": 0.4140625, + "learning_rate": 0.0004353528146017879, + "loss": 3.4923, + "step": 5925 + }, + { + "epoch": 0.24951578947368422, + "grad_norm": 0.421875, + "learning_rate": 0.00043533016878170687, + "loss": 2.9915, + "step": 5926 + }, + { + "epoch": 0.24955789473684212, + "grad_norm": 0.421875, + "learning_rate": 0.0004353075195851279, + "loss": 2.6857, + "step": 5927 + }, + { + "epoch": 0.2496, + "grad_norm": 0.42578125, + "learning_rate": 0.00043528486701246375, + "loss": 3.4822, + "step": 5928 + }, + { + "epoch": 0.2496421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.0004352622110641269, + "loss": 2.8233, + "step": 5929 + }, + { + "epoch": 0.2496842105263158, + "grad_norm": 0.400390625, + "learning_rate": 0.0004352395517405303, + "loss": 3.4272, + "step": 5930 + }, + { + "epoch": 0.24972631578947369, + "grad_norm": 0.4140625, + "learning_rate": 0.0004352168890420867, + "loss": 3.3662, + "step": 5931 + }, + { + "epoch": 0.24976842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.000435194222969209, + "loss": 3.4941, + "step": 5932 + }, + { + "epoch": 0.24981052631578948, + "grad_norm": 0.396484375, + "learning_rate": 0.0004351715535223101, + "loss": 3.5766, + "step": 5933 + }, + { + "epoch": 0.24985263157894738, + "grad_norm": 0.396484375, + "learning_rate": 0.00043514888070180315, + "loss": 3.2833, + "step": 5934 + }, + { + "epoch": 0.24989473684210525, + "grad_norm": 0.4140625, + "learning_rate": 0.0004351262045081011, + "loss": 3.081, + "step": 5935 + }, + { + "epoch": 0.24993684210526315, + "grad_norm": 0.4296875, + "learning_rate": 0.0004351035249416171, + "loss": 3.6775, + "step": 5936 + }, + { + "epoch": 0.24997894736842105, + "grad_norm": 0.4609375, + "learning_rate": 0.00043508084200276446, + "loss": 3.1271, + "step": 5937 + }, + { + "epoch": 0.2500210526315789, + "grad_norm": 0.404296875, + "learning_rate": 0.0004350581556919562, + "loss": 2.9615, + "step": 5938 + }, + { + "epoch": 0.2500631578947368, + "grad_norm": 0.5234375, + "learning_rate": 0.00043503546600960586, + "loss": 2.9622, + "step": 5939 + }, + { + "epoch": 0.2501052631578947, + "grad_norm": 0.443359375, + "learning_rate": 0.00043501277295612676, + "loss": 3.4923, + "step": 5940 + }, + { + "epoch": 0.2501473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.0004349900765319322, + "loss": 2.946, + "step": 5941 + }, + { + "epoch": 0.2501894736842105, + "grad_norm": 0.443359375, + "learning_rate": 0.00043496737673743587, + "loss": 2.9072, + "step": 5942 + }, + { + "epoch": 0.2502315789473684, + "grad_norm": 0.5703125, + "learning_rate": 0.0004349446735730512, + "loss": 3.0582, + "step": 5943 + }, + { + "epoch": 0.2502736842105263, + "grad_norm": 0.431640625, + "learning_rate": 0.000434921967039192, + "loss": 3.1677, + "step": 5944 + }, + { + "epoch": 0.2503157894736842, + "grad_norm": 0.462890625, + "learning_rate": 0.0004348992571362718, + "loss": 3.1582, + "step": 5945 + }, + { + "epoch": 0.2503578947368421, + "grad_norm": 0.396484375, + "learning_rate": 0.0004348765438647043, + "loss": 3.1602, + "step": 5946 + }, + { + "epoch": 0.2504, + "grad_norm": 0.42578125, + "learning_rate": 0.0004348538272249035, + "loss": 2.8246, + "step": 5947 + }, + { + "epoch": 0.2504421052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.0004348311072172831, + "loss": 3.3215, + "step": 5948 + }, + { + "epoch": 0.2504842105263158, + "grad_norm": 0.396484375, + "learning_rate": 0.00043480838384225715, + "loss": 3.189, + "step": 5949 + }, + { + "epoch": 0.2505263157894737, + "grad_norm": 0.443359375, + "learning_rate": 0.00043478565710023954, + "loss": 3.031, + "step": 5950 + }, + { + "epoch": 0.25056842105263155, + "grad_norm": 0.390625, + "learning_rate": 0.0004347629269916443, + "loss": 3.1738, + "step": 5951 + }, + { + "epoch": 0.25061052631578945, + "grad_norm": 0.392578125, + "learning_rate": 0.00043474019351688576, + "loss": 3.4134, + "step": 5952 + }, + { + "epoch": 0.25065263157894735, + "grad_norm": 0.3984375, + "learning_rate": 0.0004347174566763779, + "loss": 3.1434, + "step": 5953 + }, + { + "epoch": 0.25069473684210525, + "grad_norm": 0.400390625, + "learning_rate": 0.000434694716470535, + "loss": 3.3328, + "step": 5954 + }, + { + "epoch": 0.25073684210526315, + "grad_norm": 0.54296875, + "learning_rate": 0.00043467197289977135, + "loss": 3.3598, + "step": 5955 + }, + { + "epoch": 0.25077894736842105, + "grad_norm": 0.408203125, + "learning_rate": 0.0004346492259645014, + "loss": 2.6097, + "step": 5956 + }, + { + "epoch": 0.25082105263157894, + "grad_norm": 0.451171875, + "learning_rate": 0.00043462647566513946, + "loss": 3.6658, + "step": 5957 + }, + { + "epoch": 0.25086315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.0004346037220021001, + "loss": 3.384, + "step": 5958 + }, + { + "epoch": 0.25090526315789474, + "grad_norm": 0.484375, + "learning_rate": 0.00043458096497579773, + "loss": 3.3693, + "step": 5959 + }, + { + "epoch": 0.25094736842105264, + "grad_norm": 0.423828125, + "learning_rate": 0.0004345582045866472, + "loss": 3.3107, + "step": 5960 + }, + { + "epoch": 0.25098947368421054, + "grad_norm": 0.41015625, + "learning_rate": 0.0004345354408350629, + "loss": 3.0628, + "step": 5961 + }, + { + "epoch": 0.25103157894736844, + "grad_norm": 0.453125, + "learning_rate": 0.00043451267372145977, + "loss": 3.6457, + "step": 5962 + }, + { + "epoch": 0.25107368421052634, + "grad_norm": 0.3984375, + "learning_rate": 0.0004344899032462524, + "loss": 3.1352, + "step": 5963 + }, + { + "epoch": 0.25111578947368424, + "grad_norm": 0.404296875, + "learning_rate": 0.0004344671294098559, + "loss": 3.4436, + "step": 5964 + }, + { + "epoch": 0.2511578947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.000434444352212685, + "loss": 2.9168, + "step": 5965 + }, + { + "epoch": 0.2512, + "grad_norm": 0.4296875, + "learning_rate": 0.0004344215716551547, + "loss": 3.2977, + "step": 5966 + }, + { + "epoch": 0.2512421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.00043439878773768006, + "loss": 3.3153, + "step": 5967 + }, + { + "epoch": 0.2512842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.0004343760004606761, + "loss": 3.1176, + "step": 5968 + }, + { + "epoch": 0.2513263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.0004343532098245581, + "loss": 3.4627, + "step": 5969 + }, + { + "epoch": 0.2513684210526316, + "grad_norm": 0.400390625, + "learning_rate": 0.00043433041582974127, + "loss": 2.8761, + "step": 5970 + }, + { + "epoch": 0.2514105263157895, + "grad_norm": 0.392578125, + "learning_rate": 0.0004343076184766408, + "loss": 3.4108, + "step": 5971 + }, + { + "epoch": 0.25145263157894737, + "grad_norm": 0.474609375, + "learning_rate": 0.0004342848177656721, + "loss": 3.311, + "step": 5972 + }, + { + "epoch": 0.25149473684210527, + "grad_norm": 0.4453125, + "learning_rate": 0.0004342620136972505, + "loss": 3.9995, + "step": 5973 + }, + { + "epoch": 0.25153684210526317, + "grad_norm": 0.419921875, + "learning_rate": 0.0004342392062717915, + "loss": 3.451, + "step": 5974 + }, + { + "epoch": 0.25157894736842107, + "grad_norm": 0.38671875, + "learning_rate": 0.00043421639548971066, + "loss": 3.5722, + "step": 5975 + }, + { + "epoch": 0.25162105263157897, + "grad_norm": 0.41015625, + "learning_rate": 0.0004341935813514235, + "loss": 3.1107, + "step": 5976 + }, + { + "epoch": 0.25166315789473687, + "grad_norm": 0.392578125, + "learning_rate": 0.00043417076385734576, + "loss": 3.5168, + "step": 5977 + }, + { + "epoch": 0.2517052631578947, + "grad_norm": 0.4140625, + "learning_rate": 0.0004341479430078931, + "loss": 3.1381, + "step": 5978 + }, + { + "epoch": 0.2517473684210526, + "grad_norm": 0.421875, + "learning_rate": 0.00043412511880348126, + "loss": 3.0685, + "step": 5979 + }, + { + "epoch": 0.2517894736842105, + "grad_norm": 0.427734375, + "learning_rate": 0.0004341022912445261, + "loss": 3.3139, + "step": 5980 + }, + { + "epoch": 0.2518315789473684, + "grad_norm": 0.490234375, + "learning_rate": 0.0004340794603314435, + "loss": 3.2366, + "step": 5981 + }, + { + "epoch": 0.2518736842105263, + "grad_norm": 0.408203125, + "learning_rate": 0.00043405662606464944, + "loss": 3.6517, + "step": 5982 + }, + { + "epoch": 0.2519157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.00043403378844455987, + "loss": 2.9999, + "step": 5983 + }, + { + "epoch": 0.2519578947368421, + "grad_norm": 0.388671875, + "learning_rate": 0.00043401094747159096, + "loss": 3.1826, + "step": 5984 + }, + { + "epoch": 0.252, + "grad_norm": 0.412109375, + "learning_rate": 0.0004339881031461588, + "loss": 3.0703, + "step": 5985 + }, + { + "epoch": 0.2520421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.0004339652554686795, + "loss": 2.9712, + "step": 5986 + }, + { + "epoch": 0.2520842105263158, + "grad_norm": 0.443359375, + "learning_rate": 0.0004339424044395694, + "loss": 3.3076, + "step": 5987 + }, + { + "epoch": 0.2521263157894737, + "grad_norm": 0.4453125, + "learning_rate": 0.0004339195500592449, + "loss": 3.553, + "step": 5988 + }, + { + "epoch": 0.2521684210526316, + "grad_norm": 0.40625, + "learning_rate": 0.0004338966923281222, + "loss": 3.7331, + "step": 5989 + }, + { + "epoch": 0.2522105263157895, + "grad_norm": 0.41015625, + "learning_rate": 0.0004338738312466179, + "loss": 3.6566, + "step": 5990 + }, + { + "epoch": 0.2522526315789474, + "grad_norm": 0.396484375, + "learning_rate": 0.00043385096681514834, + "loss": 3.4665, + "step": 5991 + }, + { + "epoch": 0.25229473684210524, + "grad_norm": 0.640625, + "learning_rate": 0.00043382809903413027, + "loss": 2.9238, + "step": 5992 + }, + { + "epoch": 0.25233684210526314, + "grad_norm": 0.50390625, + "learning_rate": 0.00043380522790398025, + "loss": 3.1875, + "step": 5993 + }, + { + "epoch": 0.25237894736842104, + "grad_norm": 0.4453125, + "learning_rate": 0.0004337823534251149, + "loss": 3.0295, + "step": 5994 + }, + { + "epoch": 0.25242105263157893, + "grad_norm": 0.416015625, + "learning_rate": 0.000433759475597951, + "loss": 3.146, + "step": 5995 + }, + { + "epoch": 0.25246315789473683, + "grad_norm": 0.421875, + "learning_rate": 0.00043373659442290537, + "loss": 3.1638, + "step": 5996 + }, + { + "epoch": 0.25250526315789473, + "grad_norm": 0.4609375, + "learning_rate": 0.00043371370990039483, + "loss": 3.3972, + "step": 5997 + }, + { + "epoch": 0.25254736842105263, + "grad_norm": 0.408203125, + "learning_rate": 0.00043369082203083644, + "loss": 3.4852, + "step": 5998 + }, + { + "epoch": 0.25258947368421053, + "grad_norm": 0.421875, + "learning_rate": 0.0004336679308146471, + "loss": 3.499, + "step": 5999 + }, + { + "epoch": 0.25263157894736843, + "grad_norm": 0.435546875, + "learning_rate": 0.0004336450362522437, + "loss": 2.9337, + "step": 6000 + }, + { + "epoch": 0.25263157894736843, + "eval_loss": 3.2944118976593018, + "eval_runtime": 335.4633, + "eval_samples_per_second": 44.714, + "eval_steps_per_second": 5.589, + "step": 6000 + }, + { + "epoch": 0.25267368421052633, + "grad_norm": 0.4375, + "learning_rate": 0.0004336221383440437, + "loss": 3.6573, + "step": 6001 + }, + { + "epoch": 0.2527157894736842, + "grad_norm": 0.447265625, + "learning_rate": 0.000433599237090464, + "loss": 3.3607, + "step": 6002 + }, + { + "epoch": 0.2527578947368421, + "grad_norm": 0.38671875, + "learning_rate": 0.00043357633249192193, + "loss": 2.9241, + "step": 6003 + }, + { + "epoch": 0.2528, + "grad_norm": 0.41015625, + "learning_rate": 0.0004335534245488347, + "loss": 3.3706, + "step": 6004 + }, + { + "epoch": 0.25284210526315787, + "grad_norm": 0.408203125, + "learning_rate": 0.00043353051326161984, + "loss": 3.5743, + "step": 6005 + }, + { + "epoch": 0.25288421052631577, + "grad_norm": 0.451171875, + "learning_rate": 0.0004335075986306947, + "loss": 2.8625, + "step": 6006 + }, + { + "epoch": 0.25292631578947367, + "grad_norm": 0.44140625, + "learning_rate": 0.0004334846806564767, + "loss": 3.0568, + "step": 6007 + }, + { + "epoch": 0.25296842105263156, + "grad_norm": 0.408203125, + "learning_rate": 0.00043346175933938337, + "loss": 3.0376, + "step": 6008 + }, + { + "epoch": 0.25301052631578946, + "grad_norm": 0.4296875, + "learning_rate": 0.0004334388346798323, + "loss": 2.6586, + "step": 6009 + }, + { + "epoch": 0.25305263157894736, + "grad_norm": 0.44921875, + "learning_rate": 0.00043341590667824126, + "loss": 2.9894, + "step": 6010 + }, + { + "epoch": 0.25309473684210526, + "grad_norm": 0.62109375, + "learning_rate": 0.00043339297533502786, + "loss": 2.773, + "step": 6011 + }, + { + "epoch": 0.25313684210526316, + "grad_norm": 0.44921875, + "learning_rate": 0.00043337004065061, + "loss": 3.154, + "step": 6012 + }, + { + "epoch": 0.25317894736842106, + "grad_norm": 0.4296875, + "learning_rate": 0.00043334710262540535, + "loss": 3.4374, + "step": 6013 + }, + { + "epoch": 0.25322105263157896, + "grad_norm": 0.416015625, + "learning_rate": 0.000433324161259832, + "loss": 3.3654, + "step": 6014 + }, + { + "epoch": 0.25326315789473686, + "grad_norm": 0.412109375, + "learning_rate": 0.0004333012165543078, + "loss": 3.6574, + "step": 6015 + }, + { + "epoch": 0.25330526315789476, + "grad_norm": 0.412109375, + "learning_rate": 0.0004332782685092508, + "loss": 3.2352, + "step": 6016 + }, + { + "epoch": 0.25334736842105265, + "grad_norm": 0.421875, + "learning_rate": 0.0004332553171250791, + "loss": 3.056, + "step": 6017 + }, + { + "epoch": 0.25338947368421055, + "grad_norm": 0.41796875, + "learning_rate": 0.0004332323624022108, + "loss": 3.4178, + "step": 6018 + }, + { + "epoch": 0.2534315789473684, + "grad_norm": 0.390625, + "learning_rate": 0.0004332094043410642, + "loss": 3.4588, + "step": 6019 + }, + { + "epoch": 0.2534736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.0004331864429420575, + "loss": 3.3476, + "step": 6020 + }, + { + "epoch": 0.2535157894736842, + "grad_norm": 0.404296875, + "learning_rate": 0.00043316347820560904, + "loss": 3.1959, + "step": 6021 + }, + { + "epoch": 0.2535578947368421, + "grad_norm": 0.490234375, + "learning_rate": 0.00043314051013213715, + "loss": 2.885, + "step": 6022 + }, + { + "epoch": 0.2536, + "grad_norm": 0.400390625, + "learning_rate": 0.0004331175387220605, + "loss": 3.5337, + "step": 6023 + }, + { + "epoch": 0.2536421052631579, + "grad_norm": 0.38671875, + "learning_rate": 0.00043309456397579735, + "loss": 3.0334, + "step": 6024 + }, + { + "epoch": 0.2536842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.0004330715858937664, + "loss": 3.5462, + "step": 6025 + }, + { + "epoch": 0.2537263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.00043304860447638616, + "loss": 3.3193, + "step": 6026 + }, + { + "epoch": 0.2537684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.0004330256197240755, + "loss": 3.0765, + "step": 6027 + }, + { + "epoch": 0.2538105263157895, + "grad_norm": 0.40234375, + "learning_rate": 0.0004330026316372531, + "loss": 3.1876, + "step": 6028 + }, + { + "epoch": 0.2538526315789474, + "grad_norm": 0.4921875, + "learning_rate": 0.0004329796402163378, + "loss": 3.8867, + "step": 6029 + }, + { + "epoch": 0.2538947368421053, + "grad_norm": 0.408203125, + "learning_rate": 0.0004329566454617484, + "loss": 3.5915, + "step": 6030 + }, + { + "epoch": 0.2539368421052632, + "grad_norm": 0.435546875, + "learning_rate": 0.00043293364737390387, + "loss": 3.3869, + "step": 6031 + }, + { + "epoch": 0.253978947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.00043291064595322326, + "loss": 3.2226, + "step": 6032 + }, + { + "epoch": 0.2540210526315789, + "grad_norm": 0.4140625, + "learning_rate": 0.00043288764120012557, + "loss": 3.3566, + "step": 6033 + }, + { + "epoch": 0.2540631578947368, + "grad_norm": 0.408203125, + "learning_rate": 0.00043286463311503, + "loss": 3.1756, + "step": 6034 + }, + { + "epoch": 0.2541052631578947, + "grad_norm": 0.41796875, + "learning_rate": 0.00043284162169835555, + "loss": 3.1058, + "step": 6035 + }, + { + "epoch": 0.2541473684210526, + "grad_norm": 0.447265625, + "learning_rate": 0.00043281860695052165, + "loss": 3.1083, + "step": 6036 + }, + { + "epoch": 0.2541894736842105, + "grad_norm": 0.4140625, + "learning_rate": 0.0004327955888719476, + "loss": 2.6818, + "step": 6037 + }, + { + "epoch": 0.2542315789473684, + "grad_norm": 0.486328125, + "learning_rate": 0.0004327725674630526, + "loss": 2.8582, + "step": 6038 + }, + { + "epoch": 0.2542736842105263, + "grad_norm": 0.38671875, + "learning_rate": 0.00043274954272425623, + "loss": 3.0076, + "step": 6039 + }, + { + "epoch": 0.2543157894736842, + "grad_norm": 0.3984375, + "learning_rate": 0.0004327265146559779, + "loss": 3.3849, + "step": 6040 + }, + { + "epoch": 0.2543578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.0004327034832586372, + "loss": 3.204, + "step": 6041 + }, + { + "epoch": 0.2544, + "grad_norm": 0.421875, + "learning_rate": 0.0004326804485326536, + "loss": 3.196, + "step": 6042 + }, + { + "epoch": 0.2544421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.00043265741047844694, + "loss": 3.4673, + "step": 6043 + }, + { + "epoch": 0.2544842105263158, + "grad_norm": 0.453125, + "learning_rate": 0.00043263436909643684, + "loss": 2.9964, + "step": 6044 + }, + { + "epoch": 0.2545263157894737, + "grad_norm": 0.40625, + "learning_rate": 0.00043261132438704317, + "loss": 3.2746, + "step": 6045 + }, + { + "epoch": 0.25456842105263155, + "grad_norm": 0.41015625, + "learning_rate": 0.00043258827635068563, + "loss": 3.3636, + "step": 6046 + }, + { + "epoch": 0.25461052631578945, + "grad_norm": 0.45703125, + "learning_rate": 0.00043256522498778436, + "loss": 2.8073, + "step": 6047 + }, + { + "epoch": 0.25465263157894735, + "grad_norm": 0.40234375, + "learning_rate": 0.0004325421702987591, + "loss": 3.3582, + "step": 6048 + }, + { + "epoch": 0.25469473684210525, + "grad_norm": 0.419921875, + "learning_rate": 0.0004325191122840301, + "loss": 3.3618, + "step": 6049 + }, + { + "epoch": 0.25473684210526315, + "grad_norm": 0.380859375, + "learning_rate": 0.00043249605094401717, + "loss": 3.2649, + "step": 6050 + }, + { + "epoch": 0.25477894736842105, + "grad_norm": 0.384765625, + "learning_rate": 0.00043247298627914065, + "loss": 3.6377, + "step": 6051 + }, + { + "epoch": 0.25482105263157895, + "grad_norm": 0.396484375, + "learning_rate": 0.00043244991828982076, + "loss": 3.243, + "step": 6052 + }, + { + "epoch": 0.25486315789473685, + "grad_norm": 0.41015625, + "learning_rate": 0.0004324268469764777, + "loss": 3.362, + "step": 6053 + }, + { + "epoch": 0.25490526315789475, + "grad_norm": 0.423828125, + "learning_rate": 0.0004324037723395319, + "loss": 3.6201, + "step": 6054 + }, + { + "epoch": 0.25494736842105264, + "grad_norm": 0.41015625, + "learning_rate": 0.00043238069437940364, + "loss": 3.2052, + "step": 6055 + }, + { + "epoch": 0.25498947368421054, + "grad_norm": 0.3984375, + "learning_rate": 0.0004323576130965134, + "loss": 3.1994, + "step": 6056 + }, + { + "epoch": 0.25503157894736844, + "grad_norm": 0.4140625, + "learning_rate": 0.00043233452849128175, + "loss": 3.3218, + "step": 6057 + }, + { + "epoch": 0.25507368421052634, + "grad_norm": 0.384765625, + "learning_rate": 0.0004323114405641292, + "loss": 3.5504, + "step": 6058 + }, + { + "epoch": 0.2551157894736842, + "grad_norm": 0.4453125, + "learning_rate": 0.00043228834931547636, + "loss": 3.4439, + "step": 6059 + }, + { + "epoch": 0.2551578947368421, + "grad_norm": 0.400390625, + "learning_rate": 0.000432265254745744, + "loss": 3.1921, + "step": 6060 + }, + { + "epoch": 0.2552, + "grad_norm": 0.41796875, + "learning_rate": 0.00043224215685535287, + "loss": 3.0537, + "step": 6061 + }, + { + "epoch": 0.2552421052631579, + "grad_norm": 0.40234375, + "learning_rate": 0.0004322190556447238, + "loss": 2.8689, + "step": 6062 + }, + { + "epoch": 0.2552842105263158, + "grad_norm": 0.3984375, + "learning_rate": 0.0004321959511142776, + "loss": 3.309, + "step": 6063 + }, + { + "epoch": 0.2553263157894737, + "grad_norm": 0.462890625, + "learning_rate": 0.00043217284326443526, + "loss": 3.1243, + "step": 6064 + }, + { + "epoch": 0.2553684210526316, + "grad_norm": 0.3828125, + "learning_rate": 0.0004321497320956177, + "loss": 2.9133, + "step": 6065 + }, + { + "epoch": 0.2554105263157895, + "grad_norm": 0.515625, + "learning_rate": 0.00043212661760824614, + "loss": 3.1312, + "step": 6066 + }, + { + "epoch": 0.2554526315789474, + "grad_norm": 0.41015625, + "learning_rate": 0.00043210349980274154, + "loss": 3.8386, + "step": 6067 + }, + { + "epoch": 0.2554947368421053, + "grad_norm": 0.4609375, + "learning_rate": 0.0004320803786795251, + "loss": 3.4378, + "step": 6068 + }, + { + "epoch": 0.2555368421052632, + "grad_norm": 0.40234375, + "learning_rate": 0.00043205725423901814, + "loss": 3.5953, + "step": 6069 + }, + { + "epoch": 0.25557894736842107, + "grad_norm": 0.390625, + "learning_rate": 0.0004320341264816419, + "loss": 4.0064, + "step": 6070 + }, + { + "epoch": 0.25562105263157897, + "grad_norm": 0.4375, + "learning_rate": 0.0004320109954078178, + "loss": 3.8868, + "step": 6071 + }, + { + "epoch": 0.2556631578947368, + "grad_norm": 0.55078125, + "learning_rate": 0.00043198786101796715, + "loss": 3.3687, + "step": 6072 + }, + { + "epoch": 0.2557052631578947, + "grad_norm": 0.451171875, + "learning_rate": 0.00043196472331251157, + "loss": 3.568, + "step": 6073 + }, + { + "epoch": 0.2557473684210526, + "grad_norm": 0.392578125, + "learning_rate": 0.0004319415822918725, + "loss": 3.7574, + "step": 6074 + }, + { + "epoch": 0.2557894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.0004319184379564716, + "loss": 3.2719, + "step": 6075 + }, + { + "epoch": 0.2558315789473684, + "grad_norm": 0.443359375, + "learning_rate": 0.00043189529030673043, + "loss": 3.1474, + "step": 6076 + }, + { + "epoch": 0.2558736842105263, + "grad_norm": 0.40625, + "learning_rate": 0.00043187213934307086, + "loss": 3.1235, + "step": 6077 + }, + { + "epoch": 0.2559157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.00043184898506591456, + "loss": 3.1134, + "step": 6078 + }, + { + "epoch": 0.2559578947368421, + "grad_norm": 0.380859375, + "learning_rate": 0.00043182582747568345, + "loss": 3.2941, + "step": 6079 + }, + { + "epoch": 0.256, + "grad_norm": 0.427734375, + "learning_rate": 0.0004318026665727993, + "loss": 3.1172, + "step": 6080 + }, + { + "epoch": 0.2560421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.00043177950235768434, + "loss": 3.325, + "step": 6081 + }, + { + "epoch": 0.2560842105263158, + "grad_norm": 0.380859375, + "learning_rate": 0.00043175633483076033, + "loss": 3.1216, + "step": 6082 + }, + { + "epoch": 0.2561263157894737, + "grad_norm": 0.392578125, + "learning_rate": 0.00043173316399244946, + "loss": 3.249, + "step": 6083 + }, + { + "epoch": 0.2561684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.0004317099898431739, + "loss": 3.6156, + "step": 6084 + }, + { + "epoch": 0.2562105263157895, + "grad_norm": 0.40234375, + "learning_rate": 0.00043168681238335576, + "loss": 3.6679, + "step": 6085 + }, + { + "epoch": 0.25625263157894734, + "grad_norm": 0.5390625, + "learning_rate": 0.00043166363161341743, + "loss": 3.1704, + "step": 6086 + }, + { + "epoch": 0.25629473684210524, + "grad_norm": 0.40234375, + "learning_rate": 0.0004316404475337812, + "loss": 3.4398, + "step": 6087 + }, + { + "epoch": 0.25633684210526314, + "grad_norm": 0.37890625, + "learning_rate": 0.0004316172601448694, + "loss": 3.2026, + "step": 6088 + }, + { + "epoch": 0.25637894736842104, + "grad_norm": 0.439453125, + "learning_rate": 0.00043159406944710447, + "loss": 3.4618, + "step": 6089 + }, + { + "epoch": 0.25642105263157894, + "grad_norm": 0.40234375, + "learning_rate": 0.000431570875440909, + "loss": 3.1465, + "step": 6090 + }, + { + "epoch": 0.25646315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.0004315476781267055, + "loss": 3.3156, + "step": 6091 + }, + { + "epoch": 0.25650526315789474, + "grad_norm": 0.40234375, + "learning_rate": 0.00043152447750491665, + "loss": 3.6675, + "step": 6092 + }, + { + "epoch": 0.25654736842105264, + "grad_norm": 0.412109375, + "learning_rate": 0.0004315012735759651, + "loss": 3.5745, + "step": 6093 + }, + { + "epoch": 0.25658947368421053, + "grad_norm": 0.41796875, + "learning_rate": 0.0004314780663402736, + "loss": 3.1197, + "step": 6094 + }, + { + "epoch": 0.25663157894736843, + "grad_norm": 0.400390625, + "learning_rate": 0.00043145485579826494, + "loss": 3.4775, + "step": 6095 + }, + { + "epoch": 0.25667368421052633, + "grad_norm": 0.388671875, + "learning_rate": 0.000431431641950362, + "loss": 3.2463, + "step": 6096 + }, + { + "epoch": 0.25671578947368423, + "grad_norm": 0.376953125, + "learning_rate": 0.0004314084247969877, + "loss": 3.4397, + "step": 6097 + }, + { + "epoch": 0.25675789473684213, + "grad_norm": 0.58984375, + "learning_rate": 0.0004313852043385651, + "loss": 3.1508, + "step": 6098 + }, + { + "epoch": 0.2568, + "grad_norm": 0.494140625, + "learning_rate": 0.0004313619805755171, + "loss": 3.3843, + "step": 6099 + }, + { + "epoch": 0.25684210526315787, + "grad_norm": 0.431640625, + "learning_rate": 0.00043133875350826704, + "loss": 2.8319, + "step": 6100 + }, + { + "epoch": 0.25688421052631577, + "grad_norm": 0.4375, + "learning_rate": 0.0004313155231372379, + "loss": 3.1014, + "step": 6101 + }, + { + "epoch": 0.25692631578947367, + "grad_norm": 0.416015625, + "learning_rate": 0.00043129228946285294, + "loss": 3.2822, + "step": 6102 + }, + { + "epoch": 0.25696842105263157, + "grad_norm": 0.427734375, + "learning_rate": 0.00043126905248553547, + "loss": 3.3325, + "step": 6103 + }, + { + "epoch": 0.25701052631578947, + "grad_norm": 0.421875, + "learning_rate": 0.0004312458122057089, + "loss": 3.2015, + "step": 6104 + }, + { + "epoch": 0.25705263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.0004312225686237965, + "loss": 3.5371, + "step": 6105 + }, + { + "epoch": 0.25709473684210526, + "grad_norm": 0.44140625, + "learning_rate": 0.0004311993217402219, + "loss": 3.0762, + "step": 6106 + }, + { + "epoch": 0.25713684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.00043117607155540854, + "loss": 3.2927, + "step": 6107 + }, + { + "epoch": 0.25717894736842106, + "grad_norm": 0.4296875, + "learning_rate": 0.00043115281806978007, + "loss": 3.385, + "step": 6108 + }, + { + "epoch": 0.25722105263157896, + "grad_norm": 0.392578125, + "learning_rate": 0.00043112956128376, + "loss": 2.9517, + "step": 6109 + }, + { + "epoch": 0.25726315789473686, + "grad_norm": 0.388671875, + "learning_rate": 0.0004311063011977723, + "loss": 3.3454, + "step": 6110 + }, + { + "epoch": 0.25730526315789476, + "grad_norm": 0.40625, + "learning_rate": 0.00043108303781224045, + "loss": 3.1351, + "step": 6111 + }, + { + "epoch": 0.25734736842105266, + "grad_norm": 0.447265625, + "learning_rate": 0.00043105977112758845, + "loss": 3.2994, + "step": 6112 + }, + { + "epoch": 0.2573894736842105, + "grad_norm": 0.462890625, + "learning_rate": 0.00043103650114424013, + "loss": 3.3109, + "step": 6113 + }, + { + "epoch": 0.2574315789473684, + "grad_norm": 0.3984375, + "learning_rate": 0.00043101322786261956, + "loss": 3.4919, + "step": 6114 + }, + { + "epoch": 0.2574736842105263, + "grad_norm": 0.44921875, + "learning_rate": 0.00043098995128315056, + "loss": 3.5706, + "step": 6115 + }, + { + "epoch": 0.2575157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.0004309666714062573, + "loss": 3.3651, + "step": 6116 + }, + { + "epoch": 0.2575578947368421, + "grad_norm": 0.376953125, + "learning_rate": 0.000430943388232364, + "loss": 3.1717, + "step": 6117 + }, + { + "epoch": 0.2576, + "grad_norm": 0.412109375, + "learning_rate": 0.00043092010176189465, + "loss": 3.1678, + "step": 6118 + }, + { + "epoch": 0.2576421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00043089681199527376, + "loss": 2.6509, + "step": 6119 + }, + { + "epoch": 0.2576842105263158, + "grad_norm": 0.5390625, + "learning_rate": 0.00043087351893292535, + "loss": 3.6603, + "step": 6120 + }, + { + "epoch": 0.2577263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.00043085022257527406, + "loss": 3.7938, + "step": 6121 + }, + { + "epoch": 0.2577684210526316, + "grad_norm": 0.50390625, + "learning_rate": 0.0004308269229227442, + "loss": 3.2138, + "step": 6122 + }, + { + "epoch": 0.2578105263157895, + "grad_norm": 0.44140625, + "learning_rate": 0.0004308036199757602, + "loss": 3.2045, + "step": 6123 + }, + { + "epoch": 0.2578526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.0004307803137347467, + "loss": 3.5277, + "step": 6124 + }, + { + "epoch": 0.2578947368421053, + "grad_norm": 0.490234375, + "learning_rate": 0.0004307570042001283, + "loss": 2.8281, + "step": 6125 + }, + { + "epoch": 0.25793684210526313, + "grad_norm": 0.455078125, + "learning_rate": 0.00043073369137232965, + "loss": 3.424, + "step": 6126 + }, + { + "epoch": 0.25797894736842103, + "grad_norm": 0.43359375, + "learning_rate": 0.0004307103752517755, + "loss": 3.6572, + "step": 6127 + }, + { + "epoch": 0.25802105263157893, + "grad_norm": 0.41015625, + "learning_rate": 0.00043068705583889066, + "loss": 3.6863, + "step": 6128 + }, + { + "epoch": 0.2580631578947368, + "grad_norm": 0.427734375, + "learning_rate": 0.0004306637331341, + "loss": 3.4523, + "step": 6129 + }, + { + "epoch": 0.2581052631578947, + "grad_norm": 0.419921875, + "learning_rate": 0.00043064040713782837, + "loss": 2.8686, + "step": 6130 + }, + { + "epoch": 0.2581473684210526, + "grad_norm": 0.4375, + "learning_rate": 0.0004306170778505007, + "loss": 3.2661, + "step": 6131 + }, + { + "epoch": 0.2581894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.00043059374527254214, + "loss": 3.1896, + "step": 6132 + }, + { + "epoch": 0.2582315789473684, + "grad_norm": 0.396484375, + "learning_rate": 0.0004305704094043777, + "loss": 2.9932, + "step": 6133 + }, + { + "epoch": 0.2582736842105263, + "grad_norm": 0.388671875, + "learning_rate": 0.00043054707024643257, + "loss": 3.2916, + "step": 6134 + }, + { + "epoch": 0.2583157894736842, + "grad_norm": 0.380859375, + "learning_rate": 0.000430523727799132, + "loss": 2.804, + "step": 6135 + }, + { + "epoch": 0.2583578947368421, + "grad_norm": 0.3984375, + "learning_rate": 0.0004305003820629012, + "loss": 2.9091, + "step": 6136 + }, + { + "epoch": 0.2584, + "grad_norm": 0.3828125, + "learning_rate": 0.00043047703303816543, + "loss": 3.3525, + "step": 6137 + }, + { + "epoch": 0.2584421052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.0004304536807253503, + "loss": 3.1367, + "step": 6138 + }, + { + "epoch": 0.2584842105263158, + "grad_norm": 0.384765625, + "learning_rate": 0.000430430325124881, + "loss": 3.4791, + "step": 6139 + }, + { + "epoch": 0.25852631578947366, + "grad_norm": 0.43359375, + "learning_rate": 0.0004304069662371833, + "loss": 3.0645, + "step": 6140 + }, + { + "epoch": 0.25856842105263156, + "grad_norm": 0.53515625, + "learning_rate": 0.00043038360406268264, + "loss": 3.1117, + "step": 6141 + }, + { + "epoch": 0.25861052631578946, + "grad_norm": 0.5625, + "learning_rate": 0.00043036023860180457, + "loss": 2.9278, + "step": 6142 + }, + { + "epoch": 0.25865263157894736, + "grad_norm": 0.47265625, + "learning_rate": 0.0004303368698549749, + "loss": 3.2543, + "step": 6143 + }, + { + "epoch": 0.25869473684210526, + "grad_norm": 0.458984375, + "learning_rate": 0.0004303134978226193, + "loss": 3.1228, + "step": 6144 + }, + { + "epoch": 0.25873684210526315, + "grad_norm": 0.44140625, + "learning_rate": 0.00043029012250516373, + "loss": 3.366, + "step": 6145 + }, + { + "epoch": 0.25877894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.0004302667439030339, + "loss": 3.0633, + "step": 6146 + }, + { + "epoch": 0.25882105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.0004302433620166558, + "loss": 3.3485, + "step": 6147 + }, + { + "epoch": 0.25886315789473685, + "grad_norm": 0.423828125, + "learning_rate": 0.0004302199768464554, + "loss": 3.2263, + "step": 6148 + }, + { + "epoch": 0.25890526315789475, + "grad_norm": 0.39453125, + "learning_rate": 0.00043019658839285884, + "loss": 3.0154, + "step": 6149 + }, + { + "epoch": 0.25894736842105265, + "grad_norm": 0.48046875, + "learning_rate": 0.0004301731966562921, + "loss": 2.9147, + "step": 6150 + }, + { + "epoch": 0.25898947368421055, + "grad_norm": 0.39453125, + "learning_rate": 0.00043014980163718144, + "loss": 2.9014, + "step": 6151 + }, + { + "epoch": 0.25903157894736845, + "grad_norm": 0.40625, + "learning_rate": 0.0004301264033359531, + "loss": 3.4199, + "step": 6152 + }, + { + "epoch": 0.2590736842105263, + "grad_norm": 0.3828125, + "learning_rate": 0.0004301030017530333, + "loss": 2.9437, + "step": 6153 + }, + { + "epoch": 0.2591157894736842, + "grad_norm": 0.4453125, + "learning_rate": 0.0004300795968888484, + "loss": 3.4307, + "step": 6154 + }, + { + "epoch": 0.2591578947368421, + "grad_norm": 0.3984375, + "learning_rate": 0.00043005618874382477, + "loss": 3.0463, + "step": 6155 + }, + { + "epoch": 0.2592, + "grad_norm": 0.40625, + "learning_rate": 0.0004300327773183891, + "loss": 3.1884, + "step": 6156 + }, + { + "epoch": 0.2592421052631579, + "grad_norm": 0.39453125, + "learning_rate": 0.0004300093626129676, + "loss": 3.3494, + "step": 6157 + }, + { + "epoch": 0.2592842105263158, + "grad_norm": 0.44921875, + "learning_rate": 0.00042998594462798713, + "loss": 3.5587, + "step": 6158 + }, + { + "epoch": 0.2593263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.00042996252336387414, + "loss": 3.5775, + "step": 6159 + }, + { + "epoch": 0.2593684210526316, + "grad_norm": 0.3984375, + "learning_rate": 0.00042993909882105546, + "loss": 3.1355, + "step": 6160 + }, + { + "epoch": 0.2594105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.0004299156709999578, + "loss": 3.2287, + "step": 6161 + }, + { + "epoch": 0.2594526315789474, + "grad_norm": 0.435546875, + "learning_rate": 0.00042989223990100805, + "loss": 3.4734, + "step": 6162 + }, + { + "epoch": 0.2594947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.000429868805524633, + "loss": 3.2615, + "step": 6163 + }, + { + "epoch": 0.2595368421052632, + "grad_norm": 0.40625, + "learning_rate": 0.0004298453678712597, + "loss": 3.6193, + "step": 6164 + }, + { + "epoch": 0.2595789473684211, + "grad_norm": 0.4140625, + "learning_rate": 0.00042982192694131504, + "loss": 2.8476, + "step": 6165 + }, + { + "epoch": 0.259621052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00042979848273522626, + "loss": 3.5556, + "step": 6166 + }, + { + "epoch": 0.2596631578947368, + "grad_norm": 0.462890625, + "learning_rate": 0.0004297750352534203, + "loss": 3.1545, + "step": 6167 + }, + { + "epoch": 0.2597052631578947, + "grad_norm": 0.3984375, + "learning_rate": 0.00042975158449632447, + "loss": 2.9406, + "step": 6168 + }, + { + "epoch": 0.2597473684210526, + "grad_norm": 0.462890625, + "learning_rate": 0.00042972813046436586, + "loss": 3.6385, + "step": 6169 + }, + { + "epoch": 0.2597894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.000429704673157972, + "loss": 3.2468, + "step": 6170 + }, + { + "epoch": 0.2598315789473684, + "grad_norm": 0.44921875, + "learning_rate": 0.0004296812125775701, + "loss": 3.3389, + "step": 6171 + }, + { + "epoch": 0.2598736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.00042965774872358764, + "loss": 3.2557, + "step": 6172 + }, + { + "epoch": 0.2599157894736842, + "grad_norm": 0.40234375, + "learning_rate": 0.0004296342815964521, + "loss": 3.3789, + "step": 6173 + }, + { + "epoch": 0.2599578947368421, + "grad_norm": 0.39453125, + "learning_rate": 0.00042961081119659094, + "loss": 3.179, + "step": 6174 + }, + { + "epoch": 0.26, + "grad_norm": 0.43359375, + "learning_rate": 0.0004295873375244319, + "loss": 2.9999, + "step": 6175 + }, + { + "epoch": 0.2600421052631579, + "grad_norm": 0.4921875, + "learning_rate": 0.0004295638605804025, + "loss": 3.3216, + "step": 6176 + }, + { + "epoch": 0.2600842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.00042954038036493064, + "loss": 3.7098, + "step": 6177 + }, + { + "epoch": 0.2601263157894737, + "grad_norm": 0.482421875, + "learning_rate": 0.000429516896878444, + "loss": 3.2123, + "step": 6178 + }, + { + "epoch": 0.2601684210526316, + "grad_norm": 0.462890625, + "learning_rate": 0.0004294934101213703, + "loss": 3.5476, + "step": 6179 + }, + { + "epoch": 0.26021052631578945, + "grad_norm": 0.462890625, + "learning_rate": 0.00042946992009413774, + "loss": 3.0534, + "step": 6180 + }, + { + "epoch": 0.26025263157894735, + "grad_norm": 0.408203125, + "learning_rate": 0.00042944642679717393, + "loss": 3.2793, + "step": 6181 + }, + { + "epoch": 0.26029473684210525, + "grad_norm": 0.478515625, + "learning_rate": 0.00042942293023090716, + "loss": 3.2507, + "step": 6182 + }, + { + "epoch": 0.26033684210526314, + "grad_norm": 0.400390625, + "learning_rate": 0.00042939943039576543, + "loss": 3.4143, + "step": 6183 + }, + { + "epoch": 0.26037894736842104, + "grad_norm": 0.439453125, + "learning_rate": 0.00042937592729217676, + "loss": 3.0732, + "step": 6184 + }, + { + "epoch": 0.26042105263157894, + "grad_norm": 0.390625, + "learning_rate": 0.0004293524209205696, + "loss": 3.098, + "step": 6185 + }, + { + "epoch": 0.26046315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00042932891128137196, + "loss": 3.2063, + "step": 6186 + }, + { + "epoch": 0.26050526315789474, + "grad_norm": 0.419921875, + "learning_rate": 0.0004293053983750123, + "loss": 3.0457, + "step": 6187 + }, + { + "epoch": 0.26054736842105264, + "grad_norm": 0.408203125, + "learning_rate": 0.00042928188220191894, + "loss": 3.2588, + "step": 6188 + }, + { + "epoch": 0.26058947368421054, + "grad_norm": 0.39453125, + "learning_rate": 0.0004292583627625203, + "loss": 3.6176, + "step": 6189 + }, + { + "epoch": 0.26063157894736844, + "grad_norm": 0.396484375, + "learning_rate": 0.0004292348400572449, + "loss": 3.1577, + "step": 6190 + }, + { + "epoch": 0.26067368421052634, + "grad_norm": 0.470703125, + "learning_rate": 0.0004292113140865214, + "loss": 3.1727, + "step": 6191 + }, + { + "epoch": 0.26071578947368423, + "grad_norm": 0.439453125, + "learning_rate": 0.0004291877848507783, + "loss": 3.2862, + "step": 6192 + }, + { + "epoch": 0.26075789473684213, + "grad_norm": 0.408203125, + "learning_rate": 0.00042916425235044425, + "loss": 3.1198, + "step": 6193 + }, + { + "epoch": 0.2608, + "grad_norm": 0.44140625, + "learning_rate": 0.0004291407165859481, + "loss": 2.8052, + "step": 6194 + }, + { + "epoch": 0.2608421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.0004291171775577185, + "loss": 3.3822, + "step": 6195 + }, + { + "epoch": 0.2608842105263158, + "grad_norm": 0.40234375, + "learning_rate": 0.0004290936352661844, + "loss": 3.4492, + "step": 6196 + }, + { + "epoch": 0.2609263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.00042907008971177467, + "loss": 3.4028, + "step": 6197 + }, + { + "epoch": 0.26096842105263157, + "grad_norm": 0.435546875, + "learning_rate": 0.00042904654089491836, + "loss": 3.4107, + "step": 6198 + }, + { + "epoch": 0.26101052631578947, + "grad_norm": 0.427734375, + "learning_rate": 0.0004290229888160444, + "loss": 3.0222, + "step": 6199 + }, + { + "epoch": 0.26105263157894737, + "grad_norm": 0.5078125, + "learning_rate": 0.000428999433475582, + "loss": 3.7807, + "step": 6200 + }, + { + "epoch": 0.26109473684210527, + "grad_norm": 0.48828125, + "learning_rate": 0.0004289758748739602, + "loss": 3.0948, + "step": 6201 + }, + { + "epoch": 0.26113684210526317, + "grad_norm": 0.44140625, + "learning_rate": 0.0004289523130116082, + "loss": 3.2291, + "step": 6202 + }, + { + "epoch": 0.26117894736842107, + "grad_norm": 0.412109375, + "learning_rate": 0.0004289287478889553, + "loss": 3.7891, + "step": 6203 + }, + { + "epoch": 0.26122105263157896, + "grad_norm": 0.431640625, + "learning_rate": 0.0004289051795064309, + "loss": 3.4318, + "step": 6204 + }, + { + "epoch": 0.26126315789473686, + "grad_norm": 0.421875, + "learning_rate": 0.0004288816078644643, + "loss": 3.515, + "step": 6205 + }, + { + "epoch": 0.26130526315789476, + "grad_norm": 0.435546875, + "learning_rate": 0.00042885803296348503, + "loss": 3.0889, + "step": 6206 + }, + { + "epoch": 0.2613473684210526, + "grad_norm": 0.435546875, + "learning_rate": 0.00042883445480392256, + "loss": 3.4653, + "step": 6207 + }, + { + "epoch": 0.2613894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.0004288108733862064, + "loss": 3.3627, + "step": 6208 + }, + { + "epoch": 0.2614315789473684, + "grad_norm": 0.4765625, + "learning_rate": 0.0004287872887107662, + "loss": 2.9652, + "step": 6209 + }, + { + "epoch": 0.2614736842105263, + "grad_norm": 0.396484375, + "learning_rate": 0.0004287637007780317, + "loss": 3.4843, + "step": 6210 + }, + { + "epoch": 0.2615157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.00042874010958843267, + "loss": 3.1182, + "step": 6211 + }, + { + "epoch": 0.2615578947368421, + "grad_norm": 0.455078125, + "learning_rate": 0.0004287165151423988, + "loss": 2.8971, + "step": 6212 + }, + { + "epoch": 0.2616, + "grad_norm": 0.4453125, + "learning_rate": 0.00042869291744036, + "loss": 3.6778, + "step": 6213 + }, + { + "epoch": 0.2616421052631579, + "grad_norm": 0.5703125, + "learning_rate": 0.0004286693164827462, + "loss": 3.3359, + "step": 6214 + }, + { + "epoch": 0.2616842105263158, + "grad_norm": 0.40625, + "learning_rate": 0.00042864571226998736, + "loss": 3.2704, + "step": 6215 + }, + { + "epoch": 0.2617263157894737, + "grad_norm": 0.4453125, + "learning_rate": 0.00042862210480251365, + "loss": 3.4575, + "step": 6216 + }, + { + "epoch": 0.2617684210526316, + "grad_norm": 0.435546875, + "learning_rate": 0.0004285984940807549, + "loss": 3.4357, + "step": 6217 + }, + { + "epoch": 0.2618105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.0004285748801051416, + "loss": 3.1995, + "step": 6218 + }, + { + "epoch": 0.2618526315789474, + "grad_norm": 0.40625, + "learning_rate": 0.00042855126287610364, + "loss": 3.3123, + "step": 6219 + }, + { + "epoch": 0.26189473684210524, + "grad_norm": 0.439453125, + "learning_rate": 0.0004285276423940715, + "loss": 3.3992, + "step": 6220 + }, + { + "epoch": 0.26193684210526313, + "grad_norm": 0.40234375, + "learning_rate": 0.0004285040186594755, + "loss": 3.5687, + "step": 6221 + }, + { + "epoch": 0.26197894736842103, + "grad_norm": 0.423828125, + "learning_rate": 0.0004284803916727461, + "loss": 3.6773, + "step": 6222 + }, + { + "epoch": 0.26202105263157893, + "grad_norm": 0.412109375, + "learning_rate": 0.00042845676143431357, + "loss": 2.8695, + "step": 6223 + }, + { + "epoch": 0.26206315789473683, + "grad_norm": 0.39453125, + "learning_rate": 0.00042843312794460856, + "loss": 3.3007, + "step": 6224 + }, + { + "epoch": 0.26210526315789473, + "grad_norm": 0.83203125, + "learning_rate": 0.00042840949120406157, + "loss": 3.2453, + "step": 6225 + }, + { + "epoch": 0.26214736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.0004283858512131033, + "loss": 3.3979, + "step": 6226 + }, + { + "epoch": 0.26218947368421053, + "grad_norm": 0.4296875, + "learning_rate": 0.0004283622079721644, + "loss": 3.207, + "step": 6227 + }, + { + "epoch": 0.2622315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.00042833856148167567, + "loss": 3.3125, + "step": 6228 + }, + { + "epoch": 0.2622736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.00042831491174206786, + "loss": 3.0378, + "step": 6229 + }, + { + "epoch": 0.2623157894736842, + "grad_norm": 0.4453125, + "learning_rate": 0.0004282912587537719, + "loss": 3.5209, + "step": 6230 + }, + { + "epoch": 0.2623578947368421, + "grad_norm": 0.4453125, + "learning_rate": 0.00042826760251721865, + "loss": 3.2132, + "step": 6231 + }, + { + "epoch": 0.2624, + "grad_norm": 0.41796875, + "learning_rate": 0.0004282439430328392, + "loss": 3.4563, + "step": 6232 + }, + { + "epoch": 0.2624421052631579, + "grad_norm": 0.44921875, + "learning_rate": 0.00042822028030106447, + "loss": 3.312, + "step": 6233 + }, + { + "epoch": 0.26248421052631576, + "grad_norm": 0.4609375, + "learning_rate": 0.00042819661432232564, + "loss": 3.0309, + "step": 6234 + }, + { + "epoch": 0.26252631578947366, + "grad_norm": 0.39453125, + "learning_rate": 0.0004281729450970539, + "loss": 3.2094, + "step": 6235 + }, + { + "epoch": 0.26256842105263156, + "grad_norm": 0.400390625, + "learning_rate": 0.00042814927262568045, + "loss": 3.2903, + "step": 6236 + }, + { + "epoch": 0.26261052631578946, + "grad_norm": 0.447265625, + "learning_rate": 0.0004281255969086365, + "loss": 3.2936, + "step": 6237 + }, + { + "epoch": 0.26265263157894736, + "grad_norm": 0.52734375, + "learning_rate": 0.00042810191794635355, + "loss": 2.7746, + "step": 6238 + }, + { + "epoch": 0.26269473684210526, + "grad_norm": 0.45703125, + "learning_rate": 0.00042807823573926283, + "loss": 3.033, + "step": 6239 + }, + { + "epoch": 0.26273684210526316, + "grad_norm": 0.4453125, + "learning_rate": 0.00042805455028779594, + "loss": 3.1633, + "step": 6240 + }, + { + "epoch": 0.26277894736842106, + "grad_norm": 0.41015625, + "learning_rate": 0.00042803086159238444, + "loss": 3.1929, + "step": 6241 + }, + { + "epoch": 0.26282105263157896, + "grad_norm": 0.404296875, + "learning_rate": 0.0004280071696534597, + "loss": 3.1701, + "step": 6242 + }, + { + "epoch": 0.26286315789473685, + "grad_norm": 0.46484375, + "learning_rate": 0.00042798347447145344, + "loss": 3.3468, + "step": 6243 + }, + { + "epoch": 0.26290526315789475, + "grad_norm": 0.427734375, + "learning_rate": 0.00042795977604679746, + "loss": 3.1284, + "step": 6244 + }, + { + "epoch": 0.26294736842105265, + "grad_norm": 0.419921875, + "learning_rate": 0.00042793607437992357, + "loss": 3.2611, + "step": 6245 + }, + { + "epoch": 0.26298947368421055, + "grad_norm": 0.408203125, + "learning_rate": 0.00042791236947126334, + "loss": 3.4824, + "step": 6246 + }, + { + "epoch": 0.2630315789473684, + "grad_norm": 0.447265625, + "learning_rate": 0.00042788866132124876, + "loss": 3.5099, + "step": 6247 + }, + { + "epoch": 0.2630736842105263, + "grad_norm": 0.44140625, + "learning_rate": 0.00042786494993031184, + "loss": 3.0518, + "step": 6248 + }, + { + "epoch": 0.2631157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.0004278412352988845, + "loss": 3.2429, + "step": 6249 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0004278175174273988, + "loss": 3.5243, + "step": 6250 + }, + { + "epoch": 0.2632, + "grad_norm": 0.41796875, + "learning_rate": 0.0004277937963162868, + "loss": 3.257, + "step": 6251 + }, + { + "epoch": 0.2632421052631579, + "grad_norm": 0.404296875, + "learning_rate": 0.0004277700719659808, + "loss": 3.2812, + "step": 6252 + }, + { + "epoch": 0.2632842105263158, + "grad_norm": 0.466796875, + "learning_rate": 0.00042774634437691295, + "loss": 3.097, + "step": 6253 + }, + { + "epoch": 0.2633263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.00042772261354951556, + "loss": 3.5116, + "step": 6254 + }, + { + "epoch": 0.2633684210526316, + "grad_norm": 0.384765625, + "learning_rate": 0.000427698879484221, + "loss": 3.3276, + "step": 6255 + }, + { + "epoch": 0.2634105263157895, + "grad_norm": 0.396484375, + "learning_rate": 0.0004276751421814615, + "loss": 3.3316, + "step": 6256 + }, + { + "epoch": 0.2634526315789474, + "grad_norm": 0.439453125, + "learning_rate": 0.0004276514016416698, + "loss": 3.3754, + "step": 6257 + }, + { + "epoch": 0.2634947368421053, + "grad_norm": 0.396484375, + "learning_rate": 0.0004276276578652783, + "loss": 3.0314, + "step": 6258 + }, + { + "epoch": 0.2635368421052632, + "grad_norm": 0.39453125, + "learning_rate": 0.00042760391085271956, + "loss": 3.4036, + "step": 6259 + }, + { + "epoch": 0.2635789473684211, + "grad_norm": 0.400390625, + "learning_rate": 0.0004275801606044262, + "loss": 3.1138, + "step": 6260 + }, + { + "epoch": 0.2636210526315789, + "grad_norm": 0.400390625, + "learning_rate": 0.00042755640712083097, + "loss": 3.5388, + "step": 6261 + }, + { + "epoch": 0.2636631578947368, + "grad_norm": 0.380859375, + "learning_rate": 0.0004275326504023667, + "loss": 3.8405, + "step": 6262 + }, + { + "epoch": 0.2637052631578947, + "grad_norm": 0.396484375, + "learning_rate": 0.00042750889044946606, + "loss": 3.125, + "step": 6263 + }, + { + "epoch": 0.2637473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.00042748512726256194, + "loss": 3.291, + "step": 6264 + }, + { + "epoch": 0.2637894736842105, + "grad_norm": 0.66796875, + "learning_rate": 0.0004274613608420874, + "loss": 2.6533, + "step": 6265 + }, + { + "epoch": 0.2638315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.0004274375911884754, + "loss": 2.9643, + "step": 6266 + }, + { + "epoch": 0.2638736842105263, + "grad_norm": 0.4375, + "learning_rate": 0.00042741381830215897, + "loss": 3.3915, + "step": 6267 + }, + { + "epoch": 0.2639157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.0004273900421835712, + "loss": 3.024, + "step": 6268 + }, + { + "epoch": 0.2639578947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.00042736626283314526, + "loss": 3.1551, + "step": 6269 + }, + { + "epoch": 0.264, + "grad_norm": 0.412109375, + "learning_rate": 0.00042734248025131445, + "loss": 3.1435, + "step": 6270 + }, + { + "epoch": 0.2640421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.000427318694438512, + "loss": 3.1671, + "step": 6271 + }, + { + "epoch": 0.2640842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.00042729490539517126, + "loss": 3.1248, + "step": 6272 + }, + { + "epoch": 0.2641263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.00042727111312172575, + "loss": 3.5478, + "step": 6273 + }, + { + "epoch": 0.26416842105263155, + "grad_norm": 0.421875, + "learning_rate": 0.00042724731761860867, + "loss": 3.694, + "step": 6274 + }, + { + "epoch": 0.26421052631578945, + "grad_norm": 0.421875, + "learning_rate": 0.00042722351888625386, + "loss": 2.9388, + "step": 6275 + }, + { + "epoch": 0.26425263157894735, + "grad_norm": 0.44921875, + "learning_rate": 0.0004271997169250947, + "loss": 2.8644, + "step": 6276 + }, + { + "epoch": 0.26429473684210525, + "grad_norm": 0.42578125, + "learning_rate": 0.000427175911735565, + "loss": 3.2576, + "step": 6277 + }, + { + "epoch": 0.26433684210526315, + "grad_norm": 0.408203125, + "learning_rate": 0.00042715210331809825, + "loss": 3.0457, + "step": 6278 + }, + { + "epoch": 0.26437894736842105, + "grad_norm": 0.45703125, + "learning_rate": 0.0004271282916731284, + "loss": 2.9549, + "step": 6279 + }, + { + "epoch": 0.26442105263157895, + "grad_norm": 0.95703125, + "learning_rate": 0.0004271044768010891, + "loss": 2.9924, + "step": 6280 + }, + { + "epoch": 0.26446315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.00042708065870241443, + "loss": 3.4148, + "step": 6281 + }, + { + "epoch": 0.26450526315789474, + "grad_norm": 0.421875, + "learning_rate": 0.00042705683737753806, + "loss": 3.0394, + "step": 6282 + }, + { + "epoch": 0.26454736842105264, + "grad_norm": 0.41015625, + "learning_rate": 0.00042703301282689433, + "loss": 3.3899, + "step": 6283 + }, + { + "epoch": 0.26458947368421054, + "grad_norm": 0.443359375, + "learning_rate": 0.00042700918505091703, + "loss": 3.3845, + "step": 6284 + }, + { + "epoch": 0.26463157894736844, + "grad_norm": 0.4296875, + "learning_rate": 0.0004269853540500404, + "loss": 3.3631, + "step": 6285 + }, + { + "epoch": 0.26467368421052634, + "grad_norm": 0.431640625, + "learning_rate": 0.00042696151982469843, + "loss": 3.5155, + "step": 6286 + }, + { + "epoch": 0.26471578947368424, + "grad_norm": 0.41796875, + "learning_rate": 0.0004269376823753256, + "loss": 3.1301, + "step": 6287 + }, + { + "epoch": 0.2647578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.0004269138417023561, + "loss": 2.8259, + "step": 6288 + }, + { + "epoch": 0.2648, + "grad_norm": 0.41796875, + "learning_rate": 0.00042688999780622427, + "loss": 2.852, + "step": 6289 + }, + { + "epoch": 0.2648421052631579, + "grad_norm": 0.498046875, + "learning_rate": 0.00042686615068736447, + "loss": 3.1415, + "step": 6290 + }, + { + "epoch": 0.2648842105263158, + "grad_norm": 0.392578125, + "learning_rate": 0.0004268423003462112, + "loss": 2.9191, + "step": 6291 + }, + { + "epoch": 0.2649263157894737, + "grad_norm": 0.435546875, + "learning_rate": 0.000426818446783199, + "loss": 3.1254, + "step": 6292 + }, + { + "epoch": 0.2649684210526316, + "grad_norm": 0.40234375, + "learning_rate": 0.00042679458999876255, + "loss": 3.2031, + "step": 6293 + }, + { + "epoch": 0.2650105263157895, + "grad_norm": 0.44921875, + "learning_rate": 0.0004267707299933363, + "loss": 3.2816, + "step": 6294 + }, + { + "epoch": 0.2650526315789474, + "grad_norm": 0.40625, + "learning_rate": 0.0004267468667673551, + "loss": 3.1382, + "step": 6295 + }, + { + "epoch": 0.26509473684210527, + "grad_norm": 0.4140625, + "learning_rate": 0.00042672300032125366, + "loss": 3.3349, + "step": 6296 + }, + { + "epoch": 0.26513684210526317, + "grad_norm": 0.416015625, + "learning_rate": 0.0004266991306554668, + "loss": 3.6174, + "step": 6297 + }, + { + "epoch": 0.26517894736842107, + "grad_norm": 0.404296875, + "learning_rate": 0.0004266752577704293, + "loss": 3.3813, + "step": 6298 + }, + { + "epoch": 0.26522105263157897, + "grad_norm": 0.41796875, + "learning_rate": 0.0004266513816665762, + "loss": 3.0026, + "step": 6299 + }, + { + "epoch": 0.26526315789473687, + "grad_norm": 0.392578125, + "learning_rate": 0.00042662750234434256, + "loss": 3.7354, + "step": 6300 + }, + { + "epoch": 0.2653052631578947, + "grad_norm": 0.37890625, + "learning_rate": 0.0004266036198041634, + "loss": 3.3894, + "step": 6301 + }, + { + "epoch": 0.2653473684210526, + "grad_norm": 0.419921875, + "learning_rate": 0.00042657973404647373, + "loss": 3.3925, + "step": 6302 + }, + { + "epoch": 0.2653894736842105, + "grad_norm": 0.384765625, + "learning_rate": 0.00042655584507170875, + "loss": 3.3881, + "step": 6303 + }, + { + "epoch": 0.2654315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.0004265319528803037, + "loss": 3.0366, + "step": 6304 + }, + { + "epoch": 0.2654736842105263, + "grad_norm": 0.474609375, + "learning_rate": 0.00042650805747269395, + "loss": 3.53, + "step": 6305 + }, + { + "epoch": 0.2655157894736842, + "grad_norm": 0.384765625, + "learning_rate": 0.00042648415884931476, + "loss": 3.2141, + "step": 6306 + }, + { + "epoch": 0.2655578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00042646025701060153, + "loss": 3.2089, + "step": 6307 + }, + { + "epoch": 0.2656, + "grad_norm": 0.44140625, + "learning_rate": 0.0004264363519569898, + "loss": 3.1877, + "step": 6308 + }, + { + "epoch": 0.2656421052631579, + "grad_norm": 1.234375, + "learning_rate": 0.00042641244368891496, + "loss": 3.268, + "step": 6309 + }, + { + "epoch": 0.2656842105263158, + "grad_norm": 0.3984375, + "learning_rate": 0.00042638853220681273, + "loss": 3.2334, + "step": 6310 + }, + { + "epoch": 0.2657263157894737, + "grad_norm": 0.51171875, + "learning_rate": 0.00042636461751111867, + "loss": 2.8376, + "step": 6311 + }, + { + "epoch": 0.2657684210526316, + "grad_norm": 0.435546875, + "learning_rate": 0.0004263406996022685, + "loss": 2.9539, + "step": 6312 + }, + { + "epoch": 0.2658105263157895, + "grad_norm": 0.458984375, + "learning_rate": 0.00042631677848069797, + "loss": 3.253, + "step": 6313 + }, + { + "epoch": 0.2658526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.0004262928541468429, + "loss": 3.5099, + "step": 6314 + }, + { + "epoch": 0.26589473684210524, + "grad_norm": 0.41015625, + "learning_rate": 0.00042626892660113923, + "loss": 3.2963, + "step": 6315 + }, + { + "epoch": 0.26593684210526314, + "grad_norm": 0.4609375, + "learning_rate": 0.00042624499584402274, + "loss": 2.8445, + "step": 6316 + }, + { + "epoch": 0.26597894736842104, + "grad_norm": 0.41796875, + "learning_rate": 0.0004262210618759296, + "loss": 3.2586, + "step": 6317 + }, + { + "epoch": 0.26602105263157894, + "grad_norm": 0.404296875, + "learning_rate": 0.00042619712469729563, + "loss": 3.3481, + "step": 6318 + }, + { + "epoch": 0.26606315789473683, + "grad_norm": 0.431640625, + "learning_rate": 0.00042617318430855715, + "loss": 3.3241, + "step": 6319 + }, + { + "epoch": 0.26610526315789473, + "grad_norm": 0.462890625, + "learning_rate": 0.0004261492407101503, + "loss": 3.1188, + "step": 6320 + }, + { + "epoch": 0.26614736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.0004261252939025111, + "loss": 3.4091, + "step": 6321 + }, + { + "epoch": 0.26618947368421053, + "grad_norm": 0.431640625, + "learning_rate": 0.00042610134388607616, + "loss": 2.8455, + "step": 6322 + }, + { + "epoch": 0.26623157894736843, + "grad_norm": 0.453125, + "learning_rate": 0.00042607739066128156, + "loss": 2.8574, + "step": 6323 + }, + { + "epoch": 0.26627368421052633, + "grad_norm": 0.43359375, + "learning_rate": 0.00042605343422856377, + "loss": 2.6856, + "step": 6324 + }, + { + "epoch": 0.26631578947368423, + "grad_norm": 0.443359375, + "learning_rate": 0.0004260294745883593, + "loss": 3.5231, + "step": 6325 + }, + { + "epoch": 0.2663578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.0004260055117411046, + "loss": 3.4352, + "step": 6326 + }, + { + "epoch": 0.2664, + "grad_norm": 0.46875, + "learning_rate": 0.00042598154568723626, + "loss": 3.0456, + "step": 6327 + }, + { + "epoch": 0.26644210526315787, + "grad_norm": 0.43359375, + "learning_rate": 0.0004259575764271909, + "loss": 3.0534, + "step": 6328 + }, + { + "epoch": 0.26648421052631577, + "grad_norm": 0.4375, + "learning_rate": 0.00042593360396140533, + "loss": 3.0856, + "step": 6329 + }, + { + "epoch": 0.26652631578947367, + "grad_norm": 0.39453125, + "learning_rate": 0.0004259096282903161, + "loss": 3.2412, + "step": 6330 + }, + { + "epoch": 0.26656842105263157, + "grad_norm": 0.412109375, + "learning_rate": 0.00042588564941436027, + "loss": 3.2838, + "step": 6331 + }, + { + "epoch": 0.26661052631578946, + "grad_norm": 0.435546875, + "learning_rate": 0.00042586166733397446, + "loss": 3.4664, + "step": 6332 + }, + { + "epoch": 0.26665263157894736, + "grad_norm": 0.455078125, + "learning_rate": 0.0004258376820495957, + "loss": 3.0562, + "step": 6333 + }, + { + "epoch": 0.26669473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.000425813693561661, + "loss": 3.3986, + "step": 6334 + }, + { + "epoch": 0.26673684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.0004257897018706074, + "loss": 3.2179, + "step": 6335 + }, + { + "epoch": 0.26677894736842106, + "grad_norm": 0.41796875, + "learning_rate": 0.000425765706976872, + "loss": 3.1342, + "step": 6336 + }, + { + "epoch": 0.26682105263157896, + "grad_norm": 0.423828125, + "learning_rate": 0.00042574170888089185, + "loss": 3.2029, + "step": 6337 + }, + { + "epoch": 0.26686315789473686, + "grad_norm": 0.3984375, + "learning_rate": 0.00042571770758310435, + "loss": 3.1142, + "step": 6338 + }, + { + "epoch": 0.26690526315789476, + "grad_norm": 0.43359375, + "learning_rate": 0.00042569370308394663, + "loss": 3.2718, + "step": 6339 + }, + { + "epoch": 0.26694736842105266, + "grad_norm": 0.5234375, + "learning_rate": 0.000425669695383856, + "loss": 3.1049, + "step": 6340 + }, + { + "epoch": 0.2669894736842105, + "grad_norm": 0.384765625, + "learning_rate": 0.00042564568448326997, + "loss": 3.4215, + "step": 6341 + }, + { + "epoch": 0.2670315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.000425621670382626, + "loss": 3.1289, + "step": 6342 + }, + { + "epoch": 0.2670736842105263, + "grad_norm": 0.40234375, + "learning_rate": 0.00042559765308236146, + "loss": 3.2794, + "step": 6343 + }, + { + "epoch": 0.2671157894736842, + "grad_norm": 0.51953125, + "learning_rate": 0.00042557363258291406, + "loss": 3.2385, + "step": 6344 + }, + { + "epoch": 0.2671578947368421, + "grad_norm": 0.453125, + "learning_rate": 0.0004255496088847214, + "loss": 3.6462, + "step": 6345 + }, + { + "epoch": 0.2672, + "grad_norm": 0.4453125, + "learning_rate": 0.000425525581988221, + "loss": 3.5533, + "step": 6346 + }, + { + "epoch": 0.2672421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.00042550155189385083, + "loss": 3.5518, + "step": 6347 + }, + { + "epoch": 0.2672842105263158, + "grad_norm": 0.50390625, + "learning_rate": 0.0004254775186020485, + "loss": 3.0705, + "step": 6348 + }, + { + "epoch": 0.2673263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.00042545348211325205, + "loss": 3.3356, + "step": 6349 + }, + { + "epoch": 0.2673684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.0004254294424278993, + "loss": 3.2119, + "step": 6350 + }, + { + "epoch": 0.2674105263157895, + "grad_norm": 0.412109375, + "learning_rate": 0.0004254053995464282, + "loss": 3.348, + "step": 6351 + }, + { + "epoch": 0.2674526315789474, + "grad_norm": 0.498046875, + "learning_rate": 0.00042538135346927686, + "loss": 3.1821, + "step": 6352 + }, + { + "epoch": 0.2674947368421053, + "grad_norm": 0.439453125, + "learning_rate": 0.00042535730419688325, + "loss": 3.0415, + "step": 6353 + }, + { + "epoch": 0.2675368421052632, + "grad_norm": 0.41796875, + "learning_rate": 0.0004253332517296856, + "loss": 3.2275, + "step": 6354 + }, + { + "epoch": 0.267578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00042530919606812215, + "loss": 3.6709, + "step": 6355 + }, + { + "epoch": 0.2676210526315789, + "grad_norm": 0.4765625, + "learning_rate": 0.0004252851372126312, + "loss": 3.2427, + "step": 6356 + }, + { + "epoch": 0.2676631578947368, + "grad_norm": 0.515625, + "learning_rate": 0.0004252610751636509, + "loss": 3.3825, + "step": 6357 + }, + { + "epoch": 0.2677052631578947, + "grad_norm": 0.4296875, + "learning_rate": 0.00042523700992161975, + "loss": 3.0383, + "step": 6358 + }, + { + "epoch": 0.2677473684210526, + "grad_norm": 0.43359375, + "learning_rate": 0.0004252129414869762, + "loss": 3.279, + "step": 6359 + }, + { + "epoch": 0.2677894736842105, + "grad_norm": 0.40234375, + "learning_rate": 0.00042518886986015863, + "loss": 3.3094, + "step": 6360 + }, + { + "epoch": 0.2678315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.00042516479504160575, + "loss": 3.6452, + "step": 6361 + }, + { + "epoch": 0.2678736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.00042514071703175614, + "loss": 3.4311, + "step": 6362 + }, + { + "epoch": 0.2679157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.0004251166358310484, + "loss": 2.8366, + "step": 6363 + }, + { + "epoch": 0.2679578947368421, + "grad_norm": 0.40234375, + "learning_rate": 0.0004250925514399214, + "loss": 2.9144, + "step": 6364 + }, + { + "epoch": 0.268, + "grad_norm": 0.42578125, + "learning_rate": 0.00042506846385881374, + "loss": 3.5243, + "step": 6365 + }, + { + "epoch": 0.2680421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00042504437308816444, + "loss": 3.1334, + "step": 6366 + }, + { + "epoch": 0.2680842105263158, + "grad_norm": 0.392578125, + "learning_rate": 0.00042502027912841224, + "loss": 3.4249, + "step": 6367 + }, + { + "epoch": 0.26812631578947366, + "grad_norm": 0.4296875, + "learning_rate": 0.0004249961819799963, + "loss": 3.6981, + "step": 6368 + }, + { + "epoch": 0.26816842105263156, + "grad_norm": 0.4140625, + "learning_rate": 0.0004249720816433555, + "loss": 3.2701, + "step": 6369 + }, + { + "epoch": 0.26821052631578945, + "grad_norm": 0.431640625, + "learning_rate": 0.00042494797811892896, + "loss": 3.3865, + "step": 6370 + }, + { + "epoch": 0.26825263157894735, + "grad_norm": 0.41015625, + "learning_rate": 0.00042492387140715575, + "loss": 3.1325, + "step": 6371 + }, + { + "epoch": 0.26829473684210525, + "grad_norm": 0.40234375, + "learning_rate": 0.0004248997615084752, + "loss": 3.2783, + "step": 6372 + }, + { + "epoch": 0.26833684210526315, + "grad_norm": 0.421875, + "learning_rate": 0.00042487564842332644, + "loss": 3.6164, + "step": 6373 + }, + { + "epoch": 0.26837894736842105, + "grad_norm": 0.431640625, + "learning_rate": 0.0004248515321521489, + "loss": 2.3453, + "step": 6374 + }, + { + "epoch": 0.26842105263157895, + "grad_norm": 0.404296875, + "learning_rate": 0.00042482741269538185, + "loss": 3.4442, + "step": 6375 + }, + { + "epoch": 0.26846315789473685, + "grad_norm": 0.435546875, + "learning_rate": 0.0004248032900534647, + "loss": 2.8978, + "step": 6376 + }, + { + "epoch": 0.26850526315789475, + "grad_norm": 0.38671875, + "learning_rate": 0.0004247791642268371, + "loss": 3.0188, + "step": 6377 + }, + { + "epoch": 0.26854736842105265, + "grad_norm": 0.44921875, + "learning_rate": 0.00042475503521593837, + "loss": 3.1182, + "step": 6378 + }, + { + "epoch": 0.26858947368421054, + "grad_norm": 0.376953125, + "learning_rate": 0.0004247309030212084, + "loss": 3.238, + "step": 6379 + }, + { + "epoch": 0.26863157894736844, + "grad_norm": 0.408203125, + "learning_rate": 0.0004247067676430865, + "loss": 3.3033, + "step": 6380 + }, + { + "epoch": 0.26867368421052634, + "grad_norm": 0.390625, + "learning_rate": 0.0004246826290820126, + "loss": 3.1905, + "step": 6381 + }, + { + "epoch": 0.2687157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.0004246584873384265, + "loss": 2.9784, + "step": 6382 + }, + { + "epoch": 0.2687578947368421, + "grad_norm": 0.40234375, + "learning_rate": 0.00042463434241276797, + "loss": 3.4526, + "step": 6383 + }, + { + "epoch": 0.2688, + "grad_norm": 0.39453125, + "learning_rate": 0.0004246101943054769, + "loss": 2.9956, + "step": 6384 + }, + { + "epoch": 0.2688421052631579, + "grad_norm": 0.54296875, + "learning_rate": 0.0004245860430169932, + "loss": 2.9979, + "step": 6385 + }, + { + "epoch": 0.2688842105263158, + "grad_norm": 0.46484375, + "learning_rate": 0.000424561888547757, + "loss": 3.0014, + "step": 6386 + }, + { + "epoch": 0.2689263157894737, + "grad_norm": 0.40625, + "learning_rate": 0.0004245377308982082, + "loss": 2.8607, + "step": 6387 + }, + { + "epoch": 0.2689684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.0004245135700687871, + "loss": 3.2286, + "step": 6388 + }, + { + "epoch": 0.2690105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.0004244894060599338, + "loss": 3.1448, + "step": 6389 + }, + { + "epoch": 0.2690526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.00042446523887208857, + "loss": 3.2772, + "step": 6390 + }, + { + "epoch": 0.2690947368421053, + "grad_norm": 0.408203125, + "learning_rate": 0.00042444106850569153, + "loss": 2.9565, + "step": 6391 + }, + { + "epoch": 0.2691368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.0004244168949611833, + "loss": 3.6742, + "step": 6392 + }, + { + "epoch": 0.2691789473684211, + "grad_norm": 0.396484375, + "learning_rate": 0.0004243927182390042, + "loss": 3.1222, + "step": 6393 + }, + { + "epoch": 0.26922105263157897, + "grad_norm": 0.470703125, + "learning_rate": 0.00042436853833959453, + "loss": 2.6047, + "step": 6394 + }, + { + "epoch": 0.2692631578947368, + "grad_norm": 0.396484375, + "learning_rate": 0.00042434435526339504, + "loss": 3.123, + "step": 6395 + }, + { + "epoch": 0.2693052631578947, + "grad_norm": 0.458984375, + "learning_rate": 0.00042432016901084623, + "loss": 3.3929, + "step": 6396 + }, + { + "epoch": 0.2693473684210526, + "grad_norm": 0.41796875, + "learning_rate": 0.0004242959795823889, + "loss": 3.2623, + "step": 6397 + }, + { + "epoch": 0.2693894736842105, + "grad_norm": 0.4765625, + "learning_rate": 0.00042427178697846345, + "loss": 2.6597, + "step": 6398 + }, + { + "epoch": 0.2694315789473684, + "grad_norm": 0.4453125, + "learning_rate": 0.00042424759119951083, + "loss": 3.2708, + "step": 6399 + }, + { + "epoch": 0.2694736842105263, + "grad_norm": 0.40234375, + "learning_rate": 0.00042422339224597186, + "loss": 2.5968, + "step": 6400 + }, + { + "epoch": 0.2695157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.0004241991901182873, + "loss": 3.6032, + "step": 6401 + }, + { + "epoch": 0.2695578947368421, + "grad_norm": 0.68359375, + "learning_rate": 0.0004241749848168983, + "loss": 3.1867, + "step": 6402 + }, + { + "epoch": 0.2696, + "grad_norm": 0.40625, + "learning_rate": 0.0004241507763422457, + "loss": 3.017, + "step": 6403 + }, + { + "epoch": 0.2696421052631579, + "grad_norm": 0.486328125, + "learning_rate": 0.0004241265646947706, + "loss": 3.219, + "step": 6404 + }, + { + "epoch": 0.2696842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.000424102349874914, + "loss": 3.4647, + "step": 6405 + }, + { + "epoch": 0.2697263157894737, + "grad_norm": 0.45703125, + "learning_rate": 0.00042407813188311715, + "loss": 2.9676, + "step": 6406 + }, + { + "epoch": 0.2697684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.0004240539107198213, + "loss": 3.4421, + "step": 6407 + }, + { + "epoch": 0.2698105263157895, + "grad_norm": 0.4453125, + "learning_rate": 0.0004240296863854677, + "loss": 3.0129, + "step": 6408 + }, + { + "epoch": 0.26985263157894734, + "grad_norm": 0.4453125, + "learning_rate": 0.0004240054588804977, + "loss": 3.1717, + "step": 6409 + }, + { + "epoch": 0.26989473684210524, + "grad_norm": 0.400390625, + "learning_rate": 0.00042398122820535265, + "loss": 3.5012, + "step": 6410 + }, + { + "epoch": 0.26993684210526314, + "grad_norm": 0.4453125, + "learning_rate": 0.0004239569943604741, + "loss": 3.3762, + "step": 6411 + }, + { + "epoch": 0.26997894736842104, + "grad_norm": 0.41796875, + "learning_rate": 0.0004239327573463034, + "loss": 3.393, + "step": 6412 + }, + { + "epoch": 0.27002105263157894, + "grad_norm": 0.466796875, + "learning_rate": 0.0004239085171632824, + "loss": 3.2879, + "step": 6413 + }, + { + "epoch": 0.27006315789473684, + "grad_norm": 0.40234375, + "learning_rate": 0.00042388427381185234, + "loss": 3.163, + "step": 6414 + }, + { + "epoch": 0.27010526315789474, + "grad_norm": 0.478515625, + "learning_rate": 0.0004238600272924553, + "loss": 3.435, + "step": 6415 + }, + { + "epoch": 0.27014736842105264, + "grad_norm": 0.458984375, + "learning_rate": 0.0004238357776055327, + "loss": 3.2586, + "step": 6416 + }, + { + "epoch": 0.27018947368421053, + "grad_norm": 0.482421875, + "learning_rate": 0.00042381152475152664, + "loss": 3.2817, + "step": 6417 + }, + { + "epoch": 0.27023157894736843, + "grad_norm": 0.42578125, + "learning_rate": 0.0004237872687308787, + "loss": 3.0169, + "step": 6418 + }, + { + "epoch": 0.27027368421052633, + "grad_norm": 0.421875, + "learning_rate": 0.0004237630095440309, + "loss": 3.4522, + "step": 6419 + }, + { + "epoch": 0.27031578947368423, + "grad_norm": 0.46484375, + "learning_rate": 0.00042373874719142527, + "loss": 3.6355, + "step": 6420 + }, + { + "epoch": 0.27035789473684213, + "grad_norm": 0.408203125, + "learning_rate": 0.0004237144816735038, + "loss": 3.6477, + "step": 6421 + }, + { + "epoch": 0.2704, + "grad_norm": 0.4140625, + "learning_rate": 0.00042369021299070857, + "loss": 3.39, + "step": 6422 + }, + { + "epoch": 0.2704421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.00042366594114348177, + "loss": 3.0867, + "step": 6423 + }, + { + "epoch": 0.27048421052631577, + "grad_norm": 0.42578125, + "learning_rate": 0.0004236416661322655, + "loss": 3.5644, + "step": 6424 + }, + { + "epoch": 0.27052631578947367, + "grad_norm": 0.412109375, + "learning_rate": 0.00042361738795750216, + "loss": 3.6193, + "step": 6425 + }, + { + "epoch": 0.27056842105263157, + "grad_norm": 0.52734375, + "learning_rate": 0.000423593106619634, + "loss": 3.4706, + "step": 6426 + }, + { + "epoch": 0.27061052631578947, + "grad_norm": 0.44921875, + "learning_rate": 0.00042356882211910335, + "loss": 3.6939, + "step": 6427 + }, + { + "epoch": 0.27065263157894737, + "grad_norm": 0.40625, + "learning_rate": 0.00042354453445635275, + "loss": 3.3058, + "step": 6428 + }, + { + "epoch": 0.27069473684210527, + "grad_norm": 0.423828125, + "learning_rate": 0.0004235202436318246, + "loss": 3.2101, + "step": 6429 + }, + { + "epoch": 0.27073684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.0004234959496459615, + "loss": 3.69, + "step": 6430 + }, + { + "epoch": 0.27077894736842106, + "grad_norm": 0.45703125, + "learning_rate": 0.0004234716524992061, + "loss": 2.8015, + "step": 6431 + }, + { + "epoch": 0.27082105263157896, + "grad_norm": 0.44921875, + "learning_rate": 0.0004234473521920009, + "loss": 3.2596, + "step": 6432 + }, + { + "epoch": 0.27086315789473686, + "grad_norm": 0.45703125, + "learning_rate": 0.0004234230487247889, + "loss": 2.9416, + "step": 6433 + }, + { + "epoch": 0.27090526315789476, + "grad_norm": 0.462890625, + "learning_rate": 0.00042339874209801256, + "loss": 3.4046, + "step": 6434 + }, + { + "epoch": 0.27094736842105266, + "grad_norm": 0.423828125, + "learning_rate": 0.00042337443231211494, + "loss": 3.3399, + "step": 6435 + }, + { + "epoch": 0.2709894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.0004233501193675389, + "loss": 3.8201, + "step": 6436 + }, + { + "epoch": 0.2710315789473684, + "grad_norm": 0.400390625, + "learning_rate": 0.0004233258032647273, + "loss": 3.6096, + "step": 6437 + }, + { + "epoch": 0.2710736842105263, + "grad_norm": 0.39453125, + "learning_rate": 0.0004233014840041233, + "loss": 3.1883, + "step": 6438 + }, + { + "epoch": 0.2711157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.0004232771615861698, + "loss": 3.4065, + "step": 6439 + }, + { + "epoch": 0.2711578947368421, + "grad_norm": 0.5703125, + "learning_rate": 0.00042325283601131003, + "loss": 3.2293, + "step": 6440 + }, + { + "epoch": 0.2712, + "grad_norm": 0.404296875, + "learning_rate": 0.00042322850727998705, + "loss": 3.0446, + "step": 6441 + }, + { + "epoch": 0.2712421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.0004232041753926443, + "loss": 3.4257, + "step": 6442 + }, + { + "epoch": 0.2712842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.00042317984034972506, + "loss": 3.6864, + "step": 6443 + }, + { + "epoch": 0.2713263157894737, + "grad_norm": 0.404296875, + "learning_rate": 0.00042315550215167255, + "loss": 3.0599, + "step": 6444 + }, + { + "epoch": 0.2713684210526316, + "grad_norm": 0.453125, + "learning_rate": 0.0004231311607989302, + "loss": 3.3024, + "step": 6445 + }, + { + "epoch": 0.2714105263157895, + "grad_norm": 0.400390625, + "learning_rate": 0.00042310681629194156, + "loss": 2.4417, + "step": 6446 + }, + { + "epoch": 0.2714526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.0004230824686311501, + "loss": 3.0827, + "step": 6447 + }, + { + "epoch": 0.2714947368421053, + "grad_norm": 0.41796875, + "learning_rate": 0.00042305811781699946, + "loss": 3.2718, + "step": 6448 + }, + { + "epoch": 0.27153684210526313, + "grad_norm": 0.431640625, + "learning_rate": 0.0004230337638499332, + "loss": 3.2378, + "step": 6449 + }, + { + "epoch": 0.27157894736842103, + "grad_norm": 0.416015625, + "learning_rate": 0.000423009406730395, + "loss": 3.3503, + "step": 6450 + }, + { + "epoch": 0.27162105263157893, + "grad_norm": 0.451171875, + "learning_rate": 0.0004229850464588288, + "loss": 3.6389, + "step": 6451 + }, + { + "epoch": 0.27166315789473683, + "grad_norm": 0.51171875, + "learning_rate": 0.0004229606830356783, + "loss": 3.2197, + "step": 6452 + }, + { + "epoch": 0.2717052631578947, + "grad_norm": 0.41015625, + "learning_rate": 0.00042293631646138737, + "loss": 3.3798, + "step": 6453 + }, + { + "epoch": 0.2717473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.0004229119467363999, + "loss": 2.9206, + "step": 6454 + }, + { + "epoch": 0.2717894736842105, + "grad_norm": 0.44921875, + "learning_rate": 0.00042288757386115997, + "loss": 3.2362, + "step": 6455 + }, + { + "epoch": 0.2718315789473684, + "grad_norm": 0.462890625, + "learning_rate": 0.00042286319783611147, + "loss": 2.6049, + "step": 6456 + }, + { + "epoch": 0.2718736842105263, + "grad_norm": 0.404296875, + "learning_rate": 0.00042283881866169874, + "loss": 3.3568, + "step": 6457 + }, + { + "epoch": 0.2719157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.0004228144363383658, + "loss": 3.5674, + "step": 6458 + }, + { + "epoch": 0.2719578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.0004227900508665569, + "loss": 3.2902, + "step": 6459 + }, + { + "epoch": 0.272, + "grad_norm": 0.396484375, + "learning_rate": 0.0004227656622467162, + "loss": 3.3236, + "step": 6460 + }, + { + "epoch": 0.2720421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00042274127047928813, + "loss": 3.5077, + "step": 6461 + }, + { + "epoch": 0.27208421052631576, + "grad_norm": 0.419921875, + "learning_rate": 0.00042271687556471715, + "loss": 3.5554, + "step": 6462 + }, + { + "epoch": 0.27212631578947366, + "grad_norm": 0.515625, + "learning_rate": 0.00042269247750344755, + "loss": 2.8024, + "step": 6463 + }, + { + "epoch": 0.27216842105263156, + "grad_norm": 0.416015625, + "learning_rate": 0.00042266807629592396, + "loss": 2.8508, + "step": 6464 + }, + { + "epoch": 0.27221052631578946, + "grad_norm": 0.4140625, + "learning_rate": 0.0004226436719425909, + "loss": 3.3113, + "step": 6465 + }, + { + "epoch": 0.27225263157894736, + "grad_norm": 0.45703125, + "learning_rate": 0.000422619264443893, + "loss": 3.0102, + "step": 6466 + }, + { + "epoch": 0.27229473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.0004225948538002749, + "loss": 3.4644, + "step": 6467 + }, + { + "epoch": 0.27233684210526315, + "grad_norm": 0.4296875, + "learning_rate": 0.00042257044001218136, + "loss": 3.2368, + "step": 6468 + }, + { + "epoch": 0.27237894736842105, + "grad_norm": 0.4140625, + "learning_rate": 0.00042254602308005717, + "loss": 2.8964, + "step": 6469 + }, + { + "epoch": 0.27242105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.00042252160300434717, + "loss": 3.5766, + "step": 6470 + }, + { + "epoch": 0.27246315789473685, + "grad_norm": 0.443359375, + "learning_rate": 0.00042249717978549625, + "loss": 3.3034, + "step": 6471 + }, + { + "epoch": 0.27250526315789475, + "grad_norm": 0.419921875, + "learning_rate": 0.00042247275342394944, + "loss": 3.0497, + "step": 6472 + }, + { + "epoch": 0.27254736842105265, + "grad_norm": 0.4140625, + "learning_rate": 0.0004224483239201516, + "loss": 3.6123, + "step": 6473 + }, + { + "epoch": 0.27258947368421055, + "grad_norm": 0.408203125, + "learning_rate": 0.000422423891274548, + "loss": 3.3051, + "step": 6474 + }, + { + "epoch": 0.27263157894736845, + "grad_norm": 0.435546875, + "learning_rate": 0.00042239945548758364, + "loss": 2.8966, + "step": 6475 + }, + { + "epoch": 0.2726736842105263, + "grad_norm": 0.57421875, + "learning_rate": 0.00042237501655970387, + "loss": 3.1048, + "step": 6476 + }, + { + "epoch": 0.2727157894736842, + "grad_norm": 0.404296875, + "learning_rate": 0.00042235057449135373, + "loss": 3.0976, + "step": 6477 + }, + { + "epoch": 0.2727578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.00042232612928297863, + "loss": 3.3886, + "step": 6478 + }, + { + "epoch": 0.2728, + "grad_norm": 0.40625, + "learning_rate": 0.00042230168093502395, + "loss": 3.4853, + "step": 6479 + }, + { + "epoch": 0.2728421052631579, + "grad_norm": 0.396484375, + "learning_rate": 0.00042227722944793513, + "loss": 3.2212, + "step": 6480 + }, + { + "epoch": 0.2728842105263158, + "grad_norm": 0.39453125, + "learning_rate": 0.0004222527748221575, + "loss": 2.7216, + "step": 6481 + }, + { + "epoch": 0.2729263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.0004222283170581368, + "loss": 3.4158, + "step": 6482 + }, + { + "epoch": 0.2729684210526316, + "grad_norm": 0.416015625, + "learning_rate": 0.0004222038561563185, + "loss": 3.5475, + "step": 6483 + }, + { + "epoch": 0.2730105263157895, + "grad_norm": 0.4765625, + "learning_rate": 0.0004221793921171483, + "loss": 3.5681, + "step": 6484 + }, + { + "epoch": 0.2730526315789474, + "grad_norm": 0.41015625, + "learning_rate": 0.00042215492494107186, + "loss": 3.8476, + "step": 6485 + }, + { + "epoch": 0.2730947368421053, + "grad_norm": 0.431640625, + "learning_rate": 0.00042213045462853496, + "loss": 3.378, + "step": 6486 + }, + { + "epoch": 0.2731368421052632, + "grad_norm": 0.390625, + "learning_rate": 0.00042210598117998344, + "loss": 3.1606, + "step": 6487 + }, + { + "epoch": 0.2731789473684211, + "grad_norm": 0.390625, + "learning_rate": 0.000422081504595863, + "loss": 3.1243, + "step": 6488 + }, + { + "epoch": 0.2732210526315789, + "grad_norm": 0.408203125, + "learning_rate": 0.0004220570248766199, + "loss": 3.2083, + "step": 6489 + }, + { + "epoch": 0.2732631578947368, + "grad_norm": 0.390625, + "learning_rate": 0.00042203254202269997, + "loss": 2.8056, + "step": 6490 + }, + { + "epoch": 0.2733052631578947, + "grad_norm": 0.421875, + "learning_rate": 0.0004220080560345492, + "loss": 3.3898, + "step": 6491 + }, + { + "epoch": 0.2733473684210526, + "grad_norm": 0.3984375, + "learning_rate": 0.00042198356691261376, + "loss": 2.8662, + "step": 6492 + }, + { + "epoch": 0.2733894736842105, + "grad_norm": 0.400390625, + "learning_rate": 0.00042195907465733984, + "loss": 3.4138, + "step": 6493 + }, + { + "epoch": 0.2734315789473684, + "grad_norm": 0.40234375, + "learning_rate": 0.00042193457926917354, + "loss": 3.4618, + "step": 6494 + }, + { + "epoch": 0.2734736842105263, + "grad_norm": 0.416015625, + "learning_rate": 0.0004219100807485613, + "loss": 3.1989, + "step": 6495 + }, + { + "epoch": 0.2735157894736842, + "grad_norm": 0.4453125, + "learning_rate": 0.0004218855790959493, + "loss": 2.4662, + "step": 6496 + }, + { + "epoch": 0.2735578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0004218610743117841, + "loss": 3.424, + "step": 6497 + }, + { + "epoch": 0.2736, + "grad_norm": 0.41796875, + "learning_rate": 0.000421836566396512, + "loss": 3.268, + "step": 6498 + }, + { + "epoch": 0.2736421052631579, + "grad_norm": 0.37890625, + "learning_rate": 0.00042181205535057957, + "loss": 3.4284, + "step": 6499 + }, + { + "epoch": 0.2736842105263158, + "grad_norm": 0.40234375, + "learning_rate": 0.00042178754117443336, + "loss": 3.2268, + "step": 6500 + }, + { + "epoch": 0.2737263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.0004217630238685199, + "loss": 3.1248, + "step": 6501 + }, + { + "epoch": 0.2737684210526316, + "grad_norm": 0.43359375, + "learning_rate": 0.0004217385034332861, + "loss": 3.4265, + "step": 6502 + }, + { + "epoch": 0.27381052631578945, + "grad_norm": 0.40234375, + "learning_rate": 0.0004217139798691785, + "loss": 3.4103, + "step": 6503 + }, + { + "epoch": 0.27385263157894735, + "grad_norm": 0.41015625, + "learning_rate": 0.000421689453176644, + "loss": 3.1926, + "step": 6504 + }, + { + "epoch": 0.27389473684210525, + "grad_norm": 0.4140625, + "learning_rate": 0.0004216649233561293, + "loss": 3.3586, + "step": 6505 + }, + { + "epoch": 0.27393684210526315, + "grad_norm": 0.400390625, + "learning_rate": 0.00042164039040808144, + "loss": 3.8455, + "step": 6506 + }, + { + "epoch": 0.27397894736842104, + "grad_norm": 0.40234375, + "learning_rate": 0.00042161585433294734, + "loss": 3.4343, + "step": 6507 + }, + { + "epoch": 0.27402105263157894, + "grad_norm": 0.40234375, + "learning_rate": 0.000421591315131174, + "loss": 3.1182, + "step": 6508 + }, + { + "epoch": 0.27406315789473684, + "grad_norm": 0.37890625, + "learning_rate": 0.0004215667728032084, + "loss": 3.1718, + "step": 6509 + }, + { + "epoch": 0.27410526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.00042154222734949796, + "loss": 3.0234, + "step": 6510 + }, + { + "epoch": 0.27414736842105264, + "grad_norm": 0.42578125, + "learning_rate": 0.0004215176787704896, + "loss": 3.0397, + "step": 6511 + }, + { + "epoch": 0.27418947368421054, + "grad_norm": 0.38671875, + "learning_rate": 0.00042149312706663065, + "loss": 3.5663, + "step": 6512 + }, + { + "epoch": 0.27423157894736844, + "grad_norm": 0.421875, + "learning_rate": 0.00042146857223836854, + "loss": 3.6205, + "step": 6513 + }, + { + "epoch": 0.27427368421052634, + "grad_norm": 0.40625, + "learning_rate": 0.00042144401428615043, + "loss": 3.2493, + "step": 6514 + }, + { + "epoch": 0.27431578947368424, + "grad_norm": 0.416015625, + "learning_rate": 0.00042141945321042375, + "loss": 3.5602, + "step": 6515 + }, + { + "epoch": 0.2743578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00042139488901163613, + "loss": 3.4949, + "step": 6516 + }, + { + "epoch": 0.2744, + "grad_norm": 0.44921875, + "learning_rate": 0.0004213703216902349, + "loss": 3.2818, + "step": 6517 + }, + { + "epoch": 0.2744421052631579, + "grad_norm": 0.38671875, + "learning_rate": 0.0004213457512466679, + "loss": 3.3253, + "step": 6518 + }, + { + "epoch": 0.2744842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.0004213211776813826, + "loss": 3.3786, + "step": 6519 + }, + { + "epoch": 0.2745263157894737, + "grad_norm": 0.3984375, + "learning_rate": 0.0004212966009948267, + "loss": 3.6269, + "step": 6520 + }, + { + "epoch": 0.2745684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.000421272021187448, + "loss": 3.4241, + "step": 6521 + }, + { + "epoch": 0.27461052631578947, + "grad_norm": 0.412109375, + "learning_rate": 0.0004212474382596943, + "loss": 3.2873, + "step": 6522 + }, + { + "epoch": 0.27465263157894737, + "grad_norm": 0.396484375, + "learning_rate": 0.0004212228522120135, + "loss": 3.386, + "step": 6523 + }, + { + "epoch": 0.27469473684210527, + "grad_norm": 0.439453125, + "learning_rate": 0.00042119826304485355, + "loss": 2.8141, + "step": 6524 + }, + { + "epoch": 0.27473684210526317, + "grad_norm": 0.431640625, + "learning_rate": 0.0004211736707586624, + "loss": 3.4275, + "step": 6525 + }, + { + "epoch": 0.27477894736842107, + "grad_norm": 0.462890625, + "learning_rate": 0.000421149075353888, + "loss": 3.5436, + "step": 6526 + }, + { + "epoch": 0.27482105263157897, + "grad_norm": 0.443359375, + "learning_rate": 0.00042112447683097866, + "loss": 3.1321, + "step": 6527 + }, + { + "epoch": 0.27486315789473686, + "grad_norm": 0.404296875, + "learning_rate": 0.00042109987519038227, + "loss": 3.5082, + "step": 6528 + }, + { + "epoch": 0.27490526315789476, + "grad_norm": 0.42578125, + "learning_rate": 0.00042107527043254726, + "loss": 3.2106, + "step": 6529 + }, + { + "epoch": 0.2749473684210526, + "grad_norm": 0.43359375, + "learning_rate": 0.00042105066255792185, + "loss": 3.3604, + "step": 6530 + }, + { + "epoch": 0.2749894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.00042102605156695434, + "loss": 3.373, + "step": 6531 + }, + { + "epoch": 0.2750315789473684, + "grad_norm": 0.462890625, + "learning_rate": 0.0004210014374600931, + "loss": 3.4307, + "step": 6532 + }, + { + "epoch": 0.2750736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.0004209768202377866, + "loss": 2.9288, + "step": 6533 + }, + { + "epoch": 0.2751157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.00042095219990048326, + "loss": 3.4639, + "step": 6534 + }, + { + "epoch": 0.2751578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.00042092757644863175, + "loss": 3.4761, + "step": 6535 + }, + { + "epoch": 0.2752, + "grad_norm": 0.416015625, + "learning_rate": 0.0004209029498826806, + "loss": 2.8818, + "step": 6536 + }, + { + "epoch": 0.2752421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.0004208783202030786, + "loss": 3.2147, + "step": 6537 + }, + { + "epoch": 0.2752842105263158, + "grad_norm": 0.392578125, + "learning_rate": 0.0004208536874102743, + "loss": 3.3226, + "step": 6538 + }, + { + "epoch": 0.2753263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.0004208290515047166, + "loss": 3.3759, + "step": 6539 + }, + { + "epoch": 0.2753684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.0004208044124868543, + "loss": 2.9062, + "step": 6540 + }, + { + "epoch": 0.2754105263157895, + "grad_norm": 0.376953125, + "learning_rate": 0.0004207797703571363, + "loss": 3.7027, + "step": 6541 + }, + { + "epoch": 0.2754526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00042075512511601144, + "loss": 3.3396, + "step": 6542 + }, + { + "epoch": 0.27549473684210524, + "grad_norm": 0.439453125, + "learning_rate": 0.000420730476763929, + "loss": 2.8917, + "step": 6543 + }, + { + "epoch": 0.27553684210526314, + "grad_norm": 0.427734375, + "learning_rate": 0.00042070582530133775, + "loss": 3.5631, + "step": 6544 + }, + { + "epoch": 0.27557894736842103, + "grad_norm": 0.443359375, + "learning_rate": 0.00042068117072868695, + "loss": 2.7966, + "step": 6545 + }, + { + "epoch": 0.27562105263157893, + "grad_norm": 0.400390625, + "learning_rate": 0.0004206565130464257, + "loss": 2.8822, + "step": 6546 + }, + { + "epoch": 0.27566315789473683, + "grad_norm": 0.39453125, + "learning_rate": 0.00042063185225500336, + "loss": 2.9495, + "step": 6547 + }, + { + "epoch": 0.27570526315789473, + "grad_norm": 0.423828125, + "learning_rate": 0.00042060718835486917, + "loss": 3.3082, + "step": 6548 + }, + { + "epoch": 0.27574736842105263, + "grad_norm": 0.376953125, + "learning_rate": 0.00042058252134647247, + "loss": 3.1263, + "step": 6549 + }, + { + "epoch": 0.27578947368421053, + "grad_norm": 0.404296875, + "learning_rate": 0.0004205578512302626, + "loss": 3.4752, + "step": 6550 + }, + { + "epoch": 0.2758315789473684, + "grad_norm": 0.404296875, + "learning_rate": 0.00042053317800668915, + "loss": 2.8122, + "step": 6551 + }, + { + "epoch": 0.2758736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.0004205085016762016, + "loss": 2.996, + "step": 6552 + }, + { + "epoch": 0.2759157894736842, + "grad_norm": 0.453125, + "learning_rate": 0.00042048382223924943, + "loss": 3.4017, + "step": 6553 + }, + { + "epoch": 0.2759578947368421, + "grad_norm": 0.388671875, + "learning_rate": 0.00042045913969628225, + "loss": 3.131, + "step": 6554 + }, + { + "epoch": 0.276, + "grad_norm": 0.408203125, + "learning_rate": 0.0004204344540477499, + "loss": 3.304, + "step": 6555 + }, + { + "epoch": 0.2760421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.000420409765294102, + "loss": 3.2075, + "step": 6556 + }, + { + "epoch": 0.27608421052631577, + "grad_norm": 0.447265625, + "learning_rate": 0.0004203850734357885, + "loss": 2.8884, + "step": 6557 + }, + { + "epoch": 0.27612631578947366, + "grad_norm": 0.4453125, + "learning_rate": 0.00042036037847325913, + "loss": 3.4836, + "step": 6558 + }, + { + "epoch": 0.27616842105263156, + "grad_norm": 0.41015625, + "learning_rate": 0.00042033568040696377, + "loss": 3.1663, + "step": 6559 + }, + { + "epoch": 0.27621052631578946, + "grad_norm": 0.41796875, + "learning_rate": 0.0004203109792373525, + "loss": 3.242, + "step": 6560 + }, + { + "epoch": 0.27625263157894736, + "grad_norm": 0.40234375, + "learning_rate": 0.00042028627496487527, + "loss": 3.6265, + "step": 6561 + }, + { + "epoch": 0.27629473684210526, + "grad_norm": 0.40625, + "learning_rate": 0.00042026156758998213, + "loss": 2.8325, + "step": 6562 + }, + { + "epoch": 0.27633684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.00042023685711312336, + "loss": 3.2276, + "step": 6563 + }, + { + "epoch": 0.27637894736842106, + "grad_norm": 0.455078125, + "learning_rate": 0.0004202121435347491, + "loss": 2.9697, + "step": 6564 + }, + { + "epoch": 0.27642105263157896, + "grad_norm": 0.421875, + "learning_rate": 0.00042018742685530954, + "loss": 3.2601, + "step": 6565 + }, + { + "epoch": 0.27646315789473686, + "grad_norm": 0.412109375, + "learning_rate": 0.000420162707075255, + "loss": 3.1864, + "step": 6566 + }, + { + "epoch": 0.27650526315789475, + "grad_norm": 0.427734375, + "learning_rate": 0.0004201379841950359, + "loss": 3.1265, + "step": 6567 + }, + { + "epoch": 0.27654736842105265, + "grad_norm": 0.36328125, + "learning_rate": 0.0004201132582151026, + "loss": 3.1121, + "step": 6568 + }, + { + "epoch": 0.27658947368421055, + "grad_norm": 0.400390625, + "learning_rate": 0.0004200885291359056, + "loss": 2.7161, + "step": 6569 + }, + { + "epoch": 0.2766315789473684, + "grad_norm": 0.40625, + "learning_rate": 0.0004200637969578955, + "loss": 3.0314, + "step": 6570 + }, + { + "epoch": 0.2766736842105263, + "grad_norm": 0.376953125, + "learning_rate": 0.0004200390616815228, + "loss": 3.2215, + "step": 6571 + }, + { + "epoch": 0.2767157894736842, + "grad_norm": 0.3671875, + "learning_rate": 0.0004200143233072382, + "loss": 3.1522, + "step": 6572 + }, + { + "epoch": 0.2767578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.0004199895818354924, + "loss": 3.1174, + "step": 6573 + }, + { + "epoch": 0.2768, + "grad_norm": 0.427734375, + "learning_rate": 0.00041996483726673613, + "loss": 3.0369, + "step": 6574 + }, + { + "epoch": 0.2768421052631579, + "grad_norm": 0.455078125, + "learning_rate": 0.00041994008960142014, + "loss": 3.1888, + "step": 6575 + }, + { + "epoch": 0.2768842105263158, + "grad_norm": 0.3984375, + "learning_rate": 0.00041991533883999554, + "loss": 3.5151, + "step": 6576 + }, + { + "epoch": 0.2769263157894737, + "grad_norm": 0.36328125, + "learning_rate": 0.000419890584982913, + "loss": 2.7248, + "step": 6577 + }, + { + "epoch": 0.2769684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.0004198658280306237, + "loss": 3.175, + "step": 6578 + }, + { + "epoch": 0.2770105263157895, + "grad_norm": 0.43359375, + "learning_rate": 0.00041984106798357857, + "loss": 3.4148, + "step": 6579 + }, + { + "epoch": 0.2770526315789474, + "grad_norm": 0.44140625, + "learning_rate": 0.0004198163048422287, + "loss": 3.0725, + "step": 6580 + }, + { + "epoch": 0.2770947368421053, + "grad_norm": 0.3984375, + "learning_rate": 0.00041979153860702533, + "loss": 3.5595, + "step": 6581 + }, + { + "epoch": 0.2771368421052632, + "grad_norm": 0.396484375, + "learning_rate": 0.0004197667692784196, + "loss": 2.9202, + "step": 6582 + }, + { + "epoch": 0.2771789473684211, + "grad_norm": 0.431640625, + "learning_rate": 0.00041974199685686274, + "loss": 3.2668, + "step": 6583 + }, + { + "epoch": 0.2772210526315789, + "grad_norm": 0.416015625, + "learning_rate": 0.00041971722134280623, + "loss": 3.5529, + "step": 6584 + }, + { + "epoch": 0.2772631578947368, + "grad_norm": 0.43359375, + "learning_rate": 0.0004196924427367013, + "loss": 3.6094, + "step": 6585 + }, + { + "epoch": 0.2773052631578947, + "grad_norm": 0.3984375, + "learning_rate": 0.00041966766103899945, + "loss": 3.4502, + "step": 6586 + }, + { + "epoch": 0.2773473684210526, + "grad_norm": 0.412109375, + "learning_rate": 0.0004196428762501522, + "loss": 3.1369, + "step": 6587 + }, + { + "epoch": 0.2773894736842105, + "grad_norm": 0.447265625, + "learning_rate": 0.00041961808837061103, + "loss": 3.4338, + "step": 6588 + }, + { + "epoch": 0.2774315789473684, + "grad_norm": 0.404296875, + "learning_rate": 0.00041959329740082766, + "loss": 3.1948, + "step": 6589 + }, + { + "epoch": 0.2774736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.0004195685033412536, + "loss": 3.252, + "step": 6590 + }, + { + "epoch": 0.2775157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.00041954370619234064, + "loss": 3.2458, + "step": 6591 + }, + { + "epoch": 0.2775578947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.00041951890595454057, + "loss": 3.4346, + "step": 6592 + }, + { + "epoch": 0.2776, + "grad_norm": 0.419921875, + "learning_rate": 0.00041949410262830523, + "loss": 3.5173, + "step": 6593 + }, + { + "epoch": 0.2776421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.00041946929621408647, + "loss": 3.2384, + "step": 6594 + }, + { + "epoch": 0.2776842105263158, + "grad_norm": 0.53515625, + "learning_rate": 0.0004194444867123363, + "loss": 3.1336, + "step": 6595 + }, + { + "epoch": 0.2777263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.00041941967412350656, + "loss": 3.3995, + "step": 6596 + }, + { + "epoch": 0.27776842105263155, + "grad_norm": 0.416015625, + "learning_rate": 0.0004193948584480495, + "loss": 3.5149, + "step": 6597 + }, + { + "epoch": 0.27781052631578945, + "grad_norm": 0.416015625, + "learning_rate": 0.0004193700396864172, + "loss": 3.7719, + "step": 6598 + }, + { + "epoch": 0.27785263157894735, + "grad_norm": 0.41015625, + "learning_rate": 0.0004193452178390617, + "loss": 3.4228, + "step": 6599 + }, + { + "epoch": 0.27789473684210525, + "grad_norm": 0.4140625, + "learning_rate": 0.0004193203929064353, + "loss": 2.8364, + "step": 6600 + }, + { + "epoch": 0.27793684210526315, + "grad_norm": 0.5, + "learning_rate": 0.00041929556488899027, + "loss": 2.9872, + "step": 6601 + }, + { + "epoch": 0.27797894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.00041927073378717893, + "loss": 3.131, + "step": 6602 + }, + { + "epoch": 0.27802105263157895, + "grad_norm": 0.453125, + "learning_rate": 0.0004192458996014539, + "loss": 3.5979, + "step": 6603 + }, + { + "epoch": 0.27806315789473685, + "grad_norm": 0.41015625, + "learning_rate": 0.0004192210623322673, + "loss": 3.0426, + "step": 6604 + }, + { + "epoch": 0.27810526315789474, + "grad_norm": 0.421875, + "learning_rate": 0.00041919622198007174, + "loss": 3.1596, + "step": 6605 + }, + { + "epoch": 0.27814736842105264, + "grad_norm": 0.404296875, + "learning_rate": 0.00041917137854531985, + "loss": 3.474, + "step": 6606 + }, + { + "epoch": 0.27818947368421054, + "grad_norm": 0.447265625, + "learning_rate": 0.0004191465320284642, + "loss": 3.2664, + "step": 6607 + }, + { + "epoch": 0.27823157894736844, + "grad_norm": 0.466796875, + "learning_rate": 0.00041912168242995754, + "loss": 3.2065, + "step": 6608 + }, + { + "epoch": 0.27827368421052634, + "grad_norm": 0.490234375, + "learning_rate": 0.00041909682975025243, + "loss": 3.314, + "step": 6609 + }, + { + "epoch": 0.2783157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.0004190719739898018, + "loss": 3.0488, + "step": 6610 + }, + { + "epoch": 0.2783578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00041904711514905845, + "loss": 3.3811, + "step": 6611 + }, + { + "epoch": 0.2784, + "grad_norm": 0.390625, + "learning_rate": 0.0004190222532284753, + "loss": 3.0741, + "step": 6612 + }, + { + "epoch": 0.2784421052631579, + "grad_norm": 0.458984375, + "learning_rate": 0.0004189973882285053, + "loss": 2.911, + "step": 6613 + }, + { + "epoch": 0.2784842105263158, + "grad_norm": 0.396484375, + "learning_rate": 0.0004189725201496015, + "loss": 2.9579, + "step": 6614 + }, + { + "epoch": 0.2785263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.0004189476489922168, + "loss": 3.0509, + "step": 6615 + }, + { + "epoch": 0.2785684210526316, + "grad_norm": 0.43359375, + "learning_rate": 0.0004189227747568045, + "loss": 3.468, + "step": 6616 + }, + { + "epoch": 0.2786105263157895, + "grad_norm": 0.388671875, + "learning_rate": 0.0004188978974438177, + "loss": 3.5568, + "step": 6617 + }, + { + "epoch": 0.2786526315789474, + "grad_norm": 0.416015625, + "learning_rate": 0.00041887301705370973, + "loss": 3.4185, + "step": 6618 + }, + { + "epoch": 0.2786947368421053, + "grad_norm": 0.45703125, + "learning_rate": 0.00041884813358693376, + "loss": 3.3872, + "step": 6619 + }, + { + "epoch": 0.27873684210526317, + "grad_norm": 0.416015625, + "learning_rate": 0.00041882324704394316, + "loss": 3.4621, + "step": 6620 + }, + { + "epoch": 0.27877894736842107, + "grad_norm": 0.416015625, + "learning_rate": 0.00041879835742519134, + "loss": 2.8777, + "step": 6621 + }, + { + "epoch": 0.27882105263157897, + "grad_norm": 0.40234375, + "learning_rate": 0.0004187734647311319, + "loss": 3.6372, + "step": 6622 + }, + { + "epoch": 0.27886315789473687, + "grad_norm": 0.388671875, + "learning_rate": 0.0004187485689622181, + "loss": 3.2462, + "step": 6623 + }, + { + "epoch": 0.2789052631578947, + "grad_norm": 0.431640625, + "learning_rate": 0.00041872367011890364, + "loss": 3.2563, + "step": 6624 + }, + { + "epoch": 0.2789473684210526, + "grad_norm": 0.4453125, + "learning_rate": 0.0004186987682016422, + "loss": 3.0996, + "step": 6625 + }, + { + "epoch": 0.2789894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.0004186738632108875, + "loss": 3.1638, + "step": 6626 + }, + { + "epoch": 0.2790315789473684, + "grad_norm": 0.39453125, + "learning_rate": 0.0004186489551470931, + "loss": 2.9084, + "step": 6627 + }, + { + "epoch": 0.2790736842105263, + "grad_norm": 0.408203125, + "learning_rate": 0.0004186240440107129, + "loss": 3.2046, + "step": 6628 + }, + { + "epoch": 0.2791157894736842, + "grad_norm": 0.392578125, + "learning_rate": 0.0004185991298022007, + "loss": 3.2739, + "step": 6629 + }, + { + "epoch": 0.2791578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.0004185742125220106, + "loss": 3.1638, + "step": 6630 + }, + { + "epoch": 0.2792, + "grad_norm": 0.447265625, + "learning_rate": 0.00041854929217059625, + "loss": 3.2391, + "step": 6631 + }, + { + "epoch": 0.2792421052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.00041852436874841194, + "loss": 3.1495, + "step": 6632 + }, + { + "epoch": 0.2792842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.0004184994422559116, + "loss": 3.1134, + "step": 6633 + }, + { + "epoch": 0.2793263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00041847451269354935, + "loss": 3.655, + "step": 6634 + }, + { + "epoch": 0.2793684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.00041844958006177947, + "loss": 3.4538, + "step": 6635 + }, + { + "epoch": 0.2794105263157895, + "grad_norm": 0.38671875, + "learning_rate": 0.0004184246443610562, + "loss": 3.3199, + "step": 6636 + }, + { + "epoch": 0.27945263157894734, + "grad_norm": 0.435546875, + "learning_rate": 0.00041839970559183373, + "loss": 3.0707, + "step": 6637 + }, + { + "epoch": 0.27949473684210524, + "grad_norm": 0.80859375, + "learning_rate": 0.0004183747637545666, + "loss": 2.748, + "step": 6638 + }, + { + "epoch": 0.27953684210526314, + "grad_norm": 0.4375, + "learning_rate": 0.000418349818849709, + "loss": 3.04, + "step": 6639 + }, + { + "epoch": 0.27957894736842104, + "grad_norm": 0.427734375, + "learning_rate": 0.0004183248708777155, + "loss": 3.2245, + "step": 6640 + }, + { + "epoch": 0.27962105263157894, + "grad_norm": 0.42578125, + "learning_rate": 0.0004182999198390407, + "loss": 3.211, + "step": 6641 + }, + { + "epoch": 0.27966315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.00041827496573413903, + "loss": 3.2164, + "step": 6642 + }, + { + "epoch": 0.27970526315789473, + "grad_norm": 0.451171875, + "learning_rate": 0.00041825000856346526, + "loss": 3.3138, + "step": 6643 + }, + { + "epoch": 0.27974736842105263, + "grad_norm": 0.412109375, + "learning_rate": 0.000418225048327474, + "loss": 3.0779, + "step": 6644 + }, + { + "epoch": 0.27978947368421053, + "grad_norm": 0.4140625, + "learning_rate": 0.00041820008502662, + "loss": 3.2993, + "step": 6645 + }, + { + "epoch": 0.27983157894736843, + "grad_norm": 0.404296875, + "learning_rate": 0.0004181751186613581, + "loss": 3.5527, + "step": 6646 + }, + { + "epoch": 0.27987368421052633, + "grad_norm": 0.37890625, + "learning_rate": 0.00041815014923214314, + "loss": 3.0518, + "step": 6647 + }, + { + "epoch": 0.27991578947368423, + "grad_norm": 0.4296875, + "learning_rate": 0.00041812517673942993, + "loss": 3.2773, + "step": 6648 + }, + { + "epoch": 0.27995789473684213, + "grad_norm": 0.431640625, + "learning_rate": 0.00041810020118367364, + "loss": 3.6576, + "step": 6649 + }, + { + "epoch": 0.28, + "grad_norm": 0.416015625, + "learning_rate": 0.0004180752225653292, + "loss": 3.8121, + "step": 6650 + }, + { + "epoch": 0.28004210526315787, + "grad_norm": 0.416015625, + "learning_rate": 0.00041805024088485167, + "loss": 3.587, + "step": 6651 + }, + { + "epoch": 0.28008421052631577, + "grad_norm": 0.423828125, + "learning_rate": 0.0004180252561426962, + "loss": 3.6611, + "step": 6652 + }, + { + "epoch": 0.28012631578947367, + "grad_norm": 0.41796875, + "learning_rate": 0.00041800026833931797, + "loss": 3.2335, + "step": 6653 + }, + { + "epoch": 0.28016842105263157, + "grad_norm": 0.458984375, + "learning_rate": 0.0004179752774751723, + "loss": 3.3893, + "step": 6654 + }, + { + "epoch": 0.28021052631578947, + "grad_norm": 0.412109375, + "learning_rate": 0.00041795028355071437, + "loss": 3.2301, + "step": 6655 + }, + { + "epoch": 0.28025263157894736, + "grad_norm": 0.42578125, + "learning_rate": 0.0004179252865663996, + "loss": 3.3729, + "step": 6656 + }, + { + "epoch": 0.28029473684210526, + "grad_norm": 0.408203125, + "learning_rate": 0.0004179002865226834, + "loss": 2.9862, + "step": 6657 + }, + { + "epoch": 0.28033684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.00041787528342002135, + "loss": 3.2229, + "step": 6658 + }, + { + "epoch": 0.28037894736842106, + "grad_norm": 0.416015625, + "learning_rate": 0.0004178502772588688, + "loss": 3.3188, + "step": 6659 + }, + { + "epoch": 0.28042105263157896, + "grad_norm": 0.419921875, + "learning_rate": 0.00041782526803968147, + "loss": 3.4063, + "step": 6660 + }, + { + "epoch": 0.28046315789473686, + "grad_norm": 0.40625, + "learning_rate": 0.00041780025576291493, + "loss": 3.6166, + "step": 6661 + }, + { + "epoch": 0.28050526315789476, + "grad_norm": 0.4140625, + "learning_rate": 0.00041777524042902484, + "loss": 3.4047, + "step": 6662 + }, + { + "epoch": 0.28054736842105266, + "grad_norm": 0.44921875, + "learning_rate": 0.00041775022203846703, + "loss": 3.3336, + "step": 6663 + }, + { + "epoch": 0.2805894736842105, + "grad_norm": 0.58984375, + "learning_rate": 0.0004177252005916972, + "loss": 3.5159, + "step": 6664 + }, + { + "epoch": 0.2806315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.0004177001760891714, + "loss": 3.2001, + "step": 6665 + }, + { + "epoch": 0.2806736842105263, + "grad_norm": 0.404296875, + "learning_rate": 0.0004176751485313454, + "loss": 3.1684, + "step": 6666 + }, + { + "epoch": 0.2807157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.0004176501179186752, + "loss": 3.5735, + "step": 6667 + }, + { + "epoch": 0.2807578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00041762508425161683, + "loss": 3.0811, + "step": 6668 + }, + { + "epoch": 0.2808, + "grad_norm": 0.40625, + "learning_rate": 0.0004176000475306263, + "loss": 3.5004, + "step": 6669 + }, + { + "epoch": 0.2808421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.0004175750077561599, + "loss": 3.5581, + "step": 6670 + }, + { + "epoch": 0.2808842105263158, + "grad_norm": 0.384765625, + "learning_rate": 0.00041754996492867377, + "loss": 3.0918, + "step": 6671 + }, + { + "epoch": 0.2809263157894737, + "grad_norm": 0.80859375, + "learning_rate": 0.0004175249190486241, + "loss": 3.0459, + "step": 6672 + }, + { + "epoch": 0.2809684210526316, + "grad_norm": 0.44921875, + "learning_rate": 0.00041749987011646726, + "loss": 3.1027, + "step": 6673 + }, + { + "epoch": 0.2810105263157895, + "grad_norm": 0.376953125, + "learning_rate": 0.00041747481813265964, + "loss": 3.3682, + "step": 6674 + }, + { + "epoch": 0.2810526315789474, + "grad_norm": 0.4453125, + "learning_rate": 0.0004174497630976575, + "loss": 3.2241, + "step": 6675 + }, + { + "epoch": 0.2810947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.0004174247050119175, + "loss": 3.2557, + "step": 6676 + }, + { + "epoch": 0.2811368421052632, + "grad_norm": 0.42578125, + "learning_rate": 0.000417399643875896, + "loss": 3.0142, + "step": 6677 + }, + { + "epoch": 0.28117894736842103, + "grad_norm": 0.423828125, + "learning_rate": 0.0004173745796900498, + "loss": 3.4463, + "step": 6678 + }, + { + "epoch": 0.2812210526315789, + "grad_norm": 0.388671875, + "learning_rate": 0.0004173495124548353, + "loss": 3.312, + "step": 6679 + }, + { + "epoch": 0.2812631578947368, + "grad_norm": 0.41796875, + "learning_rate": 0.0004173244421707094, + "loss": 3.6851, + "step": 6680 + }, + { + "epoch": 0.2813052631578947, + "grad_norm": 0.40234375, + "learning_rate": 0.00041729936883812876, + "loss": 2.7317, + "step": 6681 + }, + { + "epoch": 0.2813473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.0004172742924575501, + "loss": 3.4062, + "step": 6682 + }, + { + "epoch": 0.2813894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.00041724921302943043, + "loss": 3.6427, + "step": 6683 + }, + { + "epoch": 0.2814315789473684, + "grad_norm": 0.404296875, + "learning_rate": 0.00041722413055422667, + "loss": 3.5665, + "step": 6684 + }, + { + "epoch": 0.2814736842105263, + "grad_norm": 0.412109375, + "learning_rate": 0.0004171990450323956, + "loss": 3.418, + "step": 6685 + }, + { + "epoch": 0.2815157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.0004171739564643945, + "loss": 3.4383, + "step": 6686 + }, + { + "epoch": 0.2815578947368421, + "grad_norm": 0.48046875, + "learning_rate": 0.00041714886485068027, + "loss": 3.4488, + "step": 6687 + }, + { + "epoch": 0.2816, + "grad_norm": 0.40625, + "learning_rate": 0.0004171237701917101, + "loss": 3.001, + "step": 6688 + }, + { + "epoch": 0.2816421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.0004170986724879413, + "loss": 3.5818, + "step": 6689 + }, + { + "epoch": 0.2816842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.00041707357173983084, + "loss": 3.2684, + "step": 6690 + }, + { + "epoch": 0.28172631578947366, + "grad_norm": 0.416015625, + "learning_rate": 0.00041704846794783635, + "loss": 3.2806, + "step": 6691 + }, + { + "epoch": 0.28176842105263156, + "grad_norm": 0.427734375, + "learning_rate": 0.000417023361112415, + "loss": 3.0444, + "step": 6692 + }, + { + "epoch": 0.28181052631578946, + "grad_norm": 0.447265625, + "learning_rate": 0.0004169982512340242, + "loss": 3.3491, + "step": 6693 + }, + { + "epoch": 0.28185263157894735, + "grad_norm": 0.447265625, + "learning_rate": 0.00041697313831312165, + "loss": 3.3791, + "step": 6694 + }, + { + "epoch": 0.28189473684210525, + "grad_norm": 0.396484375, + "learning_rate": 0.0004169480223501646, + "loss": 3.1435, + "step": 6695 + }, + { + "epoch": 0.28193684210526315, + "grad_norm": 0.423828125, + "learning_rate": 0.0004169229033456107, + "loss": 3.6401, + "step": 6696 + }, + { + "epoch": 0.28197894736842105, + "grad_norm": 0.416015625, + "learning_rate": 0.00041689778129991767, + "loss": 3.3951, + "step": 6697 + }, + { + "epoch": 0.28202105263157895, + "grad_norm": 0.408203125, + "learning_rate": 0.00041687265621354313, + "loss": 3.2654, + "step": 6698 + }, + { + "epoch": 0.28206315789473685, + "grad_norm": 0.4140625, + "learning_rate": 0.0004168475280869449, + "loss": 3.5863, + "step": 6699 + }, + { + "epoch": 0.28210526315789475, + "grad_norm": 0.44921875, + "learning_rate": 0.0004168223969205808, + "loss": 2.6782, + "step": 6700 + }, + { + "epoch": 0.28214736842105265, + "grad_norm": 0.50390625, + "learning_rate": 0.00041679726271490853, + "loss": 3.1562, + "step": 6701 + }, + { + "epoch": 0.28218947368421055, + "grad_norm": 0.396484375, + "learning_rate": 0.00041677212547038613, + "loss": 3.4072, + "step": 6702 + }, + { + "epoch": 0.28223157894736844, + "grad_norm": 0.404296875, + "learning_rate": 0.00041674698518747154, + "loss": 2.9622, + "step": 6703 + }, + { + "epoch": 0.28227368421052634, + "grad_norm": 0.427734375, + "learning_rate": 0.00041672184186662285, + "loss": 3.2485, + "step": 6704 + }, + { + "epoch": 0.2823157894736842, + "grad_norm": 0.3828125, + "learning_rate": 0.0004166966955082981, + "loss": 3.4762, + "step": 6705 + }, + { + "epoch": 0.2823578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.0004166715461129554, + "loss": 3.064, + "step": 6706 + }, + { + "epoch": 0.2824, + "grad_norm": 0.40234375, + "learning_rate": 0.00041664639368105293, + "loss": 3.3304, + "step": 6707 + }, + { + "epoch": 0.2824421052631579, + "grad_norm": 0.400390625, + "learning_rate": 0.00041662123821304905, + "loss": 3.1445, + "step": 6708 + }, + { + "epoch": 0.2824842105263158, + "grad_norm": 0.4140625, + "learning_rate": 0.0004165960797094019, + "loss": 2.8116, + "step": 6709 + }, + { + "epoch": 0.2825263157894737, + "grad_norm": 0.39453125, + "learning_rate": 0.0004165709181705699, + "loss": 3.5259, + "step": 6710 + }, + { + "epoch": 0.2825684210526316, + "grad_norm": 0.40234375, + "learning_rate": 0.0004165457535970115, + "loss": 3.1078, + "step": 6711 + }, + { + "epoch": 0.2826105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.00041652058598918527, + "loss": 3.404, + "step": 6712 + }, + { + "epoch": 0.2826526315789474, + "grad_norm": 0.408203125, + "learning_rate": 0.00041649541534754956, + "loss": 3.4321, + "step": 6713 + }, + { + "epoch": 0.2826947368421053, + "grad_norm": 0.4453125, + "learning_rate": 0.0004164702416725629, + "loss": 3.1983, + "step": 6714 + }, + { + "epoch": 0.2827368421052632, + "grad_norm": 0.443359375, + "learning_rate": 0.0004164450649646842, + "loss": 3.6488, + "step": 6715 + }, + { + "epoch": 0.2827789473684211, + "grad_norm": 0.388671875, + "learning_rate": 0.0004164198852243718, + "loss": 3.1648, + "step": 6716 + }, + { + "epoch": 0.282821052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00041639470245208467, + "loss": 3.7616, + "step": 6717 + }, + { + "epoch": 0.2828631578947368, + "grad_norm": 0.41015625, + "learning_rate": 0.00041636951664828165, + "loss": 3.5557, + "step": 6718 + }, + { + "epoch": 0.2829052631578947, + "grad_norm": 0.412109375, + "learning_rate": 0.00041634432781342145, + "loss": 3.2865, + "step": 6719 + }, + { + "epoch": 0.2829473684210526, + "grad_norm": 0.37890625, + "learning_rate": 0.00041631913594796313, + "loss": 3.4424, + "step": 6720 + }, + { + "epoch": 0.2829894736842105, + "grad_norm": 0.44140625, + "learning_rate": 0.0004162939410523655, + "loss": 3.6272, + "step": 6721 + }, + { + "epoch": 0.2830315789473684, + "grad_norm": 0.40234375, + "learning_rate": 0.0004162687431270876, + "loss": 2.9641, + "step": 6722 + }, + { + "epoch": 0.2830736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.0004162435421725886, + "loss": 3.3319, + "step": 6723 + }, + { + "epoch": 0.2831157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.00041621833818932766, + "loss": 3.195, + "step": 6724 + }, + { + "epoch": 0.2831578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.00041619313117776383, + "loss": 3.195, + "step": 6725 + }, + { + "epoch": 0.2832, + "grad_norm": 0.41796875, + "learning_rate": 0.00041616792113835646, + "loss": 2.6857, + "step": 6726 + }, + { + "epoch": 0.2832421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00041614270807156475, + "loss": 3.6809, + "step": 6727 + }, + { + "epoch": 0.2832842105263158, + "grad_norm": 0.3984375, + "learning_rate": 0.00041611749197784814, + "loss": 3.3442, + "step": 6728 + }, + { + "epoch": 0.2833263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00041609227285766605, + "loss": 3.6784, + "step": 6729 + }, + { + "epoch": 0.2833684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.00041606705071147777, + "loss": 3.2473, + "step": 6730 + }, + { + "epoch": 0.28341052631578945, + "grad_norm": 0.435546875, + "learning_rate": 0.00041604182553974303, + "loss": 3.4388, + "step": 6731 + }, + { + "epoch": 0.28345263157894734, + "grad_norm": 0.42578125, + "learning_rate": 0.0004160165973429213, + "loss": 3.1442, + "step": 6732 + }, + { + "epoch": 0.28349473684210524, + "grad_norm": 0.3984375, + "learning_rate": 0.0004159913661214723, + "loss": 3.254, + "step": 6733 + }, + { + "epoch": 0.28353684210526314, + "grad_norm": 0.419921875, + "learning_rate": 0.0004159661318758556, + "loss": 3.3356, + "step": 6734 + }, + { + "epoch": 0.28357894736842104, + "grad_norm": 0.42578125, + "learning_rate": 0.00041594089460653095, + "loss": 3.3217, + "step": 6735 + }, + { + "epoch": 0.28362105263157894, + "grad_norm": 0.4296875, + "learning_rate": 0.00041591565431395815, + "loss": 3.5002, + "step": 6736 + }, + { + "epoch": 0.28366315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00041589041099859717, + "loss": 3.1628, + "step": 6737 + }, + { + "epoch": 0.28370526315789474, + "grad_norm": 0.416015625, + "learning_rate": 0.00041586516466090776, + "loss": 2.7701, + "step": 6738 + }, + { + "epoch": 0.28374736842105264, + "grad_norm": 0.419921875, + "learning_rate": 0.0004158399153013499, + "loss": 3.1499, + "step": 6739 + }, + { + "epoch": 0.28378947368421054, + "grad_norm": 0.404296875, + "learning_rate": 0.0004158146629203836, + "loss": 3.3434, + "step": 6740 + }, + { + "epoch": 0.28383157894736843, + "grad_norm": 0.3984375, + "learning_rate": 0.00041578940751846907, + "loss": 3.0107, + "step": 6741 + }, + { + "epoch": 0.28387368421052633, + "grad_norm": 0.416015625, + "learning_rate": 0.0004157641490960663, + "loss": 3.0127, + "step": 6742 + }, + { + "epoch": 0.28391578947368423, + "grad_norm": 0.427734375, + "learning_rate": 0.0004157388876536355, + "loss": 3.1251, + "step": 6743 + }, + { + "epoch": 0.28395789473684213, + "grad_norm": 0.431640625, + "learning_rate": 0.00041571362319163687, + "loss": 3.0175, + "step": 6744 + }, + { + "epoch": 0.284, + "grad_norm": 0.43359375, + "learning_rate": 0.00041568835571053074, + "loss": 3.3101, + "step": 6745 + }, + { + "epoch": 0.2840421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00041566308521077743, + "loss": 3.0066, + "step": 6746 + }, + { + "epoch": 0.2840842105263158, + "grad_norm": 0.40234375, + "learning_rate": 0.0004156378116928374, + "loss": 3.0327, + "step": 6747 + }, + { + "epoch": 0.28412631578947367, + "grad_norm": 0.3984375, + "learning_rate": 0.00041561253515717106, + "loss": 3.6476, + "step": 6748 + }, + { + "epoch": 0.28416842105263157, + "grad_norm": 0.390625, + "learning_rate": 0.0004155872556042389, + "loss": 3.353, + "step": 6749 + }, + { + "epoch": 0.28421052631578947, + "grad_norm": 0.3984375, + "learning_rate": 0.0004155619730345015, + "loss": 3.1492, + "step": 6750 + }, + { + "epoch": 0.28425263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.0004155366874484194, + "loss": 3.4902, + "step": 6751 + }, + { + "epoch": 0.28429473684210527, + "grad_norm": 0.41015625, + "learning_rate": 0.0004155113988464535, + "loss": 3.7805, + "step": 6752 + }, + { + "epoch": 0.28433684210526317, + "grad_norm": 0.462890625, + "learning_rate": 0.00041548610722906435, + "loss": 3.4252, + "step": 6753 + }, + { + "epoch": 0.28437894736842106, + "grad_norm": 0.419921875, + "learning_rate": 0.00041546081259671267, + "loss": 3.1236, + "step": 6754 + }, + { + "epoch": 0.28442105263157896, + "grad_norm": 0.4140625, + "learning_rate": 0.0004154355149498594, + "loss": 2.9728, + "step": 6755 + }, + { + "epoch": 0.28446315789473686, + "grad_norm": 0.4140625, + "learning_rate": 0.00041541021428896554, + "loss": 3.3853, + "step": 6756 + }, + { + "epoch": 0.28450526315789476, + "grad_norm": 0.48046875, + "learning_rate": 0.0004153849106144918, + "loss": 3.0118, + "step": 6757 + }, + { + "epoch": 0.2845473684210526, + "grad_norm": 0.447265625, + "learning_rate": 0.00041535960392689934, + "loss": 2.996, + "step": 6758 + }, + { + "epoch": 0.2845894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.00041533429422664923, + "loss": 2.8705, + "step": 6759 + }, + { + "epoch": 0.2846315789473684, + "grad_norm": 0.41015625, + "learning_rate": 0.00041530898151420246, + "loss": 3.3637, + "step": 6760 + }, + { + "epoch": 0.2846736842105263, + "grad_norm": 0.40234375, + "learning_rate": 0.0004152836657900204, + "loss": 3.3062, + "step": 6761 + }, + { + "epoch": 0.2847157894736842, + "grad_norm": 0.392578125, + "learning_rate": 0.0004152583470545641, + "loss": 3.4685, + "step": 6762 + }, + { + "epoch": 0.2847578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.0004152330253082949, + "loss": 3.2851, + "step": 6763 + }, + { + "epoch": 0.2848, + "grad_norm": 0.400390625, + "learning_rate": 0.00041520770055167403, + "loss": 3.2389, + "step": 6764 + }, + { + "epoch": 0.2848421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00041518237278516303, + "loss": 3.4337, + "step": 6765 + }, + { + "epoch": 0.2848842105263158, + "grad_norm": 0.37890625, + "learning_rate": 0.00041515704200922334, + "loss": 2.9614, + "step": 6766 + }, + { + "epoch": 0.2849263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.0004151317082243163, + "loss": 3.1639, + "step": 6767 + }, + { + "epoch": 0.2849684210526316, + "grad_norm": 0.4453125, + "learning_rate": 0.00041510637143090355, + "loss": 2.8583, + "step": 6768 + }, + { + "epoch": 0.2850105263157895, + "grad_norm": 0.408203125, + "learning_rate": 0.00041508103162944676, + "loss": 3.576, + "step": 6769 + }, + { + "epoch": 0.2850526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.0004150556888204075, + "loss": 3.0098, + "step": 6770 + }, + { + "epoch": 0.2850947368421053, + "grad_norm": 0.388671875, + "learning_rate": 0.00041503034300424755, + "loss": 3.3303, + "step": 6771 + }, + { + "epoch": 0.28513684210526313, + "grad_norm": 0.42578125, + "learning_rate": 0.00041500499418142866, + "loss": 3.5926, + "step": 6772 + }, + { + "epoch": 0.28517894736842103, + "grad_norm": 0.416015625, + "learning_rate": 0.00041497964235241263, + "loss": 3.2274, + "step": 6773 + }, + { + "epoch": 0.28522105263157893, + "grad_norm": 0.40625, + "learning_rate": 0.0004149542875176614, + "loss": 3.4937, + "step": 6774 + }, + { + "epoch": 0.28526315789473683, + "grad_norm": 0.43359375, + "learning_rate": 0.00041492892967763686, + "loss": 3.526, + "step": 6775 + }, + { + "epoch": 0.28530526315789473, + "grad_norm": 0.421875, + "learning_rate": 0.0004149035688328009, + "loss": 3.3338, + "step": 6776 + }, + { + "epoch": 0.2853473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.00041487820498361573, + "loss": 3.4505, + "step": 6777 + }, + { + "epoch": 0.2853894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.00041485283813054343, + "loss": 3.1628, + "step": 6778 + }, + { + "epoch": 0.2854315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.00041482746827404605, + "loss": 3.3462, + "step": 6779 + }, + { + "epoch": 0.2854736842105263, + "grad_norm": 0.416015625, + "learning_rate": 0.00041480209541458593, + "loss": 3.436, + "step": 6780 + }, + { + "epoch": 0.2855157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.0004147767195526252, + "loss": 3.7156, + "step": 6781 + }, + { + "epoch": 0.2855578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.0004147513406886263, + "loss": 3.022, + "step": 6782 + }, + { + "epoch": 0.2856, + "grad_norm": 0.408203125, + "learning_rate": 0.0004147259588230515, + "loss": 3.1655, + "step": 6783 + }, + { + "epoch": 0.2856421052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.00041470057395636334, + "loss": 3.5808, + "step": 6784 + }, + { + "epoch": 0.28568421052631576, + "grad_norm": 0.46875, + "learning_rate": 0.0004146751860890242, + "loss": 3.3094, + "step": 6785 + }, + { + "epoch": 0.28572631578947366, + "grad_norm": 0.5625, + "learning_rate": 0.0004146497952214966, + "loss": 3.4299, + "step": 6786 + }, + { + "epoch": 0.28576842105263156, + "grad_norm": 0.40625, + "learning_rate": 0.0004146244013542433, + "loss": 3.2844, + "step": 6787 + }, + { + "epoch": 0.28581052631578946, + "grad_norm": 0.447265625, + "learning_rate": 0.0004145990044877268, + "loss": 3.4305, + "step": 6788 + }, + { + "epoch": 0.28585263157894736, + "grad_norm": 0.455078125, + "learning_rate": 0.00041457360462240976, + "loss": 3.4777, + "step": 6789 + }, + { + "epoch": 0.28589473684210526, + "grad_norm": 0.404296875, + "learning_rate": 0.0004145482017587551, + "loss": 3.348, + "step": 6790 + }, + { + "epoch": 0.28593684210526316, + "grad_norm": 0.400390625, + "learning_rate": 0.00041452279589722547, + "loss": 3.1765, + "step": 6791 + }, + { + "epoch": 0.28597894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.0004144973870382838, + "loss": 3.4061, + "step": 6792 + }, + { + "epoch": 0.28602105263157895, + "grad_norm": 0.49609375, + "learning_rate": 0.0004144719751823931, + "loss": 3.3084, + "step": 6793 + }, + { + "epoch": 0.28606315789473685, + "grad_norm": 0.404296875, + "learning_rate": 0.0004144465603300162, + "loss": 3.4772, + "step": 6794 + }, + { + "epoch": 0.28610526315789475, + "grad_norm": 0.44140625, + "learning_rate": 0.00041442114248161624, + "loss": 3.1667, + "step": 6795 + }, + { + "epoch": 0.28614736842105265, + "grad_norm": 0.41796875, + "learning_rate": 0.00041439572163765617, + "loss": 3.1921, + "step": 6796 + }, + { + "epoch": 0.28618947368421055, + "grad_norm": 0.4296875, + "learning_rate": 0.00041437029779859926, + "loss": 3.3992, + "step": 6797 + }, + { + "epoch": 0.28623157894736845, + "grad_norm": 0.416015625, + "learning_rate": 0.00041434487096490857, + "loss": 3.1245, + "step": 6798 + }, + { + "epoch": 0.2862736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.0004143194411370475, + "loss": 3.2488, + "step": 6799 + }, + { + "epoch": 0.2863157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.0004142940083154793, + "loss": 3.2958, + "step": 6800 + }, + { + "epoch": 0.2863578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.0004142685725006673, + "loss": 2.9586, + "step": 6801 + }, + { + "epoch": 0.2864, + "grad_norm": 0.373046875, + "learning_rate": 0.00041424313369307485, + "loss": 3.1594, + "step": 6802 + }, + { + "epoch": 0.2864421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00041421769189316557, + "loss": 3.3788, + "step": 6803 + }, + { + "epoch": 0.2864842105263158, + "grad_norm": 0.40625, + "learning_rate": 0.0004141922471014028, + "loss": 2.9266, + "step": 6804 + }, + { + "epoch": 0.2865263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.0004141667993182503, + "loss": 3.2032, + "step": 6805 + }, + { + "epoch": 0.2865684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.00041414134854417153, + "loss": 3.2129, + "step": 6806 + }, + { + "epoch": 0.2866105263157895, + "grad_norm": 0.474609375, + "learning_rate": 0.0004141158947796303, + "loss": 2.9108, + "step": 6807 + }, + { + "epoch": 0.2866526315789474, + "grad_norm": 0.39453125, + "learning_rate": 0.00041409043802509025, + "loss": 3.3863, + "step": 6808 + }, + { + "epoch": 0.2866947368421053, + "grad_norm": 0.384765625, + "learning_rate": 0.0004140649782810153, + "loss": 3.2424, + "step": 6809 + }, + { + "epoch": 0.2867368421052632, + "grad_norm": 0.408203125, + "learning_rate": 0.0004140395155478691, + "loss": 3.0535, + "step": 6810 + }, + { + "epoch": 0.2867789473684211, + "grad_norm": 0.40625, + "learning_rate": 0.00041401404982611566, + "loss": 3.386, + "step": 6811 + }, + { + "epoch": 0.2868210526315789, + "grad_norm": 0.40234375, + "learning_rate": 0.00041398858111621906, + "loss": 3.2655, + "step": 6812 + }, + { + "epoch": 0.2868631578947368, + "grad_norm": 0.451171875, + "learning_rate": 0.00041396310941864314, + "loss": 3.0905, + "step": 6813 + }, + { + "epoch": 0.2869052631578947, + "grad_norm": 0.455078125, + "learning_rate": 0.00041393763473385203, + "loss": 2.9902, + "step": 6814 + }, + { + "epoch": 0.2869473684210526, + "grad_norm": 0.39453125, + "learning_rate": 0.0004139121570623098, + "loss": 3.0494, + "step": 6815 + }, + { + "epoch": 0.2869894736842105, + "grad_norm": 0.44140625, + "learning_rate": 0.00041388667640448063, + "loss": 3.2856, + "step": 6816 + }, + { + "epoch": 0.2870315789473684, + "grad_norm": 0.439453125, + "learning_rate": 0.0004138611927608288, + "loss": 3.9361, + "step": 6817 + }, + { + "epoch": 0.2870736842105263, + "grad_norm": 0.40234375, + "learning_rate": 0.0004138357061318186, + "loss": 3.1721, + "step": 6818 + }, + { + "epoch": 0.2871157894736842, + "grad_norm": 0.396484375, + "learning_rate": 0.0004138102165179143, + "loss": 3.4302, + "step": 6819 + }, + { + "epoch": 0.2871578947368421, + "grad_norm": 0.59375, + "learning_rate": 0.0004137847239195803, + "loss": 3.3751, + "step": 6820 + }, + { + "epoch": 0.2872, + "grad_norm": 0.412109375, + "learning_rate": 0.0004137592283372812, + "loss": 3.5782, + "step": 6821 + }, + { + "epoch": 0.2872421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.00041373372977148126, + "loss": 3.0822, + "step": 6822 + }, + { + "epoch": 0.2872842105263158, + "grad_norm": 0.388671875, + "learning_rate": 0.0004137082282226451, + "loss": 3.1254, + "step": 6823 + }, + { + "epoch": 0.2873263157894737, + "grad_norm": 0.455078125, + "learning_rate": 0.0004136827236912374, + "loss": 2.7952, + "step": 6824 + }, + { + "epoch": 0.2873684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.0004136572161777229, + "loss": 2.9388, + "step": 6825 + }, + { + "epoch": 0.28741052631578945, + "grad_norm": 0.453125, + "learning_rate": 0.00041363170568256606, + "loss": 2.7385, + "step": 6826 + }, + { + "epoch": 0.28745263157894735, + "grad_norm": 0.423828125, + "learning_rate": 0.0004136061922062319, + "loss": 3.0984, + "step": 6827 + }, + { + "epoch": 0.28749473684210525, + "grad_norm": 0.423828125, + "learning_rate": 0.0004135806757491851, + "loss": 3.4219, + "step": 6828 + }, + { + "epoch": 0.28753684210526315, + "grad_norm": 0.408203125, + "learning_rate": 0.00041355515631189064, + "loss": 3.4787, + "step": 6829 + }, + { + "epoch": 0.28757894736842105, + "grad_norm": 0.443359375, + "learning_rate": 0.0004135296338948134, + "loss": 3.7318, + "step": 6830 + }, + { + "epoch": 0.28762105263157894, + "grad_norm": 0.439453125, + "learning_rate": 0.0004135041084984183, + "loss": 3.0566, + "step": 6831 + }, + { + "epoch": 0.28766315789473684, + "grad_norm": 0.498046875, + "learning_rate": 0.00041347858012317053, + "loss": 3.1387, + "step": 6832 + }, + { + "epoch": 0.28770526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.00041345304876953506, + "loss": 3.4598, + "step": 6833 + }, + { + "epoch": 0.28774736842105264, + "grad_norm": 0.404296875, + "learning_rate": 0.0004134275144379771, + "loss": 3.725, + "step": 6834 + }, + { + "epoch": 0.28778947368421054, + "grad_norm": 0.453125, + "learning_rate": 0.00041340197712896187, + "loss": 3.1959, + "step": 6835 + }, + { + "epoch": 0.28783157894736844, + "grad_norm": 0.396484375, + "learning_rate": 0.00041337643684295457, + "loss": 3.2991, + "step": 6836 + }, + { + "epoch": 0.28787368421052634, + "grad_norm": 0.396484375, + "learning_rate": 0.00041335089358042053, + "loss": 2.7578, + "step": 6837 + }, + { + "epoch": 0.28791578947368424, + "grad_norm": 0.416015625, + "learning_rate": 0.00041332534734182515, + "loss": 3.3701, + "step": 6838 + }, + { + "epoch": 0.2879578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.0004132997981276339, + "loss": 3.5056, + "step": 6839 + }, + { + "epoch": 0.288, + "grad_norm": 0.38671875, + "learning_rate": 0.00041327424593831213, + "loss": 3.2825, + "step": 6840 + }, + { + "epoch": 0.2880421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.0004132486907743255, + "loss": 2.7677, + "step": 6841 + }, + { + "epoch": 0.2880842105263158, + "grad_norm": 0.3984375, + "learning_rate": 0.00041322313263613954, + "loss": 3.0814, + "step": 6842 + }, + { + "epoch": 0.2881263157894737, + "grad_norm": 0.46484375, + "learning_rate": 0.00041319757152421984, + "loss": 3.4905, + "step": 6843 + }, + { + "epoch": 0.2881684210526316, + "grad_norm": 0.40234375, + "learning_rate": 0.0004131720074390321, + "loss": 2.9604, + "step": 6844 + }, + { + "epoch": 0.2882105263157895, + "grad_norm": 0.380859375, + "learning_rate": 0.00041314644038104216, + "loss": 3.2489, + "step": 6845 + }, + { + "epoch": 0.28825263157894737, + "grad_norm": 0.458984375, + "learning_rate": 0.00041312087035071576, + "loss": 3.0028, + "step": 6846 + }, + { + "epoch": 0.28829473684210527, + "grad_norm": 0.4140625, + "learning_rate": 0.00041309529734851873, + "loss": 3.5113, + "step": 6847 + }, + { + "epoch": 0.28833684210526317, + "grad_norm": 0.4375, + "learning_rate": 0.00041306972137491706, + "loss": 3.3556, + "step": 6848 + }, + { + "epoch": 0.28837894736842107, + "grad_norm": 0.412109375, + "learning_rate": 0.00041304414243037663, + "loss": 3.4655, + "step": 6849 + }, + { + "epoch": 0.28842105263157897, + "grad_norm": 0.416015625, + "learning_rate": 0.00041301856051536353, + "loss": 3.2174, + "step": 6850 + }, + { + "epoch": 0.28846315789473687, + "grad_norm": 0.388671875, + "learning_rate": 0.0004129929756303437, + "loss": 3.2303, + "step": 6851 + }, + { + "epoch": 0.2885052631578947, + "grad_norm": 0.40625, + "learning_rate": 0.00041296738777578347, + "loss": 3.1464, + "step": 6852 + }, + { + "epoch": 0.2885473684210526, + "grad_norm": 0.3984375, + "learning_rate": 0.00041294179695214886, + "loss": 3.3757, + "step": 6853 + }, + { + "epoch": 0.2885894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.0004129162031599062, + "loss": 3.5455, + "step": 6854 + }, + { + "epoch": 0.2886315789473684, + "grad_norm": 0.431640625, + "learning_rate": 0.0004128906063995217, + "loss": 3.6344, + "step": 6855 + }, + { + "epoch": 0.2886736842105263, + "grad_norm": 0.388671875, + "learning_rate": 0.0004128650066714618, + "loss": 3.5887, + "step": 6856 + }, + { + "epoch": 0.2887157894736842, + "grad_norm": 0.40625, + "learning_rate": 0.0004128394039761928, + "loss": 3.4138, + "step": 6857 + }, + { + "epoch": 0.2887578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.0004128137983141811, + "loss": 3.4601, + "step": 6858 + }, + { + "epoch": 0.2888, + "grad_norm": 0.423828125, + "learning_rate": 0.0004127881896858934, + "loss": 2.9784, + "step": 6859 + }, + { + "epoch": 0.2888421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.0004127625780917961, + "loss": 3.5096, + "step": 6860 + }, + { + "epoch": 0.2888842105263158, + "grad_norm": 0.388671875, + "learning_rate": 0.0004127369635323559, + "loss": 3.2296, + "step": 6861 + }, + { + "epoch": 0.2889263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.0004127113460080394, + "loss": 3.0172, + "step": 6862 + }, + { + "epoch": 0.2889684210526316, + "grad_norm": 0.43359375, + "learning_rate": 0.0004126857255193134, + "loss": 3.122, + "step": 6863 + }, + { + "epoch": 0.2890105263157895, + "grad_norm": 0.412109375, + "learning_rate": 0.0004126601020666446, + "loss": 3.637, + "step": 6864 + }, + { + "epoch": 0.2890526315789474, + "grad_norm": 0.41015625, + "learning_rate": 0.00041263447565049983, + "loss": 2.765, + "step": 6865 + }, + { + "epoch": 0.28909473684210524, + "grad_norm": 0.40234375, + "learning_rate": 0.00041260884627134604, + "loss": 3.2371, + "step": 6866 + }, + { + "epoch": 0.28913684210526314, + "grad_norm": 0.4453125, + "learning_rate": 0.00041258321392965015, + "loss": 3.334, + "step": 6867 + }, + { + "epoch": 0.28917894736842104, + "grad_norm": 0.416015625, + "learning_rate": 0.00041255757862587906, + "loss": 3.5513, + "step": 6868 + }, + { + "epoch": 0.28922105263157893, + "grad_norm": 0.4140625, + "learning_rate": 0.00041253194036049997, + "loss": 3.3936, + "step": 6869 + }, + { + "epoch": 0.28926315789473683, + "grad_norm": 0.44140625, + "learning_rate": 0.00041250629913397983, + "loss": 3.5201, + "step": 6870 + }, + { + "epoch": 0.28930526315789473, + "grad_norm": 0.40234375, + "learning_rate": 0.0004124806549467859, + "loss": 3.4831, + "step": 6871 + }, + { + "epoch": 0.28934736842105263, + "grad_norm": 0.400390625, + "learning_rate": 0.00041245500779938526, + "loss": 3.0633, + "step": 6872 + }, + { + "epoch": 0.28938947368421053, + "grad_norm": 0.4140625, + "learning_rate": 0.0004124293576922453, + "loss": 3.3907, + "step": 6873 + }, + { + "epoch": 0.28943157894736843, + "grad_norm": 0.50390625, + "learning_rate": 0.0004124037046258333, + "loss": 3.2553, + "step": 6874 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 0.40234375, + "learning_rate": 0.00041237804860061667, + "loss": 3.5275, + "step": 6875 + }, + { + "epoch": 0.2895157894736842, + "grad_norm": 0.5234375, + "learning_rate": 0.00041235238961706273, + "loss": 3.2615, + "step": 6876 + }, + { + "epoch": 0.2895578947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.000412326727675639, + "loss": 3.2144, + "step": 6877 + }, + { + "epoch": 0.2896, + "grad_norm": 0.42578125, + "learning_rate": 0.000412301062776813, + "loss": 3.5277, + "step": 6878 + }, + { + "epoch": 0.28964210526315787, + "grad_norm": 0.431640625, + "learning_rate": 0.0004122753949210524, + "loss": 3.5069, + "step": 6879 + }, + { + "epoch": 0.28968421052631577, + "grad_norm": 0.431640625, + "learning_rate": 0.0004122497241088247, + "loss": 3.2184, + "step": 6880 + }, + { + "epoch": 0.28972631578947367, + "grad_norm": 0.416015625, + "learning_rate": 0.00041222405034059773, + "loss": 3.0129, + "step": 6881 + }, + { + "epoch": 0.28976842105263156, + "grad_norm": 0.412109375, + "learning_rate": 0.0004121983736168391, + "loss": 3.505, + "step": 6882 + }, + { + "epoch": 0.28981052631578946, + "grad_norm": 0.392578125, + "learning_rate": 0.00041217269393801673, + "loss": 3.3256, + "step": 6883 + }, + { + "epoch": 0.28985263157894736, + "grad_norm": 0.41796875, + "learning_rate": 0.0004121470113045984, + "loss": 3.0588, + "step": 6884 + }, + { + "epoch": 0.28989473684210526, + "grad_norm": 0.41796875, + "learning_rate": 0.000412121325717052, + "loss": 3.4451, + "step": 6885 + }, + { + "epoch": 0.28993684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.0004120956371758456, + "loss": 3.3772, + "step": 6886 + }, + { + "epoch": 0.28997894736842106, + "grad_norm": 0.3828125, + "learning_rate": 0.0004120699456814471, + "loss": 2.8542, + "step": 6887 + }, + { + "epoch": 0.29002105263157896, + "grad_norm": 0.392578125, + "learning_rate": 0.00041204425123432466, + "loss": 3.1662, + "step": 6888 + }, + { + "epoch": 0.29006315789473686, + "grad_norm": 0.4296875, + "learning_rate": 0.0004120185538349463, + "loss": 2.5155, + "step": 6889 + }, + { + "epoch": 0.29010526315789475, + "grad_norm": 0.41796875, + "learning_rate": 0.0004119928534837803, + "loss": 3.1446, + "step": 6890 + }, + { + "epoch": 0.29014736842105265, + "grad_norm": 0.41015625, + "learning_rate": 0.00041196715018129476, + "loss": 2.975, + "step": 6891 + }, + { + "epoch": 0.29018947368421055, + "grad_norm": 0.421875, + "learning_rate": 0.0004119414439279581, + "loss": 2.6824, + "step": 6892 + }, + { + "epoch": 0.2902315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.00041191573472423865, + "loss": 3.1621, + "step": 6893 + }, + { + "epoch": 0.2902736842105263, + "grad_norm": 0.451171875, + "learning_rate": 0.0004118900225706047, + "loss": 3.1146, + "step": 6894 + }, + { + "epoch": 0.2903157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.0004118643074675248, + "loss": 3.0639, + "step": 6895 + }, + { + "epoch": 0.2903578947368421, + "grad_norm": 0.5078125, + "learning_rate": 0.00041183858941546737, + "loss": 3.3838, + "step": 6896 + }, + { + "epoch": 0.2904, + "grad_norm": 0.65234375, + "learning_rate": 0.00041181286841490103, + "loss": 3.3239, + "step": 6897 + }, + { + "epoch": 0.2904421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.0004117871444662943, + "loss": 3.047, + "step": 6898 + }, + { + "epoch": 0.2904842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.00041176141757011587, + "loss": 3.2523, + "step": 6899 + }, + { + "epoch": 0.2905263157894737, + "grad_norm": 0.46875, + "learning_rate": 0.0004117356877268345, + "loss": 3.1833, + "step": 6900 + }, + { + "epoch": 0.2905684210526316, + "grad_norm": 0.4296875, + "learning_rate": 0.000411709954936919, + "loss": 3.4644, + "step": 6901 + }, + { + "epoch": 0.2906105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.00041168421920083803, + "loss": 3.1531, + "step": 6902 + }, + { + "epoch": 0.2906526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.00041165848051906065, + "loss": 3.4381, + "step": 6903 + }, + { + "epoch": 0.2906947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.00041163273889205555, + "loss": 3.4376, + "step": 6904 + }, + { + "epoch": 0.2907368421052632, + "grad_norm": 0.400390625, + "learning_rate": 0.0004116069943202919, + "loss": 3.5508, + "step": 6905 + }, + { + "epoch": 0.290778947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.00041158124680423883, + "loss": 3.1528, + "step": 6906 + }, + { + "epoch": 0.2908210526315789, + "grad_norm": 0.388671875, + "learning_rate": 0.00041155549634436516, + "loss": 3.0612, + "step": 6907 + }, + { + "epoch": 0.2908631578947368, + "grad_norm": 0.408203125, + "learning_rate": 0.0004115297429411402, + "loss": 3.3424, + "step": 6908 + }, + { + "epoch": 0.2909052631578947, + "grad_norm": 0.435546875, + "learning_rate": 0.0004115039865950332, + "loss": 3.5114, + "step": 6909 + }, + { + "epoch": 0.2909473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.0004114782273065132, + "loss": 2.8298, + "step": 6910 + }, + { + "epoch": 0.2909894736842105, + "grad_norm": 0.37890625, + "learning_rate": 0.00041145246507604964, + "loss": 2.9669, + "step": 6911 + }, + { + "epoch": 0.2910315789473684, + "grad_norm": 0.396484375, + "learning_rate": 0.0004114266999041119, + "loss": 3.4259, + "step": 6912 + }, + { + "epoch": 0.2910736842105263, + "grad_norm": 0.40234375, + "learning_rate": 0.0004114009317911693, + "loss": 3.1891, + "step": 6913 + }, + { + "epoch": 0.2911157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.0004113751607376915, + "loss": 3.2577, + "step": 6914 + }, + { + "epoch": 0.2911578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.0004113493867441478, + "loss": 2.7257, + "step": 6915 + }, + { + "epoch": 0.2912, + "grad_norm": 0.419921875, + "learning_rate": 0.0004113236098110078, + "loss": 3.3934, + "step": 6916 + }, + { + "epoch": 0.2912421052631579, + "grad_norm": 0.48046875, + "learning_rate": 0.00041129782993874124, + "loss": 3.0108, + "step": 6917 + }, + { + "epoch": 0.2912842105263158, + "grad_norm": 0.494140625, + "learning_rate": 0.00041127204712781767, + "loss": 2.7261, + "step": 6918 + }, + { + "epoch": 0.2913263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.00041124626137870697, + "loss": 3.3889, + "step": 6919 + }, + { + "epoch": 0.29136842105263155, + "grad_norm": 0.421875, + "learning_rate": 0.0004112204726918788, + "loss": 3.0211, + "step": 6920 + }, + { + "epoch": 0.29141052631578945, + "grad_norm": 0.400390625, + "learning_rate": 0.00041119468106780305, + "loss": 3.3265, + "step": 6921 + }, + { + "epoch": 0.29145263157894735, + "grad_norm": 0.41796875, + "learning_rate": 0.0004111688865069496, + "loss": 3.4631, + "step": 6922 + }, + { + "epoch": 0.29149473684210525, + "grad_norm": 0.421875, + "learning_rate": 0.00041114308900978846, + "loss": 3.4626, + "step": 6923 + }, + { + "epoch": 0.29153684210526315, + "grad_norm": 0.38671875, + "learning_rate": 0.00041111728857678954, + "loss": 3.1028, + "step": 6924 + }, + { + "epoch": 0.29157894736842105, + "grad_norm": 0.427734375, + "learning_rate": 0.0004110914852084229, + "loss": 3.1169, + "step": 6925 + }, + { + "epoch": 0.29162105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.00041106567890515866, + "loss": 2.7875, + "step": 6926 + }, + { + "epoch": 0.29166315789473685, + "grad_norm": 0.400390625, + "learning_rate": 0.00041103986966746703, + "loss": 3.6257, + "step": 6927 + }, + { + "epoch": 0.29170526315789475, + "grad_norm": 0.421875, + "learning_rate": 0.00041101405749581824, + "loss": 3.2642, + "step": 6928 + }, + { + "epoch": 0.29174736842105264, + "grad_norm": 0.400390625, + "learning_rate": 0.0004109882423906824, + "loss": 3.2128, + "step": 6929 + }, + { + "epoch": 0.29178947368421054, + "grad_norm": 0.392578125, + "learning_rate": 0.00041096242435253, + "loss": 3.1603, + "step": 6930 + }, + { + "epoch": 0.29183157894736844, + "grad_norm": 0.412109375, + "learning_rate": 0.00041093660338183135, + "loss": 3.3577, + "step": 6931 + }, + { + "epoch": 0.29187368421052634, + "grad_norm": 0.435546875, + "learning_rate": 0.0004109107794790569, + "loss": 3.1983, + "step": 6932 + }, + { + "epoch": 0.2919157894736842, + "grad_norm": 0.390625, + "learning_rate": 0.00041088495264467693, + "loss": 3.1853, + "step": 6933 + }, + { + "epoch": 0.2919578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.0004108591228791624, + "loss": 3.4953, + "step": 6934 + }, + { + "epoch": 0.292, + "grad_norm": 0.376953125, + "learning_rate": 0.00041083329018298353, + "loss": 3.0693, + "step": 6935 + }, + { + "epoch": 0.2920421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.0004108074545566111, + "loss": 3.6212, + "step": 6936 + }, + { + "epoch": 0.2920842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.00041078161600051577, + "loss": 3.2413, + "step": 6937 + }, + { + "epoch": 0.2921263157894737, + "grad_norm": 0.484375, + "learning_rate": 0.00041075577451516837, + "loss": 2.9161, + "step": 6938 + }, + { + "epoch": 0.2921684210526316, + "grad_norm": 0.388671875, + "learning_rate": 0.0004107299301010396, + "loss": 3.2237, + "step": 6939 + }, + { + "epoch": 0.2922105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.0004107040827586004, + "loss": 3.4368, + "step": 6940 + }, + { + "epoch": 0.2922526315789474, + "grad_norm": 0.3984375, + "learning_rate": 0.00041067823248832155, + "loss": 2.7992, + "step": 6941 + }, + { + "epoch": 0.2922947368421053, + "grad_norm": 0.40234375, + "learning_rate": 0.00041065237929067413, + "loss": 3.1245, + "step": 6942 + }, + { + "epoch": 0.2923368421052632, + "grad_norm": 0.453125, + "learning_rate": 0.00041062652316612916, + "loss": 2.8013, + "step": 6943 + }, + { + "epoch": 0.29237894736842107, + "grad_norm": 0.447265625, + "learning_rate": 0.0004106006641151576, + "loss": 3.4383, + "step": 6944 + }, + { + "epoch": 0.29242105263157897, + "grad_norm": 0.43359375, + "learning_rate": 0.0004105748021382306, + "loss": 3.3552, + "step": 6945 + }, + { + "epoch": 0.29246315789473687, + "grad_norm": 0.419921875, + "learning_rate": 0.00041054893723581944, + "loss": 3.3643, + "step": 6946 + }, + { + "epoch": 0.2925052631578947, + "grad_norm": 0.3984375, + "learning_rate": 0.00041052306940839536, + "loss": 3.4502, + "step": 6947 + }, + { + "epoch": 0.2925473684210526, + "grad_norm": 0.375, + "learning_rate": 0.0004104971986564294, + "loss": 3.549, + "step": 6948 + }, + { + "epoch": 0.2925894736842105, + "grad_norm": 0.43359375, + "learning_rate": 0.0004104713249803932, + "loss": 3.3577, + "step": 6949 + }, + { + "epoch": 0.2926315789473684, + "grad_norm": 0.41015625, + "learning_rate": 0.00041044544838075793, + "loss": 3.1391, + "step": 6950 + }, + { + "epoch": 0.2926736842105263, + "grad_norm": 0.39453125, + "learning_rate": 0.00041041956885799513, + "loss": 3.2707, + "step": 6951 + }, + { + "epoch": 0.2927157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.00041039368641257625, + "loss": 3.512, + "step": 6952 + }, + { + "epoch": 0.2927578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00041036780104497284, + "loss": 3.5691, + "step": 6953 + }, + { + "epoch": 0.2928, + "grad_norm": 0.4453125, + "learning_rate": 0.00041034191275565653, + "loss": 2.9653, + "step": 6954 + }, + { + "epoch": 0.2928421052631579, + "grad_norm": 0.40234375, + "learning_rate": 0.000410316021545099, + "loss": 3.4059, + "step": 6955 + }, + { + "epoch": 0.2928842105263158, + "grad_norm": 0.388671875, + "learning_rate": 0.0004102901274137719, + "loss": 3.0426, + "step": 6956 + }, + { + "epoch": 0.2929263157894737, + "grad_norm": 0.384765625, + "learning_rate": 0.000410264230362147, + "loss": 3.3646, + "step": 6957 + }, + { + "epoch": 0.2929684210526316, + "grad_norm": 0.388671875, + "learning_rate": 0.00041023833039069614, + "loss": 3.5184, + "step": 6958 + }, + { + "epoch": 0.2930105263157895, + "grad_norm": 0.388671875, + "learning_rate": 0.0004102124274998912, + "loss": 3.3137, + "step": 6959 + }, + { + "epoch": 0.29305263157894734, + "grad_norm": 0.427734375, + "learning_rate": 0.000410186521690204, + "loss": 3.3484, + "step": 6960 + }, + { + "epoch": 0.29309473684210524, + "grad_norm": 0.416015625, + "learning_rate": 0.0004101606129621066, + "loss": 3.2754, + "step": 6961 + }, + { + "epoch": 0.29313684210526314, + "grad_norm": 0.423828125, + "learning_rate": 0.0004101347013160711, + "loss": 3.5101, + "step": 6962 + }, + { + "epoch": 0.29317894736842104, + "grad_norm": 0.392578125, + "learning_rate": 0.0004101087867525694, + "loss": 3.5125, + "step": 6963 + }, + { + "epoch": 0.29322105263157894, + "grad_norm": 0.396484375, + "learning_rate": 0.0004100828692720738, + "loss": 3.3988, + "step": 6964 + }, + { + "epoch": 0.29326315789473684, + "grad_norm": 0.447265625, + "learning_rate": 0.0004100569488750563, + "loss": 3.069, + "step": 6965 + }, + { + "epoch": 0.29330526315789474, + "grad_norm": 0.439453125, + "learning_rate": 0.00041003102556198937, + "loss": 2.9953, + "step": 6966 + }, + { + "epoch": 0.29334736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.0004100050993333451, + "loss": 3.5592, + "step": 6967 + }, + { + "epoch": 0.29338947368421053, + "grad_norm": 0.44921875, + "learning_rate": 0.00040997917018959596, + "loss": 3.2103, + "step": 6968 + }, + { + "epoch": 0.29343157894736843, + "grad_norm": 0.3671875, + "learning_rate": 0.0004099532381312143, + "loss": 3.0738, + "step": 6969 + }, + { + "epoch": 0.29347368421052633, + "grad_norm": 0.412109375, + "learning_rate": 0.0004099273031586726, + "loss": 3.1552, + "step": 6970 + }, + { + "epoch": 0.29351578947368423, + "grad_norm": 0.3984375, + "learning_rate": 0.0004099013652724434, + "loss": 3.158, + "step": 6971 + }, + { + "epoch": 0.29355789473684213, + "grad_norm": 0.40234375, + "learning_rate": 0.00040987542447299904, + "loss": 3.0497, + "step": 6972 + }, + { + "epoch": 0.2936, + "grad_norm": 0.40625, + "learning_rate": 0.00040984948076081234, + "loss": 3.5721, + "step": 6973 + }, + { + "epoch": 0.29364210526315787, + "grad_norm": 0.388671875, + "learning_rate": 0.000409823534136356, + "loss": 3.2918, + "step": 6974 + }, + { + "epoch": 0.29368421052631577, + "grad_norm": 0.3984375, + "learning_rate": 0.0004097975846001026, + "loss": 3.4771, + "step": 6975 + }, + { + "epoch": 0.29372631578947367, + "grad_norm": 0.3671875, + "learning_rate": 0.000409771632152525, + "loss": 2.9753, + "step": 6976 + }, + { + "epoch": 0.29376842105263157, + "grad_norm": 0.40625, + "learning_rate": 0.0004097456767940959, + "loss": 3.3356, + "step": 6977 + }, + { + "epoch": 0.29381052631578947, + "grad_norm": 0.4296875, + "learning_rate": 0.0004097197185252883, + "loss": 2.9741, + "step": 6978 + }, + { + "epoch": 0.29385263157894737, + "grad_norm": 0.390625, + "learning_rate": 0.0004096937573465751, + "loss": 3.524, + "step": 6979 + }, + { + "epoch": 0.29389473684210526, + "grad_norm": 0.392578125, + "learning_rate": 0.0004096677932584293, + "loss": 2.8723, + "step": 6980 + }, + { + "epoch": 0.29393684210526316, + "grad_norm": 0.3984375, + "learning_rate": 0.0004096418262613239, + "loss": 3.4511, + "step": 6981 + }, + { + "epoch": 0.29397894736842106, + "grad_norm": 0.439453125, + "learning_rate": 0.00040961585635573193, + "loss": 3.2945, + "step": 6982 + }, + { + "epoch": 0.29402105263157896, + "grad_norm": 0.419921875, + "learning_rate": 0.0004095898835421267, + "loss": 3.1488, + "step": 6983 + }, + { + "epoch": 0.29406315789473686, + "grad_norm": 0.384765625, + "learning_rate": 0.0004095639078209813, + "loss": 2.9756, + "step": 6984 + }, + { + "epoch": 0.29410526315789476, + "grad_norm": 0.408203125, + "learning_rate": 0.00040953792919276886, + "loss": 3.2762, + "step": 6985 + }, + { + "epoch": 0.29414736842105266, + "grad_norm": 0.40625, + "learning_rate": 0.0004095119476579629, + "loss": 3.2267, + "step": 6986 + }, + { + "epoch": 0.2941894736842105, + "grad_norm": 0.400390625, + "learning_rate": 0.0004094859632170367, + "loss": 3.1944, + "step": 6987 + }, + { + "epoch": 0.2942315789473684, + "grad_norm": 0.388671875, + "learning_rate": 0.0004094599758704636, + "loss": 3.3935, + "step": 6988 + }, + { + "epoch": 0.2942736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.000409433985618717, + "loss": 3.4182, + "step": 6989 + }, + { + "epoch": 0.2943157894736842, + "grad_norm": 0.46875, + "learning_rate": 0.00040940799246227066, + "loss": 3.3397, + "step": 6990 + }, + { + "epoch": 0.2943578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.0004093819964015979, + "loss": 3.0362, + "step": 6991 + }, + { + "epoch": 0.2944, + "grad_norm": 0.42578125, + "learning_rate": 0.00040935599743717243, + "loss": 3.0901, + "step": 6992 + }, + { + "epoch": 0.2944421052631579, + "grad_norm": 0.380859375, + "learning_rate": 0.00040932999556946795, + "loss": 3.0273, + "step": 6993 + }, + { + "epoch": 0.2944842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.0004093039907989582, + "loss": 3.5781, + "step": 6994 + }, + { + "epoch": 0.2945263157894737, + "grad_norm": 0.3984375, + "learning_rate": 0.00040927798312611686, + "loss": 3.161, + "step": 6995 + }, + { + "epoch": 0.2945684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.0004092519725514178, + "loss": 3.2262, + "step": 6996 + }, + { + "epoch": 0.2946105263157895, + "grad_norm": 0.384765625, + "learning_rate": 0.0004092259590753349, + "loss": 2.9021, + "step": 6997 + }, + { + "epoch": 0.2946526315789474, + "grad_norm": 0.40625, + "learning_rate": 0.00040919994269834214, + "loss": 3.24, + "step": 6998 + }, + { + "epoch": 0.2946947368421053, + "grad_norm": 0.400390625, + "learning_rate": 0.00040917392342091355, + "loss": 2.8868, + "step": 6999 + }, + { + "epoch": 0.29473684210526313, + "grad_norm": 0.400390625, + "learning_rate": 0.00040914790124352296, + "loss": 3.4138, + "step": 7000 + }, + { + "epoch": 0.29477894736842103, + "grad_norm": 0.47265625, + "learning_rate": 0.00040912187616664464, + "loss": 3.1524, + "step": 7001 + }, + { + "epoch": 0.29482105263157893, + "grad_norm": 0.408203125, + "learning_rate": 0.0004090958481907527, + "loss": 3.3403, + "step": 7002 + }, + { + "epoch": 0.2948631578947368, + "grad_norm": 0.43359375, + "learning_rate": 0.0004090698173163214, + "loss": 3.0669, + "step": 7003 + }, + { + "epoch": 0.2949052631578947, + "grad_norm": 0.404296875, + "learning_rate": 0.00040904378354382487, + "loss": 3.5379, + "step": 7004 + }, + { + "epoch": 0.2949473684210526, + "grad_norm": 0.392578125, + "learning_rate": 0.00040901774687373757, + "loss": 3.3879, + "step": 7005 + }, + { + "epoch": 0.2949894736842105, + "grad_norm": 0.435546875, + "learning_rate": 0.00040899170730653364, + "loss": 2.6395, + "step": 7006 + }, + { + "epoch": 0.2950315789473684, + "grad_norm": 0.392578125, + "learning_rate": 0.0004089656648426876, + "loss": 3.1169, + "step": 7007 + }, + { + "epoch": 0.2950736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.000408939619482674, + "loss": 3.0944, + "step": 7008 + }, + { + "epoch": 0.2951157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.00040891357122696726, + "loss": 3.2274, + "step": 7009 + }, + { + "epoch": 0.2951578947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.000408887520076042, + "loss": 3.4985, + "step": 7010 + }, + { + "epoch": 0.2952, + "grad_norm": 0.43359375, + "learning_rate": 0.00040886146603037277, + "loss": 3.0975, + "step": 7011 + }, + { + "epoch": 0.2952421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.00040883540909043424, + "loss": 3.0399, + "step": 7012 + }, + { + "epoch": 0.2952842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.0004088093492567012, + "loss": 3.4971, + "step": 7013 + }, + { + "epoch": 0.29532631578947366, + "grad_norm": 0.392578125, + "learning_rate": 0.00040878328652964844, + "loss": 2.8487, + "step": 7014 + }, + { + "epoch": 0.29536842105263156, + "grad_norm": 0.40625, + "learning_rate": 0.00040875722090975077, + "loss": 3.0139, + "step": 7015 + }, + { + "epoch": 0.29541052631578946, + "grad_norm": 0.44140625, + "learning_rate": 0.00040873115239748305, + "loss": 3.2148, + "step": 7016 + }, + { + "epoch": 0.29545263157894736, + "grad_norm": 0.4296875, + "learning_rate": 0.00040870508099332027, + "loss": 3.2863, + "step": 7017 + }, + { + "epoch": 0.29549473684210525, + "grad_norm": 0.421875, + "learning_rate": 0.0004086790066977373, + "loss": 3.0548, + "step": 7018 + }, + { + "epoch": 0.29553684210526315, + "grad_norm": 0.435546875, + "learning_rate": 0.00040865292951120934, + "loss": 3.419, + "step": 7019 + }, + { + "epoch": 0.29557894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.0004086268494342114, + "loss": 3.4219, + "step": 7020 + }, + { + "epoch": 0.29562105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.0004086007664672186, + "loss": 3.6585, + "step": 7021 + }, + { + "epoch": 0.29566315789473685, + "grad_norm": 0.400390625, + "learning_rate": 0.00040857468061070623, + "loss": 3.6051, + "step": 7022 + }, + { + "epoch": 0.29570526315789475, + "grad_norm": 0.40625, + "learning_rate": 0.0004085485918651495, + "loss": 3.0094, + "step": 7023 + }, + { + "epoch": 0.29574736842105265, + "grad_norm": 0.41796875, + "learning_rate": 0.00040852250023102373, + "loss": 2.9432, + "step": 7024 + }, + { + "epoch": 0.29578947368421055, + "grad_norm": 0.423828125, + "learning_rate": 0.00040849640570880417, + "loss": 3.7083, + "step": 7025 + }, + { + "epoch": 0.29583157894736845, + "grad_norm": 0.3984375, + "learning_rate": 0.00040847030829896635, + "loss": 3.3362, + "step": 7026 + }, + { + "epoch": 0.2958736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.0004084442080019858, + "loss": 3.6235, + "step": 7027 + }, + { + "epoch": 0.2959157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.0004084181048183379, + "loss": 3.5923, + "step": 7028 + }, + { + "epoch": 0.2959578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.00040839199874849827, + "loss": 3.1826, + "step": 7029 + }, + { + "epoch": 0.296, + "grad_norm": 0.41015625, + "learning_rate": 0.00040836588979294256, + "loss": 3.3164, + "step": 7030 + }, + { + "epoch": 0.2960421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.0004083397779521463, + "loss": 3.3661, + "step": 7031 + }, + { + "epoch": 0.2960842105263158, + "grad_norm": 0.400390625, + "learning_rate": 0.00040831366322658535, + "loss": 3.2896, + "step": 7032 + }, + { + "epoch": 0.2961263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.0004082875456167355, + "loss": 2.8008, + "step": 7033 + }, + { + "epoch": 0.2961684210526316, + "grad_norm": 0.43359375, + "learning_rate": 0.00040826142512307253, + "loss": 3.3985, + "step": 7034 + }, + { + "epoch": 0.2962105263157895, + "grad_norm": 0.51953125, + "learning_rate": 0.00040823530174607237, + "loss": 3.4451, + "step": 7035 + }, + { + "epoch": 0.2962526315789474, + "grad_norm": 0.4140625, + "learning_rate": 0.00040820917548621094, + "loss": 3.4909, + "step": 7036 + }, + { + "epoch": 0.2962947368421053, + "grad_norm": 0.44140625, + "learning_rate": 0.0004081830463439642, + "loss": 3.4369, + "step": 7037 + }, + { + "epoch": 0.2963368421052632, + "grad_norm": 0.447265625, + "learning_rate": 0.0004081569143198083, + "loss": 2.883, + "step": 7038 + }, + { + "epoch": 0.2963789473684211, + "grad_norm": 0.431640625, + "learning_rate": 0.0004081307794142191, + "loss": 3.1317, + "step": 7039 + }, + { + "epoch": 0.296421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.000408104641627673, + "loss": 3.0456, + "step": 7040 + }, + { + "epoch": 0.2964631578947368, + "grad_norm": 0.4609375, + "learning_rate": 0.00040807850096064605, + "loss": 3.5733, + "step": 7041 + }, + { + "epoch": 0.2965052631578947, + "grad_norm": 0.412109375, + "learning_rate": 0.0004080523574136145, + "loss": 3.3384, + "step": 7042 + }, + { + "epoch": 0.2965473684210526, + "grad_norm": 0.408203125, + "learning_rate": 0.0004080262109870548, + "loss": 3.4938, + "step": 7043 + }, + { + "epoch": 0.2965894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.00040800006168144315, + "loss": 3.2096, + "step": 7044 + }, + { + "epoch": 0.2966315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.00040797390949725605, + "loss": 3.1908, + "step": 7045 + }, + { + "epoch": 0.2966736842105263, + "grad_norm": 0.466796875, + "learning_rate": 0.0004079477544349699, + "loss": 3.1697, + "step": 7046 + }, + { + "epoch": 0.2967157894736842, + "grad_norm": 0.51171875, + "learning_rate": 0.00040792159649506124, + "loss": 3.4531, + "step": 7047 + }, + { + "epoch": 0.2967578947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.00040789543567800666, + "loss": 3.2872, + "step": 7048 + }, + { + "epoch": 0.2968, + "grad_norm": 0.42578125, + "learning_rate": 0.0004078692719842828, + "loss": 3.3932, + "step": 7049 + }, + { + "epoch": 0.2968421052631579, + "grad_norm": 0.40234375, + "learning_rate": 0.0004078431054143662, + "loss": 3.1714, + "step": 7050 + }, + { + "epoch": 0.2968842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.00040781693596873375, + "loss": 3.0404, + "step": 7051 + }, + { + "epoch": 0.2969263157894737, + "grad_norm": 0.404296875, + "learning_rate": 0.0004077907636478622, + "loss": 3.2849, + "step": 7052 + }, + { + "epoch": 0.2969684210526316, + "grad_norm": 0.40625, + "learning_rate": 0.0004077645884522282, + "loss": 3.2119, + "step": 7053 + }, + { + "epoch": 0.29701052631578945, + "grad_norm": 0.4765625, + "learning_rate": 0.00040773841038230886, + "loss": 2.911, + "step": 7054 + }, + { + "epoch": 0.29705263157894735, + "grad_norm": 0.42578125, + "learning_rate": 0.000407712229438581, + "loss": 3.5003, + "step": 7055 + }, + { + "epoch": 0.29709473684210524, + "grad_norm": 0.416015625, + "learning_rate": 0.0004076860456215216, + "loss": 2.9564, + "step": 7056 + }, + { + "epoch": 0.29713684210526314, + "grad_norm": 0.404296875, + "learning_rate": 0.00040765985893160775, + "loss": 3.0703, + "step": 7057 + }, + { + "epoch": 0.29717894736842104, + "grad_norm": 0.423828125, + "learning_rate": 0.00040763366936931645, + "loss": 2.8627, + "step": 7058 + }, + { + "epoch": 0.29722105263157894, + "grad_norm": 0.41796875, + "learning_rate": 0.000407607476935125, + "loss": 3.1157, + "step": 7059 + }, + { + "epoch": 0.29726315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.00040758128162951047, + "loss": 3.4056, + "step": 7060 + }, + { + "epoch": 0.29730526315789474, + "grad_norm": 0.443359375, + "learning_rate": 0.0004075550834529501, + "loss": 2.9523, + "step": 7061 + }, + { + "epoch": 0.29734736842105264, + "grad_norm": 0.427734375, + "learning_rate": 0.0004075288824059212, + "loss": 3.1129, + "step": 7062 + }, + { + "epoch": 0.29738947368421054, + "grad_norm": 0.390625, + "learning_rate": 0.00040750267848890125, + "loss": 3.1112, + "step": 7063 + }, + { + "epoch": 0.29743157894736844, + "grad_norm": 0.55859375, + "learning_rate": 0.0004074764717023676, + "loss": 3.4455, + "step": 7064 + }, + { + "epoch": 0.29747368421052633, + "grad_norm": 0.39453125, + "learning_rate": 0.0004074502620467975, + "loss": 2.9379, + "step": 7065 + }, + { + "epoch": 0.29751578947368423, + "grad_norm": 0.392578125, + "learning_rate": 0.00040742404952266866, + "loss": 3.4671, + "step": 7066 + }, + { + "epoch": 0.29755789473684213, + "grad_norm": 0.443359375, + "learning_rate": 0.0004073978341304587, + "loss": 3.0391, + "step": 7067 + }, + { + "epoch": 0.2976, + "grad_norm": 0.40234375, + "learning_rate": 0.0004073716158706451, + "loss": 3.2764, + "step": 7068 + }, + { + "epoch": 0.2976421052631579, + "grad_norm": 0.404296875, + "learning_rate": 0.0004073453947437055, + "loss": 3.1283, + "step": 7069 + }, + { + "epoch": 0.2976842105263158, + "grad_norm": 0.3984375, + "learning_rate": 0.0004073191707501177, + "loss": 3.2943, + "step": 7070 + }, + { + "epoch": 0.2977263157894737, + "grad_norm": 0.458984375, + "learning_rate": 0.00040729294389035955, + "loss": 3.3016, + "step": 7071 + }, + { + "epoch": 0.29776842105263157, + "grad_norm": 0.439453125, + "learning_rate": 0.0004072667141649087, + "loss": 3.4313, + "step": 7072 + }, + { + "epoch": 0.29781052631578947, + "grad_norm": 0.41015625, + "learning_rate": 0.00040724048157424306, + "loss": 3.7441, + "step": 7073 + }, + { + "epoch": 0.29785263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.0004072142461188406, + "loss": 2.9675, + "step": 7074 + }, + { + "epoch": 0.29789473684210527, + "grad_norm": 0.40234375, + "learning_rate": 0.00040718800779917933, + "loss": 3.2042, + "step": 7075 + }, + { + "epoch": 0.29793684210526317, + "grad_norm": 0.439453125, + "learning_rate": 0.0004071617666157372, + "loss": 3.3818, + "step": 7076 + }, + { + "epoch": 0.29797894736842107, + "grad_norm": 0.423828125, + "learning_rate": 0.00040713552256899246, + "loss": 3.4673, + "step": 7077 + }, + { + "epoch": 0.29802105263157896, + "grad_norm": 0.40234375, + "learning_rate": 0.00040710927565942303, + "loss": 3.3942, + "step": 7078 + }, + { + "epoch": 0.29806315789473686, + "grad_norm": 0.390625, + "learning_rate": 0.00040708302588750723, + "loss": 3.5458, + "step": 7079 + }, + { + "epoch": 0.29810526315789476, + "grad_norm": 0.39453125, + "learning_rate": 0.0004070567732537232, + "loss": 2.9831, + "step": 7080 + }, + { + "epoch": 0.2981473684210526, + "grad_norm": 0.412109375, + "learning_rate": 0.00040703051775854937, + "loss": 3.6183, + "step": 7081 + }, + { + "epoch": 0.2981894736842105, + "grad_norm": 0.478515625, + "learning_rate": 0.000407004259402464, + "loss": 3.3972, + "step": 7082 + }, + { + "epoch": 0.2982315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.0004069779981859455, + "loss": 3.2861, + "step": 7083 + }, + { + "epoch": 0.2982736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.00040695173410947223, + "loss": 3.3951, + "step": 7084 + }, + { + "epoch": 0.2983157894736842, + "grad_norm": 0.453125, + "learning_rate": 0.00040692546717352283, + "loss": 3.6335, + "step": 7085 + }, + { + "epoch": 0.2983578947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.00040689919737857584, + "loss": 3.4825, + "step": 7086 + }, + { + "epoch": 0.2984, + "grad_norm": 0.423828125, + "learning_rate": 0.0004068729247251097, + "loss": 3.3585, + "step": 7087 + }, + { + "epoch": 0.2984421052631579, + "grad_norm": 0.546875, + "learning_rate": 0.00040684664921360333, + "loss": 3.1765, + "step": 7088 + }, + { + "epoch": 0.2984842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.00040682037084453527, + "loss": 3.5421, + "step": 7089 + }, + { + "epoch": 0.2985263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.00040679408961838426, + "loss": 3.2434, + "step": 7090 + }, + { + "epoch": 0.2985684210526316, + "grad_norm": 0.4296875, + "learning_rate": 0.0004067678055356292, + "loss": 3.2472, + "step": 7091 + }, + { + "epoch": 0.2986105263157895, + "grad_norm": 0.412109375, + "learning_rate": 0.0004067415185967489, + "loss": 3.1952, + "step": 7092 + }, + { + "epoch": 0.2986526315789474, + "grad_norm": 0.4453125, + "learning_rate": 0.00040671522880222227, + "loss": 3.2267, + "step": 7093 + }, + { + "epoch": 0.2986947368421053, + "grad_norm": 0.396484375, + "learning_rate": 0.0004066889361525283, + "loss": 3.7341, + "step": 7094 + }, + { + "epoch": 0.29873684210526313, + "grad_norm": 0.404296875, + "learning_rate": 0.00040666264064814604, + "loss": 3.3532, + "step": 7095 + }, + { + "epoch": 0.29877894736842103, + "grad_norm": 0.40234375, + "learning_rate": 0.0004066363422895546, + "loss": 3.6474, + "step": 7096 + }, + { + "epoch": 0.29882105263157893, + "grad_norm": 0.416015625, + "learning_rate": 0.000406610041077233, + "loss": 3.3863, + "step": 7097 + }, + { + "epoch": 0.29886315789473683, + "grad_norm": 0.427734375, + "learning_rate": 0.00040658373701166043, + "loss": 3.2587, + "step": 7098 + }, + { + "epoch": 0.29890526315789473, + "grad_norm": 0.38671875, + "learning_rate": 0.00040655743009331626, + "loss": 3.4349, + "step": 7099 + }, + { + "epoch": 0.29894736842105263, + "grad_norm": 0.484375, + "learning_rate": 0.00040653112032267946, + "loss": 2.6972, + "step": 7100 + }, + { + "epoch": 0.2989894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.0004065048077002297, + "loss": 3.3591, + "step": 7101 + }, + { + "epoch": 0.2990315789473684, + "grad_norm": 0.41015625, + "learning_rate": 0.00040647849222644623, + "loss": 3.4455, + "step": 7102 + }, + { + "epoch": 0.2990736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.00040645217390180844, + "loss": 2.7638, + "step": 7103 + }, + { + "epoch": 0.2991157894736842, + "grad_norm": 0.38671875, + "learning_rate": 0.0004064258527267959, + "loss": 3.0703, + "step": 7104 + }, + { + "epoch": 0.2991578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.00040639952870188815, + "loss": 3.5365, + "step": 7105 + }, + { + "epoch": 0.2992, + "grad_norm": 0.388671875, + "learning_rate": 0.00040637320182756466, + "loss": 3.0702, + "step": 7106 + }, + { + "epoch": 0.2992421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.0004063468721043052, + "loss": 3.2313, + "step": 7107 + }, + { + "epoch": 0.29928421052631576, + "grad_norm": 0.412109375, + "learning_rate": 0.0004063205395325894, + "loss": 3.6548, + "step": 7108 + }, + { + "epoch": 0.29932631578947366, + "grad_norm": 0.39453125, + "learning_rate": 0.00040629420411289707, + "loss": 3.2564, + "step": 7109 + }, + { + "epoch": 0.29936842105263156, + "grad_norm": 0.40625, + "learning_rate": 0.0004062678658457081, + "loss": 3.2673, + "step": 7110 + }, + { + "epoch": 0.29941052631578946, + "grad_norm": 0.416015625, + "learning_rate": 0.00040624152473150207, + "loss": 3.1549, + "step": 7111 + }, + { + "epoch": 0.29945263157894736, + "grad_norm": 0.4140625, + "learning_rate": 0.000406215180770759, + "loss": 3.2754, + "step": 7112 + }, + { + "epoch": 0.29949473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.000406188833963959, + "loss": 3.4655, + "step": 7113 + }, + { + "epoch": 0.29953684210526316, + "grad_norm": 0.435546875, + "learning_rate": 0.0004061624843115819, + "loss": 3.1522, + "step": 7114 + }, + { + "epoch": 0.29957894736842106, + "grad_norm": 0.392578125, + "learning_rate": 0.0004061361318141078, + "loss": 3.1621, + "step": 7115 + }, + { + "epoch": 0.29962105263157895, + "grad_norm": 0.41015625, + "learning_rate": 0.00040610977647201693, + "loss": 3.0234, + "step": 7116 + }, + { + "epoch": 0.29966315789473685, + "grad_norm": 0.416015625, + "learning_rate": 0.00040608341828578926, + "loss": 3.4838, + "step": 7117 + }, + { + "epoch": 0.29970526315789475, + "grad_norm": 0.41796875, + "learning_rate": 0.00040605705725590513, + "loss": 3.2417, + "step": 7118 + }, + { + "epoch": 0.29974736842105265, + "grad_norm": 0.447265625, + "learning_rate": 0.00040603069338284483, + "loss": 3.6645, + "step": 7119 + }, + { + "epoch": 0.29978947368421055, + "grad_norm": 0.39453125, + "learning_rate": 0.00040600432666708846, + "loss": 3.2095, + "step": 7120 + }, + { + "epoch": 0.2998315789473684, + "grad_norm": 0.40625, + "learning_rate": 0.00040597795710911664, + "loss": 3.4361, + "step": 7121 + }, + { + "epoch": 0.2998736842105263, + "grad_norm": 0.39453125, + "learning_rate": 0.0004059515847094098, + "loss": 3.1715, + "step": 7122 + }, + { + "epoch": 0.2999157894736842, + "grad_norm": 0.39453125, + "learning_rate": 0.00040592520946844824, + "loss": 3.204, + "step": 7123 + }, + { + "epoch": 0.2999578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.0004058988313867126, + "loss": 3.4369, + "step": 7124 + }, + { + "epoch": 0.3, + "grad_norm": 0.412109375, + "learning_rate": 0.0004058724504646834, + "loss": 2.9328, + "step": 7125 + }, + { + "epoch": 0.3000421052631579, + "grad_norm": 0.4921875, + "learning_rate": 0.0004058460667028413, + "loss": 3.117, + "step": 7126 + }, + { + "epoch": 0.3000842105263158, + "grad_norm": 0.3984375, + "learning_rate": 0.0004058196801016669, + "loss": 3.5972, + "step": 7127 + }, + { + "epoch": 0.3001263157894737, + "grad_norm": 0.3984375, + "learning_rate": 0.0004057932906616412, + "loss": 3.9995, + "step": 7128 + }, + { + "epoch": 0.3001684210526316, + "grad_norm": 0.37890625, + "learning_rate": 0.00040576689838324465, + "loss": 3.2179, + "step": 7129 + }, + { + "epoch": 0.3002105263157895, + "grad_norm": 0.37109375, + "learning_rate": 0.0004057405032669582, + "loss": 2.4477, + "step": 7130 + }, + { + "epoch": 0.3002526315789474, + "grad_norm": 0.40234375, + "learning_rate": 0.00040571410531326294, + "loss": 3.2257, + "step": 7131 + }, + { + "epoch": 0.3002947368421053, + "grad_norm": 0.419921875, + "learning_rate": 0.00040568770452263945, + "loss": 3.3107, + "step": 7132 + }, + { + "epoch": 0.3003368421052632, + "grad_norm": 0.435546875, + "learning_rate": 0.000405661300895569, + "loss": 3.4885, + "step": 7133 + }, + { + "epoch": 0.3003789473684211, + "grad_norm": 0.421875, + "learning_rate": 0.00040563489443253253, + "loss": 2.8229, + "step": 7134 + }, + { + "epoch": 0.3004210526315789, + "grad_norm": 0.423828125, + "learning_rate": 0.00040560848513401115, + "loss": 3.3754, + "step": 7135 + }, + { + "epoch": 0.3004631578947368, + "grad_norm": 0.388671875, + "learning_rate": 0.000405582073000486, + "loss": 3.1276, + "step": 7136 + }, + { + "epoch": 0.3005052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.0004055556580324383, + "loss": 2.8902, + "step": 7137 + }, + { + "epoch": 0.3005473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.0004055292402303492, + "loss": 3.3738, + "step": 7138 + }, + { + "epoch": 0.3005894736842105, + "grad_norm": 0.408203125, + "learning_rate": 0.0004055028195947002, + "loss": 3.2514, + "step": 7139 + }, + { + "epoch": 0.3006315789473684, + "grad_norm": 0.396484375, + "learning_rate": 0.0004054763961259724, + "loss": 2.9669, + "step": 7140 + }, + { + "epoch": 0.3006736842105263, + "grad_norm": 0.404296875, + "learning_rate": 0.00040544996982464746, + "loss": 2.9713, + "step": 7141 + }, + { + "epoch": 0.3007157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.0004054235406912067, + "loss": 3.2531, + "step": 7142 + }, + { + "epoch": 0.3007578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.0004053971087261316, + "loss": 3.147, + "step": 7143 + }, + { + "epoch": 0.3008, + "grad_norm": 0.416015625, + "learning_rate": 0.0004053706739299038, + "loss": 3.5368, + "step": 7144 + }, + { + "epoch": 0.3008421052631579, + "grad_norm": 0.5, + "learning_rate": 0.00040534423630300486, + "loss": 3.1637, + "step": 7145 + }, + { + "epoch": 0.3008842105263158, + "grad_norm": 0.455078125, + "learning_rate": 0.00040531779584591644, + "loss": 3.4304, + "step": 7146 + }, + { + "epoch": 0.3009263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.0004052913525591203, + "loss": 3.2401, + "step": 7147 + }, + { + "epoch": 0.30096842105263155, + "grad_norm": 0.427734375, + "learning_rate": 0.0004052649064430981, + "loss": 3.2303, + "step": 7148 + }, + { + "epoch": 0.30101052631578945, + "grad_norm": 0.384765625, + "learning_rate": 0.00040523845749833185, + "loss": 3.2858, + "step": 7149 + }, + { + "epoch": 0.30105263157894735, + "grad_norm": 0.4140625, + "learning_rate": 0.00040521200572530327, + "loss": 3.3665, + "step": 7150 + }, + { + "epoch": 0.30109473684210525, + "grad_norm": 0.427734375, + "learning_rate": 0.00040518555112449426, + "loss": 3.6882, + "step": 7151 + }, + { + "epoch": 0.30113684210526315, + "grad_norm": 0.40234375, + "learning_rate": 0.00040515909369638685, + "loss": 3.1602, + "step": 7152 + }, + { + "epoch": 0.30117894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.0004051326334414631, + "loss": 3.0372, + "step": 7153 + }, + { + "epoch": 0.30122105263157894, + "grad_norm": 0.421875, + "learning_rate": 0.00040510617036020497, + "loss": 3.3583, + "step": 7154 + }, + { + "epoch": 0.30126315789473684, + "grad_norm": 0.44921875, + "learning_rate": 0.00040507970445309474, + "loss": 3.4743, + "step": 7155 + }, + { + "epoch": 0.30130526315789474, + "grad_norm": 0.38671875, + "learning_rate": 0.00040505323572061447, + "loss": 2.8635, + "step": 7156 + }, + { + "epoch": 0.30134736842105264, + "grad_norm": 0.41796875, + "learning_rate": 0.00040502676416324647, + "loss": 3.1104, + "step": 7157 + }, + { + "epoch": 0.30138947368421054, + "grad_norm": 0.421875, + "learning_rate": 0.0004050002897814729, + "loss": 3.6362, + "step": 7158 + }, + { + "epoch": 0.30143157894736844, + "grad_norm": 0.41796875, + "learning_rate": 0.0004049738125757762, + "loss": 3.1378, + "step": 7159 + }, + { + "epoch": 0.30147368421052634, + "grad_norm": 1.09375, + "learning_rate": 0.00040494733254663873, + "loss": 3.0104, + "step": 7160 + }, + { + "epoch": 0.30151578947368424, + "grad_norm": 0.435546875, + "learning_rate": 0.0004049208496945429, + "loss": 3.0925, + "step": 7161 + }, + { + "epoch": 0.3015578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0004048943640199712, + "loss": 3.2423, + "step": 7162 + }, + { + "epoch": 0.3016, + "grad_norm": 0.4140625, + "learning_rate": 0.0004048678755234062, + "loss": 3.5937, + "step": 7163 + }, + { + "epoch": 0.3016421052631579, + "grad_norm": 0.451171875, + "learning_rate": 0.0004048413842053305, + "loss": 3.3515, + "step": 7164 + }, + { + "epoch": 0.3016842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.00040481489006622675, + "loss": 3.0265, + "step": 7165 + }, + { + "epoch": 0.3017263157894737, + "grad_norm": 0.396484375, + "learning_rate": 0.0004047883931065775, + "loss": 3.1981, + "step": 7166 + }, + { + "epoch": 0.3017684210526316, + "grad_norm": 0.400390625, + "learning_rate": 0.0004047618933268656, + "loss": 2.9761, + "step": 7167 + }, + { + "epoch": 0.3018105263157895, + "grad_norm": 0.44140625, + "learning_rate": 0.00040473539072757386, + "loss": 3.0301, + "step": 7168 + }, + { + "epoch": 0.3018526315789474, + "grad_norm": 0.41015625, + "learning_rate": 0.00040470888530918516, + "loss": 2.8747, + "step": 7169 + }, + { + "epoch": 0.30189473684210527, + "grad_norm": 0.435546875, + "learning_rate": 0.0004046823770721823, + "loss": 3.4779, + "step": 7170 + }, + { + "epoch": 0.30193684210526317, + "grad_norm": 0.419921875, + "learning_rate": 0.0004046558660170483, + "loss": 2.9559, + "step": 7171 + }, + { + "epoch": 0.30197894736842107, + "grad_norm": 0.4140625, + "learning_rate": 0.0004046293521442661, + "loss": 3.3375, + "step": 7172 + }, + { + "epoch": 0.30202105263157897, + "grad_norm": 0.41796875, + "learning_rate": 0.00040460283545431884, + "loss": 3.4059, + "step": 7173 + }, + { + "epoch": 0.30206315789473687, + "grad_norm": 0.474609375, + "learning_rate": 0.00040457631594768953, + "loss": 2.9437, + "step": 7174 + }, + { + "epoch": 0.3021052631578947, + "grad_norm": 0.392578125, + "learning_rate": 0.00040454979362486134, + "loss": 3.3757, + "step": 7175 + }, + { + "epoch": 0.3021473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.0004045232684863175, + "loss": 3.1857, + "step": 7176 + }, + { + "epoch": 0.3021894736842105, + "grad_norm": 0.404296875, + "learning_rate": 0.00040449674053254133, + "loss": 3.1325, + "step": 7177 + }, + { + "epoch": 0.3022315789473684, + "grad_norm": 0.396484375, + "learning_rate": 0.00040447020976401603, + "loss": 3.4661, + "step": 7178 + }, + { + "epoch": 0.3022736842105263, + "grad_norm": 0.3984375, + "learning_rate": 0.00040444367618122504, + "loss": 3.0325, + "step": 7179 + }, + { + "epoch": 0.3023157894736842, + "grad_norm": 0.392578125, + "learning_rate": 0.00040441713978465164, + "loss": 3.2169, + "step": 7180 + }, + { + "epoch": 0.3023578947368421, + "grad_norm": 0.484375, + "learning_rate": 0.00040439060057477944, + "loss": 3.0947, + "step": 7181 + }, + { + "epoch": 0.3024, + "grad_norm": 0.435546875, + "learning_rate": 0.00040436405855209193, + "loss": 3.5381, + "step": 7182 + }, + { + "epoch": 0.3024421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.0004043375137170726, + "loss": 2.8737, + "step": 7183 + }, + { + "epoch": 0.3024842105263158, + "grad_norm": 0.453125, + "learning_rate": 0.0004043109660702051, + "loss": 2.9908, + "step": 7184 + }, + { + "epoch": 0.3025263157894737, + "grad_norm": 0.435546875, + "learning_rate": 0.00040428441561197315, + "loss": 3.5554, + "step": 7185 + }, + { + "epoch": 0.3025684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.00040425786234286047, + "loss": 3.256, + "step": 7186 + }, + { + "epoch": 0.3026105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.0004042313062633507, + "loss": 3.2908, + "step": 7187 + }, + { + "epoch": 0.3026526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.00040420474737392774, + "loss": 2.9989, + "step": 7188 + }, + { + "epoch": 0.30269473684210524, + "grad_norm": 0.462890625, + "learning_rate": 0.00040417818567507546, + "loss": 3.3374, + "step": 7189 + }, + { + "epoch": 0.30273684210526314, + "grad_norm": 0.5390625, + "learning_rate": 0.0004041516211672779, + "loss": 3.3622, + "step": 7190 + }, + { + "epoch": 0.30277894736842104, + "grad_norm": 0.400390625, + "learning_rate": 0.00040412505385101883, + "loss": 3.219, + "step": 7191 + }, + { + "epoch": 0.30282105263157894, + "grad_norm": 0.443359375, + "learning_rate": 0.00040409848372678235, + "loss": 2.3298, + "step": 7192 + }, + { + "epoch": 0.30286315789473683, + "grad_norm": 0.498046875, + "learning_rate": 0.00040407191079505265, + "loss": 3.6835, + "step": 7193 + }, + { + "epoch": 0.30290526315789473, + "grad_norm": 0.427734375, + "learning_rate": 0.00040404533505631365, + "loss": 3.3307, + "step": 7194 + }, + { + "epoch": 0.30294736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.0004040187565110497, + "loss": 3.0551, + "step": 7195 + }, + { + "epoch": 0.30298947368421053, + "grad_norm": 0.412109375, + "learning_rate": 0.00040399217515974493, + "loss": 3.2392, + "step": 7196 + }, + { + "epoch": 0.30303157894736843, + "grad_norm": 0.451171875, + "learning_rate": 0.00040396559100288373, + "loss": 3.4133, + "step": 7197 + }, + { + "epoch": 0.30307368421052633, + "grad_norm": 0.42578125, + "learning_rate": 0.0004039390040409503, + "loss": 3.6795, + "step": 7198 + }, + { + "epoch": 0.3031157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.00040391241427442914, + "loss": 3.3116, + "step": 7199 + }, + { + "epoch": 0.3031578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.0004038858217038046, + "loss": 3.4817, + "step": 7200 + }, + { + "epoch": 0.3032, + "grad_norm": 0.423828125, + "learning_rate": 0.00040385922632956117, + "loss": 3.3277, + "step": 7201 + }, + { + "epoch": 0.30324210526315787, + "grad_norm": 0.408203125, + "learning_rate": 0.00040383262815218345, + "loss": 3.8246, + "step": 7202 + }, + { + "epoch": 0.30328421052631577, + "grad_norm": 0.40234375, + "learning_rate": 0.00040380602717215595, + "loss": 3.267, + "step": 7203 + }, + { + "epoch": 0.30332631578947367, + "grad_norm": 0.419921875, + "learning_rate": 0.00040377942338996344, + "loss": 2.916, + "step": 7204 + }, + { + "epoch": 0.30336842105263156, + "grad_norm": 0.46484375, + "learning_rate": 0.00040375281680609044, + "loss": 2.9805, + "step": 7205 + }, + { + "epoch": 0.30341052631578946, + "grad_norm": 0.431640625, + "learning_rate": 0.00040372620742102185, + "loss": 3.043, + "step": 7206 + }, + { + "epoch": 0.30345263157894736, + "grad_norm": 0.447265625, + "learning_rate": 0.0004036995952352424, + "loss": 3.1854, + "step": 7207 + }, + { + "epoch": 0.30349473684210526, + "grad_norm": 0.400390625, + "learning_rate": 0.0004036729802492367, + "loss": 2.9862, + "step": 7208 + }, + { + "epoch": 0.30353684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.00040364636246349007, + "loss": 3.6425, + "step": 7209 + }, + { + "epoch": 0.30357894736842106, + "grad_norm": 0.458984375, + "learning_rate": 0.00040361974187848716, + "loss": 3.3296, + "step": 7210 + }, + { + "epoch": 0.30362105263157896, + "grad_norm": 0.4296875, + "learning_rate": 0.000403593118494713, + "loss": 3.2004, + "step": 7211 + }, + { + "epoch": 0.30366315789473686, + "grad_norm": 0.44140625, + "learning_rate": 0.00040356649231265283, + "loss": 2.9588, + "step": 7212 + }, + { + "epoch": 0.30370526315789476, + "grad_norm": 0.3984375, + "learning_rate": 0.00040353986333279145, + "loss": 3.1981, + "step": 7213 + }, + { + "epoch": 0.30374736842105265, + "grad_norm": 0.439453125, + "learning_rate": 0.00040351323155561424, + "loss": 3.204, + "step": 7214 + }, + { + "epoch": 0.30378947368421055, + "grad_norm": 0.41015625, + "learning_rate": 0.00040348659698160634, + "loss": 3.2757, + "step": 7215 + }, + { + "epoch": 0.3038315789473684, + "grad_norm": 0.44921875, + "learning_rate": 0.000403459959611253, + "loss": 3.4009, + "step": 7216 + }, + { + "epoch": 0.3038736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.00040343331944503946, + "loss": 3.3942, + "step": 7217 + }, + { + "epoch": 0.3039157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.0004034066764834511, + "loss": 3.4926, + "step": 7218 + }, + { + "epoch": 0.3039578947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.0004033800307269734, + "loss": 3.1336, + "step": 7219 + }, + { + "epoch": 0.304, + "grad_norm": 0.5859375, + "learning_rate": 0.0004033533821760917, + "loss": 3.2857, + "step": 7220 + }, + { + "epoch": 0.3040421052631579, + "grad_norm": 0.453125, + "learning_rate": 0.00040332673083129156, + "loss": 2.7415, + "step": 7221 + }, + { + "epoch": 0.3040842105263158, + "grad_norm": 0.447265625, + "learning_rate": 0.0004033000766930585, + "loss": 3.2251, + "step": 7222 + }, + { + "epoch": 0.3041263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.00040327341976187825, + "loss": 3.1824, + "step": 7223 + }, + { + "epoch": 0.3041684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.00040324676003823635, + "loss": 3.2979, + "step": 7224 + }, + { + "epoch": 0.3042105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.0004032200975226185, + "loss": 3.3774, + "step": 7225 + }, + { + "epoch": 0.3042526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00040319343221551055, + "loss": 3.4369, + "step": 7226 + }, + { + "epoch": 0.3042947368421053, + "grad_norm": 0.419921875, + "learning_rate": 0.00040316676411739826, + "loss": 2.9629, + "step": 7227 + }, + { + "epoch": 0.3043368421052632, + "grad_norm": 0.392578125, + "learning_rate": 0.00040314009322876744, + "loss": 3.4383, + "step": 7228 + }, + { + "epoch": 0.304378947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.00040311341955010405, + "loss": 3.3273, + "step": 7229 + }, + { + "epoch": 0.3044210526315789, + "grad_norm": 0.3984375, + "learning_rate": 0.0004030867430818941, + "loss": 3.1951, + "step": 7230 + }, + { + "epoch": 0.3044631578947368, + "grad_norm": 0.408203125, + "learning_rate": 0.00040306006382462354, + "loss": 3.4609, + "step": 7231 + }, + { + "epoch": 0.3045052631578947, + "grad_norm": 0.46484375, + "learning_rate": 0.00040303338177877853, + "loss": 3.4164, + "step": 7232 + }, + { + "epoch": 0.3045473684210526, + "grad_norm": 0.412109375, + "learning_rate": 0.00040300669694484507, + "loss": 2.8759, + "step": 7233 + }, + { + "epoch": 0.3045894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.0004029800093233093, + "loss": 3.2779, + "step": 7234 + }, + { + "epoch": 0.3046315789473684, + "grad_norm": 0.4609375, + "learning_rate": 0.00040295331891465754, + "loss": 3.1125, + "step": 7235 + }, + { + "epoch": 0.3046736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.000402926625719376, + "loss": 3.0441, + "step": 7236 + }, + { + "epoch": 0.3047157894736842, + "grad_norm": 0.404296875, + "learning_rate": 0.0004028999297379511, + "loss": 3.3236, + "step": 7237 + }, + { + "epoch": 0.3047578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.0004028732309708691, + "loss": 3.4255, + "step": 7238 + }, + { + "epoch": 0.3048, + "grad_norm": 0.42578125, + "learning_rate": 0.0004028465294186164, + "loss": 3.0669, + "step": 7239 + }, + { + "epoch": 0.3048421052631579, + "grad_norm": 0.451171875, + "learning_rate": 0.0004028198250816796, + "loss": 3.6884, + "step": 7240 + }, + { + "epoch": 0.3048842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.000402793117960545, + "loss": 3.4626, + "step": 7241 + }, + { + "epoch": 0.3049263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0004027664080556994, + "loss": 3.215, + "step": 7242 + }, + { + "epoch": 0.30496842105263156, + "grad_norm": 0.4375, + "learning_rate": 0.00040273969536762947, + "loss": 3.3726, + "step": 7243 + }, + { + "epoch": 0.30501052631578945, + "grad_norm": 0.41796875, + "learning_rate": 0.00040271297989682156, + "loss": 3.3472, + "step": 7244 + }, + { + "epoch": 0.30505263157894735, + "grad_norm": 0.408203125, + "learning_rate": 0.00040268626164376267, + "loss": 3.553, + "step": 7245 + }, + { + "epoch": 0.30509473684210525, + "grad_norm": 0.392578125, + "learning_rate": 0.0004026595406089395, + "loss": 3.4587, + "step": 7246 + }, + { + "epoch": 0.30513684210526315, + "grad_norm": 0.470703125, + "learning_rate": 0.0004026328167928388, + "loss": 2.9574, + "step": 7247 + }, + { + "epoch": 0.30517894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.00040260609019594757, + "loss": 3.205, + "step": 7248 + }, + { + "epoch": 0.30522105263157895, + "grad_norm": 0.416015625, + "learning_rate": 0.00040257936081875267, + "loss": 2.8901, + "step": 7249 + }, + { + "epoch": 0.30526315789473685, + "grad_norm": 0.400390625, + "learning_rate": 0.00040255262866174113, + "loss": 3.8291, + "step": 7250 + }, + { + "epoch": 0.30530526315789475, + "grad_norm": 0.439453125, + "learning_rate": 0.00040252589372539993, + "loss": 3.3451, + "step": 7251 + }, + { + "epoch": 0.30534736842105265, + "grad_norm": 0.447265625, + "learning_rate": 0.0004024991560102161, + "loss": 2.8464, + "step": 7252 + }, + { + "epoch": 0.30538947368421054, + "grad_norm": 0.427734375, + "learning_rate": 0.00040247241551667686, + "loss": 3.1878, + "step": 7253 + }, + { + "epoch": 0.30543157894736844, + "grad_norm": 0.4140625, + "learning_rate": 0.0004024456722452694, + "loss": 3.3616, + "step": 7254 + }, + { + "epoch": 0.30547368421052634, + "grad_norm": 0.396484375, + "learning_rate": 0.0004024189261964808, + "loss": 3.2905, + "step": 7255 + }, + { + "epoch": 0.3055157894736842, + "grad_norm": 0.40625, + "learning_rate": 0.00040239217737079845, + "loss": 3.0141, + "step": 7256 + }, + { + "epoch": 0.3055578947368421, + "grad_norm": 0.404296875, + "learning_rate": 0.0004023654257687098, + "loss": 3.0116, + "step": 7257 + }, + { + "epoch": 0.3056, + "grad_norm": 0.447265625, + "learning_rate": 0.00040233867139070206, + "loss": 2.9015, + "step": 7258 + }, + { + "epoch": 0.3056421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.00040231191423726275, + "loss": 3.3728, + "step": 7259 + }, + { + "epoch": 0.3056842105263158, + "grad_norm": 0.5859375, + "learning_rate": 0.00040228515430887926, + "loss": 3.584, + "step": 7260 + }, + { + "epoch": 0.3057263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0004022583916060392, + "loss": 3.2033, + "step": 7261 + }, + { + "epoch": 0.3057684210526316, + "grad_norm": 0.416015625, + "learning_rate": 0.00040223162612923013, + "loss": 3.2497, + "step": 7262 + }, + { + "epoch": 0.3058105263157895, + "grad_norm": 0.56640625, + "learning_rate": 0.00040220485787893973, + "loss": 3.3595, + "step": 7263 + }, + { + "epoch": 0.3058526315789474, + "grad_norm": 0.470703125, + "learning_rate": 0.0004021780868556556, + "loss": 3.1713, + "step": 7264 + }, + { + "epoch": 0.3058947368421053, + "grad_norm": 0.46484375, + "learning_rate": 0.00040215131305986563, + "loss": 3.533, + "step": 7265 + }, + { + "epoch": 0.3059368421052632, + "grad_norm": 0.458984375, + "learning_rate": 0.00040212453649205745, + "loss": 3.5698, + "step": 7266 + }, + { + "epoch": 0.3059789473684211, + "grad_norm": 0.396484375, + "learning_rate": 0.00040209775715271896, + "loss": 3.1792, + "step": 7267 + }, + { + "epoch": 0.30602105263157897, + "grad_norm": 0.412109375, + "learning_rate": 0.000402070975042338, + "loss": 3.3752, + "step": 7268 + }, + { + "epoch": 0.3060631578947368, + "grad_norm": 0.45703125, + "learning_rate": 0.0004020441901614026, + "loss": 3.1684, + "step": 7269 + }, + { + "epoch": 0.3061052631578947, + "grad_norm": 0.423828125, + "learning_rate": 0.0004020174025104006, + "loss": 3.2718, + "step": 7270 + }, + { + "epoch": 0.3061473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.00040199061208982036, + "loss": 2.7433, + "step": 7271 + }, + { + "epoch": 0.3061894736842105, + "grad_norm": 0.404296875, + "learning_rate": 0.00040196381890014954, + "loss": 3.2036, + "step": 7272 + }, + { + "epoch": 0.3062315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.00040193702294187664, + "loss": 3.2174, + "step": 7273 + }, + { + "epoch": 0.3062736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.0004019102242154896, + "loss": 3.0607, + "step": 7274 + }, + { + "epoch": 0.3063157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.0004018834227214767, + "loss": 3.1852, + "step": 7275 + }, + { + "epoch": 0.3063578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0004018566184603264, + "loss": 2.9697, + "step": 7276 + }, + { + "epoch": 0.3064, + "grad_norm": 0.44921875, + "learning_rate": 0.00040182981143252694, + "loss": 2.9302, + "step": 7277 + }, + { + "epoch": 0.3064421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.0004018030016385667, + "loss": 3.4449, + "step": 7278 + }, + { + "epoch": 0.3064842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.0004017761890789341, + "loss": 3.0799, + "step": 7279 + }, + { + "epoch": 0.3065263157894737, + "grad_norm": 0.384765625, + "learning_rate": 0.0004017493737541177, + "loss": 2.6804, + "step": 7280 + }, + { + "epoch": 0.3065684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.00040172255566460604, + "loss": 2.8409, + "step": 7281 + }, + { + "epoch": 0.3066105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.0004016957348108875, + "loss": 3.3747, + "step": 7282 + }, + { + "epoch": 0.30665263157894734, + "grad_norm": 0.4453125, + "learning_rate": 0.00040166891119345103, + "loss": 3.7477, + "step": 7283 + }, + { + "epoch": 0.30669473684210524, + "grad_norm": 0.39453125, + "learning_rate": 0.0004016420848127853, + "loss": 2.9036, + "step": 7284 + }, + { + "epoch": 0.30673684210526314, + "grad_norm": 0.3984375, + "learning_rate": 0.00040161525566937887, + "loss": 3.4695, + "step": 7285 + }, + { + "epoch": 0.30677894736842104, + "grad_norm": 0.458984375, + "learning_rate": 0.0004015884237637206, + "loss": 3.512, + "step": 7286 + }, + { + "epoch": 0.30682105263157894, + "grad_norm": 0.42578125, + "learning_rate": 0.00040156158909629934, + "loss": 3.3636, + "step": 7287 + }, + { + "epoch": 0.30686315789473684, + "grad_norm": 0.40625, + "learning_rate": 0.0004015347516676041, + "loss": 3.222, + "step": 7288 + }, + { + "epoch": 0.30690526315789474, + "grad_norm": 0.40234375, + "learning_rate": 0.0004015079114781236, + "loss": 3.3031, + "step": 7289 + }, + { + "epoch": 0.30694736842105264, + "grad_norm": 0.578125, + "learning_rate": 0.0004014810685283471, + "loss": 3.0917, + "step": 7290 + }, + { + "epoch": 0.30698947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.0004014542228187634, + "loss": 3.6075, + "step": 7291 + }, + { + "epoch": 0.30703157894736843, + "grad_norm": 0.416015625, + "learning_rate": 0.0004014273743498618, + "loss": 2.7728, + "step": 7292 + }, + { + "epoch": 0.30707368421052633, + "grad_norm": 0.423828125, + "learning_rate": 0.00040140052312213133, + "loss": 3.0342, + "step": 7293 + }, + { + "epoch": 0.30711578947368423, + "grad_norm": 0.419921875, + "learning_rate": 0.00040137366913606124, + "loss": 3.2024, + "step": 7294 + }, + { + "epoch": 0.30715789473684213, + "grad_norm": 0.427734375, + "learning_rate": 0.00040134681239214066, + "loss": 3.0137, + "step": 7295 + }, + { + "epoch": 0.3072, + "grad_norm": 0.408203125, + "learning_rate": 0.00040131995289085907, + "loss": 3.5906, + "step": 7296 + }, + { + "epoch": 0.30724210526315787, + "grad_norm": 0.421875, + "learning_rate": 0.0004012930906327057, + "loss": 3.3314, + "step": 7297 + }, + { + "epoch": 0.30728421052631577, + "grad_norm": 0.435546875, + "learning_rate": 0.00040126622561817, + "loss": 3.1571, + "step": 7298 + }, + { + "epoch": 0.30732631578947367, + "grad_norm": 0.408203125, + "learning_rate": 0.0004012393578477414, + "loss": 2.8867, + "step": 7299 + }, + { + "epoch": 0.30736842105263157, + "grad_norm": 0.3984375, + "learning_rate": 0.0004012124873219094, + "loss": 3.1355, + "step": 7300 + }, + { + "epoch": 0.30741052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.0004011856140411635, + "loss": 3.3088, + "step": 7301 + }, + { + "epoch": 0.30745263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.00040115873800599343, + "loss": 3.3906, + "step": 7302 + }, + { + "epoch": 0.30749473684210527, + "grad_norm": 0.423828125, + "learning_rate": 0.00040113185921688866, + "loss": 3.3675, + "step": 7303 + }, + { + "epoch": 0.30753684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.00040110497767433906, + "loss": 3.2245, + "step": 7304 + }, + { + "epoch": 0.30757894736842106, + "grad_norm": 0.3984375, + "learning_rate": 0.0004010780933788343, + "loss": 3.3727, + "step": 7305 + }, + { + "epoch": 0.30762105263157896, + "grad_norm": 0.43359375, + "learning_rate": 0.0004010512063308642, + "loss": 2.8313, + "step": 7306 + }, + { + "epoch": 0.30766315789473686, + "grad_norm": 0.40625, + "learning_rate": 0.0004010243165309186, + "loss": 3.0548, + "step": 7307 + }, + { + "epoch": 0.30770526315789476, + "grad_norm": 0.439453125, + "learning_rate": 0.00040099742397948737, + "loss": 2.9205, + "step": 7308 + }, + { + "epoch": 0.30774736842105266, + "grad_norm": 0.408203125, + "learning_rate": 0.00040097052867706047, + "loss": 2.7559, + "step": 7309 + }, + { + "epoch": 0.3077894736842105, + "grad_norm": 0.416015625, + "learning_rate": 0.00040094363062412796, + "loss": 3.5637, + "step": 7310 + }, + { + "epoch": 0.3078315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.00040091672982117975, + "loss": 3.051, + "step": 7311 + }, + { + "epoch": 0.3078736842105263, + "grad_norm": 0.5078125, + "learning_rate": 0.00040088982626870616, + "loss": 2.5523, + "step": 7312 + }, + { + "epoch": 0.3079157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.00040086291996719724, + "loss": 3.0535, + "step": 7313 + }, + { + "epoch": 0.3079578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0004008360109171432, + "loss": 3.2792, + "step": 7314 + }, + { + "epoch": 0.308, + "grad_norm": 0.435546875, + "learning_rate": 0.00040080909911903407, + "loss": 2.986, + "step": 7315 + }, + { + "epoch": 0.3080421052631579, + "grad_norm": 0.3828125, + "learning_rate": 0.0004007821845733605, + "loss": 3.0661, + "step": 7316 + }, + { + "epoch": 0.3080842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.00040075526728061266, + "loss": 2.871, + "step": 7317 + }, + { + "epoch": 0.3081263157894737, + "grad_norm": 0.3828125, + "learning_rate": 0.000400728347241281, + "loss": 3.1113, + "step": 7318 + }, + { + "epoch": 0.3081684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.0004007014244558559, + "loss": 3.4014, + "step": 7319 + }, + { + "epoch": 0.3082105263157895, + "grad_norm": 0.42578125, + "learning_rate": 0.000400674498924828, + "loss": 3.5296, + "step": 7320 + }, + { + "epoch": 0.3082526315789474, + "grad_norm": 0.59375, + "learning_rate": 0.0004006475706486877, + "loss": 2.8545, + "step": 7321 + }, + { + "epoch": 0.3082947368421053, + "grad_norm": 0.466796875, + "learning_rate": 0.0004006206396279257, + "loss": 3.2672, + "step": 7322 + }, + { + "epoch": 0.30833684210526313, + "grad_norm": 0.46484375, + "learning_rate": 0.00040059370586303255, + "loss": 3.1239, + "step": 7323 + }, + { + "epoch": 0.30837894736842103, + "grad_norm": 0.462890625, + "learning_rate": 0.0004005667693544991, + "loss": 3.2319, + "step": 7324 + }, + { + "epoch": 0.30842105263157893, + "grad_norm": 0.4375, + "learning_rate": 0.00040053983010281596, + "loss": 3.4342, + "step": 7325 + }, + { + "epoch": 0.30846315789473683, + "grad_norm": 0.416015625, + "learning_rate": 0.000400512888108474, + "loss": 3.5286, + "step": 7326 + }, + { + "epoch": 0.3085052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.0004004859433719641, + "loss": 3.2997, + "step": 7327 + }, + { + "epoch": 0.3085473684210526, + "grad_norm": 0.408203125, + "learning_rate": 0.0004004589958937771, + "loss": 3.1088, + "step": 7328 + }, + { + "epoch": 0.3085894736842105, + "grad_norm": 0.96875, + "learning_rate": 0.0004004320456744039, + "loss": 3.0759, + "step": 7329 + }, + { + "epoch": 0.3086315789473684, + "grad_norm": 0.43359375, + "learning_rate": 0.0004004050927143358, + "loss": 2.9664, + "step": 7330 + }, + { + "epoch": 0.3086736842105263, + "grad_norm": 0.46875, + "learning_rate": 0.00040037813701406346, + "loss": 3.0985, + "step": 7331 + }, + { + "epoch": 0.3087157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.00040035117857407823, + "loss": 3.1973, + "step": 7332 + }, + { + "epoch": 0.3087578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.0004003242173948711, + "loss": 2.8775, + "step": 7333 + }, + { + "epoch": 0.3088, + "grad_norm": 0.3828125, + "learning_rate": 0.0004002972534769334, + "loss": 3.2265, + "step": 7334 + }, + { + "epoch": 0.3088421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00040027028682075626, + "loss": 3.2896, + "step": 7335 + }, + { + "epoch": 0.3088842105263158, + "grad_norm": 0.439453125, + "learning_rate": 0.00040024331742683113, + "loss": 3.3278, + "step": 7336 + }, + { + "epoch": 0.30892631578947366, + "grad_norm": 0.41796875, + "learning_rate": 0.0004002163452956493, + "loss": 3.1903, + "step": 7337 + }, + { + "epoch": 0.30896842105263156, + "grad_norm": 0.396484375, + "learning_rate": 0.0004001893704277021, + "loss": 3.2786, + "step": 7338 + }, + { + "epoch": 0.30901052631578946, + "grad_norm": 0.3984375, + "learning_rate": 0.0004001623928234811, + "loss": 3.4059, + "step": 7339 + }, + { + "epoch": 0.30905263157894736, + "grad_norm": 0.416015625, + "learning_rate": 0.0004001354124834777, + "loss": 3.2033, + "step": 7340 + }, + { + "epoch": 0.30909473684210526, + "grad_norm": 0.451171875, + "learning_rate": 0.0004001084294081835, + "loss": 3.5038, + "step": 7341 + }, + { + "epoch": 0.30913684210526315, + "grad_norm": 0.435546875, + "learning_rate": 0.00040008144359809, + "loss": 3.5789, + "step": 7342 + }, + { + "epoch": 0.30917894736842105, + "grad_norm": 0.40234375, + "learning_rate": 0.000400054455053689, + "loss": 3.352, + "step": 7343 + }, + { + "epoch": 0.30922105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.0004000274637754722, + "loss": 2.9182, + "step": 7344 + }, + { + "epoch": 0.30926315789473685, + "grad_norm": 0.41796875, + "learning_rate": 0.0004000004697639312, + "loss": 2.9284, + "step": 7345 + }, + { + "epoch": 0.30930526315789475, + "grad_norm": 0.462890625, + "learning_rate": 0.0003999734730195579, + "loss": 3.6692, + "step": 7346 + }, + { + "epoch": 0.30934736842105265, + "grad_norm": 0.5390625, + "learning_rate": 0.0003999464735428442, + "loss": 3.4437, + "step": 7347 + }, + { + "epoch": 0.30938947368421055, + "grad_norm": 0.47265625, + "learning_rate": 0.00039991947133428187, + "loss": 3.6116, + "step": 7348 + }, + { + "epoch": 0.30943157894736845, + "grad_norm": 0.41796875, + "learning_rate": 0.0003998924663943629, + "loss": 3.4609, + "step": 7349 + }, + { + "epoch": 0.3094736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.0003998654587235793, + "loss": 3.1128, + "step": 7350 + }, + { + "epoch": 0.3095157894736842, + "grad_norm": 0.4609375, + "learning_rate": 0.0003998384483224232, + "loss": 3.2029, + "step": 7351 + }, + { + "epoch": 0.3095578947368421, + "grad_norm": 0.390625, + "learning_rate": 0.00039981143519138664, + "loss": 3.0567, + "step": 7352 + }, + { + "epoch": 0.3096, + "grad_norm": 0.412109375, + "learning_rate": 0.00039978441933096164, + "loss": 3.2865, + "step": 7353 + }, + { + "epoch": 0.3096421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.0003997574007416406, + "loss": 3.7253, + "step": 7354 + }, + { + "epoch": 0.3096842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.0003997303794239157, + "loss": 2.9425, + "step": 7355 + }, + { + "epoch": 0.3097263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.00039970335537827906, + "loss": 3.2557, + "step": 7356 + }, + { + "epoch": 0.3097684210526316, + "grad_norm": 0.45703125, + "learning_rate": 0.0003996763286052233, + "loss": 3.7003, + "step": 7357 + }, + { + "epoch": 0.3098105263157895, + "grad_norm": 0.400390625, + "learning_rate": 0.00039964929910524065, + "loss": 3.0516, + "step": 7358 + }, + { + "epoch": 0.3098526315789474, + "grad_norm": 0.44921875, + "learning_rate": 0.00039962226687882365, + "loss": 2.9853, + "step": 7359 + }, + { + "epoch": 0.3098947368421053, + "grad_norm": 0.419921875, + "learning_rate": 0.0003995952319264647, + "loss": 3.5054, + "step": 7360 + }, + { + "epoch": 0.3099368421052632, + "grad_norm": 0.41015625, + "learning_rate": 0.00039956819424865643, + "loss": 3.1239, + "step": 7361 + }, + { + "epoch": 0.3099789473684211, + "grad_norm": 0.40625, + "learning_rate": 0.0003995411538458913, + "loss": 3.2333, + "step": 7362 + }, + { + "epoch": 0.310021052631579, + "grad_norm": 0.45703125, + "learning_rate": 0.00039951411071866217, + "loss": 3.22, + "step": 7363 + }, + { + "epoch": 0.3100631578947368, + "grad_norm": 0.388671875, + "learning_rate": 0.0003994870648674616, + "loss": 2.7908, + "step": 7364 + }, + { + "epoch": 0.3101052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.00039946001629278226, + "loss": 3.411, + "step": 7365 + }, + { + "epoch": 0.3101473684210526, + "grad_norm": 0.4140625, + "learning_rate": 0.00039943296499511715, + "loss": 3.3301, + "step": 7366 + }, + { + "epoch": 0.3101894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.0003994059109749589, + "loss": 2.9942, + "step": 7367 + }, + { + "epoch": 0.3102315789473684, + "grad_norm": 0.39453125, + "learning_rate": 0.00039937885423280064, + "loss": 3.3597, + "step": 7368 + }, + { + "epoch": 0.3102736842105263, + "grad_norm": 0.44140625, + "learning_rate": 0.000399351794769135, + "loss": 3.2265, + "step": 7369 + }, + { + "epoch": 0.3103157894736842, + "grad_norm": 0.40234375, + "learning_rate": 0.0003993247325844552, + "loss": 3.1726, + "step": 7370 + }, + { + "epoch": 0.3103578947368421, + "grad_norm": 0.4453125, + "learning_rate": 0.00039929766767925423, + "loss": 3.0721, + "step": 7371 + }, + { + "epoch": 0.3104, + "grad_norm": 0.486328125, + "learning_rate": 0.00039927060005402515, + "loss": 3.5639, + "step": 7372 + }, + { + "epoch": 0.3104421052631579, + "grad_norm": 0.400390625, + "learning_rate": 0.0003992435297092611, + "loss": 3.814, + "step": 7373 + }, + { + "epoch": 0.3104842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.00039921645664545537, + "loss": 2.988, + "step": 7374 + }, + { + "epoch": 0.3105263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.0003991893808631011, + "loss": 3.612, + "step": 7375 + }, + { + "epoch": 0.3105684210526316, + "grad_norm": 0.53515625, + "learning_rate": 0.0003991623023626916, + "loss": 3.248, + "step": 7376 + }, + { + "epoch": 0.31061052631578945, + "grad_norm": 0.4140625, + "learning_rate": 0.00039913522114472016, + "loss": 3.1642, + "step": 7377 + }, + { + "epoch": 0.31065263157894735, + "grad_norm": 0.39453125, + "learning_rate": 0.00039910813720968026, + "loss": 3.2375, + "step": 7378 + }, + { + "epoch": 0.31069473684210525, + "grad_norm": 0.4140625, + "learning_rate": 0.0003990810505580653, + "loss": 3.0129, + "step": 7379 + }, + { + "epoch": 0.31073684210526314, + "grad_norm": 0.44140625, + "learning_rate": 0.00039905396119036874, + "loss": 2.8426, + "step": 7380 + }, + { + "epoch": 0.31077894736842104, + "grad_norm": 0.427734375, + "learning_rate": 0.0003990268691070841, + "loss": 3.0598, + "step": 7381 + }, + { + "epoch": 0.31082105263157894, + "grad_norm": 0.412109375, + "learning_rate": 0.0003989997743087051, + "loss": 3.1521, + "step": 7382 + }, + { + "epoch": 0.31086315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.00039897267679572524, + "loss": 2.8502, + "step": 7383 + }, + { + "epoch": 0.31090526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.0003989455765686382, + "loss": 3.3394, + "step": 7384 + }, + { + "epoch": 0.31094736842105264, + "grad_norm": 0.4453125, + "learning_rate": 0.0003989184736279378, + "loss": 2.8559, + "step": 7385 + }, + { + "epoch": 0.31098947368421054, + "grad_norm": 0.40625, + "learning_rate": 0.00039889136797411777, + "loss": 3.401, + "step": 7386 + }, + { + "epoch": 0.31103157894736844, + "grad_norm": 0.380859375, + "learning_rate": 0.00039886425960767197, + "loss": 3.4051, + "step": 7387 + }, + { + "epoch": 0.31107368421052634, + "grad_norm": 0.41015625, + "learning_rate": 0.0003988371485290943, + "loss": 3.0646, + "step": 7388 + }, + { + "epoch": 0.31111578947368423, + "grad_norm": 0.400390625, + "learning_rate": 0.00039881003473887853, + "loss": 3.3619, + "step": 7389 + }, + { + "epoch": 0.3111578947368421, + "grad_norm": 0.40234375, + "learning_rate": 0.0003987829182375189, + "loss": 2.5672, + "step": 7390 + }, + { + "epoch": 0.3112, + "grad_norm": 0.44140625, + "learning_rate": 0.0003987557990255093, + "loss": 3.7055, + "step": 7391 + }, + { + "epoch": 0.3112421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.0003987286771033438, + "loss": 3.2788, + "step": 7392 + }, + { + "epoch": 0.3112842105263158, + "grad_norm": 0.447265625, + "learning_rate": 0.00039870155247151656, + "loss": 3.3632, + "step": 7393 + }, + { + "epoch": 0.3113263157894737, + "grad_norm": 0.396484375, + "learning_rate": 0.0003986744251305218, + "loss": 3.2673, + "step": 7394 + }, + { + "epoch": 0.31136842105263157, + "grad_norm": 0.404296875, + "learning_rate": 0.00039864729508085354, + "loss": 3.1612, + "step": 7395 + }, + { + "epoch": 0.31141052631578947, + "grad_norm": 0.40625, + "learning_rate": 0.00039862016232300636, + "loss": 3.0306, + "step": 7396 + }, + { + "epoch": 0.31145263157894737, + "grad_norm": 0.3984375, + "learning_rate": 0.00039859302685747434, + "loss": 3.0856, + "step": 7397 + }, + { + "epoch": 0.31149473684210527, + "grad_norm": 0.43359375, + "learning_rate": 0.00039856588868475207, + "loss": 3.2754, + "step": 7398 + }, + { + "epoch": 0.31153684210526317, + "grad_norm": 0.39453125, + "learning_rate": 0.0003985387478053338, + "loss": 3.7474, + "step": 7399 + }, + { + "epoch": 0.31157894736842107, + "grad_norm": 0.408203125, + "learning_rate": 0.00039851160421971413, + "loss": 3.342, + "step": 7400 + }, + { + "epoch": 0.31162105263157897, + "grad_norm": 0.419921875, + "learning_rate": 0.0003984844579283875, + "loss": 3.2166, + "step": 7401 + }, + { + "epoch": 0.31166315789473686, + "grad_norm": 0.392578125, + "learning_rate": 0.00039845730893184843, + "loss": 3.2499, + "step": 7402 + }, + { + "epoch": 0.31170526315789476, + "grad_norm": 0.451171875, + "learning_rate": 0.00039843015723059174, + "loss": 3.4656, + "step": 7403 + }, + { + "epoch": 0.3117473684210526, + "grad_norm": 0.40625, + "learning_rate": 0.00039840300282511186, + "loss": 3.3947, + "step": 7404 + }, + { + "epoch": 0.3117894736842105, + "grad_norm": 0.384765625, + "learning_rate": 0.0003983758457159037, + "loss": 3.3461, + "step": 7405 + }, + { + "epoch": 0.3118315789473684, + "grad_norm": 0.408203125, + "learning_rate": 0.00039834868590346204, + "loss": 3.441, + "step": 7406 + }, + { + "epoch": 0.3118736842105263, + "grad_norm": 0.40625, + "learning_rate": 0.00039832152338828154, + "loss": 3.3021, + "step": 7407 + }, + { + "epoch": 0.3119157894736842, + "grad_norm": 0.46484375, + "learning_rate": 0.0003982943581708571, + "loss": 3.3159, + "step": 7408 + }, + { + "epoch": 0.3119578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.00039826719025168377, + "loss": 2.7332, + "step": 7409 + }, + { + "epoch": 0.312, + "grad_norm": 0.4296875, + "learning_rate": 0.00039824001963125643, + "loss": 3.6423, + "step": 7410 + }, + { + "epoch": 0.3120421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00039821284631007015, + "loss": 3.083, + "step": 7411 + }, + { + "epoch": 0.3120842105263158, + "grad_norm": 0.4609375, + "learning_rate": 0.00039818567028861987, + "loss": 2.8152, + "step": 7412 + }, + { + "epoch": 0.3121263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.00039815849156740084, + "loss": 3.3076, + "step": 7413 + }, + { + "epoch": 0.3121684210526316, + "grad_norm": 0.40625, + "learning_rate": 0.0003981313101469082, + "loss": 3.4314, + "step": 7414 + }, + { + "epoch": 0.3122105263157895, + "grad_norm": 0.412109375, + "learning_rate": 0.0003981041260276371, + "loss": 3.4, + "step": 7415 + }, + { + "epoch": 0.3122526315789474, + "grad_norm": 0.41015625, + "learning_rate": 0.00039807693921008284, + "loss": 3.4535, + "step": 7416 + }, + { + "epoch": 0.31229473684210524, + "grad_norm": 0.419921875, + "learning_rate": 0.0003980497496947407, + "loss": 3.1372, + "step": 7417 + }, + { + "epoch": 0.31233684210526313, + "grad_norm": 0.431640625, + "learning_rate": 0.0003980225574821061, + "loss": 3.1125, + "step": 7418 + }, + { + "epoch": 0.31237894736842103, + "grad_norm": 0.423828125, + "learning_rate": 0.0003979953625726744, + "loss": 2.9954, + "step": 7419 + }, + { + "epoch": 0.31242105263157893, + "grad_norm": 0.427734375, + "learning_rate": 0.0003979681649669411, + "loss": 3.1749, + "step": 7420 + }, + { + "epoch": 0.31246315789473683, + "grad_norm": 0.38671875, + "learning_rate": 0.00039794096466540176, + "loss": 3.4737, + "step": 7421 + }, + { + "epoch": 0.31250526315789473, + "grad_norm": 0.4453125, + "learning_rate": 0.00039791376166855175, + "loss": 3.7119, + "step": 7422 + }, + { + "epoch": 0.31254736842105263, + "grad_norm": 0.392578125, + "learning_rate": 0.00039788655597688684, + "loss": 3.6328, + "step": 7423 + }, + { + "epoch": 0.31258947368421053, + "grad_norm": 0.42578125, + "learning_rate": 0.0003978593475909026, + "loss": 2.9337, + "step": 7424 + }, + { + "epoch": 0.3126315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.00039783213651109484, + "loss": 3.0813, + "step": 7425 + }, + { + "epoch": 0.3126736842105263, + "grad_norm": 0.39453125, + "learning_rate": 0.0003978049227379592, + "loss": 3.4933, + "step": 7426 + }, + { + "epoch": 0.3127157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.0003977777062719916, + "loss": 3.1789, + "step": 7427 + }, + { + "epoch": 0.3127578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.00039775048711368776, + "loss": 3.3626, + "step": 7428 + }, + { + "epoch": 0.3128, + "grad_norm": 0.4375, + "learning_rate": 0.00039772326526354366, + "loss": 3.0785, + "step": 7429 + }, + { + "epoch": 0.3128421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00039769604072205524, + "loss": 3.2577, + "step": 7430 + }, + { + "epoch": 0.31288421052631576, + "grad_norm": 0.39453125, + "learning_rate": 0.0003976688134897185, + "loss": 3.2389, + "step": 7431 + }, + { + "epoch": 0.31292631578947366, + "grad_norm": 0.423828125, + "learning_rate": 0.0003976415835670295, + "loss": 3.4281, + "step": 7432 + }, + { + "epoch": 0.31296842105263156, + "grad_norm": 0.419921875, + "learning_rate": 0.0003976143509544843, + "loss": 3.4499, + "step": 7433 + }, + { + "epoch": 0.31301052631578946, + "grad_norm": 0.412109375, + "learning_rate": 0.00039758711565257907, + "loss": 3.5175, + "step": 7434 + }, + { + "epoch": 0.31305263157894736, + "grad_norm": 0.3984375, + "learning_rate": 0.00039755987766181, + "loss": 3.2529, + "step": 7435 + }, + { + "epoch": 0.31309473684210526, + "grad_norm": 0.408203125, + "learning_rate": 0.0003975326369826733, + "loss": 3.315, + "step": 7436 + }, + { + "epoch": 0.31313684210526316, + "grad_norm": 0.3828125, + "learning_rate": 0.00039750539361566527, + "loss": 3.2597, + "step": 7437 + }, + { + "epoch": 0.31317894736842106, + "grad_norm": 0.390625, + "learning_rate": 0.00039747814756128236, + "loss": 3.2667, + "step": 7438 + }, + { + "epoch": 0.31322105263157896, + "grad_norm": 0.396484375, + "learning_rate": 0.00039745089882002084, + "loss": 3.2693, + "step": 7439 + }, + { + "epoch": 0.31326315789473685, + "grad_norm": 0.376953125, + "learning_rate": 0.0003974236473923772, + "loss": 2.9445, + "step": 7440 + }, + { + "epoch": 0.31330526315789475, + "grad_norm": 0.41796875, + "learning_rate": 0.00039739639327884797, + "loss": 3.3583, + "step": 7441 + }, + { + "epoch": 0.31334736842105265, + "grad_norm": 0.546875, + "learning_rate": 0.0003973691364799296, + "loss": 2.86, + "step": 7442 + }, + { + "epoch": 0.31338947368421055, + "grad_norm": 0.4140625, + "learning_rate": 0.0003973418769961187, + "loss": 3.4056, + "step": 7443 + }, + { + "epoch": 0.3134315789473684, + "grad_norm": 0.4140625, + "learning_rate": 0.0003973146148279119, + "loss": 3.0453, + "step": 7444 + }, + { + "epoch": 0.3134736842105263, + "grad_norm": 0.39453125, + "learning_rate": 0.00039728734997580595, + "loss": 3.0676, + "step": 7445 + }, + { + "epoch": 0.3135157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00039726008244029756, + "loss": 3.1237, + "step": 7446 + }, + { + "epoch": 0.3135578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.00039723281222188347, + "loss": 3.267, + "step": 7447 + }, + { + "epoch": 0.3136, + "grad_norm": 0.421875, + "learning_rate": 0.0003972055393210605, + "loss": 3.3411, + "step": 7448 + }, + { + "epoch": 0.3136421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.00039717826373832554, + "loss": 3.1061, + "step": 7449 + }, + { + "epoch": 0.3136842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.0003971509854741756, + "loss": 3.0621, + "step": 7450 + }, + { + "epoch": 0.3137263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.00039712370452910757, + "loss": 3.3853, + "step": 7451 + }, + { + "epoch": 0.3137684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.00039709642090361854, + "loss": 3.3365, + "step": 7452 + }, + { + "epoch": 0.3138105263157895, + "grad_norm": 0.484375, + "learning_rate": 0.00039706913459820547, + "loss": 2.7026, + "step": 7453 + }, + { + "epoch": 0.3138526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.0003970418456133657, + "loss": 3.0366, + "step": 7454 + }, + { + "epoch": 0.3138947368421053, + "grad_norm": 0.46484375, + "learning_rate": 0.00039701455394959616, + "loss": 3.2023, + "step": 7455 + }, + { + "epoch": 0.3139368421052632, + "grad_norm": 0.43359375, + "learning_rate": 0.00039698725960739424, + "loss": 3.1295, + "step": 7456 + }, + { + "epoch": 0.3139789473684211, + "grad_norm": 0.400390625, + "learning_rate": 0.0003969599625872571, + "loss": 3.4282, + "step": 7457 + }, + { + "epoch": 0.3140210526315789, + "grad_norm": 0.408203125, + "learning_rate": 0.0003969326628896822, + "loss": 3.2976, + "step": 7458 + }, + { + "epoch": 0.3140631578947368, + "grad_norm": 0.3984375, + "learning_rate": 0.00039690536051516673, + "loss": 3.347, + "step": 7459 + }, + { + "epoch": 0.3141052631578947, + "grad_norm": 0.44921875, + "learning_rate": 0.00039687805546420824, + "loss": 3.0647, + "step": 7460 + }, + { + "epoch": 0.3141473684210526, + "grad_norm": 0.49609375, + "learning_rate": 0.0003968507477373041, + "loss": 3.1937, + "step": 7461 + }, + { + "epoch": 0.3141894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.000396823437334952, + "loss": 3.225, + "step": 7462 + }, + { + "epoch": 0.3142315789473684, + "grad_norm": 0.419921875, + "learning_rate": 0.0003967961242576493, + "loss": 3.1896, + "step": 7463 + }, + { + "epoch": 0.3142736842105263, + "grad_norm": 0.4375, + "learning_rate": 0.00039676880850589367, + "loss": 3.4536, + "step": 7464 + }, + { + "epoch": 0.3143157894736842, + "grad_norm": 0.3984375, + "learning_rate": 0.00039674149008018286, + "loss": 2.8948, + "step": 7465 + }, + { + "epoch": 0.3143578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.00039671416898101445, + "loss": 3.2017, + "step": 7466 + }, + { + "epoch": 0.3144, + "grad_norm": 0.421875, + "learning_rate": 0.0003966868452088863, + "loss": 3.6467, + "step": 7467 + }, + { + "epoch": 0.3144421052631579, + "grad_norm": 0.474609375, + "learning_rate": 0.00039665951876429617, + "loss": 3.3365, + "step": 7468 + }, + { + "epoch": 0.3144842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.00039663218964774196, + "loss": 3.6083, + "step": 7469 + }, + { + "epoch": 0.3145263157894737, + "grad_norm": 0.40625, + "learning_rate": 0.0003966048578597215, + "loss": 3.2057, + "step": 7470 + }, + { + "epoch": 0.31456842105263155, + "grad_norm": 0.41015625, + "learning_rate": 0.0003965775234007328, + "loss": 3.3493, + "step": 7471 + }, + { + "epoch": 0.31461052631578945, + "grad_norm": 0.45703125, + "learning_rate": 0.0003965501862712738, + "loss": 3.1587, + "step": 7472 + }, + { + "epoch": 0.31465263157894735, + "grad_norm": 0.4296875, + "learning_rate": 0.00039652284647184266, + "loss": 3.6313, + "step": 7473 + }, + { + "epoch": 0.31469473684210525, + "grad_norm": 0.40625, + "learning_rate": 0.00039649550400293744, + "loss": 2.87, + "step": 7474 + }, + { + "epoch": 0.31473684210526315, + "grad_norm": 0.4296875, + "learning_rate": 0.00039646815886505623, + "loss": 3.2297, + "step": 7475 + }, + { + "epoch": 0.31477894736842105, + "grad_norm": 0.431640625, + "learning_rate": 0.0003964408110586972, + "loss": 3.3323, + "step": 7476 + }, + { + "epoch": 0.31482105263157895, + "grad_norm": 0.404296875, + "learning_rate": 0.00039641346058435866, + "loss": 2.9341, + "step": 7477 + }, + { + "epoch": 0.31486315789473684, + "grad_norm": 0.384765625, + "learning_rate": 0.000396386107442539, + "loss": 2.8989, + "step": 7478 + }, + { + "epoch": 0.31490526315789474, + "grad_norm": 0.404296875, + "learning_rate": 0.00039635875163373635, + "loss": 3.4092, + "step": 7479 + }, + { + "epoch": 0.31494736842105264, + "grad_norm": 0.4765625, + "learning_rate": 0.0003963313931584493, + "loss": 2.8028, + "step": 7480 + }, + { + "epoch": 0.31498947368421054, + "grad_norm": 0.4296875, + "learning_rate": 0.0003963040320171761, + "loss": 3.0914, + "step": 7481 + }, + { + "epoch": 0.31503157894736844, + "grad_norm": 0.396484375, + "learning_rate": 0.0003962766682104154, + "loss": 2.9201, + "step": 7482 + }, + { + "epoch": 0.31507368421052634, + "grad_norm": 0.3984375, + "learning_rate": 0.0003962493017386657, + "loss": 3.1492, + "step": 7483 + }, + { + "epoch": 0.31511578947368424, + "grad_norm": 0.427734375, + "learning_rate": 0.00039622193260242545, + "loss": 3.2404, + "step": 7484 + }, + { + "epoch": 0.3151578947368421, + "grad_norm": 0.400390625, + "learning_rate": 0.0003961945608021935, + "loss": 3.5027, + "step": 7485 + }, + { + "epoch": 0.3152, + "grad_norm": 0.400390625, + "learning_rate": 0.00039616718633846837, + "loss": 3.4893, + "step": 7486 + }, + { + "epoch": 0.3152421052631579, + "grad_norm": 0.392578125, + "learning_rate": 0.0003961398092117489, + "loss": 3.7984, + "step": 7487 + }, + { + "epoch": 0.3152842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.0003961124294225338, + "loss": 3.0178, + "step": 7488 + }, + { + "epoch": 0.3153263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.00039608504697132187, + "loss": 3.2586, + "step": 7489 + }, + { + "epoch": 0.3153684210526316, + "grad_norm": 0.392578125, + "learning_rate": 0.00039605766185861203, + "loss": 3.5667, + "step": 7490 + }, + { + "epoch": 0.3154105263157895, + "grad_norm": 0.62109375, + "learning_rate": 0.0003960302740849032, + "loss": 3.3659, + "step": 7491 + }, + { + "epoch": 0.3154526315789474, + "grad_norm": 0.431640625, + "learning_rate": 0.00039600288365069435, + "loss": 2.9802, + "step": 7492 + }, + { + "epoch": 0.3154947368421053, + "grad_norm": 0.3984375, + "learning_rate": 0.0003959754905564845, + "loss": 2.848, + "step": 7493 + }, + { + "epoch": 0.31553684210526317, + "grad_norm": 0.423828125, + "learning_rate": 0.0003959480948027728, + "loss": 3.2274, + "step": 7494 + }, + { + "epoch": 0.31557894736842107, + "grad_norm": 0.4296875, + "learning_rate": 0.0003959206963900582, + "loss": 3.1172, + "step": 7495 + }, + { + "epoch": 0.31562105263157897, + "grad_norm": 0.435546875, + "learning_rate": 0.00039589329531884005, + "loss": 2.912, + "step": 7496 + }, + { + "epoch": 0.31566315789473687, + "grad_norm": 0.419921875, + "learning_rate": 0.00039586589158961737, + "loss": 2.893, + "step": 7497 + }, + { + "epoch": 0.3157052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.0003958384852028896, + "loss": 3.0973, + "step": 7498 + }, + { + "epoch": 0.3157473684210526, + "grad_norm": 0.41796875, + "learning_rate": 0.000395811076159156, + "loss": 3.3533, + "step": 7499 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 0.451171875, + "learning_rate": 0.00039578366445891587, + "loss": 3.2017, + "step": 7500 + }, + { + "epoch": 0.3158315789473684, + "grad_norm": 0.408203125, + "learning_rate": 0.0003957562501026687, + "loss": 2.975, + "step": 7501 + }, + { + "epoch": 0.3158736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.0003957288330909139, + "loss": 3.3691, + "step": 7502 + }, + { + "epoch": 0.3159157894736842, + "grad_norm": 0.388671875, + "learning_rate": 0.000395701413424151, + "loss": 3.3199, + "step": 7503 + }, + { + "epoch": 0.3159578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.0003956739911028795, + "loss": 3.4767, + "step": 7504 + }, + { + "epoch": 0.316, + "grad_norm": 0.423828125, + "learning_rate": 0.00039564656612759905, + "loss": 2.9177, + "step": 7505 + }, + { + "epoch": 0.3160421052631579, + "grad_norm": 0.3984375, + "learning_rate": 0.00039561913849880926, + "loss": 3.4656, + "step": 7506 + }, + { + "epoch": 0.3160842105263158, + "grad_norm": 0.5390625, + "learning_rate": 0.00039559170821701, + "loss": 3.5141, + "step": 7507 + }, + { + "epoch": 0.3161263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.0003955642752827007, + "loss": 3.2666, + "step": 7508 + }, + { + "epoch": 0.3161684210526316, + "grad_norm": 0.412109375, + "learning_rate": 0.0003955368396963814, + "loss": 3.5638, + "step": 7509 + }, + { + "epoch": 0.3162105263157895, + "grad_norm": 0.42578125, + "learning_rate": 0.0003955094014585519, + "loss": 3.117, + "step": 7510 + }, + { + "epoch": 0.31625263157894734, + "grad_norm": 0.443359375, + "learning_rate": 0.0003954819605697121, + "loss": 3.1942, + "step": 7511 + }, + { + "epoch": 0.31629473684210524, + "grad_norm": 0.462890625, + "learning_rate": 0.0003954545170303619, + "loss": 3.2696, + "step": 7512 + }, + { + "epoch": 0.31633684210526314, + "grad_norm": 0.390625, + "learning_rate": 0.00039542707084100126, + "loss": 3.4786, + "step": 7513 + }, + { + "epoch": 0.31637894736842104, + "grad_norm": 0.37890625, + "learning_rate": 0.0003953996220021303, + "loss": 2.8893, + "step": 7514 + }, + { + "epoch": 0.31642105263157894, + "grad_norm": 0.427734375, + "learning_rate": 0.00039537217051424905, + "loss": 3.3121, + "step": 7515 + }, + { + "epoch": 0.31646315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.00039534471637785764, + "loss": 2.9904, + "step": 7516 + }, + { + "epoch": 0.31650526315789473, + "grad_norm": 0.4140625, + "learning_rate": 0.0003953172595934563, + "loss": 3.3074, + "step": 7517 + }, + { + "epoch": 0.31654736842105263, + "grad_norm": 0.44921875, + "learning_rate": 0.0003952898001615452, + "loss": 3.2015, + "step": 7518 + }, + { + "epoch": 0.31658947368421053, + "grad_norm": 0.4453125, + "learning_rate": 0.0003952623380826247, + "loss": 3.3429, + "step": 7519 + }, + { + "epoch": 0.31663157894736843, + "grad_norm": 0.451171875, + "learning_rate": 0.000395234873357195, + "loss": 3.2838, + "step": 7520 + }, + { + "epoch": 0.31667368421052633, + "grad_norm": 0.40234375, + "learning_rate": 0.0003952074059857566, + "loss": 3.5184, + "step": 7521 + }, + { + "epoch": 0.31671578947368423, + "grad_norm": 0.419921875, + "learning_rate": 0.00039517993596880984, + "loss": 3.2192, + "step": 7522 + }, + { + "epoch": 0.3167578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.0003951524633068553, + "loss": 3.4803, + "step": 7523 + }, + { + "epoch": 0.3168, + "grad_norm": 0.41015625, + "learning_rate": 0.00039512498800039336, + "loss": 3.1692, + "step": 7524 + }, + { + "epoch": 0.31684210526315787, + "grad_norm": 0.419921875, + "learning_rate": 0.0003950975100499247, + "loss": 3.5659, + "step": 7525 + }, + { + "epoch": 0.31688421052631577, + "grad_norm": 0.39453125, + "learning_rate": 0.00039507002945594986, + "loss": 3.2774, + "step": 7526 + }, + { + "epoch": 0.31692631578947367, + "grad_norm": 0.443359375, + "learning_rate": 0.0003950425462189695, + "loss": 3.4328, + "step": 7527 + }, + { + "epoch": 0.31696842105263157, + "grad_norm": 0.4296875, + "learning_rate": 0.0003950150603394844, + "loss": 3.3204, + "step": 7528 + }, + { + "epoch": 0.31701052631578946, + "grad_norm": 0.396484375, + "learning_rate": 0.0003949875718179953, + "loss": 3.3494, + "step": 7529 + }, + { + "epoch": 0.31705263157894736, + "grad_norm": 0.61328125, + "learning_rate": 0.000394960080655003, + "loss": 3.1209, + "step": 7530 + }, + { + "epoch": 0.31709473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.0003949325868510083, + "loss": 3.1695, + "step": 7531 + }, + { + "epoch": 0.31713684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.0003949050904065122, + "loss": 3.3405, + "step": 7532 + }, + { + "epoch": 0.31717894736842106, + "grad_norm": 0.453125, + "learning_rate": 0.0003948775913220156, + "loss": 3.1241, + "step": 7533 + }, + { + "epoch": 0.31722105263157896, + "grad_norm": 0.353515625, + "learning_rate": 0.00039485008959801957, + "loss": 2.4633, + "step": 7534 + }, + { + "epoch": 0.31726315789473686, + "grad_norm": 0.41015625, + "learning_rate": 0.000394822585235025, + "loss": 3.1037, + "step": 7535 + }, + { + "epoch": 0.31730526315789476, + "grad_norm": 0.83984375, + "learning_rate": 0.0003947950782335332, + "loss": 3.2732, + "step": 7536 + }, + { + "epoch": 0.31734736842105266, + "grad_norm": 0.4140625, + "learning_rate": 0.0003947675685940451, + "loss": 3.2676, + "step": 7537 + }, + { + "epoch": 0.3173894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.000394740056317062, + "loss": 3.2375, + "step": 7538 + }, + { + "epoch": 0.3174315789473684, + "grad_norm": 0.44921875, + "learning_rate": 0.00039471254140308515, + "loss": 3.2616, + "step": 7539 + }, + { + "epoch": 0.3174736842105263, + "grad_norm": 0.5, + "learning_rate": 0.0003946850238526158, + "loss": 3.5061, + "step": 7540 + }, + { + "epoch": 0.3175157894736842, + "grad_norm": 0.52734375, + "learning_rate": 0.00039465750366615536, + "loss": 2.7974, + "step": 7541 + }, + { + "epoch": 0.3175578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0003946299808442052, + "loss": 3.2561, + "step": 7542 + }, + { + "epoch": 0.3176, + "grad_norm": 0.416015625, + "learning_rate": 0.0003946024553872667, + "loss": 3.3596, + "step": 7543 + }, + { + "epoch": 0.3176421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.0003945749272958413, + "loss": 3.1859, + "step": 7544 + }, + { + "epoch": 0.3176842105263158, + "grad_norm": 0.39453125, + "learning_rate": 0.00039454739657043066, + "loss": 3.5208, + "step": 7545 + }, + { + "epoch": 0.3177263157894737, + "grad_norm": 0.47265625, + "learning_rate": 0.00039451986321153624, + "loss": 2.8656, + "step": 7546 + }, + { + "epoch": 0.3177684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.00039449232721965976, + "loss": 2.42, + "step": 7547 + }, + { + "epoch": 0.3178105263157895, + "grad_norm": 0.470703125, + "learning_rate": 0.0003944647885953029, + "loss": 3.3134, + "step": 7548 + }, + { + "epoch": 0.3178526315789474, + "grad_norm": 0.44140625, + "learning_rate": 0.00039443724733896726, + "loss": 3.2705, + "step": 7549 + }, + { + "epoch": 0.3178947368421053, + "grad_norm": 0.443359375, + "learning_rate": 0.00039440970345115466, + "loss": 3.0717, + "step": 7550 + }, + { + "epoch": 0.3179368421052632, + "grad_norm": 0.427734375, + "learning_rate": 0.0003943821569323669, + "loss": 3.0024, + "step": 7551 + }, + { + "epoch": 0.317978947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00039435460778310593, + "loss": 3.1211, + "step": 7552 + }, + { + "epoch": 0.3180210526315789, + "grad_norm": 0.392578125, + "learning_rate": 0.0003943270560038737, + "loss": 3.5537, + "step": 7553 + }, + { + "epoch": 0.3180631578947368, + "grad_norm": 0.443359375, + "learning_rate": 0.000394299501595172, + "loss": 2.7536, + "step": 7554 + }, + { + "epoch": 0.3181052631578947, + "grad_norm": 0.421875, + "learning_rate": 0.0003942719445575029, + "loss": 3.2498, + "step": 7555 + }, + { + "epoch": 0.3181473684210526, + "grad_norm": 0.96484375, + "learning_rate": 0.0003942443848913686, + "loss": 3.2498, + "step": 7556 + }, + { + "epoch": 0.3181894736842105, + "grad_norm": 0.431640625, + "learning_rate": 0.000394216822597271, + "loss": 3.1386, + "step": 7557 + }, + { + "epoch": 0.3182315789473684, + "grad_norm": 0.431640625, + "learning_rate": 0.00039418925767571234, + "loss": 3.1969, + "step": 7558 + }, + { + "epoch": 0.3182736842105263, + "grad_norm": 0.447265625, + "learning_rate": 0.00039416169012719483, + "loss": 3.5012, + "step": 7559 + }, + { + "epoch": 0.3183157894736842, + "grad_norm": 0.466796875, + "learning_rate": 0.0003941341199522207, + "loss": 3.3976, + "step": 7560 + }, + { + "epoch": 0.3183578947368421, + "grad_norm": 0.50390625, + "learning_rate": 0.0003941065471512923, + "loss": 2.9703, + "step": 7561 + }, + { + "epoch": 0.3184, + "grad_norm": 0.412109375, + "learning_rate": 0.0003940789717249119, + "loss": 3.506, + "step": 7562 + }, + { + "epoch": 0.3184421052631579, + "grad_norm": 0.40234375, + "learning_rate": 0.0003940513936735819, + "loss": 3.428, + "step": 7563 + }, + { + "epoch": 0.3184842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.0003940238129978048, + "loss": 3.6986, + "step": 7564 + }, + { + "epoch": 0.31852631578947366, + "grad_norm": 0.416015625, + "learning_rate": 0.000393996229698083, + "loss": 3.0628, + "step": 7565 + }, + { + "epoch": 0.31856842105263156, + "grad_norm": 0.515625, + "learning_rate": 0.00039396864377491914, + "loss": 3.2429, + "step": 7566 + }, + { + "epoch": 0.31861052631578946, + "grad_norm": 0.43359375, + "learning_rate": 0.00039394105522881574, + "loss": 2.7927, + "step": 7567 + }, + { + "epoch": 0.31865263157894735, + "grad_norm": 0.400390625, + "learning_rate": 0.0003939134640602754, + "loss": 3.3551, + "step": 7568 + }, + { + "epoch": 0.31869473684210525, + "grad_norm": 0.451171875, + "learning_rate": 0.00039388587026980084, + "loss": 2.9891, + "step": 7569 + }, + { + "epoch": 0.31873684210526315, + "grad_norm": 0.4453125, + "learning_rate": 0.0003938582738578948, + "loss": 3.0966, + "step": 7570 + }, + { + "epoch": 0.31877894736842105, + "grad_norm": 0.412109375, + "learning_rate": 0.00039383067482506, + "loss": 3.1353, + "step": 7571 + }, + { + "epoch": 0.31882105263157895, + "grad_norm": 0.435546875, + "learning_rate": 0.0003938030731717993, + "loss": 2.9529, + "step": 7572 + }, + { + "epoch": 0.31886315789473685, + "grad_norm": 0.578125, + "learning_rate": 0.00039377546889861564, + "loss": 3.1503, + "step": 7573 + }, + { + "epoch": 0.31890526315789475, + "grad_norm": 0.4609375, + "learning_rate": 0.0003937478620060119, + "loss": 2.9756, + "step": 7574 + }, + { + "epoch": 0.31894736842105265, + "grad_norm": 0.412109375, + "learning_rate": 0.00039372025249449083, + "loss": 3.0516, + "step": 7575 + }, + { + "epoch": 0.31898947368421054, + "grad_norm": 0.4140625, + "learning_rate": 0.00039369264036455577, + "loss": 3.0545, + "step": 7576 + }, + { + "epoch": 0.31903157894736844, + "grad_norm": 0.41796875, + "learning_rate": 0.00039366502561670957, + "loss": 3.3041, + "step": 7577 + }, + { + "epoch": 0.31907368421052634, + "grad_norm": 0.46484375, + "learning_rate": 0.00039363740825145544, + "loss": 3.0833, + "step": 7578 + }, + { + "epoch": 0.3191157894736842, + "grad_norm": 0.384765625, + "learning_rate": 0.0003936097882692965, + "loss": 2.8939, + "step": 7579 + }, + { + "epoch": 0.3191578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.0003935821656707359, + "loss": 3.4183, + "step": 7580 + }, + { + "epoch": 0.3192, + "grad_norm": 0.400390625, + "learning_rate": 0.000393554540456277, + "loss": 2.8895, + "step": 7581 + }, + { + "epoch": 0.3192421052631579, + "grad_norm": 0.39453125, + "learning_rate": 0.00039352691262642305, + "loss": 3.1806, + "step": 7582 + }, + { + "epoch": 0.3192842105263158, + "grad_norm": 0.443359375, + "learning_rate": 0.00039349928218167734, + "loss": 3.1961, + "step": 7583 + }, + { + "epoch": 0.3193263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.00039347164912254337, + "loss": 3.0577, + "step": 7584 + }, + { + "epoch": 0.3193684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.0003934440134495244, + "loss": 3.0253, + "step": 7585 + }, + { + "epoch": 0.3194105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.00039341637516312426, + "loss": 3.0755, + "step": 7586 + }, + { + "epoch": 0.3194526315789474, + "grad_norm": 0.421875, + "learning_rate": 0.0003933887342638461, + "loss": 2.944, + "step": 7587 + }, + { + "epoch": 0.3194947368421053, + "grad_norm": 0.40234375, + "learning_rate": 0.00039336109075219374, + "loss": 3.5343, + "step": 7588 + }, + { + "epoch": 0.3195368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.00039333344462867074, + "loss": 3.4872, + "step": 7589 + }, + { + "epoch": 0.3195789473684211, + "grad_norm": 0.412109375, + "learning_rate": 0.0003933057958937808, + "loss": 2.8422, + "step": 7590 + }, + { + "epoch": 0.319621052631579, + "grad_norm": 0.421875, + "learning_rate": 0.00039327814454802757, + "loss": 3.1027, + "step": 7591 + }, + { + "epoch": 0.3196631578947368, + "grad_norm": 0.4140625, + "learning_rate": 0.00039325049059191496, + "loss": 3.4847, + "step": 7592 + }, + { + "epoch": 0.3197052631578947, + "grad_norm": 0.419921875, + "learning_rate": 0.00039322283402594665, + "loss": 3.0165, + "step": 7593 + }, + { + "epoch": 0.3197473684210526, + "grad_norm": 0.462890625, + "learning_rate": 0.00039319517485062665, + "loss": 3.0557, + "step": 7594 + }, + { + "epoch": 0.3197894736842105, + "grad_norm": 0.43359375, + "learning_rate": 0.0003931675130664588, + "loss": 3.4603, + "step": 7595 + }, + { + "epoch": 0.3198315789473684, + "grad_norm": 0.439453125, + "learning_rate": 0.00039313984867394707, + "loss": 3.3797, + "step": 7596 + }, + { + "epoch": 0.3198736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.00039311218167359553, + "loss": 3.4195, + "step": 7597 + }, + { + "epoch": 0.3199157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.0003930845120659081, + "loss": 3.2018, + "step": 7598 + }, + { + "epoch": 0.3199578947368421, + "grad_norm": 0.400390625, + "learning_rate": 0.0003930568398513891, + "loss": 3.0816, + "step": 7599 + }, + { + "epoch": 0.32, + "grad_norm": 0.421875, + "learning_rate": 0.00039302916503054243, + "loss": 3.8771, + "step": 7600 + }, + { + "epoch": 0.3200421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.00039300148760387245, + "loss": 3.2862, + "step": 7601 + }, + { + "epoch": 0.3200842105263158, + "grad_norm": 0.50390625, + "learning_rate": 0.00039297380757188346, + "loss": 3.3038, + "step": 7602 + }, + { + "epoch": 0.3201263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.0003929461249350796, + "loss": 3.0297, + "step": 7603 + }, + { + "epoch": 0.3201684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.00039291843969396534, + "loss": 2.7953, + "step": 7604 + }, + { + "epoch": 0.3202105263157895, + "grad_norm": 0.443359375, + "learning_rate": 0.000392890751849045, + "loss": 3.5401, + "step": 7605 + }, + { + "epoch": 0.32025263157894734, + "grad_norm": 0.419921875, + "learning_rate": 0.00039286306140082314, + "loss": 2.865, + "step": 7606 + }, + { + "epoch": 0.32029473684210524, + "grad_norm": 0.4375, + "learning_rate": 0.0003928353683498041, + "loss": 3.4754, + "step": 7607 + }, + { + "epoch": 0.32033684210526314, + "grad_norm": 0.3984375, + "learning_rate": 0.0003928076726964925, + "loss": 2.7858, + "step": 7608 + }, + { + "epoch": 0.32037894736842104, + "grad_norm": 0.447265625, + "learning_rate": 0.0003927799744413928, + "loss": 3.1655, + "step": 7609 + }, + { + "epoch": 0.32042105263157894, + "grad_norm": 0.4296875, + "learning_rate": 0.00039275227358500985, + "loss": 3.4781, + "step": 7610 + }, + { + "epoch": 0.32046315789473684, + "grad_norm": 0.546875, + "learning_rate": 0.00039272457012784815, + "loss": 3.0972, + "step": 7611 + }, + { + "epoch": 0.32050526315789474, + "grad_norm": 0.4765625, + "learning_rate": 0.00039269686407041244, + "loss": 2.9018, + "step": 7612 + }, + { + "epoch": 0.32054736842105264, + "grad_norm": 0.494140625, + "learning_rate": 0.0003926691554132076, + "loss": 3.0494, + "step": 7613 + }, + { + "epoch": 0.32058947368421054, + "grad_norm": 0.4296875, + "learning_rate": 0.0003926414441567383, + "loss": 2.8116, + "step": 7614 + }, + { + "epoch": 0.32063157894736843, + "grad_norm": 0.5078125, + "learning_rate": 0.0003926137303015095, + "loss": 2.6566, + "step": 7615 + }, + { + "epoch": 0.32067368421052633, + "grad_norm": 0.4375, + "learning_rate": 0.0003925860138480262, + "loss": 3.2842, + "step": 7616 + }, + { + "epoch": 0.32071578947368423, + "grad_norm": 0.439453125, + "learning_rate": 0.0003925582947967932, + "loss": 2.9873, + "step": 7617 + }, + { + "epoch": 0.32075789473684213, + "grad_norm": 0.5078125, + "learning_rate": 0.0003925305731483154, + "loss": 3.2454, + "step": 7618 + }, + { + "epoch": 0.3208, + "grad_norm": 0.408203125, + "learning_rate": 0.00039250284890309824, + "loss": 2.9796, + "step": 7619 + }, + { + "epoch": 0.3208421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00039247512206164653, + "loss": 3.0595, + "step": 7620 + }, + { + "epoch": 0.32088421052631577, + "grad_norm": 0.4140625, + "learning_rate": 0.00039244739262446547, + "loss": 3.2905, + "step": 7621 + }, + { + "epoch": 0.32092631578947367, + "grad_norm": 0.421875, + "learning_rate": 0.00039241966059206035, + "loss": 2.975, + "step": 7622 + }, + { + "epoch": 0.32096842105263157, + "grad_norm": 0.412109375, + "learning_rate": 0.0003923919259649363, + "loss": 2.9943, + "step": 7623 + }, + { + "epoch": 0.32101052631578947, + "grad_norm": 0.419921875, + "learning_rate": 0.00039236418874359874, + "loss": 3.705, + "step": 7624 + }, + { + "epoch": 0.32105263157894737, + "grad_norm": 0.400390625, + "learning_rate": 0.0003923364489285529, + "loss": 3.6752, + "step": 7625 + }, + { + "epoch": 0.32109473684210527, + "grad_norm": 0.455078125, + "learning_rate": 0.00039230870652030413, + "loss": 3.4704, + "step": 7626 + }, + { + "epoch": 0.32113684210526316, + "grad_norm": 0.462890625, + "learning_rate": 0.000392280961519358, + "loss": 3.2319, + "step": 7627 + }, + { + "epoch": 0.32117894736842106, + "grad_norm": 0.408203125, + "learning_rate": 0.00039225321392621985, + "loss": 3.4493, + "step": 7628 + }, + { + "epoch": 0.32122105263157896, + "grad_norm": 0.451171875, + "learning_rate": 0.00039222546374139533, + "loss": 3.6995, + "step": 7629 + }, + { + "epoch": 0.32126315789473686, + "grad_norm": 0.38671875, + "learning_rate": 0.00039219771096539, + "loss": 3.4226, + "step": 7630 + }, + { + "epoch": 0.32130526315789476, + "grad_norm": 0.41796875, + "learning_rate": 0.0003921699555987095, + "loss": 3.0865, + "step": 7631 + }, + { + "epoch": 0.32134736842105266, + "grad_norm": 0.431640625, + "learning_rate": 0.0003921421976418593, + "loss": 2.878, + "step": 7632 + }, + { + "epoch": 0.3213894736842105, + "grad_norm": 0.396484375, + "learning_rate": 0.0003921144370953453, + "loss": 3.0306, + "step": 7633 + }, + { + "epoch": 0.3214315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.0003920866739596733, + "loss": 3.1394, + "step": 7634 + }, + { + "epoch": 0.3214736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.00039205890823534905, + "loss": 2.985, + "step": 7635 + }, + { + "epoch": 0.3215157894736842, + "grad_norm": 0.40625, + "learning_rate": 0.00039203113992287845, + "loss": 3.3118, + "step": 7636 + }, + { + "epoch": 0.3215578947368421, + "grad_norm": 0.478515625, + "learning_rate": 0.00039200336902276724, + "loss": 2.7652, + "step": 7637 + }, + { + "epoch": 0.3216, + "grad_norm": 0.439453125, + "learning_rate": 0.00039197559553552153, + "loss": 3.1406, + "step": 7638 + }, + { + "epoch": 0.3216421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00039194781946164727, + "loss": 3.4233, + "step": 7639 + }, + { + "epoch": 0.3216842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.00039192004080165055, + "loss": 3.0441, + "step": 7640 + }, + { + "epoch": 0.3217263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.0003918922595560375, + "loss": 3.2039, + "step": 7641 + }, + { + "epoch": 0.3217684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.00039186447572531414, + "loss": 3.0943, + "step": 7642 + }, + { + "epoch": 0.3218105263157895, + "grad_norm": 0.455078125, + "learning_rate": 0.00039183668930998665, + "loss": 3.3078, + "step": 7643 + }, + { + "epoch": 0.3218526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.00039180890031056145, + "loss": 2.9149, + "step": 7644 + }, + { + "epoch": 0.3218947368421053, + "grad_norm": 0.41015625, + "learning_rate": 0.00039178110872754455, + "loss": 3.505, + "step": 7645 + }, + { + "epoch": 0.32193684210526313, + "grad_norm": 0.41015625, + "learning_rate": 0.0003917533145614426, + "loss": 3.4221, + "step": 7646 + }, + { + "epoch": 0.32197894736842103, + "grad_norm": 0.408203125, + "learning_rate": 0.00039172551781276174, + "loss": 3.1722, + "step": 7647 + }, + { + "epoch": 0.32202105263157893, + "grad_norm": 0.4140625, + "learning_rate": 0.00039169771848200847, + "loss": 3.4358, + "step": 7648 + }, + { + "epoch": 0.32206315789473683, + "grad_norm": 0.396484375, + "learning_rate": 0.00039166991656968927, + "loss": 3.3386, + "step": 7649 + }, + { + "epoch": 0.32210526315789473, + "grad_norm": 0.515625, + "learning_rate": 0.0003916421120763106, + "loss": 2.9449, + "step": 7650 + }, + { + "epoch": 0.3221473684210526, + "grad_norm": 0.3984375, + "learning_rate": 0.0003916143050023792, + "loss": 2.8812, + "step": 7651 + }, + { + "epoch": 0.3221894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.0003915864953484014, + "loss": 2.9171, + "step": 7652 + }, + { + "epoch": 0.3222315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.000391558683114884, + "loss": 2.9394, + "step": 7653 + }, + { + "epoch": 0.3222736842105263, + "grad_norm": 0.47265625, + "learning_rate": 0.0003915308683023338, + "loss": 3.077, + "step": 7654 + }, + { + "epoch": 0.3223157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.0003915030509112575, + "loss": 3.0394, + "step": 7655 + }, + { + "epoch": 0.3223578947368421, + "grad_norm": 0.470703125, + "learning_rate": 0.0003914752309421618, + "loss": 3.1122, + "step": 7656 + }, + { + "epoch": 0.3224, + "grad_norm": 0.408203125, + "learning_rate": 0.00039144740839555366, + "loss": 3.0605, + "step": 7657 + }, + { + "epoch": 0.3224421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.0003914195832719399, + "loss": 2.955, + "step": 7658 + }, + { + "epoch": 0.32248421052631576, + "grad_norm": 0.76171875, + "learning_rate": 0.0003913917555718275, + "loss": 3.3906, + "step": 7659 + }, + { + "epoch": 0.32252631578947366, + "grad_norm": 0.41015625, + "learning_rate": 0.00039136392529572344, + "loss": 3.1478, + "step": 7660 + }, + { + "epoch": 0.32256842105263156, + "grad_norm": 0.41796875, + "learning_rate": 0.0003913360924441348, + "loss": 3.1572, + "step": 7661 + }, + { + "epoch": 0.32261052631578946, + "grad_norm": 0.40625, + "learning_rate": 0.0003913082570175686, + "loss": 3.3406, + "step": 7662 + }, + { + "epoch": 0.32265263157894736, + "grad_norm": 0.431640625, + "learning_rate": 0.000391280419016532, + "loss": 2.9968, + "step": 7663 + }, + { + "epoch": 0.32269473684210526, + "grad_norm": 0.51171875, + "learning_rate": 0.0003912525784415321, + "loss": 2.9329, + "step": 7664 + }, + { + "epoch": 0.32273684210526316, + "grad_norm": 0.486328125, + "learning_rate": 0.00039122473529307623, + "loss": 3.1511, + "step": 7665 + }, + { + "epoch": 0.32277894736842105, + "grad_norm": 0.4140625, + "learning_rate": 0.0003911968895716716, + "loss": 3.4112, + "step": 7666 + }, + { + "epoch": 0.32282105263157895, + "grad_norm": 0.50390625, + "learning_rate": 0.0003911690412778256, + "loss": 2.9805, + "step": 7667 + }, + { + "epoch": 0.32286315789473685, + "grad_norm": 0.4609375, + "learning_rate": 0.0003911411904120454, + "loss": 2.9821, + "step": 7668 + }, + { + "epoch": 0.32290526315789475, + "grad_norm": 0.451171875, + "learning_rate": 0.0003911133369748387, + "loss": 3.7423, + "step": 7669 + }, + { + "epoch": 0.32294736842105265, + "grad_norm": 0.408203125, + "learning_rate": 0.00039108548096671275, + "loss": 3.0658, + "step": 7670 + }, + { + "epoch": 0.32298947368421055, + "grad_norm": 0.400390625, + "learning_rate": 0.0003910576223881751, + "loss": 3.228, + "step": 7671 + }, + { + "epoch": 0.32303157894736845, + "grad_norm": 0.423828125, + "learning_rate": 0.0003910297612397333, + "loss": 3.0798, + "step": 7672 + }, + { + "epoch": 0.3230736842105263, + "grad_norm": 0.447265625, + "learning_rate": 0.0003910018975218949, + "loss": 3.1546, + "step": 7673 + }, + { + "epoch": 0.3231157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.00039097403123516765, + "loss": 2.8845, + "step": 7674 + }, + { + "epoch": 0.3231578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.0003909461623800592, + "loss": 3.659, + "step": 7675 + }, + { + "epoch": 0.3232, + "grad_norm": 0.408203125, + "learning_rate": 0.00039091829095707725, + "loss": 3.4917, + "step": 7676 + }, + { + "epoch": 0.3232421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00039089041696672966, + "loss": 2.8891, + "step": 7677 + }, + { + "epoch": 0.3232842105263158, + "grad_norm": 0.50390625, + "learning_rate": 0.00039086254040952416, + "loss": 2.9514, + "step": 7678 + }, + { + "epoch": 0.3233263157894737, + "grad_norm": 0.3984375, + "learning_rate": 0.0003908346612859687, + "loss": 3.2723, + "step": 7679 + }, + { + "epoch": 0.3233684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.00039080677959657116, + "loss": 3.4022, + "step": 7680 + }, + { + "epoch": 0.3234105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.0003907788953418395, + "loss": 3.0917, + "step": 7681 + }, + { + "epoch": 0.3234526315789474, + "grad_norm": 0.447265625, + "learning_rate": 0.0003907510085222819, + "loss": 3.415, + "step": 7682 + }, + { + "epoch": 0.3234947368421053, + "grad_norm": 0.419921875, + "learning_rate": 0.00039072311913840627, + "loss": 3.4375, + "step": 7683 + }, + { + "epoch": 0.3235368421052632, + "grad_norm": 0.3984375, + "learning_rate": 0.0003906952271907207, + "loss": 2.8949, + "step": 7684 + }, + { + "epoch": 0.3235789473684211, + "grad_norm": 0.4375, + "learning_rate": 0.00039066733267973335, + "loss": 3.3628, + "step": 7685 + }, + { + "epoch": 0.3236210526315789, + "grad_norm": 0.412109375, + "learning_rate": 0.0003906394356059526, + "loss": 3.5041, + "step": 7686 + }, + { + "epoch": 0.3236631578947368, + "grad_norm": 0.400390625, + "learning_rate": 0.00039061153596988655, + "loss": 3.4094, + "step": 7687 + }, + { + "epoch": 0.3237052631578947, + "grad_norm": 0.40234375, + "learning_rate": 0.00039058363377204343, + "loss": 3.0015, + "step": 7688 + }, + { + "epoch": 0.3237473684210526, + "grad_norm": 0.41796875, + "learning_rate": 0.0003905557290129318, + "loss": 3.2188, + "step": 7689 + }, + { + "epoch": 0.3237894736842105, + "grad_norm": 0.447265625, + "learning_rate": 0.0003905278216930599, + "loss": 2.8507, + "step": 7690 + }, + { + "epoch": 0.3238315789473684, + "grad_norm": 0.41015625, + "learning_rate": 0.0003904999118129361, + "loss": 3.6317, + "step": 7691 + }, + { + "epoch": 0.3238736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.0003904719993730691, + "loss": 3.0733, + "step": 7692 + }, + { + "epoch": 0.3239157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.0003904440843739673, + "loss": 3.3828, + "step": 7693 + }, + { + "epoch": 0.3239578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.0003904161668161393, + "loss": 3.2669, + "step": 7694 + }, + { + "epoch": 0.324, + "grad_norm": 0.435546875, + "learning_rate": 0.0003903882467000937, + "loss": 3.0747, + "step": 7695 + }, + { + "epoch": 0.3240421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00039036032402633926, + "loss": 2.8318, + "step": 7696 + }, + { + "epoch": 0.3240842105263158, + "grad_norm": 0.4140625, + "learning_rate": 0.00039033239879538454, + "loss": 3.4097, + "step": 7697 + }, + { + "epoch": 0.3241263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.00039030447100773846, + "loss": 3.3199, + "step": 7698 + }, + { + "epoch": 0.3241684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.0003902765406639097, + "loss": 3.1288, + "step": 7699 + }, + { + "epoch": 0.32421052631578945, + "grad_norm": 0.435546875, + "learning_rate": 0.00039024860776440726, + "loss": 3.034, + "step": 7700 + }, + { + "epoch": 0.32425263157894735, + "grad_norm": 0.42578125, + "learning_rate": 0.0003902206723097399, + "loss": 3.1077, + "step": 7701 + }, + { + "epoch": 0.32429473684210525, + "grad_norm": 0.423828125, + "learning_rate": 0.0003901927343004167, + "loss": 3.368, + "step": 7702 + }, + { + "epoch": 0.32433684210526315, + "grad_norm": 0.408203125, + "learning_rate": 0.0003901647937369466, + "loss": 3.3997, + "step": 7703 + }, + { + "epoch": 0.32437894736842104, + "grad_norm": 0.423828125, + "learning_rate": 0.00039013685061983864, + "loss": 3.368, + "step": 7704 + }, + { + "epoch": 0.32442105263157894, + "grad_norm": 0.396484375, + "learning_rate": 0.0003901089049496019, + "loss": 3.0184, + "step": 7705 + }, + { + "epoch": 0.32446315789473684, + "grad_norm": 0.43359375, + "learning_rate": 0.00039008095672674547, + "loss": 3.3627, + "step": 7706 + }, + { + "epoch": 0.32450526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.00039005300595177874, + "loss": 3.0908, + "step": 7707 + }, + { + "epoch": 0.32454736842105264, + "grad_norm": 0.48046875, + "learning_rate": 0.00039002505262521073, + "loss": 2.915, + "step": 7708 + }, + { + "epoch": 0.32458947368421054, + "grad_norm": 0.478515625, + "learning_rate": 0.0003899970967475508, + "loss": 3.5016, + "step": 7709 + }, + { + "epoch": 0.32463157894736844, + "grad_norm": 0.39453125, + "learning_rate": 0.00038996913831930826, + "loss": 2.8844, + "step": 7710 + }, + { + "epoch": 0.32467368421052634, + "grad_norm": 0.40234375, + "learning_rate": 0.00038994117734099246, + "loss": 3.3524, + "step": 7711 + }, + { + "epoch": 0.32471578947368424, + "grad_norm": 0.4375, + "learning_rate": 0.00038991321381311276, + "loss": 3.0339, + "step": 7712 + }, + { + "epoch": 0.3247578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.0003898852477361787, + "loss": 3.4029, + "step": 7713 + }, + { + "epoch": 0.3248, + "grad_norm": 0.4140625, + "learning_rate": 0.0003898572791106999, + "loss": 3.4315, + "step": 7714 + }, + { + "epoch": 0.3248421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.00038982930793718574, + "loss": 3.4829, + "step": 7715 + }, + { + "epoch": 0.3248842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.0003898013342161459, + "loss": 2.808, + "step": 7716 + }, + { + "epoch": 0.3249263157894737, + "grad_norm": 0.4765625, + "learning_rate": 0.00038977335794809, + "loss": 3.1473, + "step": 7717 + }, + { + "epoch": 0.3249684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.00038974537913352773, + "loss": 3.7466, + "step": 7718 + }, + { + "epoch": 0.32501052631578947, + "grad_norm": 0.400390625, + "learning_rate": 0.0003897173977729689, + "loss": 3.0478, + "step": 7719 + }, + { + "epoch": 0.32505263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.00038968941386692313, + "loss": 2.9991, + "step": 7720 + }, + { + "epoch": 0.32509473684210527, + "grad_norm": 0.4375, + "learning_rate": 0.0003896614274159005, + "loss": 3.4582, + "step": 7721 + }, + { + "epoch": 0.32513684210526317, + "grad_norm": 0.3984375, + "learning_rate": 0.00038963343842041067, + "loss": 3.3717, + "step": 7722 + }, + { + "epoch": 0.32517894736842107, + "grad_norm": 0.40625, + "learning_rate": 0.0003896054468809637, + "loss": 3.4539, + "step": 7723 + }, + { + "epoch": 0.32522105263157897, + "grad_norm": 0.40625, + "learning_rate": 0.00038957745279806955, + "loss": 3.4444, + "step": 7724 + }, + { + "epoch": 0.32526315789473687, + "grad_norm": 0.44140625, + "learning_rate": 0.00038954945617223816, + "loss": 3.6441, + "step": 7725 + }, + { + "epoch": 0.32530526315789476, + "grad_norm": 0.447265625, + "learning_rate": 0.00038952145700397957, + "loss": 3.6744, + "step": 7726 + }, + { + "epoch": 0.3253473684210526, + "grad_norm": 0.41015625, + "learning_rate": 0.0003894934552938041, + "loss": 3.1167, + "step": 7727 + }, + { + "epoch": 0.3253894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.00038946545104222166, + "loss": 3.1487, + "step": 7728 + }, + { + "epoch": 0.3254315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00038943744424974264, + "loss": 3.4988, + "step": 7729 + }, + { + "epoch": 0.3254736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.0003894094349168772, + "loss": 3.2542, + "step": 7730 + }, + { + "epoch": 0.3255157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00038938142304413564, + "loss": 3.2545, + "step": 7731 + }, + { + "epoch": 0.3255578947368421, + "grad_norm": 0.3984375, + "learning_rate": 0.00038935340863202833, + "loss": 2.8935, + "step": 7732 + }, + { + "epoch": 0.3256, + "grad_norm": 0.375, + "learning_rate": 0.0003893253916810656, + "loss": 2.9665, + "step": 7733 + }, + { + "epoch": 0.3256421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.000389297372191758, + "loss": 3.6583, + "step": 7734 + }, + { + "epoch": 0.3256842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.000389269350164616, + "loss": 3.5827, + "step": 7735 + }, + { + "epoch": 0.3257263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0003892413256001499, + "loss": 3.5032, + "step": 7736 + }, + { + "epoch": 0.3257684210526316, + "grad_norm": 0.412109375, + "learning_rate": 0.0003892132984988706, + "loss": 3.0255, + "step": 7737 + }, + { + "epoch": 0.3258105263157895, + "grad_norm": 0.3984375, + "learning_rate": 0.00038918526886128855, + "loss": 2.721, + "step": 7738 + }, + { + "epoch": 0.3258526315789474, + "grad_norm": 0.41015625, + "learning_rate": 0.0003891572366879144, + "loss": 3.2272, + "step": 7739 + }, + { + "epoch": 0.32589473684210524, + "grad_norm": 0.40625, + "learning_rate": 0.00038912920197925883, + "loss": 3.1767, + "step": 7740 + }, + { + "epoch": 0.32593684210526314, + "grad_norm": 0.4296875, + "learning_rate": 0.0003891011647358328, + "loss": 3.3291, + "step": 7741 + }, + { + "epoch": 0.32597894736842103, + "grad_norm": 0.416015625, + "learning_rate": 0.00038907312495814695, + "loss": 3.6203, + "step": 7742 + }, + { + "epoch": 0.32602105263157893, + "grad_norm": 0.38671875, + "learning_rate": 0.00038904508264671215, + "loss": 3.0731, + "step": 7743 + }, + { + "epoch": 0.32606315789473683, + "grad_norm": 0.41015625, + "learning_rate": 0.0003890170378020393, + "loss": 3.066, + "step": 7744 + }, + { + "epoch": 0.32610526315789473, + "grad_norm": 0.390625, + "learning_rate": 0.00038898899042463943, + "loss": 3.0193, + "step": 7745 + }, + { + "epoch": 0.32614736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.00038896094051502333, + "loss": 3.1845, + "step": 7746 + }, + { + "epoch": 0.32618947368421053, + "grad_norm": 0.39453125, + "learning_rate": 0.00038893288807370223, + "loss": 3.029, + "step": 7747 + }, + { + "epoch": 0.32623157894736843, + "grad_norm": 0.3984375, + "learning_rate": 0.0003889048331011872, + "loss": 3.0007, + "step": 7748 + }, + { + "epoch": 0.3262736842105263, + "grad_norm": 0.388671875, + "learning_rate": 0.0003888767755979893, + "loss": 3.3586, + "step": 7749 + }, + { + "epoch": 0.3263157894736842, + "grad_norm": 0.47265625, + "learning_rate": 0.00038884871556461967, + "loss": 3.8711, + "step": 7750 + }, + { + "epoch": 0.3263578947368421, + "grad_norm": 0.396484375, + "learning_rate": 0.00038882065300158966, + "loss": 3.376, + "step": 7751 + }, + { + "epoch": 0.3264, + "grad_norm": 0.375, + "learning_rate": 0.0003887925879094104, + "loss": 2.9198, + "step": 7752 + }, + { + "epoch": 0.3264421052631579, + "grad_norm": 0.38671875, + "learning_rate": 0.00038876452028859323, + "loss": 3.5429, + "step": 7753 + }, + { + "epoch": 0.32648421052631577, + "grad_norm": 0.423828125, + "learning_rate": 0.0003887364501396496, + "loss": 3.2168, + "step": 7754 + }, + { + "epoch": 0.32652631578947366, + "grad_norm": 0.41796875, + "learning_rate": 0.0003887083774630908, + "loss": 3.1433, + "step": 7755 + }, + { + "epoch": 0.32656842105263156, + "grad_norm": 0.47265625, + "learning_rate": 0.0003886803022594284, + "loss": 3.4417, + "step": 7756 + }, + { + "epoch": 0.32661052631578946, + "grad_norm": 0.41796875, + "learning_rate": 0.0003886522245291738, + "loss": 3.4743, + "step": 7757 + }, + { + "epoch": 0.32665263157894736, + "grad_norm": 0.42578125, + "learning_rate": 0.00038862414427283864, + "loss": 3.2501, + "step": 7758 + }, + { + "epoch": 0.32669473684210526, + "grad_norm": 0.412109375, + "learning_rate": 0.0003885960614909344, + "loss": 3.3404, + "step": 7759 + }, + { + "epoch": 0.32673684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.0003885679761839728, + "loss": 3.5173, + "step": 7760 + }, + { + "epoch": 0.32677894736842106, + "grad_norm": 0.388671875, + "learning_rate": 0.0003885398883524654, + "loss": 3.2097, + "step": 7761 + }, + { + "epoch": 0.32682105263157896, + "grad_norm": 0.484375, + "learning_rate": 0.0003885117979969241, + "loss": 3.3873, + "step": 7762 + }, + { + "epoch": 0.32686315789473686, + "grad_norm": 0.408203125, + "learning_rate": 0.0003884837051178606, + "loss": 3.3623, + "step": 7763 + }, + { + "epoch": 0.32690526315789475, + "grad_norm": 0.392578125, + "learning_rate": 0.0003884556097157866, + "loss": 3.2641, + "step": 7764 + }, + { + "epoch": 0.32694736842105265, + "grad_norm": 0.40625, + "learning_rate": 0.00038842751179121417, + "loss": 3.0428, + "step": 7765 + }, + { + "epoch": 0.32698947368421055, + "grad_norm": 0.408203125, + "learning_rate": 0.000388399411344655, + "loss": 3.6526, + "step": 7766 + }, + { + "epoch": 0.3270315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.00038837130837662126, + "loss": 3.047, + "step": 7767 + }, + { + "epoch": 0.3270736842105263, + "grad_norm": 0.3984375, + "learning_rate": 0.0003883432028876248, + "loss": 3.466, + "step": 7768 + }, + { + "epoch": 0.3271157894736842, + "grad_norm": 0.390625, + "learning_rate": 0.0003883150948781778, + "loss": 3.0885, + "step": 7769 + }, + { + "epoch": 0.3271578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00038828698434879226, + "loss": 3.2987, + "step": 7770 + }, + { + "epoch": 0.3272, + "grad_norm": 0.384765625, + "learning_rate": 0.0003882588712999804, + "loss": 3.1211, + "step": 7771 + }, + { + "epoch": 0.3272421052631579, + "grad_norm": 0.44921875, + "learning_rate": 0.0003882307557322543, + "loss": 3.4063, + "step": 7772 + }, + { + "epoch": 0.3272842105263158, + "grad_norm": 0.404296875, + "learning_rate": 0.0003882026376461262, + "loss": 3.3777, + "step": 7773 + }, + { + "epoch": 0.3273263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.0003881745170421085, + "loss": 3.039, + "step": 7774 + }, + { + "epoch": 0.3273684210526316, + "grad_norm": 0.38671875, + "learning_rate": 0.0003881463939207134, + "loss": 3.3054, + "step": 7775 + }, + { + "epoch": 0.3274105263157895, + "grad_norm": 0.396484375, + "learning_rate": 0.00038811826828245334, + "loss": 2.9524, + "step": 7776 + }, + { + "epoch": 0.3274526315789474, + "grad_norm": 0.4140625, + "learning_rate": 0.00038809014012784065, + "loss": 3.3036, + "step": 7777 + }, + { + "epoch": 0.3274947368421053, + "grad_norm": 0.3984375, + "learning_rate": 0.00038806200945738794, + "loss": 3.5261, + "step": 7778 + }, + { + "epoch": 0.3275368421052632, + "grad_norm": 0.427734375, + "learning_rate": 0.00038803387627160766, + "loss": 3.4628, + "step": 7779 + }, + { + "epoch": 0.327578947368421, + "grad_norm": 0.474609375, + "learning_rate": 0.00038800574057101225, + "loss": 3.4118, + "step": 7780 + }, + { + "epoch": 0.3276210526315789, + "grad_norm": 0.3984375, + "learning_rate": 0.00038797760235611433, + "loss": 3.1013, + "step": 7781 + }, + { + "epoch": 0.3276631578947368, + "grad_norm": 0.421875, + "learning_rate": 0.0003879494616274268, + "loss": 3.0355, + "step": 7782 + }, + { + "epoch": 0.3277052631578947, + "grad_norm": 0.474609375, + "learning_rate": 0.0003879213183854621, + "loss": 3.2491, + "step": 7783 + }, + { + "epoch": 0.3277473684210526, + "grad_norm": 0.404296875, + "learning_rate": 0.000387893172630733, + "loss": 2.858, + "step": 7784 + }, + { + "epoch": 0.3277894736842105, + "grad_norm": 0.3984375, + "learning_rate": 0.00038786502436375227, + "loss": 3.2516, + "step": 7785 + }, + { + "epoch": 0.3278315789473684, + "grad_norm": 0.392578125, + "learning_rate": 0.00038783687358503284, + "loss": 3.4861, + "step": 7786 + }, + { + "epoch": 0.3278736842105263, + "grad_norm": 0.4453125, + "learning_rate": 0.00038780872029508753, + "loss": 3.0299, + "step": 7787 + }, + { + "epoch": 0.3279157894736842, + "grad_norm": 0.3984375, + "learning_rate": 0.00038778056449442923, + "loss": 3.409, + "step": 7788 + }, + { + "epoch": 0.3279578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0003877524061835709, + "loss": 3.2539, + "step": 7789 + }, + { + "epoch": 0.328, + "grad_norm": 0.404296875, + "learning_rate": 0.0003877242453630256, + "loss": 3.2238, + "step": 7790 + }, + { + "epoch": 0.3280421052631579, + "grad_norm": 0.3828125, + "learning_rate": 0.0003876960820333064, + "loss": 3.291, + "step": 7791 + }, + { + "epoch": 0.3280842105263158, + "grad_norm": 0.51171875, + "learning_rate": 0.0003876679161949264, + "loss": 3.3712, + "step": 7792 + }, + { + "epoch": 0.3281263157894737, + "grad_norm": 0.447265625, + "learning_rate": 0.00038763974784839867, + "loss": 3.2013, + "step": 7793 + }, + { + "epoch": 0.32816842105263155, + "grad_norm": 0.416015625, + "learning_rate": 0.00038761157699423646, + "loss": 3.6054, + "step": 7794 + }, + { + "epoch": 0.32821052631578945, + "grad_norm": 0.416015625, + "learning_rate": 0.000387583403632953, + "loss": 3.2329, + "step": 7795 + }, + { + "epoch": 0.32825263157894735, + "grad_norm": 0.421875, + "learning_rate": 0.00038755522776506157, + "loss": 3.1909, + "step": 7796 + }, + { + "epoch": 0.32829473684210525, + "grad_norm": 0.4609375, + "learning_rate": 0.0003875270493910755, + "loss": 2.8601, + "step": 7797 + }, + { + "epoch": 0.32833684210526315, + "grad_norm": 0.41796875, + "learning_rate": 0.00038749886851150827, + "loss": 3.0517, + "step": 7798 + }, + { + "epoch": 0.32837894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.00038747068512687315, + "loss": 2.7676, + "step": 7799 + }, + { + "epoch": 0.32842105263157895, + "grad_norm": 0.44921875, + "learning_rate": 0.00038744249923768363, + "loss": 2.859, + "step": 7800 + }, + { + "epoch": 0.32846315789473685, + "grad_norm": 0.408203125, + "learning_rate": 0.00038741431084445335, + "loss": 3.3355, + "step": 7801 + }, + { + "epoch": 0.32850526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.0003873861199476957, + "loss": 3.6095, + "step": 7802 + }, + { + "epoch": 0.32854736842105264, + "grad_norm": 0.404296875, + "learning_rate": 0.0003873579265479244, + "loss": 3.6201, + "step": 7803 + }, + { + "epoch": 0.32858947368421054, + "grad_norm": 0.390625, + "learning_rate": 0.0003873297306456531, + "loss": 3.3249, + "step": 7804 + }, + { + "epoch": 0.32863157894736844, + "grad_norm": 0.408203125, + "learning_rate": 0.00038730153224139543, + "loss": 3.1805, + "step": 7805 + }, + { + "epoch": 0.32867368421052634, + "grad_norm": 0.400390625, + "learning_rate": 0.0003872733313356651, + "loss": 3.3706, + "step": 7806 + }, + { + "epoch": 0.3287157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.00038724512792897604, + "loss": 3.2081, + "step": 7807 + }, + { + "epoch": 0.3287578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.000387216922021842, + "loss": 3.0704, + "step": 7808 + }, + { + "epoch": 0.3288, + "grad_norm": 0.39453125, + "learning_rate": 0.00038718871361477687, + "loss": 3.2854, + "step": 7809 + }, + { + "epoch": 0.3288421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.0003871605027082945, + "loss": 3.2346, + "step": 7810 + }, + { + "epoch": 0.3288842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.000387132289302909, + "loss": 3.0638, + "step": 7811 + }, + { + "epoch": 0.3289263157894737, + "grad_norm": 0.421875, + "learning_rate": 0.00038710407339913436, + "loss": 3.1617, + "step": 7812 + }, + { + "epoch": 0.3289684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.00038707585499748445, + "loss": 3.3601, + "step": 7813 + }, + { + "epoch": 0.3290105263157895, + "grad_norm": 0.3828125, + "learning_rate": 0.00038704763409847366, + "loss": 2.9455, + "step": 7814 + }, + { + "epoch": 0.3290526315789474, + "grad_norm": 0.408203125, + "learning_rate": 0.00038701941070261585, + "loss": 3.2572, + "step": 7815 + }, + { + "epoch": 0.3290947368421053, + "grad_norm": 0.4140625, + "learning_rate": 0.0003869911848104255, + "loss": 2.8311, + "step": 7816 + }, + { + "epoch": 0.32913684210526317, + "grad_norm": 0.42578125, + "learning_rate": 0.0003869629564224167, + "loss": 3.1114, + "step": 7817 + }, + { + "epoch": 0.32917894736842107, + "grad_norm": 0.40234375, + "learning_rate": 0.0003869347255391037, + "loss": 3.3113, + "step": 7818 + }, + { + "epoch": 0.32922105263157897, + "grad_norm": 0.37890625, + "learning_rate": 0.00038690649216100093, + "loss": 3.0194, + "step": 7819 + }, + { + "epoch": 0.32926315789473687, + "grad_norm": 0.404296875, + "learning_rate": 0.0003868782562886227, + "loss": 3.262, + "step": 7820 + }, + { + "epoch": 0.3293052631578947, + "grad_norm": 0.41796875, + "learning_rate": 0.00038685001792248354, + "loss": 3.1593, + "step": 7821 + }, + { + "epoch": 0.3293473684210526, + "grad_norm": 0.419921875, + "learning_rate": 0.0003868217770630978, + "loss": 3.2642, + "step": 7822 + }, + { + "epoch": 0.3293894736842105, + "grad_norm": 0.416015625, + "learning_rate": 0.00038679353371098005, + "loss": 3.112, + "step": 7823 + }, + { + "epoch": 0.3294315789473684, + "grad_norm": 0.439453125, + "learning_rate": 0.00038676528786664476, + "loss": 3.4191, + "step": 7824 + }, + { + "epoch": 0.3294736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.00038673703953060677, + "loss": 3.7351, + "step": 7825 + }, + { + "epoch": 0.3295157894736842, + "grad_norm": 0.4375, + "learning_rate": 0.0003867087887033804, + "loss": 2.8767, + "step": 7826 + }, + { + "epoch": 0.3295578947368421, + "grad_norm": 0.470703125, + "learning_rate": 0.00038668053538548066, + "loss": 3.0665, + "step": 7827 + }, + { + "epoch": 0.3296, + "grad_norm": 0.419921875, + "learning_rate": 0.0003866522795774221, + "loss": 3.1165, + "step": 7828 + }, + { + "epoch": 0.3296421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.00038662402127971965, + "loss": 3.0275, + "step": 7829 + }, + { + "epoch": 0.3296842105263158, + "grad_norm": 0.40625, + "learning_rate": 0.000386595760492888, + "loss": 2.9383, + "step": 7830 + }, + { + "epoch": 0.3297263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.00038656749721744207, + "loss": 3.2514, + "step": 7831 + }, + { + "epoch": 0.3297684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.0003865392314538968, + "loss": 3.1872, + "step": 7832 + }, + { + "epoch": 0.3298105263157895, + "grad_norm": 0.43359375, + "learning_rate": 0.0003865109632027671, + "loss": 3.6949, + "step": 7833 + }, + { + "epoch": 0.32985263157894734, + "grad_norm": 0.40625, + "learning_rate": 0.00038648269246456817, + "loss": 3.4407, + "step": 7834 + }, + { + "epoch": 0.32989473684210524, + "grad_norm": 0.412109375, + "learning_rate": 0.00038645441923981495, + "loss": 3.2819, + "step": 7835 + }, + { + "epoch": 0.32993684210526314, + "grad_norm": 0.42578125, + "learning_rate": 0.0003864261435290224, + "loss": 3.0382, + "step": 7836 + }, + { + "epoch": 0.32997894736842104, + "grad_norm": 0.4296875, + "learning_rate": 0.00038639786533270596, + "loss": 3.3755, + "step": 7837 + }, + { + "epoch": 0.33002105263157894, + "grad_norm": 0.40234375, + "learning_rate": 0.0003863695846513806, + "loss": 3.015, + "step": 7838 + }, + { + "epoch": 0.33006315789473684, + "grad_norm": 0.390625, + "learning_rate": 0.0003863413014855617, + "loss": 3.1719, + "step": 7839 + }, + { + "epoch": 0.33010526315789473, + "grad_norm": 0.416015625, + "learning_rate": 0.0003863130158357644, + "loss": 3.3593, + "step": 7840 + }, + { + "epoch": 0.33014736842105263, + "grad_norm": 0.388671875, + "learning_rate": 0.00038628472770250413, + "loss": 3.3057, + "step": 7841 + }, + { + "epoch": 0.33018947368421053, + "grad_norm": 0.427734375, + "learning_rate": 0.0003862564370862962, + "loss": 3.3171, + "step": 7842 + }, + { + "epoch": 0.33023157894736843, + "grad_norm": 0.412109375, + "learning_rate": 0.0003862281439876562, + "loss": 3.0689, + "step": 7843 + }, + { + "epoch": 0.33027368421052633, + "grad_norm": 0.41015625, + "learning_rate": 0.0003861998484070994, + "loss": 3.2629, + "step": 7844 + }, + { + "epoch": 0.33031578947368423, + "grad_norm": 0.43359375, + "learning_rate": 0.0003861715503451414, + "loss": 3.1689, + "step": 7845 + }, + { + "epoch": 0.33035789473684213, + "grad_norm": 0.400390625, + "learning_rate": 0.00038614324980229775, + "loss": 3.2722, + "step": 7846 + }, + { + "epoch": 0.3304, + "grad_norm": 0.412109375, + "learning_rate": 0.00038611494677908396, + "loss": 3.3857, + "step": 7847 + }, + { + "epoch": 0.33044210526315787, + "grad_norm": 0.4140625, + "learning_rate": 0.0003860866412760159, + "loss": 3.5801, + "step": 7848 + }, + { + "epoch": 0.33048421052631577, + "grad_norm": 0.4140625, + "learning_rate": 0.0003860583332936091, + "loss": 2.9785, + "step": 7849 + }, + { + "epoch": 0.33052631578947367, + "grad_norm": 0.486328125, + "learning_rate": 0.0003860300228323793, + "loss": 3.1409, + "step": 7850 + }, + { + "epoch": 0.33056842105263157, + "grad_norm": 0.41796875, + "learning_rate": 0.0003860017098928424, + "loss": 3.2595, + "step": 7851 + }, + { + "epoch": 0.33061052631578947, + "grad_norm": 0.400390625, + "learning_rate": 0.00038597339447551406, + "loss": 3.5387, + "step": 7852 + }, + { + "epoch": 0.33065263157894736, + "grad_norm": 0.421875, + "learning_rate": 0.0003859450765809103, + "loss": 3.525, + "step": 7853 + }, + { + "epoch": 0.33069473684210526, + "grad_norm": 0.412109375, + "learning_rate": 0.0003859167562095468, + "loss": 3.3199, + "step": 7854 + }, + { + "epoch": 0.33073684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.0003858884333619399, + "loss": 3.624, + "step": 7855 + }, + { + "epoch": 0.33077894736842106, + "grad_norm": 0.404296875, + "learning_rate": 0.00038586010803860527, + "loss": 3.1144, + "step": 7856 + }, + { + "epoch": 0.33082105263157896, + "grad_norm": 0.40234375, + "learning_rate": 0.00038583178024005914, + "loss": 3.1542, + "step": 7857 + }, + { + "epoch": 0.33086315789473686, + "grad_norm": 0.40234375, + "learning_rate": 0.00038580344996681765, + "loss": 3.1113, + "step": 7858 + }, + { + "epoch": 0.33090526315789476, + "grad_norm": 0.412109375, + "learning_rate": 0.0003857751172193968, + "loss": 3.1701, + "step": 7859 + }, + { + "epoch": 0.33094736842105266, + "grad_norm": 0.416015625, + "learning_rate": 0.0003857467819983128, + "loss": 3.322, + "step": 7860 + }, + { + "epoch": 0.3309894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.000385718444304082, + "loss": 3.1112, + "step": 7861 + }, + { + "epoch": 0.3310315789473684, + "grad_norm": 0.396484375, + "learning_rate": 0.0003856901041372206, + "loss": 3.3565, + "step": 7862 + }, + { + "epoch": 0.3310736842105263, + "grad_norm": 0.40234375, + "learning_rate": 0.00038566176149824487, + "loss": 3.5305, + "step": 7863 + }, + { + "epoch": 0.3311157894736842, + "grad_norm": 0.400390625, + "learning_rate": 0.00038563341638767133, + "loss": 3.2326, + "step": 7864 + }, + { + "epoch": 0.3311578947368421, + "grad_norm": 0.400390625, + "learning_rate": 0.0003856050688060162, + "loss": 3.0828, + "step": 7865 + }, + { + "epoch": 0.3312, + "grad_norm": 0.419921875, + "learning_rate": 0.00038557671875379617, + "loss": 3.2994, + "step": 7866 + }, + { + "epoch": 0.3312421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.0003855483662315275, + "loss": 3.2313, + "step": 7867 + }, + { + "epoch": 0.3312842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.0003855200112397269, + "loss": 3.2541, + "step": 7868 + }, + { + "epoch": 0.3313263157894737, + "grad_norm": 0.388671875, + "learning_rate": 0.00038549165377891096, + "loss": 3.1876, + "step": 7869 + }, + { + "epoch": 0.3313684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.0003854632938495963, + "loss": 3.2066, + "step": 7870 + }, + { + "epoch": 0.3314105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.00038543493145229955, + "loss": 3.2407, + "step": 7871 + }, + { + "epoch": 0.3314526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.0003854065665875375, + "loss": 3.6896, + "step": 7872 + }, + { + "epoch": 0.3314947368421053, + "grad_norm": 0.431640625, + "learning_rate": 0.00038537819925582687, + "loss": 3.7763, + "step": 7873 + }, + { + "epoch": 0.3315368421052632, + "grad_norm": 0.40234375, + "learning_rate": 0.0003853498294576845, + "loss": 3.0685, + "step": 7874 + }, + { + "epoch": 0.33157894736842103, + "grad_norm": 0.416015625, + "learning_rate": 0.0003853214571936273, + "loss": 3.2299, + "step": 7875 + }, + { + "epoch": 0.3316210526315789, + "grad_norm": 0.39453125, + "learning_rate": 0.00038529308246417214, + "loss": 3.1194, + "step": 7876 + }, + { + "epoch": 0.3316631578947368, + "grad_norm": 0.474609375, + "learning_rate": 0.000385264705269836, + "loss": 3.1279, + "step": 7877 + }, + { + "epoch": 0.3317052631578947, + "grad_norm": 0.400390625, + "learning_rate": 0.0003852363256111359, + "loss": 3.0307, + "step": 7878 + }, + { + "epoch": 0.3317473684210526, + "grad_norm": 0.4453125, + "learning_rate": 0.00038520794348858873, + "loss": 2.9669, + "step": 7879 + }, + { + "epoch": 0.3317894736842105, + "grad_norm": 0.43359375, + "learning_rate": 0.00038517955890271174, + "loss": 3.2877, + "step": 7880 + }, + { + "epoch": 0.3318315789473684, + "grad_norm": 0.443359375, + "learning_rate": 0.000385151171854022, + "loss": 2.7874, + "step": 7881 + }, + { + "epoch": 0.3318736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.0003851227823430367, + "loss": 3.2639, + "step": 7882 + }, + { + "epoch": 0.3319157894736842, + "grad_norm": 0.44921875, + "learning_rate": 0.00038509439037027316, + "loss": 3.5954, + "step": 7883 + }, + { + "epoch": 0.3319578947368421, + "grad_norm": 0.5703125, + "learning_rate": 0.0003850659959362485, + "loss": 3.3279, + "step": 7884 + }, + { + "epoch": 0.332, + "grad_norm": 0.423828125, + "learning_rate": 0.00038503759904148007, + "loss": 3.2124, + "step": 7885 + }, + { + "epoch": 0.3320421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.0003850091996864853, + "loss": 2.8286, + "step": 7886 + }, + { + "epoch": 0.3320842105263158, + "grad_norm": 0.453125, + "learning_rate": 0.00038498079787178144, + "loss": 2.8199, + "step": 7887 + }, + { + "epoch": 0.33212631578947366, + "grad_norm": 0.39453125, + "learning_rate": 0.00038495239359788615, + "loss": 2.9207, + "step": 7888 + }, + { + "epoch": 0.33216842105263156, + "grad_norm": 0.408203125, + "learning_rate": 0.0003849239868653167, + "loss": 3.2967, + "step": 7889 + }, + { + "epoch": 0.33221052631578946, + "grad_norm": 0.44921875, + "learning_rate": 0.00038489557767459084, + "loss": 3.1647, + "step": 7890 + }, + { + "epoch": 0.33225263157894735, + "grad_norm": 0.419921875, + "learning_rate": 0.00038486716602622597, + "loss": 3.1773, + "step": 7891 + }, + { + "epoch": 0.33229473684210525, + "grad_norm": 0.421875, + "learning_rate": 0.00038483875192073984, + "loss": 3.3369, + "step": 7892 + }, + { + "epoch": 0.33233684210526315, + "grad_norm": 0.40625, + "learning_rate": 0.00038481033535865005, + "loss": 3.281, + "step": 7893 + }, + { + "epoch": 0.33237894736842105, + "grad_norm": 0.3828125, + "learning_rate": 0.00038478191634047437, + "loss": 2.9946, + "step": 7894 + }, + { + "epoch": 0.33242105263157895, + "grad_norm": 0.392578125, + "learning_rate": 0.0003847534948667306, + "loss": 3.1804, + "step": 7895 + }, + { + "epoch": 0.33246315789473685, + "grad_norm": 0.4296875, + "learning_rate": 0.0003847250709379364, + "loss": 3.1189, + "step": 7896 + }, + { + "epoch": 0.33250526315789475, + "grad_norm": 0.37890625, + "learning_rate": 0.00038469664455460974, + "loss": 2.8684, + "step": 7897 + }, + { + "epoch": 0.33254736842105265, + "grad_norm": 0.40625, + "learning_rate": 0.0003846682157172684, + "loss": 3.0656, + "step": 7898 + }, + { + "epoch": 0.33258947368421055, + "grad_norm": 0.427734375, + "learning_rate": 0.0003846397844264305, + "loss": 3.344, + "step": 7899 + }, + { + "epoch": 0.33263157894736844, + "grad_norm": 0.431640625, + "learning_rate": 0.0003846113506826139, + "loss": 3.1041, + "step": 7900 + }, + { + "epoch": 0.3326736842105263, + "grad_norm": 0.380859375, + "learning_rate": 0.0003845829144863366, + "loss": 3.076, + "step": 7901 + }, + { + "epoch": 0.3327157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.00038455447583811674, + "loss": 3.3739, + "step": 7902 + }, + { + "epoch": 0.3327578947368421, + "grad_norm": 0.455078125, + "learning_rate": 0.00038452603473847246, + "loss": 2.945, + "step": 7903 + }, + { + "epoch": 0.3328, + "grad_norm": 0.42578125, + "learning_rate": 0.00038449759118792194, + "loss": 3.4231, + "step": 7904 + }, + { + "epoch": 0.3328421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.0003844691451869832, + "loss": 3.5847, + "step": 7905 + }, + { + "epoch": 0.3328842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.0003844406967361748, + "loss": 3.1225, + "step": 7906 + }, + { + "epoch": 0.3329263157894737, + "grad_norm": 0.392578125, + "learning_rate": 0.0003844122458360146, + "loss": 2.8601, + "step": 7907 + }, + { + "epoch": 0.3329684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.0003843837924870215, + "loss": 3.444, + "step": 7908 + }, + { + "epoch": 0.3330105263157895, + "grad_norm": 0.43359375, + "learning_rate": 0.00038435533668971345, + "loss": 2.9577, + "step": 7909 + }, + { + "epoch": 0.3330526315789474, + "grad_norm": 0.390625, + "learning_rate": 0.00038432687844460904, + "loss": 3.0916, + "step": 7910 + }, + { + "epoch": 0.3330947368421053, + "grad_norm": 0.41015625, + "learning_rate": 0.00038429841775222673, + "loss": 3.3113, + "step": 7911 + }, + { + "epoch": 0.3331368421052632, + "grad_norm": 0.41796875, + "learning_rate": 0.0003842699546130851, + "loss": 3.1777, + "step": 7912 + }, + { + "epoch": 0.3331789473684211, + "grad_norm": 0.43359375, + "learning_rate": 0.0003842414890277026, + "loss": 3.0431, + "step": 7913 + }, + { + "epoch": 0.333221052631579, + "grad_norm": 0.392578125, + "learning_rate": 0.00038421302099659785, + "loss": 2.9283, + "step": 7914 + }, + { + "epoch": 0.3332631578947368, + "grad_norm": 0.4140625, + "learning_rate": 0.00038418455052028965, + "loss": 2.7833, + "step": 7915 + }, + { + "epoch": 0.3333052631578947, + "grad_norm": 0.39453125, + "learning_rate": 0.00038415607759929654, + "loss": 3.1989, + "step": 7916 + }, + { + "epoch": 0.3333473684210526, + "grad_norm": 0.482421875, + "learning_rate": 0.00038412760223413735, + "loss": 2.9709, + "step": 7917 + }, + { + "epoch": 0.3333894736842105, + "grad_norm": 0.40234375, + "learning_rate": 0.00038409912442533093, + "loss": 3.1927, + "step": 7918 + }, + { + "epoch": 0.3334315789473684, + "grad_norm": 0.400390625, + "learning_rate": 0.00038407064417339586, + "loss": 3.1928, + "step": 7919 + }, + { + "epoch": 0.3334736842105263, + "grad_norm": 0.44921875, + "learning_rate": 0.0003840421614788513, + "loss": 3.7893, + "step": 7920 + }, + { + "epoch": 0.3335157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.00038401367634221595, + "loss": 3.0991, + "step": 7921 + }, + { + "epoch": 0.3335578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.0003839851887640089, + "loss": 3.0931, + "step": 7922 + }, + { + "epoch": 0.3336, + "grad_norm": 0.431640625, + "learning_rate": 0.00038395669874474915, + "loss": 3.2374, + "step": 7923 + }, + { + "epoch": 0.3336421052631579, + "grad_norm": 0.51953125, + "learning_rate": 0.0003839282062849557, + "loss": 2.9823, + "step": 7924 + }, + { + "epoch": 0.3336842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.0003838997113851478, + "loss": 3.1536, + "step": 7925 + }, + { + "epoch": 0.3337263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.00038387121404584446, + "loss": 3.3822, + "step": 7926 + }, + { + "epoch": 0.3337684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.0003838427142675649, + "loss": 3.5139, + "step": 7927 + }, + { + "epoch": 0.33381052631578945, + "grad_norm": 0.404296875, + "learning_rate": 0.00038381421205082824, + "loss": 3.0201, + "step": 7928 + }, + { + "epoch": 0.33385263157894735, + "grad_norm": 0.400390625, + "learning_rate": 0.0003837857073961539, + "loss": 2.9081, + "step": 7929 + }, + { + "epoch": 0.33389473684210524, + "grad_norm": 0.39453125, + "learning_rate": 0.0003837572003040612, + "loss": 3.28, + "step": 7930 + }, + { + "epoch": 0.33393684210526314, + "grad_norm": 0.408203125, + "learning_rate": 0.0003837286907750694, + "loss": 3.0508, + "step": 7931 + }, + { + "epoch": 0.33397894736842104, + "grad_norm": 0.40625, + "learning_rate": 0.0003837001788096981, + "loss": 3.2872, + "step": 7932 + }, + { + "epoch": 0.33402105263157894, + "grad_norm": 0.431640625, + "learning_rate": 0.0003836716644084666, + "loss": 3.4463, + "step": 7933 + }, + { + "epoch": 0.33406315789473684, + "grad_norm": 0.447265625, + "learning_rate": 0.00038364314757189435, + "loss": 3.0854, + "step": 7934 + }, + { + "epoch": 0.33410526315789474, + "grad_norm": 0.4375, + "learning_rate": 0.0003836146283005011, + "loss": 3.3409, + "step": 7935 + }, + { + "epoch": 0.33414736842105264, + "grad_norm": 0.45703125, + "learning_rate": 0.0003835861065948062, + "loss": 3.2731, + "step": 7936 + }, + { + "epoch": 0.33418947368421054, + "grad_norm": 0.40234375, + "learning_rate": 0.00038355758245532944, + "loss": 3.2119, + "step": 7937 + }, + { + "epoch": 0.33423157894736844, + "grad_norm": 0.421875, + "learning_rate": 0.00038352905588259046, + "loss": 3.1405, + "step": 7938 + }, + { + "epoch": 0.33427368421052633, + "grad_norm": 0.41796875, + "learning_rate": 0.00038350052687710894, + "loss": 3.2394, + "step": 7939 + }, + { + "epoch": 0.33431578947368423, + "grad_norm": 0.412109375, + "learning_rate": 0.0003834719954394047, + "loss": 3.677, + "step": 7940 + }, + { + "epoch": 0.33435789473684213, + "grad_norm": 0.4140625, + "learning_rate": 0.0003834434615699975, + "loss": 3.4682, + "step": 7941 + }, + { + "epoch": 0.3344, + "grad_norm": 0.404296875, + "learning_rate": 0.00038341492526940726, + "loss": 2.7045, + "step": 7942 + }, + { + "epoch": 0.3344421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.00038338638653815385, + "loss": 2.6402, + "step": 7943 + }, + { + "epoch": 0.3344842105263158, + "grad_norm": 0.48828125, + "learning_rate": 0.00038335784537675717, + "loss": 3.0987, + "step": 7944 + }, + { + "epoch": 0.33452631578947367, + "grad_norm": 0.435546875, + "learning_rate": 0.0003833293017857372, + "loss": 2.8756, + "step": 7945 + }, + { + "epoch": 0.33456842105263157, + "grad_norm": 0.52734375, + "learning_rate": 0.0003833007557656141, + "loss": 2.7931, + "step": 7946 + }, + { + "epoch": 0.33461052631578947, + "grad_norm": 0.41796875, + "learning_rate": 0.00038327220731690777, + "loss": 3.2178, + "step": 7947 + }, + { + "epoch": 0.33465263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.0003832436564401384, + "loss": 3.0454, + "step": 7948 + }, + { + "epoch": 0.33469473684210527, + "grad_norm": 0.3984375, + "learning_rate": 0.00038321510313582617, + "loss": 3.3407, + "step": 7949 + }, + { + "epoch": 0.33473684210526317, + "grad_norm": 0.373046875, + "learning_rate": 0.00038318654740449135, + "loss": 2.9371, + "step": 7950 + }, + { + "epoch": 0.33477894736842106, + "grad_norm": 0.44921875, + "learning_rate": 0.0003831579892466541, + "loss": 3.4842, + "step": 7951 + }, + { + "epoch": 0.33482105263157896, + "grad_norm": 0.39453125, + "learning_rate": 0.00038312942866283473, + "loss": 2.8166, + "step": 7952 + }, + { + "epoch": 0.33486315789473686, + "grad_norm": 0.419921875, + "learning_rate": 0.0003831008656535536, + "loss": 3.7003, + "step": 7953 + }, + { + "epoch": 0.33490526315789476, + "grad_norm": 0.392578125, + "learning_rate": 0.00038307230021933104, + "loss": 3.7203, + "step": 7954 + }, + { + "epoch": 0.3349473684210526, + "grad_norm": 0.447265625, + "learning_rate": 0.00038304373236068757, + "loss": 3.1975, + "step": 7955 + }, + { + "epoch": 0.3349894736842105, + "grad_norm": 0.39453125, + "learning_rate": 0.00038301516207814355, + "loss": 3.1756, + "step": 7956 + }, + { + "epoch": 0.3350315789473684, + "grad_norm": 0.447265625, + "learning_rate": 0.00038298658937221963, + "loss": 3.4375, + "step": 7957 + }, + { + "epoch": 0.3350736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.0003829580142434364, + "loss": 3.2679, + "step": 7958 + }, + { + "epoch": 0.3351157894736842, + "grad_norm": 0.40625, + "learning_rate": 0.00038292943669231415, + "loss": 3.1619, + "step": 7959 + }, + { + "epoch": 0.3351578947368421, + "grad_norm": 0.404296875, + "learning_rate": 0.0003829008567193739, + "loss": 3.1673, + "step": 7960 + }, + { + "epoch": 0.3352, + "grad_norm": 0.41796875, + "learning_rate": 0.00038287227432513615, + "loss": 3.1471, + "step": 7961 + }, + { + "epoch": 0.3352421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.0003828436895101217, + "loss": 3.4299, + "step": 7962 + }, + { + "epoch": 0.3352842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.0003828151022748513, + "loss": 2.8468, + "step": 7963 + }, + { + "epoch": 0.3353263157894737, + "grad_norm": 0.39453125, + "learning_rate": 0.00038278651261984575, + "loss": 3.1144, + "step": 7964 + }, + { + "epoch": 0.3353684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.00038275792054562595, + "loss": 3.4049, + "step": 7965 + }, + { + "epoch": 0.3354105263157895, + "grad_norm": 0.396484375, + "learning_rate": 0.00038272932605271293, + "loss": 3.4588, + "step": 7966 + }, + { + "epoch": 0.3354526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.0003827007291416275, + "loss": 2.7715, + "step": 7967 + }, + { + "epoch": 0.3354947368421053, + "grad_norm": 0.44921875, + "learning_rate": 0.00038267212981289055, + "loss": 3.1617, + "step": 7968 + }, + { + "epoch": 0.33553684210526313, + "grad_norm": 0.400390625, + "learning_rate": 0.0003826435280670235, + "loss": 3.3855, + "step": 7969 + }, + { + "epoch": 0.33557894736842103, + "grad_norm": 0.458984375, + "learning_rate": 0.0003826149239045471, + "loss": 2.8377, + "step": 7970 + }, + { + "epoch": 0.33562105263157893, + "grad_norm": 0.431640625, + "learning_rate": 0.0003825863173259826, + "loss": 3.4018, + "step": 7971 + }, + { + "epoch": 0.33566315789473683, + "grad_norm": 0.400390625, + "learning_rate": 0.00038255770833185124, + "loss": 3.1767, + "step": 7972 + }, + { + "epoch": 0.33570526315789473, + "grad_norm": 0.427734375, + "learning_rate": 0.00038252909692267415, + "loss": 3.5168, + "step": 7973 + }, + { + "epoch": 0.3357473684210526, + "grad_norm": 0.458984375, + "learning_rate": 0.0003825004830989726, + "loss": 2.9353, + "step": 7974 + }, + { + "epoch": 0.3357894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.0003824718668612679, + "loss": 3.5127, + "step": 7975 + }, + { + "epoch": 0.3358315789473684, + "grad_norm": 0.48046875, + "learning_rate": 0.00038244324821008155, + "loss": 2.5587, + "step": 7976 + }, + { + "epoch": 0.3358736842105263, + "grad_norm": 0.416015625, + "learning_rate": 0.00038241462714593477, + "loss": 3.0238, + "step": 7977 + }, + { + "epoch": 0.3359157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.0003823860036693491, + "loss": 2.9694, + "step": 7978 + }, + { + "epoch": 0.3359578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.0003823573777808459, + "loss": 3.1992, + "step": 7979 + }, + { + "epoch": 0.336, + "grad_norm": 0.3984375, + "learning_rate": 0.00038232874948094684, + "loss": 2.9677, + "step": 7980 + }, + { + "epoch": 0.3360421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00038230011877017346, + "loss": 3.1137, + "step": 7981 + }, + { + "epoch": 0.33608421052631576, + "grad_norm": 0.65625, + "learning_rate": 0.00038227148564904734, + "loss": 2.6302, + "step": 7982 + }, + { + "epoch": 0.33612631578947366, + "grad_norm": 0.421875, + "learning_rate": 0.00038224285011809016, + "loss": 3.0678, + "step": 7983 + }, + { + "epoch": 0.33616842105263156, + "grad_norm": 0.42578125, + "learning_rate": 0.00038221421217782373, + "loss": 3.454, + "step": 7984 + }, + { + "epoch": 0.33621052631578946, + "grad_norm": 0.419921875, + "learning_rate": 0.00038218557182876956, + "loss": 3.6481, + "step": 7985 + }, + { + "epoch": 0.33625263157894736, + "grad_norm": 0.419921875, + "learning_rate": 0.0003821569290714497, + "loss": 2.7855, + "step": 7986 + }, + { + "epoch": 0.33629473684210526, + "grad_norm": 0.419921875, + "learning_rate": 0.0003821282839063858, + "loss": 3.2587, + "step": 7987 + }, + { + "epoch": 0.33633684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.0003820996363340997, + "loss": 3.5867, + "step": 7988 + }, + { + "epoch": 0.33637894736842106, + "grad_norm": 0.404296875, + "learning_rate": 0.0003820709863551136, + "loss": 3.2944, + "step": 7989 + }, + { + "epoch": 0.33642105263157895, + "grad_norm": 0.3828125, + "learning_rate": 0.00038204233396994927, + "loss": 3.0407, + "step": 7990 + }, + { + "epoch": 0.33646315789473685, + "grad_norm": 0.45703125, + "learning_rate": 0.0003820136791791288, + "loss": 3.3878, + "step": 7991 + }, + { + "epoch": 0.33650526315789475, + "grad_norm": 0.40625, + "learning_rate": 0.00038198502198317407, + "loss": 3.6311, + "step": 7992 + }, + { + "epoch": 0.33654736842105265, + "grad_norm": 0.41015625, + "learning_rate": 0.00038195636238260746, + "loss": 3.1132, + "step": 7993 + }, + { + "epoch": 0.33658947368421055, + "grad_norm": 0.56640625, + "learning_rate": 0.00038192770037795095, + "loss": 3.2524, + "step": 7994 + }, + { + "epoch": 0.33663157894736845, + "grad_norm": 0.392578125, + "learning_rate": 0.00038189903596972665, + "loss": 3.2628, + "step": 7995 + }, + { + "epoch": 0.3366736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.000381870369158457, + "loss": 2.9233, + "step": 7996 + }, + { + "epoch": 0.3367157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.000381841699944664, + "loss": 3.0799, + "step": 7997 + }, + { + "epoch": 0.3367578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.00038181302832887034, + "loss": 2.9335, + "step": 7998 + }, + { + "epoch": 0.3368, + "grad_norm": 0.4140625, + "learning_rate": 0.0003817843543115981, + "loss": 3.471, + "step": 7999 + }, + { + "epoch": 0.3368421052631579, + "grad_norm": 0.396484375, + "learning_rate": 0.00038175567789336975, + "loss": 3.5798, + "step": 8000 + }, + { + "epoch": 0.3368842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.00038172699907470777, + "loss": 3.1683, + "step": 8001 + }, + { + "epoch": 0.3369263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.00038169831785613456, + "loss": 3.1716, + "step": 8002 + }, + { + "epoch": 0.3369684210526316, + "grad_norm": 0.447265625, + "learning_rate": 0.0003816696342381728, + "loss": 2.9892, + "step": 8003 + }, + { + "epoch": 0.3370105263157895, + "grad_norm": 0.4140625, + "learning_rate": 0.000381640948221345, + "loss": 3.5864, + "step": 8004 + }, + { + "epoch": 0.3370526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.0003816122598061739, + "loss": 3.3006, + "step": 8005 + }, + { + "epoch": 0.3370947368421053, + "grad_norm": 0.408203125, + "learning_rate": 0.0003815835689931819, + "loss": 3.3315, + "step": 8006 + }, + { + "epoch": 0.3371368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.00038155487578289194, + "loss": 2.9677, + "step": 8007 + }, + { + "epoch": 0.3371789473684211, + "grad_norm": 0.40625, + "learning_rate": 0.00038152618017582677, + "loss": 3.1989, + "step": 8008 + }, + { + "epoch": 0.3372210526315789, + "grad_norm": 0.416015625, + "learning_rate": 0.000381497482172509, + "loss": 3.5271, + "step": 8009 + }, + { + "epoch": 0.3372631578947368, + "grad_norm": 0.99609375, + "learning_rate": 0.00038146878177346176, + "loss": 3.646, + "step": 8010 + }, + { + "epoch": 0.3373052631578947, + "grad_norm": 0.455078125, + "learning_rate": 0.00038144007897920776, + "loss": 3.4276, + "step": 8011 + }, + { + "epoch": 0.3373473684210526, + "grad_norm": 0.41796875, + "learning_rate": 0.0003814113737902699, + "loss": 3.2396, + "step": 8012 + }, + { + "epoch": 0.3373894736842105, + "grad_norm": 0.484375, + "learning_rate": 0.00038138266620717123, + "loss": 3.2712, + "step": 8013 + }, + { + "epoch": 0.3374315789473684, + "grad_norm": 0.490234375, + "learning_rate": 0.00038135395623043475, + "loss": 3.5348, + "step": 8014 + }, + { + "epoch": 0.3374736842105263, + "grad_norm": 0.453125, + "learning_rate": 0.00038132524386058343, + "loss": 3.2445, + "step": 8015 + }, + { + "epoch": 0.3375157894736842, + "grad_norm": 0.46875, + "learning_rate": 0.0003812965290981406, + "loss": 3.8382, + "step": 8016 + }, + { + "epoch": 0.3375578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00038126781194362927, + "loss": 3.4179, + "step": 8017 + }, + { + "epoch": 0.3376, + "grad_norm": 0.439453125, + "learning_rate": 0.00038123909239757257, + "loss": 3.0286, + "step": 8018 + }, + { + "epoch": 0.3376421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00038121037046049387, + "loss": 3.3881, + "step": 8019 + }, + { + "epoch": 0.3376842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.0003811816461329164, + "loss": 3.3889, + "step": 8020 + }, + { + "epoch": 0.3377263157894737, + "grad_norm": 0.51171875, + "learning_rate": 0.00038115291941536345, + "loss": 3.3465, + "step": 8021 + }, + { + "epoch": 0.3377684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.0003811241903083583, + "loss": 3.3145, + "step": 8022 + }, + { + "epoch": 0.33781052631578945, + "grad_norm": 0.388671875, + "learning_rate": 0.0003810954588124246, + "loss": 3.1014, + "step": 8023 + }, + { + "epoch": 0.33785263157894735, + "grad_norm": 0.419921875, + "learning_rate": 0.0003810667249280856, + "loss": 3.0904, + "step": 8024 + }, + { + "epoch": 0.33789473684210525, + "grad_norm": 0.412109375, + "learning_rate": 0.00038103798865586494, + "loss": 3.2609, + "step": 8025 + }, + { + "epoch": 0.33793684210526315, + "grad_norm": 0.427734375, + "learning_rate": 0.000381009249996286, + "loss": 3.4993, + "step": 8026 + }, + { + "epoch": 0.33797894736842105, + "grad_norm": 0.412109375, + "learning_rate": 0.00038098050894987254, + "loss": 3.0005, + "step": 8027 + }, + { + "epoch": 0.33802105263157894, + "grad_norm": 0.40625, + "learning_rate": 0.0003809517655171481, + "loss": 3.5184, + "step": 8028 + }, + { + "epoch": 0.33806315789473684, + "grad_norm": 0.392578125, + "learning_rate": 0.0003809230196986362, + "loss": 3.2835, + "step": 8029 + }, + { + "epoch": 0.33810526315789474, + "grad_norm": 0.404296875, + "learning_rate": 0.0003808942714948609, + "loss": 3.1913, + "step": 8030 + }, + { + "epoch": 0.33814736842105264, + "grad_norm": 0.5703125, + "learning_rate": 0.00038086552090634565, + "loss": 2.9306, + "step": 8031 + }, + { + "epoch": 0.33818947368421054, + "grad_norm": 0.412109375, + "learning_rate": 0.0003808367679336144, + "loss": 3.0143, + "step": 8032 + }, + { + "epoch": 0.33823157894736844, + "grad_norm": 0.48046875, + "learning_rate": 0.000380808012577191, + "loss": 3.4699, + "step": 8033 + }, + { + "epoch": 0.33827368421052634, + "grad_norm": 0.423828125, + "learning_rate": 0.00038077925483759925, + "loss": 3.6256, + "step": 8034 + }, + { + "epoch": 0.33831578947368424, + "grad_norm": 0.404296875, + "learning_rate": 0.00038075049471536314, + "loss": 3.0378, + "step": 8035 + }, + { + "epoch": 0.3383578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0003807217322110067, + "loss": 3.5336, + "step": 8036 + }, + { + "epoch": 0.3384, + "grad_norm": 0.421875, + "learning_rate": 0.0003806929673250539, + "loss": 3.3073, + "step": 8037 + }, + { + "epoch": 0.3384421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.0003806642000580287, + "loss": 3.0693, + "step": 8038 + }, + { + "epoch": 0.3384842105263158, + "grad_norm": 0.40625, + "learning_rate": 0.00038063543041045537, + "loss": 3.4275, + "step": 8039 + }, + { + "epoch": 0.3385263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.0003806066583828579, + "loss": 3.2804, + "step": 8040 + }, + { + "epoch": 0.3385684210526316, + "grad_norm": 0.51171875, + "learning_rate": 0.00038057788397576064, + "loss": 3.4944, + "step": 8041 + }, + { + "epoch": 0.3386105263157895, + "grad_norm": 0.443359375, + "learning_rate": 0.0003805491071896878, + "loss": 3.4908, + "step": 8042 + }, + { + "epoch": 0.33865263157894737, + "grad_norm": 0.435546875, + "learning_rate": 0.0003805203280251636, + "loss": 3.0786, + "step": 8043 + }, + { + "epoch": 0.33869473684210527, + "grad_norm": 0.453125, + "learning_rate": 0.0003804915464827123, + "loss": 2.5287, + "step": 8044 + }, + { + "epoch": 0.33873684210526317, + "grad_norm": 0.447265625, + "learning_rate": 0.0003804627625628584, + "loss": 3.5034, + "step": 8045 + }, + { + "epoch": 0.33877894736842107, + "grad_norm": 0.412109375, + "learning_rate": 0.0003804339762661263, + "loss": 3.3386, + "step": 8046 + }, + { + "epoch": 0.33882105263157897, + "grad_norm": 0.41015625, + "learning_rate": 0.00038040518759304033, + "loss": 2.8087, + "step": 8047 + }, + { + "epoch": 0.33886315789473687, + "grad_norm": 0.40234375, + "learning_rate": 0.00038037639654412504, + "loss": 2.4735, + "step": 8048 + }, + { + "epoch": 0.3389052631578947, + "grad_norm": 0.423828125, + "learning_rate": 0.000380347603119905, + "loss": 3.1707, + "step": 8049 + }, + { + "epoch": 0.3389473684210526, + "grad_norm": 0.419921875, + "learning_rate": 0.00038031880732090483, + "loss": 3.3938, + "step": 8050 + }, + { + "epoch": 0.3389894736842105, + "grad_norm": 0.396484375, + "learning_rate": 0.0003802900091476491, + "loss": 3.4361, + "step": 8051 + }, + { + "epoch": 0.3390315789473684, + "grad_norm": 0.408203125, + "learning_rate": 0.0003802612086006625, + "loss": 3.3268, + "step": 8052 + }, + { + "epoch": 0.3390736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.0003802324056804698, + "loss": 3.0745, + "step": 8053 + }, + { + "epoch": 0.3391157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.0003802036003875956, + "loss": 2.6664, + "step": 8054 + }, + { + "epoch": 0.3391578947368421, + "grad_norm": 0.458984375, + "learning_rate": 0.0003801747927225647, + "loss": 3.253, + "step": 8055 + }, + { + "epoch": 0.3392, + "grad_norm": 0.515625, + "learning_rate": 0.0003801459826859022, + "loss": 3.0784, + "step": 8056 + }, + { + "epoch": 0.3392421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.00038011717027813265, + "loss": 3.2767, + "step": 8057 + }, + { + "epoch": 0.3392842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.00038008835549978125, + "loss": 2.9422, + "step": 8058 + }, + { + "epoch": 0.3393263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.0003800595383513728, + "loss": 3.1485, + "step": 8059 + }, + { + "epoch": 0.3393684210526316, + "grad_norm": 0.48046875, + "learning_rate": 0.00038003071883343245, + "loss": 3.4343, + "step": 8060 + }, + { + "epoch": 0.3394105263157895, + "grad_norm": 0.40234375, + "learning_rate": 0.0003800018969464851, + "loss": 3.411, + "step": 8061 + }, + { + "epoch": 0.3394526315789474, + "grad_norm": 0.421875, + "learning_rate": 0.00037997307269105595, + "loss": 3.0111, + "step": 8062 + }, + { + "epoch": 0.33949473684210524, + "grad_norm": 0.41015625, + "learning_rate": 0.00037994424606767015, + "loss": 3.5503, + "step": 8063 + }, + { + "epoch": 0.33953684210526314, + "grad_norm": 0.42578125, + "learning_rate": 0.00037991541707685286, + "loss": 3.5176, + "step": 8064 + }, + { + "epoch": 0.33957894736842104, + "grad_norm": 0.419921875, + "learning_rate": 0.00037988658571912925, + "loss": 3.3148, + "step": 8065 + }, + { + "epoch": 0.33962105263157893, + "grad_norm": 0.427734375, + "learning_rate": 0.0003798577519950248, + "loss": 3.5518, + "step": 8066 + }, + { + "epoch": 0.33966315789473683, + "grad_norm": 0.431640625, + "learning_rate": 0.0003798289159050646, + "loss": 3.29, + "step": 8067 + }, + { + "epoch": 0.33970526315789473, + "grad_norm": 0.41796875, + "learning_rate": 0.0003798000774497741, + "loss": 3.1019, + "step": 8068 + }, + { + "epoch": 0.33974736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.0003797712366296786, + "loss": 3.7446, + "step": 8069 + }, + { + "epoch": 0.33978947368421053, + "grad_norm": 0.416015625, + "learning_rate": 0.0003797423934453038, + "loss": 3.6007, + "step": 8070 + }, + { + "epoch": 0.33983157894736843, + "grad_norm": 0.408203125, + "learning_rate": 0.000379713547897175, + "loss": 2.9921, + "step": 8071 + }, + { + "epoch": 0.33987368421052633, + "grad_norm": 0.486328125, + "learning_rate": 0.0003796846999858177, + "loss": 3.3085, + "step": 8072 + }, + { + "epoch": 0.3399157894736842, + "grad_norm": 0.490234375, + "learning_rate": 0.0003796558497117576, + "loss": 3.713, + "step": 8073 + }, + { + "epoch": 0.3399578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0003796269970755202, + "loss": 3.4899, + "step": 8074 + }, + { + "epoch": 0.34, + "grad_norm": 0.41796875, + "learning_rate": 0.0003795981420776313, + "loss": 3.4302, + "step": 8075 + }, + { + "epoch": 0.34004210526315787, + "grad_norm": 0.4140625, + "learning_rate": 0.0003795692847186164, + "loss": 3.4794, + "step": 8076 + }, + { + "epoch": 0.34008421052631577, + "grad_norm": 0.5234375, + "learning_rate": 0.0003795404249990015, + "loss": 2.8114, + "step": 8077 + }, + { + "epoch": 0.34012631578947367, + "grad_norm": 0.4375, + "learning_rate": 0.0003795115629193122, + "loss": 3.7891, + "step": 8078 + }, + { + "epoch": 0.34016842105263156, + "grad_norm": 0.62109375, + "learning_rate": 0.00037948269848007445, + "loss": 2.8361, + "step": 8079 + }, + { + "epoch": 0.34021052631578946, + "grad_norm": 0.400390625, + "learning_rate": 0.000379453831681814, + "loss": 3.2785, + "step": 8080 + }, + { + "epoch": 0.34025263157894736, + "grad_norm": 0.423828125, + "learning_rate": 0.00037942496252505684, + "loss": 3.0125, + "step": 8081 + }, + { + "epoch": 0.34029473684210526, + "grad_norm": 0.462890625, + "learning_rate": 0.0003793960910103289, + "loss": 3.2825, + "step": 8082 + }, + { + "epoch": 0.34033684210526316, + "grad_norm": 0.45703125, + "learning_rate": 0.0003793672171381562, + "loss": 3.2323, + "step": 8083 + }, + { + "epoch": 0.34037894736842106, + "grad_norm": 0.435546875, + "learning_rate": 0.0003793383409090649, + "loss": 3.3577, + "step": 8084 + }, + { + "epoch": 0.34042105263157896, + "grad_norm": 0.5546875, + "learning_rate": 0.00037930946232358104, + "loss": 2.7474, + "step": 8085 + }, + { + "epoch": 0.34046315789473686, + "grad_norm": 0.4375, + "learning_rate": 0.0003792805813822307, + "loss": 3.2967, + "step": 8086 + }, + { + "epoch": 0.34050526315789476, + "grad_norm": 0.515625, + "learning_rate": 0.00037925169808553994, + "loss": 3.592, + "step": 8087 + }, + { + "epoch": 0.34054736842105265, + "grad_norm": 0.45703125, + "learning_rate": 0.0003792228124340352, + "loss": 3.4521, + "step": 8088 + }, + { + "epoch": 0.34058947368421055, + "grad_norm": 0.55078125, + "learning_rate": 0.0003791939244282425, + "loss": 2.6262, + "step": 8089 + }, + { + "epoch": 0.3406315789473684, + "grad_norm": 0.455078125, + "learning_rate": 0.0003791650340686884, + "loss": 3.4331, + "step": 8090 + }, + { + "epoch": 0.3406736842105263, + "grad_norm": 0.5, + "learning_rate": 0.0003791361413558992, + "loss": 3.0454, + "step": 8091 + }, + { + "epoch": 0.3407157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.00037910724629040116, + "loss": 3.0273, + "step": 8092 + }, + { + "epoch": 0.3407578947368421, + "grad_norm": 0.4765625, + "learning_rate": 0.00037907834887272084, + "loss": 3.1323, + "step": 8093 + }, + { + "epoch": 0.3408, + "grad_norm": 0.419921875, + "learning_rate": 0.0003790494491033847, + "loss": 3.2584, + "step": 8094 + }, + { + "epoch": 0.3408421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.0003790205469829191, + "loss": 3.1852, + "step": 8095 + }, + { + "epoch": 0.3408842105263158, + "grad_norm": 0.47265625, + "learning_rate": 0.00037899164251185076, + "loss": 3.5723, + "step": 8096 + }, + { + "epoch": 0.3409263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.00037896273569070627, + "loss": 3.3689, + "step": 8097 + }, + { + "epoch": 0.3409684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.0003789338265200123, + "loss": 3.3757, + "step": 8098 + }, + { + "epoch": 0.3410105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.0003789049150002954, + "loss": 2.8035, + "step": 8099 + }, + { + "epoch": 0.3410526315789474, + "grad_norm": 0.478515625, + "learning_rate": 0.00037887600113208254, + "loss": 3.5801, + "step": 8100 + }, + { + "epoch": 0.3410947368421053, + "grad_norm": 0.484375, + "learning_rate": 0.00037884708491590024, + "loss": 3.2076, + "step": 8101 + }, + { + "epoch": 0.3411368421052632, + "grad_norm": 0.474609375, + "learning_rate": 0.00037881816635227555, + "loss": 3.1833, + "step": 8102 + }, + { + "epoch": 0.341178947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.00037878924544173517, + "loss": 3.5282, + "step": 8103 + }, + { + "epoch": 0.3412210526315789, + "grad_norm": 0.39453125, + "learning_rate": 0.00037876032218480603, + "loss": 3.2198, + "step": 8104 + }, + { + "epoch": 0.3412631578947368, + "grad_norm": 0.44140625, + "learning_rate": 0.00037873139658201516, + "loss": 3.7004, + "step": 8105 + }, + { + "epoch": 0.3413052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.0003787024686338895, + "loss": 3.5955, + "step": 8106 + }, + { + "epoch": 0.3413473684210526, + "grad_norm": 0.41796875, + "learning_rate": 0.000378673538340956, + "loss": 3.4038, + "step": 8107 + }, + { + "epoch": 0.3413894736842105, + "grad_norm": 0.396484375, + "learning_rate": 0.0003786446057037419, + "loss": 3.0353, + "step": 8108 + }, + { + "epoch": 0.3414315789473684, + "grad_norm": 0.44921875, + "learning_rate": 0.00037861567072277415, + "loss": 3.0447, + "step": 8109 + }, + { + "epoch": 0.3414736842105263, + "grad_norm": 0.412109375, + "learning_rate": 0.0003785867333985801, + "loss": 2.8875, + "step": 8110 + }, + { + "epoch": 0.3415157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.0003785577937316868, + "loss": 3.1532, + "step": 8111 + }, + { + "epoch": 0.3415578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00037852885172262157, + "loss": 3.4033, + "step": 8112 + }, + { + "epoch": 0.3416, + "grad_norm": 0.439453125, + "learning_rate": 0.0003784999073719116, + "loss": 3.2983, + "step": 8113 + }, + { + "epoch": 0.3416421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.0003784709606800844, + "loss": 3.2931, + "step": 8114 + }, + { + "epoch": 0.3416842105263158, + "grad_norm": 0.404296875, + "learning_rate": 0.00037844201164766725, + "loss": 3.1403, + "step": 8115 + }, + { + "epoch": 0.3417263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.0003784130602751875, + "loss": 3.439, + "step": 8116 + }, + { + "epoch": 0.34176842105263155, + "grad_norm": 0.41015625, + "learning_rate": 0.0003783841065631727, + "loss": 3.2401, + "step": 8117 + }, + { + "epoch": 0.34181052631578945, + "grad_norm": 0.44921875, + "learning_rate": 0.0003783551505121503, + "loss": 2.7191, + "step": 8118 + }, + { + "epoch": 0.34185263157894735, + "grad_norm": 0.380859375, + "learning_rate": 0.0003783261921226478, + "loss": 2.6075, + "step": 8119 + }, + { + "epoch": 0.34189473684210525, + "grad_norm": 0.439453125, + "learning_rate": 0.00037829723139519303, + "loss": 3.1234, + "step": 8120 + }, + { + "epoch": 0.34193684210526315, + "grad_norm": 0.427734375, + "learning_rate": 0.00037826826833031334, + "loss": 3.4139, + "step": 8121 + }, + { + "epoch": 0.34197894736842105, + "grad_norm": 0.43359375, + "learning_rate": 0.00037823930292853647, + "loss": 2.9458, + "step": 8122 + }, + { + "epoch": 0.34202105263157895, + "grad_norm": 0.3984375, + "learning_rate": 0.0003782103351903903, + "loss": 3.4771, + "step": 8123 + }, + { + "epoch": 0.34206315789473685, + "grad_norm": 0.412109375, + "learning_rate": 0.00037818136511640234, + "loss": 3.4509, + "step": 8124 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 0.41796875, + "learning_rate": 0.00037815239270710054, + "loss": 2.9436, + "step": 8125 + }, + { + "epoch": 0.34214736842105264, + "grad_norm": 0.515625, + "learning_rate": 0.00037812341796301274, + "loss": 3.1038, + "step": 8126 + }, + { + "epoch": 0.34218947368421054, + "grad_norm": 0.390625, + "learning_rate": 0.0003780944408846668, + "loss": 3.1535, + "step": 8127 + }, + { + "epoch": 0.34223157894736844, + "grad_norm": 0.431640625, + "learning_rate": 0.0003780654614725907, + "loss": 2.9944, + "step": 8128 + }, + { + "epoch": 0.34227368421052634, + "grad_norm": 0.416015625, + "learning_rate": 0.00037803647972731234, + "loss": 3.2479, + "step": 8129 + }, + { + "epoch": 0.3423157894736842, + "grad_norm": 0.4375, + "learning_rate": 0.00037800749564935964, + "loss": 3.2011, + "step": 8130 + }, + { + "epoch": 0.3423578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.0003779785092392609, + "loss": 3.1058, + "step": 8131 + }, + { + "epoch": 0.3424, + "grad_norm": 0.43359375, + "learning_rate": 0.00037794952049754403, + "loss": 3.0025, + "step": 8132 + }, + { + "epoch": 0.3424421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.0003779205294247373, + "loss": 3.4586, + "step": 8133 + }, + { + "epoch": 0.3424842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.0003778915360213688, + "loss": 3.331, + "step": 8134 + }, + { + "epoch": 0.3425263157894737, + "grad_norm": 0.4765625, + "learning_rate": 0.00037786254028796673, + "loss": 3.1038, + "step": 8135 + }, + { + "epoch": 0.3425684210526316, + "grad_norm": 0.451171875, + "learning_rate": 0.00037783354222505945, + "loss": 3.464, + "step": 8136 + }, + { + "epoch": 0.3426105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.00037780454183317525, + "loss": 3.7711, + "step": 8137 + }, + { + "epoch": 0.3426526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00037777553911284243, + "loss": 2.8272, + "step": 8138 + }, + { + "epoch": 0.3426947368421053, + "grad_norm": 0.71484375, + "learning_rate": 0.0003777465340645894, + "loss": 3.0339, + "step": 8139 + }, + { + "epoch": 0.3427368421052632, + "grad_norm": 0.453125, + "learning_rate": 0.00037771752668894464, + "loss": 3.0848, + "step": 8140 + }, + { + "epoch": 0.34277894736842107, + "grad_norm": 0.419921875, + "learning_rate": 0.00037768851698643656, + "loss": 3.2503, + "step": 8141 + }, + { + "epoch": 0.34282105263157897, + "grad_norm": 0.421875, + "learning_rate": 0.00037765950495759383, + "loss": 3.013, + "step": 8142 + }, + { + "epoch": 0.34286315789473687, + "grad_norm": 0.443359375, + "learning_rate": 0.00037763049060294477, + "loss": 3.1282, + "step": 8143 + }, + { + "epoch": 0.3429052631578947, + "grad_norm": 0.462890625, + "learning_rate": 0.00037760147392301827, + "loss": 3.4879, + "step": 8144 + }, + { + "epoch": 0.3429473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.0003775724549183428, + "loss": 3.2463, + "step": 8145 + }, + { + "epoch": 0.3429894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.0003775434335894471, + "loss": 3.3159, + "step": 8146 + }, + { + "epoch": 0.3430315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.00037751440993685987, + "loss": 3.196, + "step": 8147 + }, + { + "epoch": 0.3430736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00037748538396110986, + "loss": 3.4593, + "step": 8148 + }, + { + "epoch": 0.3431157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.000377456355662726, + "loss": 3.2124, + "step": 8149 + }, + { + "epoch": 0.3431578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.0003774273250422371, + "loss": 3.2354, + "step": 8150 + }, + { + "epoch": 0.3432, + "grad_norm": 0.44921875, + "learning_rate": 0.00037739829210017205, + "loss": 2.7043, + "step": 8151 + }, + { + "epoch": 0.3432421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.0003773692568370598, + "loss": 3.1422, + "step": 8152 + }, + { + "epoch": 0.3432842105263158, + "grad_norm": 0.443359375, + "learning_rate": 0.0003773402192534294, + "loss": 3.5105, + "step": 8153 + }, + { + "epoch": 0.3433263157894737, + "grad_norm": 0.59765625, + "learning_rate": 0.00037731117934980976, + "loss": 3.1692, + "step": 8154 + }, + { + "epoch": 0.3433684210526316, + "grad_norm": 0.3984375, + "learning_rate": 0.0003772821371267301, + "loss": 3.3828, + "step": 8155 + }, + { + "epoch": 0.3434105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.00037725309258471927, + "loss": 3.2335, + "step": 8156 + }, + { + "epoch": 0.34345263157894734, + "grad_norm": 0.396484375, + "learning_rate": 0.0003772240457243067, + "loss": 3.0447, + "step": 8157 + }, + { + "epoch": 0.34349473684210524, + "grad_norm": 0.443359375, + "learning_rate": 0.00037719499654602156, + "loss": 3.0578, + "step": 8158 + }, + { + "epoch": 0.34353684210526314, + "grad_norm": 0.474609375, + "learning_rate": 0.00037716594505039303, + "loss": 3.1863, + "step": 8159 + }, + { + "epoch": 0.34357894736842104, + "grad_norm": 0.41796875, + "learning_rate": 0.0003771368912379504, + "loss": 2.9676, + "step": 8160 + }, + { + "epoch": 0.34362105263157894, + "grad_norm": 0.419921875, + "learning_rate": 0.00037710783510922294, + "loss": 3.1447, + "step": 8161 + }, + { + "epoch": 0.34366315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.0003770787766647401, + "loss": 3.7328, + "step": 8162 + }, + { + "epoch": 0.34370526315789474, + "grad_norm": 0.5078125, + "learning_rate": 0.0003770497159050312, + "loss": 3.2592, + "step": 8163 + }, + { + "epoch": 0.34374736842105263, + "grad_norm": 0.486328125, + "learning_rate": 0.0003770206528306258, + "loss": 3.2207, + "step": 8164 + }, + { + "epoch": 0.34378947368421053, + "grad_norm": 0.4296875, + "learning_rate": 0.00037699158744205336, + "loss": 3.0336, + "step": 8165 + }, + { + "epoch": 0.34383157894736843, + "grad_norm": 0.41015625, + "learning_rate": 0.00037696251973984334, + "loss": 3.2014, + "step": 8166 + }, + { + "epoch": 0.34387368421052633, + "grad_norm": 0.447265625, + "learning_rate": 0.0003769334497245255, + "loss": 3.1273, + "step": 8167 + }, + { + "epoch": 0.34391578947368423, + "grad_norm": 0.494140625, + "learning_rate": 0.00037690437739662926, + "loss": 2.9255, + "step": 8168 + }, + { + "epoch": 0.34395789473684213, + "grad_norm": 0.41015625, + "learning_rate": 0.00037687530275668435, + "loss": 3.6337, + "step": 8169 + }, + { + "epoch": 0.344, + "grad_norm": 0.4375, + "learning_rate": 0.00037684622580522057, + "loss": 3.3003, + "step": 8170 + }, + { + "epoch": 0.34404210526315787, + "grad_norm": 0.42578125, + "learning_rate": 0.00037681714654276755, + "loss": 2.8783, + "step": 8171 + }, + { + "epoch": 0.34408421052631577, + "grad_norm": 0.419921875, + "learning_rate": 0.0003767880649698552, + "loss": 3.4545, + "step": 8172 + }, + { + "epoch": 0.34412631578947367, + "grad_norm": 0.453125, + "learning_rate": 0.00037675898108701316, + "loss": 3.3453, + "step": 8173 + }, + { + "epoch": 0.34416842105263157, + "grad_norm": 0.423828125, + "learning_rate": 0.0003767298948947714, + "loss": 3.3267, + "step": 8174 + }, + { + "epoch": 0.34421052631578947, + "grad_norm": 0.4140625, + "learning_rate": 0.00037670080639366, + "loss": 3.0539, + "step": 8175 + }, + { + "epoch": 0.34425263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.0003766717155842087, + "loss": 3.6165, + "step": 8176 + }, + { + "epoch": 0.34429473684210526, + "grad_norm": 0.40625, + "learning_rate": 0.00037664262246694756, + "loss": 3.092, + "step": 8177 + }, + { + "epoch": 0.34433684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.00037661352704240666, + "loss": 3.4342, + "step": 8178 + }, + { + "epoch": 0.34437894736842106, + "grad_norm": 0.408203125, + "learning_rate": 0.000376584429311116, + "loss": 3.1952, + "step": 8179 + }, + { + "epoch": 0.34442105263157896, + "grad_norm": 0.3984375, + "learning_rate": 0.00037655532927360593, + "loss": 3.1271, + "step": 8180 + }, + { + "epoch": 0.34446315789473686, + "grad_norm": 0.5390625, + "learning_rate": 0.00037652622693040634, + "loss": 3.3971, + "step": 8181 + }, + { + "epoch": 0.34450526315789476, + "grad_norm": 0.416015625, + "learning_rate": 0.0003764971222820476, + "loss": 3.4204, + "step": 8182 + }, + { + "epoch": 0.34454736842105266, + "grad_norm": 0.41015625, + "learning_rate": 0.00037646801532905983, + "loss": 2.9938, + "step": 8183 + }, + { + "epoch": 0.3445894736842105, + "grad_norm": 0.416015625, + "learning_rate": 0.00037643890607197346, + "loss": 3.0937, + "step": 8184 + }, + { + "epoch": 0.3446315789473684, + "grad_norm": 0.431640625, + "learning_rate": 0.0003764097945113188, + "loss": 3.317, + "step": 8185 + }, + { + "epoch": 0.3446736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.0003763806806476262, + "loss": 2.7222, + "step": 8186 + }, + { + "epoch": 0.3447157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.00037635156448142615, + "loss": 3.2029, + "step": 8187 + }, + { + "epoch": 0.3447578947368421, + "grad_norm": 0.453125, + "learning_rate": 0.000376322446013249, + "loss": 3.2374, + "step": 8188 + }, + { + "epoch": 0.3448, + "grad_norm": 0.4296875, + "learning_rate": 0.0003762933252436253, + "loss": 3.0058, + "step": 8189 + }, + { + "epoch": 0.3448421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.0003762642021730855, + "loss": 3.2653, + "step": 8190 + }, + { + "epoch": 0.3448842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.0003762350768021604, + "loss": 3.2787, + "step": 8191 + }, + { + "epoch": 0.3449263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.00037620594913138054, + "loss": 3.2269, + "step": 8192 + }, + { + "epoch": 0.3449684210526316, + "grad_norm": 0.44140625, + "learning_rate": 0.0003761768191612766, + "loss": 3.5187, + "step": 8193 + }, + { + "epoch": 0.3450105263157895, + "grad_norm": 0.404296875, + "learning_rate": 0.0003761476868923792, + "loss": 3.3041, + "step": 8194 + }, + { + "epoch": 0.3450526315789474, + "grad_norm": 0.4375, + "learning_rate": 0.0003761185523252192, + "loss": 3.3635, + "step": 8195 + }, + { + "epoch": 0.3450947368421053, + "grad_norm": 0.404296875, + "learning_rate": 0.00037608941546032735, + "loss": 3.1972, + "step": 8196 + }, + { + "epoch": 0.34513684210526313, + "grad_norm": 0.427734375, + "learning_rate": 0.00037606027629823446, + "loss": 3.2351, + "step": 8197 + }, + { + "epoch": 0.34517894736842103, + "grad_norm": 0.416015625, + "learning_rate": 0.0003760311348394715, + "loss": 3.1247, + "step": 8198 + }, + { + "epoch": 0.34522105263157893, + "grad_norm": 0.419921875, + "learning_rate": 0.00037600199108456933, + "loss": 3.2359, + "step": 8199 + }, + { + "epoch": 0.3452631578947368, + "grad_norm": 0.396484375, + "learning_rate": 0.0003759728450340589, + "loss": 3.2204, + "step": 8200 + }, + { + "epoch": 0.3453052631578947, + "grad_norm": 0.4375, + "learning_rate": 0.0003759436966884713, + "loss": 3.2993, + "step": 8201 + }, + { + "epoch": 0.3453473684210526, + "grad_norm": 0.435546875, + "learning_rate": 0.00037591454604833755, + "loss": 2.8041, + "step": 8202 + }, + { + "epoch": 0.3453894736842105, + "grad_norm": 0.408203125, + "learning_rate": 0.00037588539311418856, + "loss": 2.9183, + "step": 8203 + }, + { + "epoch": 0.3454315789473684, + "grad_norm": 0.4140625, + "learning_rate": 0.00037585623788655576, + "loss": 3.321, + "step": 8204 + }, + { + "epoch": 0.3454736842105263, + "grad_norm": 0.375, + "learning_rate": 0.00037582708036597014, + "loss": 3.1191, + "step": 8205 + }, + { + "epoch": 0.3455157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.00037579792055296294, + "loss": 3.0857, + "step": 8206 + }, + { + "epoch": 0.3455578947368421, + "grad_norm": 0.396484375, + "learning_rate": 0.00037576875844806543, + "loss": 3.4273, + "step": 8207 + }, + { + "epoch": 0.3456, + "grad_norm": 0.52734375, + "learning_rate": 0.0003757395940518089, + "loss": 2.9021, + "step": 8208 + }, + { + "epoch": 0.3456421052631579, + "grad_norm": 0.50390625, + "learning_rate": 0.00037571042736472474, + "loss": 2.8855, + "step": 8209 + }, + { + "epoch": 0.3456842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.00037568125838734423, + "loss": 3.1106, + "step": 8210 + }, + { + "epoch": 0.34572631578947366, + "grad_norm": 0.466796875, + "learning_rate": 0.00037565208712019894, + "loss": 3.9363, + "step": 8211 + }, + { + "epoch": 0.34576842105263156, + "grad_norm": 0.423828125, + "learning_rate": 0.0003756229135638203, + "loss": 3.0832, + "step": 8212 + }, + { + "epoch": 0.34581052631578946, + "grad_norm": 0.392578125, + "learning_rate": 0.00037559373771873974, + "loss": 3.4189, + "step": 8213 + }, + { + "epoch": 0.34585263157894736, + "grad_norm": 0.4296875, + "learning_rate": 0.0003755645595854888, + "loss": 3.1372, + "step": 8214 + }, + { + "epoch": 0.34589473684210525, + "grad_norm": 0.40625, + "learning_rate": 0.0003755353791645991, + "loss": 3.4186, + "step": 8215 + }, + { + "epoch": 0.34593684210526315, + "grad_norm": 0.4140625, + "learning_rate": 0.00037550619645660235, + "loss": 3.3381, + "step": 8216 + }, + { + "epoch": 0.34597894736842105, + "grad_norm": 0.412109375, + "learning_rate": 0.00037547701146203004, + "loss": 3.512, + "step": 8217 + }, + { + "epoch": 0.34602105263157895, + "grad_norm": 0.400390625, + "learning_rate": 0.0003754478241814142, + "loss": 3.0702, + "step": 8218 + }, + { + "epoch": 0.34606315789473685, + "grad_norm": 0.427734375, + "learning_rate": 0.0003754186346152863, + "loss": 3.1449, + "step": 8219 + }, + { + "epoch": 0.34610526315789475, + "grad_norm": 0.42578125, + "learning_rate": 0.0003753894427641783, + "loss": 2.7087, + "step": 8220 + }, + { + "epoch": 0.34614736842105265, + "grad_norm": 0.474609375, + "learning_rate": 0.0003753602486286219, + "loss": 3.1418, + "step": 8221 + }, + { + "epoch": 0.34618947368421055, + "grad_norm": 0.419921875, + "learning_rate": 0.0003753310522091491, + "loss": 3.4326, + "step": 8222 + }, + { + "epoch": 0.34623157894736845, + "grad_norm": 0.412109375, + "learning_rate": 0.00037530185350629177, + "loss": 3.5793, + "step": 8223 + }, + { + "epoch": 0.3462736842105263, + "grad_norm": 0.470703125, + "learning_rate": 0.00037527265252058185, + "loss": 3.2095, + "step": 8224 + }, + { + "epoch": 0.3463157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.00037524344925255143, + "loss": 3.3471, + "step": 8225 + }, + { + "epoch": 0.3463578947368421, + "grad_norm": 0.3984375, + "learning_rate": 0.00037521424370273254, + "loss": 3.1191, + "step": 8226 + }, + { + "epoch": 0.3464, + "grad_norm": 0.404296875, + "learning_rate": 0.00037518503587165725, + "loss": 3.2251, + "step": 8227 + }, + { + "epoch": 0.3464421052631579, + "grad_norm": 0.578125, + "learning_rate": 0.00037515582575985774, + "loss": 3.1622, + "step": 8228 + }, + { + "epoch": 0.3464842105263158, + "grad_norm": 0.404296875, + "learning_rate": 0.00037512661336786603, + "loss": 3.0325, + "step": 8229 + }, + { + "epoch": 0.3465263157894737, + "grad_norm": 0.40234375, + "learning_rate": 0.0003750973986962145, + "loss": 3.5389, + "step": 8230 + }, + { + "epoch": 0.3465684210526316, + "grad_norm": 0.400390625, + "learning_rate": 0.0003750681817454353, + "loss": 3.4414, + "step": 8231 + }, + { + "epoch": 0.3466105263157895, + "grad_norm": 0.408203125, + "learning_rate": 0.00037503896251606084, + "loss": 3.6645, + "step": 8232 + }, + { + "epoch": 0.3466526315789474, + "grad_norm": 0.38671875, + "learning_rate": 0.00037500974100862337, + "loss": 3.123, + "step": 8233 + }, + { + "epoch": 0.3466947368421053, + "grad_norm": 0.7109375, + "learning_rate": 0.0003749805172236553, + "loss": 3.0322, + "step": 8234 + }, + { + "epoch": 0.3467368421052632, + "grad_norm": 0.419921875, + "learning_rate": 0.0003749512911616891, + "loss": 3.1078, + "step": 8235 + }, + { + "epoch": 0.3467789473684211, + "grad_norm": 0.416015625, + "learning_rate": 0.00037492206282325715, + "loss": 2.5913, + "step": 8236 + }, + { + "epoch": 0.346821052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.0003748928322088919, + "loss": 3.2595, + "step": 8237 + }, + { + "epoch": 0.3468631578947368, + "grad_norm": 0.390625, + "learning_rate": 0.00037486359931912614, + "loss": 3.1914, + "step": 8238 + }, + { + "epoch": 0.3469052631578947, + "grad_norm": 0.4140625, + "learning_rate": 0.00037483436415449226, + "loss": 3.6113, + "step": 8239 + }, + { + "epoch": 0.3469473684210526, + "grad_norm": 0.390625, + "learning_rate": 0.0003748051267155229, + "loss": 3.1891, + "step": 8240 + }, + { + "epoch": 0.3469894736842105, + "grad_norm": 0.44921875, + "learning_rate": 0.0003747758870027508, + "loss": 3.5032, + "step": 8241 + }, + { + "epoch": 0.3470315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.00037474664501670863, + "loss": 3.4103, + "step": 8242 + }, + { + "epoch": 0.3470736842105263, + "grad_norm": 0.61328125, + "learning_rate": 0.00037471740075792917, + "loss": 3.3254, + "step": 8243 + }, + { + "epoch": 0.3471157894736842, + "grad_norm": 0.458984375, + "learning_rate": 0.0003746881542269452, + "loss": 3.7788, + "step": 8244 + }, + { + "epoch": 0.3471578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.0003746589054242895, + "loss": 2.7412, + "step": 8245 + }, + { + "epoch": 0.3472, + "grad_norm": 0.431640625, + "learning_rate": 0.00037462965435049514, + "loss": 3.1441, + "step": 8246 + }, + { + "epoch": 0.3472421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00037460040100609486, + "loss": 3.5463, + "step": 8247 + }, + { + "epoch": 0.3472842105263158, + "grad_norm": 0.447265625, + "learning_rate": 0.0003745711453916217, + "loss": 2.8335, + "step": 8248 + }, + { + "epoch": 0.3473263157894737, + "grad_norm": 0.455078125, + "learning_rate": 0.0003745418875076085, + "loss": 3.301, + "step": 8249 + }, + { + "epoch": 0.3473684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.0003745126273545886, + "loss": 3.4761, + "step": 8250 + }, + { + "epoch": 0.34741052631578945, + "grad_norm": 0.421875, + "learning_rate": 0.0003744833649330948, + "loss": 3.5165, + "step": 8251 + }, + { + "epoch": 0.34745263157894735, + "grad_norm": 0.46484375, + "learning_rate": 0.00037445410024366046, + "loss": 3.5215, + "step": 8252 + }, + { + "epoch": 0.34749473684210525, + "grad_norm": 0.439453125, + "learning_rate": 0.0003744248332868186, + "loss": 3.6126, + "step": 8253 + }, + { + "epoch": 0.34753684210526314, + "grad_norm": 0.412109375, + "learning_rate": 0.0003743955640631025, + "loss": 3.192, + "step": 8254 + }, + { + "epoch": 0.34757894736842104, + "grad_norm": 0.41015625, + "learning_rate": 0.0003743662925730453, + "loss": 3.1128, + "step": 8255 + }, + { + "epoch": 0.34762105263157894, + "grad_norm": 0.41796875, + "learning_rate": 0.00037433701881718054, + "loss": 3.1386, + "step": 8256 + }, + { + "epoch": 0.34766315789473684, + "grad_norm": 0.470703125, + "learning_rate": 0.0003743077427960412, + "loss": 2.9618, + "step": 8257 + }, + { + "epoch": 0.34770526315789474, + "grad_norm": 0.3984375, + "learning_rate": 0.00037427846451016083, + "loss": 2.8275, + "step": 8258 + }, + { + "epoch": 0.34774736842105264, + "grad_norm": 0.40625, + "learning_rate": 0.00037424918396007303, + "loss": 3.6072, + "step": 8259 + }, + { + "epoch": 0.34778947368421054, + "grad_norm": 0.408203125, + "learning_rate": 0.000374219901146311, + "loss": 3.3121, + "step": 8260 + }, + { + "epoch": 0.34783157894736844, + "grad_norm": 0.408203125, + "learning_rate": 0.0003741906160694083, + "loss": 3.3312, + "step": 8261 + }, + { + "epoch": 0.34787368421052633, + "grad_norm": 0.4375, + "learning_rate": 0.0003741613287298985, + "loss": 3.446, + "step": 8262 + }, + { + "epoch": 0.34791578947368423, + "grad_norm": 0.421875, + "learning_rate": 0.00037413203912831523, + "loss": 3.1421, + "step": 8263 + }, + { + "epoch": 0.34795789473684213, + "grad_norm": 0.455078125, + "learning_rate": 0.000374102747265192, + "loss": 3.2528, + "step": 8264 + }, + { + "epoch": 0.348, + "grad_norm": 0.43359375, + "learning_rate": 0.00037407345314106254, + "loss": 3.2314, + "step": 8265 + }, + { + "epoch": 0.3480421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00037404415675646054, + "loss": 3.4725, + "step": 8266 + }, + { + "epoch": 0.3480842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.0003740148581119197, + "loss": 3.4624, + "step": 8267 + }, + { + "epoch": 0.3481263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.00037398555720797395, + "loss": 3.3025, + "step": 8268 + }, + { + "epoch": 0.34816842105263157, + "grad_norm": 0.404296875, + "learning_rate": 0.000373956254045157, + "loss": 3.2174, + "step": 8269 + }, + { + "epoch": 0.34821052631578947, + "grad_norm": 0.44140625, + "learning_rate": 0.0003739269486240027, + "loss": 3.4145, + "step": 8270 + }, + { + "epoch": 0.34825263157894737, + "grad_norm": 0.388671875, + "learning_rate": 0.0003738976409450451, + "loss": 3.0555, + "step": 8271 + }, + { + "epoch": 0.34829473684210527, + "grad_norm": 0.412109375, + "learning_rate": 0.000373868331008818, + "loss": 3.0283, + "step": 8272 + }, + { + "epoch": 0.34833684210526317, + "grad_norm": 0.4296875, + "learning_rate": 0.0003738390188158554, + "loss": 2.9652, + "step": 8273 + }, + { + "epoch": 0.34837894736842107, + "grad_norm": 0.435546875, + "learning_rate": 0.0003738097043666914, + "loss": 3.2567, + "step": 8274 + }, + { + "epoch": 0.34842105263157896, + "grad_norm": 0.443359375, + "learning_rate": 0.00037378038766186007, + "loss": 3.0372, + "step": 8275 + }, + { + "epoch": 0.34846315789473686, + "grad_norm": 0.83984375, + "learning_rate": 0.0003737510687018956, + "loss": 3.2202, + "step": 8276 + }, + { + "epoch": 0.34850526315789476, + "grad_norm": 0.50390625, + "learning_rate": 0.00037372174748733194, + "loss": 3.2648, + "step": 8277 + }, + { + "epoch": 0.3485473684210526, + "grad_norm": 0.46484375, + "learning_rate": 0.0003736924240187034, + "loss": 3.5579, + "step": 8278 + }, + { + "epoch": 0.3485894736842105, + "grad_norm": 0.45703125, + "learning_rate": 0.00037366309829654434, + "loss": 3.5527, + "step": 8279 + }, + { + "epoch": 0.3486315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.0003736337703213888, + "loss": 3.4104, + "step": 8280 + }, + { + "epoch": 0.3486736842105263, + "grad_norm": 0.466796875, + "learning_rate": 0.00037360444009377136, + "loss": 2.9484, + "step": 8281 + }, + { + "epoch": 0.3487157894736842, + "grad_norm": 0.40234375, + "learning_rate": 0.00037357510761422614, + "loss": 2.8831, + "step": 8282 + }, + { + "epoch": 0.3487578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.0003735457728832877, + "loss": 3.7802, + "step": 8283 + }, + { + "epoch": 0.3488, + "grad_norm": 0.431640625, + "learning_rate": 0.0003735164359014904, + "loss": 3.0854, + "step": 8284 + }, + { + "epoch": 0.3488421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.0003734870966693688, + "loss": 3.2821, + "step": 8285 + }, + { + "epoch": 0.3488842105263158, + "grad_norm": 0.404296875, + "learning_rate": 0.0003734577551874574, + "loss": 2.9781, + "step": 8286 + }, + { + "epoch": 0.3489263157894737, + "grad_norm": 0.451171875, + "learning_rate": 0.0003734284114562908, + "loss": 3.2406, + "step": 8287 + }, + { + "epoch": 0.3489684210526316, + "grad_norm": 0.44921875, + "learning_rate": 0.0003733990654764035, + "loss": 3.2524, + "step": 8288 + }, + { + "epoch": 0.3490105263157895, + "grad_norm": 0.404296875, + "learning_rate": 0.00037336971724833025, + "loss": 3.0693, + "step": 8289 + }, + { + "epoch": 0.3490526315789474, + "grad_norm": 0.416015625, + "learning_rate": 0.0003733403667726056, + "loss": 2.9165, + "step": 8290 + }, + { + "epoch": 0.34909473684210524, + "grad_norm": 0.421875, + "learning_rate": 0.00037331101404976453, + "loss": 3.3348, + "step": 8291 + }, + { + "epoch": 0.34913684210526313, + "grad_norm": 0.421875, + "learning_rate": 0.00037328165908034157, + "loss": 3.3673, + "step": 8292 + }, + { + "epoch": 0.34917894736842103, + "grad_norm": 0.408203125, + "learning_rate": 0.00037325230186487166, + "loss": 3.8094, + "step": 8293 + }, + { + "epoch": 0.34922105263157893, + "grad_norm": 0.416015625, + "learning_rate": 0.00037322294240388967, + "loss": 3.2573, + "step": 8294 + }, + { + "epoch": 0.34926315789473683, + "grad_norm": 0.404296875, + "learning_rate": 0.0003731935806979304, + "loss": 3.4634, + "step": 8295 + }, + { + "epoch": 0.34930526315789473, + "grad_norm": 0.400390625, + "learning_rate": 0.0003731642167475289, + "loss": 3.2582, + "step": 8296 + }, + { + "epoch": 0.34934736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.00037313485055322004, + "loss": 3.0152, + "step": 8297 + }, + { + "epoch": 0.3493894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.00037310548211553883, + "loss": 3.5773, + "step": 8298 + }, + { + "epoch": 0.3494315789473684, + "grad_norm": 0.40234375, + "learning_rate": 0.0003730761114350204, + "loss": 3.3615, + "step": 8299 + }, + { + "epoch": 0.3494736842105263, + "grad_norm": 0.40234375, + "learning_rate": 0.00037304673851219983, + "loss": 3.5529, + "step": 8300 + }, + { + "epoch": 0.3495157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.0003730173633476123, + "loss": 3.2005, + "step": 8301 + }, + { + "epoch": 0.3495578947368421, + "grad_norm": 0.390625, + "learning_rate": 0.00037298798594179297, + "loss": 2.7605, + "step": 8302 + }, + { + "epoch": 0.3496, + "grad_norm": 0.43359375, + "learning_rate": 0.000372958606295277, + "loss": 2.8665, + "step": 8303 + }, + { + "epoch": 0.3496421052631579, + "grad_norm": 0.390625, + "learning_rate": 0.00037292922440859965, + "loss": 2.8548, + "step": 8304 + }, + { + "epoch": 0.34968421052631576, + "grad_norm": 0.419921875, + "learning_rate": 0.0003728998402822963, + "loss": 3.2386, + "step": 8305 + }, + { + "epoch": 0.34972631578947366, + "grad_norm": 0.43359375, + "learning_rate": 0.00037287045391690236, + "loss": 3.6093, + "step": 8306 + }, + { + "epoch": 0.34976842105263156, + "grad_norm": 0.421875, + "learning_rate": 0.00037284106531295297, + "loss": 3.1839, + "step": 8307 + }, + { + "epoch": 0.34981052631578946, + "grad_norm": 0.3984375, + "learning_rate": 0.00037281167447098375, + "loss": 3.069, + "step": 8308 + }, + { + "epoch": 0.34985263157894736, + "grad_norm": 0.412109375, + "learning_rate": 0.00037278228139153014, + "loss": 3.0544, + "step": 8309 + }, + { + "epoch": 0.34989473684210526, + "grad_norm": 0.40625, + "learning_rate": 0.0003727528860751276, + "loss": 3.4984, + "step": 8310 + }, + { + "epoch": 0.34993684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.0003727234885223117, + "loss": 3.7331, + "step": 8311 + }, + { + "epoch": 0.34997894736842106, + "grad_norm": 0.3984375, + "learning_rate": 0.0003726940887336181, + "loss": 3.3205, + "step": 8312 + }, + { + "epoch": 0.35002105263157895, + "grad_norm": 0.404296875, + "learning_rate": 0.0003726646867095823, + "loss": 2.9202, + "step": 8313 + }, + { + "epoch": 0.35006315789473685, + "grad_norm": 0.4140625, + "learning_rate": 0.00037263528245074006, + "loss": 3.1256, + "step": 8314 + }, + { + "epoch": 0.35010526315789475, + "grad_norm": 0.388671875, + "learning_rate": 0.00037260587595762705, + "loss": 3.3829, + "step": 8315 + }, + { + "epoch": 0.35014736842105265, + "grad_norm": 0.4140625, + "learning_rate": 0.00037257646723077903, + "loss": 3.1338, + "step": 8316 + }, + { + "epoch": 0.35018947368421055, + "grad_norm": 0.40625, + "learning_rate": 0.0003725470562707318, + "loss": 3.2075, + "step": 8317 + }, + { + "epoch": 0.3502315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.00037251764307802117, + "loss": 3.2942, + "step": 8318 + }, + { + "epoch": 0.3502736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.00037248822765318306, + "loss": 3.216, + "step": 8319 + }, + { + "epoch": 0.3503157894736842, + "grad_norm": 0.453125, + "learning_rate": 0.0003724588099967534, + "loss": 3.4296, + "step": 8320 + }, + { + "epoch": 0.3503578947368421, + "grad_norm": 0.390625, + "learning_rate": 0.00037242939010926815, + "loss": 3.5463, + "step": 8321 + }, + { + "epoch": 0.3504, + "grad_norm": 0.412109375, + "learning_rate": 0.00037239996799126314, + "loss": 3.132, + "step": 8322 + }, + { + "epoch": 0.3504421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.0003723705436432746, + "loss": 2.8653, + "step": 8323 + }, + { + "epoch": 0.3504842105263158, + "grad_norm": 0.412109375, + "learning_rate": 0.00037234111706583846, + "loss": 3.2856, + "step": 8324 + }, + { + "epoch": 0.3505263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.00037231168825949096, + "loss": 3.4909, + "step": 8325 + }, + { + "epoch": 0.3505684210526316, + "grad_norm": 0.458984375, + "learning_rate": 0.00037228225722476816, + "loss": 3.0142, + "step": 8326 + }, + { + "epoch": 0.3506105263157895, + "grad_norm": 0.3984375, + "learning_rate": 0.0003722528239622064, + "loss": 3.0661, + "step": 8327 + }, + { + "epoch": 0.3506526315789474, + "grad_norm": 0.458984375, + "learning_rate": 0.0003722233884723418, + "loss": 3.415, + "step": 8328 + }, + { + "epoch": 0.3506947368421053, + "grad_norm": 0.478515625, + "learning_rate": 0.0003721939507557106, + "loss": 2.7923, + "step": 8329 + }, + { + "epoch": 0.3507368421052632, + "grad_norm": 0.451171875, + "learning_rate": 0.0003721645108128493, + "loss": 3.3168, + "step": 8330 + }, + { + "epoch": 0.3507789473684211, + "grad_norm": 0.427734375, + "learning_rate": 0.000372135068644294, + "loss": 3.3632, + "step": 8331 + }, + { + "epoch": 0.3508210526315789, + "grad_norm": 0.39453125, + "learning_rate": 0.00037210562425058136, + "loss": 3.4724, + "step": 8332 + }, + { + "epoch": 0.3508631578947368, + "grad_norm": 0.4296875, + "learning_rate": 0.00037207617763224777, + "loss": 3.2023, + "step": 8333 + }, + { + "epoch": 0.3509052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.00037204672878982955, + "loss": 3.1863, + "step": 8334 + }, + { + "epoch": 0.3509473684210526, + "grad_norm": 0.451171875, + "learning_rate": 0.00037201727772386337, + "loss": 3.0408, + "step": 8335 + }, + { + "epoch": 0.3509894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.00037198782443488576, + "loss": 3.4997, + "step": 8336 + }, + { + "epoch": 0.3510315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.00037195836892343334, + "loss": 3.1214, + "step": 8337 + }, + { + "epoch": 0.3510736842105263, + "grad_norm": 0.38671875, + "learning_rate": 0.00037192891119004263, + "loss": 3.0433, + "step": 8338 + }, + { + "epoch": 0.3511157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.00037189945123525047, + "loss": 3.0337, + "step": 8339 + }, + { + "epoch": 0.3511578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.0003718699890595936, + "loss": 3.0498, + "step": 8340 + }, + { + "epoch": 0.3512, + "grad_norm": 0.4453125, + "learning_rate": 0.0003718405246636087, + "loss": 3.2395, + "step": 8341 + }, + { + "epoch": 0.3512421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00037181105804783254, + "loss": 3.5704, + "step": 8342 + }, + { + "epoch": 0.3512842105263158, + "grad_norm": 0.447265625, + "learning_rate": 0.00037178158921280204, + "loss": 3.1893, + "step": 8343 + }, + { + "epoch": 0.3513263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.00037175211815905414, + "loss": 3.1545, + "step": 8344 + }, + { + "epoch": 0.35136842105263155, + "grad_norm": 0.400390625, + "learning_rate": 0.0003717226448871256, + "loss": 2.911, + "step": 8345 + }, + { + "epoch": 0.35141052631578945, + "grad_norm": 0.39453125, + "learning_rate": 0.00037169316939755353, + "loss": 3.1644, + "step": 8346 + }, + { + "epoch": 0.35145263157894735, + "grad_norm": 0.419921875, + "learning_rate": 0.00037166369169087493, + "loss": 3.1562, + "step": 8347 + }, + { + "epoch": 0.35149473684210525, + "grad_norm": 0.416015625, + "learning_rate": 0.0003716342117676268, + "loss": 3.1829, + "step": 8348 + }, + { + "epoch": 0.35153684210526315, + "grad_norm": 0.404296875, + "learning_rate": 0.00037160472962834623, + "loss": 2.7158, + "step": 8349 + }, + { + "epoch": 0.35157894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.00037157524527357035, + "loss": 3.2903, + "step": 8350 + }, + { + "epoch": 0.35162105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.0003715457587038364, + "loss": 2.6682, + "step": 8351 + }, + { + "epoch": 0.35166315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.00037151626991968145, + "loss": 3.49, + "step": 8352 + }, + { + "epoch": 0.35170526315789474, + "grad_norm": 0.408203125, + "learning_rate": 0.0003714867789216429, + "loss": 3.1354, + "step": 8353 + }, + { + "epoch": 0.35174736842105264, + "grad_norm": 0.423828125, + "learning_rate": 0.000371457285710258, + "loss": 3.2262, + "step": 8354 + }, + { + "epoch": 0.35178947368421054, + "grad_norm": 0.423828125, + "learning_rate": 0.00037142779028606404, + "loss": 3.6194, + "step": 8355 + }, + { + "epoch": 0.35183157894736844, + "grad_norm": 0.455078125, + "learning_rate": 0.0003713982926495984, + "loss": 3.1915, + "step": 8356 + }, + { + "epoch": 0.35187368421052634, + "grad_norm": 0.41796875, + "learning_rate": 0.00037136879280139854, + "loss": 3.194, + "step": 8357 + }, + { + "epoch": 0.35191578947368424, + "grad_norm": 0.396484375, + "learning_rate": 0.0003713392907420018, + "loss": 3.0096, + "step": 8358 + }, + { + "epoch": 0.3519578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.0003713097864719458, + "loss": 3.6196, + "step": 8359 + }, + { + "epoch": 0.352, + "grad_norm": 0.400390625, + "learning_rate": 0.000371280279991768, + "loss": 3.5267, + "step": 8360 + }, + { + "epoch": 0.3520421052631579, + "grad_norm": 0.71875, + "learning_rate": 0.00037125077130200604, + "loss": 2.8173, + "step": 8361 + }, + { + "epoch": 0.3520842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.00037122126040319746, + "loss": 3.1809, + "step": 8362 + }, + { + "epoch": 0.3521263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.00037119174729588, + "loss": 3.4177, + "step": 8363 + }, + { + "epoch": 0.3521684210526316, + "grad_norm": 0.388671875, + "learning_rate": 0.0003711622319805913, + "loss": 3.1437, + "step": 8364 + }, + { + "epoch": 0.3522105263157895, + "grad_norm": 0.435546875, + "learning_rate": 0.000371132714457869, + "loss": 3.3474, + "step": 8365 + }, + { + "epoch": 0.3522526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.000371103194728251, + "loss": 3.1933, + "step": 8366 + }, + { + "epoch": 0.35229473684210527, + "grad_norm": 0.42578125, + "learning_rate": 0.00037107367279227505, + "loss": 3.6506, + "step": 8367 + }, + { + "epoch": 0.35233684210526317, + "grad_norm": 0.41796875, + "learning_rate": 0.0003710441486504791, + "loss": 3.3752, + "step": 8368 + }, + { + "epoch": 0.35237894736842107, + "grad_norm": 0.40234375, + "learning_rate": 0.000371014622303401, + "loss": 3.3134, + "step": 8369 + }, + { + "epoch": 0.35242105263157897, + "grad_norm": 0.427734375, + "learning_rate": 0.0003709850937515786, + "loss": 2.9721, + "step": 8370 + }, + { + "epoch": 0.35246315789473687, + "grad_norm": 0.423828125, + "learning_rate": 0.00037095556299555, + "loss": 3.2375, + "step": 8371 + }, + { + "epoch": 0.3525052631578947, + "grad_norm": 0.451171875, + "learning_rate": 0.00037092603003585314, + "loss": 3.3239, + "step": 8372 + }, + { + "epoch": 0.3525473684210526, + "grad_norm": 0.3984375, + "learning_rate": 0.00037089649487302614, + "loss": 3.3319, + "step": 8373 + }, + { + "epoch": 0.3525894736842105, + "grad_norm": 0.451171875, + "learning_rate": 0.00037086695750760697, + "loss": 3.2538, + "step": 8374 + }, + { + "epoch": 0.3526315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.00037083741794013384, + "loss": 3.1107, + "step": 8375 + }, + { + "epoch": 0.3526736842105263, + "grad_norm": 0.408203125, + "learning_rate": 0.000370807876171145, + "loss": 3.5319, + "step": 8376 + }, + { + "epoch": 0.3527157894736842, + "grad_norm": 0.40234375, + "learning_rate": 0.00037077833220117856, + "loss": 3.1483, + "step": 8377 + }, + { + "epoch": 0.3527578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00037074878603077287, + "loss": 3.3937, + "step": 8378 + }, + { + "epoch": 0.3528, + "grad_norm": 0.443359375, + "learning_rate": 0.0003707192376604661, + "loss": 3.1436, + "step": 8379 + }, + { + "epoch": 0.3528421052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.0003706896870907967, + "loss": 3.5844, + "step": 8380 + }, + { + "epoch": 0.3528842105263158, + "grad_norm": 0.44921875, + "learning_rate": 0.000370660134322303, + "loss": 2.9009, + "step": 8381 + }, + { + "epoch": 0.3529263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.0003706305793555234, + "loss": 3.2249, + "step": 8382 + }, + { + "epoch": 0.3529684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.0003706010221909964, + "loss": 3.1035, + "step": 8383 + }, + { + "epoch": 0.3530105263157895, + "grad_norm": 0.408203125, + "learning_rate": 0.00037057146282926047, + "loss": 3.6417, + "step": 8384 + }, + { + "epoch": 0.3530526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.0003705419012708541, + "loss": 3.0527, + "step": 8385 + }, + { + "epoch": 0.35309473684210524, + "grad_norm": 0.423828125, + "learning_rate": 0.00037051233751631604, + "loss": 3.4006, + "step": 8386 + }, + { + "epoch": 0.35313684210526314, + "grad_norm": 0.42578125, + "learning_rate": 0.0003704827715661847, + "loss": 3.4922, + "step": 8387 + }, + { + "epoch": 0.35317894736842104, + "grad_norm": 0.400390625, + "learning_rate": 0.0003704532034209989, + "loss": 3.0543, + "step": 8388 + }, + { + "epoch": 0.35322105263157894, + "grad_norm": 0.419921875, + "learning_rate": 0.0003704236330812972, + "loss": 3.1797, + "step": 8389 + }, + { + "epoch": 0.35326315789473683, + "grad_norm": 0.4296875, + "learning_rate": 0.0003703940605476184, + "loss": 3.3814, + "step": 8390 + }, + { + "epoch": 0.35330526315789473, + "grad_norm": 0.40234375, + "learning_rate": 0.00037036448582050127, + "loss": 3.3005, + "step": 8391 + }, + { + "epoch": 0.35334736842105263, + "grad_norm": 0.4921875, + "learning_rate": 0.00037033490890048466, + "loss": 3.2047, + "step": 8392 + }, + { + "epoch": 0.35338947368421053, + "grad_norm": 0.439453125, + "learning_rate": 0.00037030532978810733, + "loss": 3.2599, + "step": 8393 + }, + { + "epoch": 0.35343157894736843, + "grad_norm": 0.421875, + "learning_rate": 0.0003702757484839083, + "loss": 3.3488, + "step": 8394 + }, + { + "epoch": 0.35347368421052633, + "grad_norm": 0.42578125, + "learning_rate": 0.0003702461649884264, + "loss": 3.0243, + "step": 8395 + }, + { + "epoch": 0.3535157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.00037021657930220075, + "loss": 2.9283, + "step": 8396 + }, + { + "epoch": 0.3535578947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.00037018699142577025, + "loss": 2.8585, + "step": 8397 + }, + { + "epoch": 0.3536, + "grad_norm": 0.5078125, + "learning_rate": 0.000370157401359674, + "loss": 3.5259, + "step": 8398 + }, + { + "epoch": 0.35364210526315787, + "grad_norm": 0.470703125, + "learning_rate": 0.00037012780910445096, + "loss": 2.9404, + "step": 8399 + }, + { + "epoch": 0.35368421052631577, + "grad_norm": 0.453125, + "learning_rate": 0.00037009821466064047, + "loss": 3.357, + "step": 8400 + }, + { + "epoch": 0.35372631578947367, + "grad_norm": 0.443359375, + "learning_rate": 0.00037006861802878156, + "loss": 3.5274, + "step": 8401 + }, + { + "epoch": 0.35376842105263157, + "grad_norm": 0.40625, + "learning_rate": 0.00037003901920941353, + "loss": 3.3709, + "step": 8402 + }, + { + "epoch": 0.35381052631578946, + "grad_norm": 0.431640625, + "learning_rate": 0.00037000941820307566, + "loss": 3.5608, + "step": 8403 + }, + { + "epoch": 0.35385263157894736, + "grad_norm": 0.416015625, + "learning_rate": 0.00036997981501030717, + "loss": 3.244, + "step": 8404 + }, + { + "epoch": 0.35389473684210526, + "grad_norm": 0.41015625, + "learning_rate": 0.0003699502096316474, + "loss": 3.2515, + "step": 8405 + }, + { + "epoch": 0.35393684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.0003699206020676358, + "loss": 2.8718, + "step": 8406 + }, + { + "epoch": 0.35397894736842106, + "grad_norm": 0.462890625, + "learning_rate": 0.00036989099231881165, + "loss": 3.0641, + "step": 8407 + }, + { + "epoch": 0.35402105263157896, + "grad_norm": 0.4296875, + "learning_rate": 0.00036986138038571455, + "loss": 3.5158, + "step": 8408 + }, + { + "epoch": 0.35406315789473686, + "grad_norm": 0.447265625, + "learning_rate": 0.0003698317662688839, + "loss": 3.3388, + "step": 8409 + }, + { + "epoch": 0.35410526315789476, + "grad_norm": 0.419921875, + "learning_rate": 0.00036980214996885924, + "loss": 3.3412, + "step": 8410 + }, + { + "epoch": 0.35414736842105266, + "grad_norm": 0.4375, + "learning_rate": 0.00036977253148618024, + "loss": 2.7781, + "step": 8411 + }, + { + "epoch": 0.35418947368421055, + "grad_norm": 0.451171875, + "learning_rate": 0.00036974291082138636, + "loss": 3.35, + "step": 8412 + }, + { + "epoch": 0.3542315789473684, + "grad_norm": 0.4140625, + "learning_rate": 0.00036971328797501734, + "loss": 3.5997, + "step": 8413 + }, + { + "epoch": 0.3542736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.000369683662947613, + "loss": 3.3039, + "step": 8414 + }, + { + "epoch": 0.3543157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.0003696540357397128, + "loss": 2.9617, + "step": 8415 + }, + { + "epoch": 0.3543578947368421, + "grad_norm": 0.443359375, + "learning_rate": 0.0003696244063518567, + "loss": 3.252, + "step": 8416 + }, + { + "epoch": 0.3544, + "grad_norm": 0.400390625, + "learning_rate": 0.00036959477478458447, + "loss": 2.8422, + "step": 8417 + }, + { + "epoch": 0.3544421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00036956514103843595, + "loss": 3.2049, + "step": 8418 + }, + { + "epoch": 0.3544842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.0003695355051139511, + "loss": 3.1824, + "step": 8419 + }, + { + "epoch": 0.3545263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.00036950586701166977, + "loss": 3.3621, + "step": 8420 + }, + { + "epoch": 0.3545684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.0003694762267321319, + "loss": 2.9948, + "step": 8421 + }, + { + "epoch": 0.3546105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.0003694465842758777, + "loss": 3.5632, + "step": 8422 + }, + { + "epoch": 0.3546526315789474, + "grad_norm": 0.388671875, + "learning_rate": 0.000369416939643447, + "loss": 3.1322, + "step": 8423 + }, + { + "epoch": 0.3546947368421053, + "grad_norm": 0.4375, + "learning_rate": 0.0003693872928353799, + "loss": 3.2696, + "step": 8424 + }, + { + "epoch": 0.3547368421052632, + "grad_norm": 0.5234375, + "learning_rate": 0.00036935764385221667, + "loss": 3.1635, + "step": 8425 + }, + { + "epoch": 0.354778947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.0003693279926944974, + "loss": 3.3804, + "step": 8426 + }, + { + "epoch": 0.3548210526315789, + "grad_norm": 0.40234375, + "learning_rate": 0.00036929833936276223, + "loss": 3.3277, + "step": 8427 + }, + { + "epoch": 0.3548631578947368, + "grad_norm": 0.43359375, + "learning_rate": 0.0003692686838575516, + "loss": 3.0877, + "step": 8428 + }, + { + "epoch": 0.3549052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.00036923902617940564, + "loss": 3.3003, + "step": 8429 + }, + { + "epoch": 0.3549473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.0003692093663288647, + "loss": 3.5651, + "step": 8430 + }, + { + "epoch": 0.3549894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.0003691797043064692, + "loss": 3.1963, + "step": 8431 + }, + { + "epoch": 0.3550315789473684, + "grad_norm": 0.39453125, + "learning_rate": 0.00036915004011275947, + "loss": 3.2812, + "step": 8432 + }, + { + "epoch": 0.3550736842105263, + "grad_norm": 0.40234375, + "learning_rate": 0.00036912037374827605, + "loss": 3.3763, + "step": 8433 + }, + { + "epoch": 0.3551157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.00036909070521355935, + "loss": 3.1488, + "step": 8434 + }, + { + "epoch": 0.3551578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.00036906103450914994, + "loss": 3.2133, + "step": 8435 + }, + { + "epoch": 0.3552, + "grad_norm": 0.40234375, + "learning_rate": 0.0003690313616355884, + "loss": 2.9893, + "step": 8436 + }, + { + "epoch": 0.3552421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00036900168659341535, + "loss": 3.2021, + "step": 8437 + }, + { + "epoch": 0.3552842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.00036897200938317134, + "loss": 3.0011, + "step": 8438 + }, + { + "epoch": 0.35532631578947366, + "grad_norm": 0.43359375, + "learning_rate": 0.00036894233000539707, + "loss": 3.5855, + "step": 8439 + }, + { + "epoch": 0.35536842105263156, + "grad_norm": 0.41796875, + "learning_rate": 0.00036891264846063333, + "loss": 3.3401, + "step": 8440 + }, + { + "epoch": 0.35541052631578945, + "grad_norm": 0.447265625, + "learning_rate": 0.0003688829647494208, + "loss": 3.3315, + "step": 8441 + }, + { + "epoch": 0.35545263157894735, + "grad_norm": 0.453125, + "learning_rate": 0.0003688532788723004, + "loss": 3.0723, + "step": 8442 + }, + { + "epoch": 0.35549473684210525, + "grad_norm": 0.46484375, + "learning_rate": 0.0003688235908298129, + "loss": 3.2475, + "step": 8443 + }, + { + "epoch": 0.35553684210526315, + "grad_norm": 0.421875, + "learning_rate": 0.0003687939006224992, + "loss": 3.7703, + "step": 8444 + }, + { + "epoch": 0.35557894736842105, + "grad_norm": 0.39453125, + "learning_rate": 0.0003687642082509002, + "loss": 3.2372, + "step": 8445 + }, + { + "epoch": 0.35562105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.0003687345137155568, + "loss": 3.3663, + "step": 8446 + }, + { + "epoch": 0.35566315789473685, + "grad_norm": 0.41015625, + "learning_rate": 0.0003687048170170101, + "loss": 3.4374, + "step": 8447 + }, + { + "epoch": 0.35570526315789475, + "grad_norm": 0.50390625, + "learning_rate": 0.00036867511815580117, + "loss": 2.7673, + "step": 8448 + }, + { + "epoch": 0.35574736842105265, + "grad_norm": 0.447265625, + "learning_rate": 0.000368645417132471, + "loss": 3.7187, + "step": 8449 + }, + { + "epoch": 0.35578947368421054, + "grad_norm": 0.4296875, + "learning_rate": 0.00036861571394756066, + "loss": 3.5667, + "step": 8450 + }, + { + "epoch": 0.35583157894736844, + "grad_norm": 0.44140625, + "learning_rate": 0.0003685860086016115, + "loss": 3.3765, + "step": 8451 + }, + { + "epoch": 0.35587368421052634, + "grad_norm": 0.4765625, + "learning_rate": 0.0003685563010951645, + "loss": 3.4038, + "step": 8452 + }, + { + "epoch": 0.3559157894736842, + "grad_norm": 0.478515625, + "learning_rate": 0.000368526591428761, + "loss": 3.4084, + "step": 8453 + }, + { + "epoch": 0.3559578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0003684968796029423, + "loss": 3.5268, + "step": 8454 + }, + { + "epoch": 0.356, + "grad_norm": 0.396484375, + "learning_rate": 0.00036846716561824967, + "loss": 3.0478, + "step": 8455 + }, + { + "epoch": 0.3560421052631579, + "grad_norm": 0.39453125, + "learning_rate": 0.0003684374494752245, + "loss": 2.9799, + "step": 8456 + }, + { + "epoch": 0.3560842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.0003684077311744082, + "loss": 3.1679, + "step": 8457 + }, + { + "epoch": 0.3561263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00036837801071634203, + "loss": 2.7548, + "step": 8458 + }, + { + "epoch": 0.3561684210526316, + "grad_norm": 0.439453125, + "learning_rate": 0.0003683482881015676, + "loss": 3.3332, + "step": 8459 + }, + { + "epoch": 0.3562105263157895, + "grad_norm": 0.42578125, + "learning_rate": 0.00036831856333062647, + "loss": 3.5327, + "step": 8460 + }, + { + "epoch": 0.3562526315789474, + "grad_norm": 0.435546875, + "learning_rate": 0.0003682888364040602, + "loss": 3.1463, + "step": 8461 + }, + { + "epoch": 0.3562947368421053, + "grad_norm": 0.482421875, + "learning_rate": 0.00036825910732241025, + "loss": 3.1507, + "step": 8462 + }, + { + "epoch": 0.3563368421052632, + "grad_norm": 0.419921875, + "learning_rate": 0.0003682293760862183, + "loss": 3.032, + "step": 8463 + }, + { + "epoch": 0.3563789473684211, + "grad_norm": 0.408203125, + "learning_rate": 0.00036819964269602605, + "loss": 3.0423, + "step": 8464 + }, + { + "epoch": 0.35642105263157897, + "grad_norm": 0.40625, + "learning_rate": 0.0003681699071523752, + "loss": 3.1754, + "step": 8465 + }, + { + "epoch": 0.3564631578947368, + "grad_norm": 0.392578125, + "learning_rate": 0.00036814016945580744, + "loss": 2.7483, + "step": 8466 + }, + { + "epoch": 0.3565052631578947, + "grad_norm": 0.4453125, + "learning_rate": 0.0003681104296068646, + "loss": 3.6548, + "step": 8467 + }, + { + "epoch": 0.3565473684210526, + "grad_norm": 0.46484375, + "learning_rate": 0.0003680806876060886, + "loss": 3.4123, + "step": 8468 + }, + { + "epoch": 0.3565894736842105, + "grad_norm": 0.380859375, + "learning_rate": 0.0003680509434540211, + "loss": 2.6754, + "step": 8469 + }, + { + "epoch": 0.3566315789473684, + "grad_norm": 0.40625, + "learning_rate": 0.00036802119715120417, + "loss": 3.3347, + "step": 8470 + }, + { + "epoch": 0.3566736842105263, + "grad_norm": 0.40234375, + "learning_rate": 0.00036799144869817976, + "loss": 3.4992, + "step": 8471 + }, + { + "epoch": 0.3567157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.00036796169809548976, + "loss": 3.6774, + "step": 8472 + }, + { + "epoch": 0.3567578947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.00036793194534367606, + "loss": 3.2564, + "step": 8473 + }, + { + "epoch": 0.3568, + "grad_norm": 0.455078125, + "learning_rate": 0.0003679021904432811, + "loss": 3.3799, + "step": 8474 + }, + { + "epoch": 0.3568421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.0003678724333948466, + "loss": 2.9776, + "step": 8475 + }, + { + "epoch": 0.3568842105263158, + "grad_norm": 0.4453125, + "learning_rate": 0.0003678426741989149, + "loss": 3.2577, + "step": 8476 + }, + { + "epoch": 0.3569263157894737, + "grad_norm": 0.421875, + "learning_rate": 0.00036781291285602815, + "loss": 3.5254, + "step": 8477 + }, + { + "epoch": 0.3569684210526316, + "grad_norm": 0.43359375, + "learning_rate": 0.00036778314936672857, + "loss": 2.9002, + "step": 8478 + }, + { + "epoch": 0.3570105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.00036775338373155837, + "loss": 3.5185, + "step": 8479 + }, + { + "epoch": 0.35705263157894734, + "grad_norm": 0.427734375, + "learning_rate": 0.00036772361595105994, + "loss": 3.5643, + "step": 8480 + }, + { + "epoch": 0.35709473684210524, + "grad_norm": 0.4453125, + "learning_rate": 0.00036769384602577547, + "loss": 3.0884, + "step": 8481 + }, + { + "epoch": 0.35713684210526314, + "grad_norm": 0.42578125, + "learning_rate": 0.00036766407395624745, + "loss": 3.3613, + "step": 8482 + }, + { + "epoch": 0.35717894736842104, + "grad_norm": 0.4296875, + "learning_rate": 0.0003676342997430182, + "loss": 3.1784, + "step": 8483 + }, + { + "epoch": 0.35722105263157894, + "grad_norm": 0.412109375, + "learning_rate": 0.00036760452338663026, + "loss": 3.2515, + "step": 8484 + }, + { + "epoch": 0.35726315789473684, + "grad_norm": 0.59765625, + "learning_rate": 0.0003675747448876261, + "loss": 2.7707, + "step": 8485 + }, + { + "epoch": 0.35730526315789474, + "grad_norm": 0.5234375, + "learning_rate": 0.00036754496424654814, + "loss": 3.3514, + "step": 8486 + }, + { + "epoch": 0.35734736842105264, + "grad_norm": 0.4375, + "learning_rate": 0.00036751518146393915, + "loss": 3.2161, + "step": 8487 + }, + { + "epoch": 0.35738947368421053, + "grad_norm": 0.419921875, + "learning_rate": 0.00036748539654034166, + "loss": 3.5402, + "step": 8488 + }, + { + "epoch": 0.35743157894736843, + "grad_norm": 0.412109375, + "learning_rate": 0.0003674556094762982, + "loss": 2.8124, + "step": 8489 + }, + { + "epoch": 0.35747368421052633, + "grad_norm": 0.453125, + "learning_rate": 0.00036742582027235156, + "loss": 3.3315, + "step": 8490 + }, + { + "epoch": 0.35751578947368423, + "grad_norm": 0.4453125, + "learning_rate": 0.00036739602892904447, + "loss": 3.1274, + "step": 8491 + }, + { + "epoch": 0.35755789473684213, + "grad_norm": 0.455078125, + "learning_rate": 0.0003673662354469196, + "loss": 3.6463, + "step": 8492 + }, + { + "epoch": 0.3576, + "grad_norm": 0.435546875, + "learning_rate": 0.0003673364398265199, + "loss": 3.4198, + "step": 8493 + }, + { + "epoch": 0.3576421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.00036730664206838806, + "loss": 3.3445, + "step": 8494 + }, + { + "epoch": 0.35768421052631577, + "grad_norm": 0.396484375, + "learning_rate": 0.00036727684217306705, + "loss": 3.308, + "step": 8495 + }, + { + "epoch": 0.35772631578947367, + "grad_norm": 0.419921875, + "learning_rate": 0.00036724704014109977, + "loss": 3.0323, + "step": 8496 + }, + { + "epoch": 0.35776842105263157, + "grad_norm": 0.41015625, + "learning_rate": 0.00036721723597302923, + "loss": 3.1548, + "step": 8497 + }, + { + "epoch": 0.35781052631578947, + "grad_norm": 0.404296875, + "learning_rate": 0.0003671874296693983, + "loss": 3.1609, + "step": 8498 + }, + { + "epoch": 0.35785263157894737, + "grad_norm": 0.45703125, + "learning_rate": 0.00036715762123075017, + "loss": 3.5152, + "step": 8499 + }, + { + "epoch": 0.35789473684210527, + "grad_norm": 0.421875, + "learning_rate": 0.0003671278106576278, + "loss": 3.2645, + "step": 8500 + }, + { + "epoch": 0.35793684210526316, + "grad_norm": 0.412109375, + "learning_rate": 0.0003670979979505743, + "loss": 2.6878, + "step": 8501 + }, + { + "epoch": 0.35797894736842106, + "grad_norm": 0.431640625, + "learning_rate": 0.0003670681831101329, + "loss": 2.7624, + "step": 8502 + }, + { + "epoch": 0.35802105263157896, + "grad_norm": 0.408203125, + "learning_rate": 0.00036703836613684673, + "loss": 3.2892, + "step": 8503 + }, + { + "epoch": 0.35806315789473686, + "grad_norm": 0.462890625, + "learning_rate": 0.0003670085470312591, + "loss": 3.0266, + "step": 8504 + }, + { + "epoch": 0.35810526315789476, + "grad_norm": 0.44921875, + "learning_rate": 0.00036697872579391315, + "loss": 3.1614, + "step": 8505 + }, + { + "epoch": 0.35814736842105266, + "grad_norm": 0.39453125, + "learning_rate": 0.0003669489024253523, + "loss": 3.4023, + "step": 8506 + }, + { + "epoch": 0.3581894736842105, + "grad_norm": 0.3984375, + "learning_rate": 0.0003669190769261199, + "loss": 3.4111, + "step": 8507 + }, + { + "epoch": 0.3582315789473684, + "grad_norm": 0.40234375, + "learning_rate": 0.0003668892492967592, + "loss": 2.6688, + "step": 8508 + }, + { + "epoch": 0.3582736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.00036685941953781376, + "loss": 3.307, + "step": 8509 + }, + { + "epoch": 0.3583157894736842, + "grad_norm": 0.455078125, + "learning_rate": 0.000366829587649827, + "loss": 3.0182, + "step": 8510 + }, + { + "epoch": 0.3583578947368421, + "grad_norm": 0.498046875, + "learning_rate": 0.0003667997536333424, + "loss": 2.8687, + "step": 8511 + }, + { + "epoch": 0.3584, + "grad_norm": 0.421875, + "learning_rate": 0.0003667699174889036, + "loss": 3.5888, + "step": 8512 + }, + { + "epoch": 0.3584421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.000366740079217054, + "loss": 3.4365, + "step": 8513 + }, + { + "epoch": 0.3584842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.00036671023881833736, + "loss": 3.4626, + "step": 8514 + }, + { + "epoch": 0.3585263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.00036668039629329736, + "loss": 3.0771, + "step": 8515 + }, + { + "epoch": 0.3585684210526316, + "grad_norm": 0.39453125, + "learning_rate": 0.00036665055164247757, + "loss": 3.2052, + "step": 8516 + }, + { + "epoch": 0.3586105263157895, + "grad_norm": 0.484375, + "learning_rate": 0.0003666207048664218, + "loss": 3.2578, + "step": 8517 + }, + { + "epoch": 0.3586526315789474, + "grad_norm": 0.44140625, + "learning_rate": 0.00036659085596567375, + "loss": 3.449, + "step": 8518 + }, + { + "epoch": 0.3586947368421053, + "grad_norm": 0.419921875, + "learning_rate": 0.0003665610049407774, + "loss": 3.1248, + "step": 8519 + }, + { + "epoch": 0.35873684210526313, + "grad_norm": 0.421875, + "learning_rate": 0.0003665311517922764, + "loss": 2.9644, + "step": 8520 + }, + { + "epoch": 0.35877894736842103, + "grad_norm": 0.40234375, + "learning_rate": 0.0003665012965207148, + "loss": 3.2378, + "step": 8521 + }, + { + "epoch": 0.35882105263157893, + "grad_norm": 0.43359375, + "learning_rate": 0.0003664714391266364, + "loss": 3.2095, + "step": 8522 + }, + { + "epoch": 0.35886315789473683, + "grad_norm": 0.3984375, + "learning_rate": 0.00036644157961058534, + "loss": 3.2669, + "step": 8523 + }, + { + "epoch": 0.3589052631578947, + "grad_norm": 0.4140625, + "learning_rate": 0.0003664117179731055, + "loss": 3.2812, + "step": 8524 + }, + { + "epoch": 0.3589473684210526, + "grad_norm": 0.380859375, + "learning_rate": 0.00036638185421474087, + "loss": 3.2559, + "step": 8525 + }, + { + "epoch": 0.3589894736842105, + "grad_norm": 0.3984375, + "learning_rate": 0.0003663519883360356, + "loss": 3.3712, + "step": 8526 + }, + { + "epoch": 0.3590315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.0003663221203375337, + "loss": 3.3206, + "step": 8527 + }, + { + "epoch": 0.3590736842105263, + "grad_norm": 0.486328125, + "learning_rate": 0.00036629225021977957, + "loss": 3.0226, + "step": 8528 + }, + { + "epoch": 0.3591157894736842, + "grad_norm": 0.40625, + "learning_rate": 0.0003662623779833173, + "loss": 3.3128, + "step": 8529 + }, + { + "epoch": 0.3591578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0003662325036286911, + "loss": 3.3421, + "step": 8530 + }, + { + "epoch": 0.3592, + "grad_norm": 0.4375, + "learning_rate": 0.0003662026271564452, + "loss": 3.3016, + "step": 8531 + }, + { + "epoch": 0.3592421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.00036617274856712403, + "loss": 3.5453, + "step": 8532 + }, + { + "epoch": 0.3592842105263158, + "grad_norm": 0.4453125, + "learning_rate": 0.00036614286786127174, + "loss": 3.1034, + "step": 8533 + }, + { + "epoch": 0.35932631578947366, + "grad_norm": 0.41015625, + "learning_rate": 0.00036611298503943303, + "loss": 3.2479, + "step": 8534 + }, + { + "epoch": 0.35936842105263156, + "grad_norm": 0.40625, + "learning_rate": 0.000366083100102152, + "loss": 2.9535, + "step": 8535 + }, + { + "epoch": 0.35941052631578946, + "grad_norm": 0.384765625, + "learning_rate": 0.0003660532130499734, + "loss": 3.0495, + "step": 8536 + }, + { + "epoch": 0.35945263157894736, + "grad_norm": 0.416015625, + "learning_rate": 0.0003660233238834416, + "loss": 3.1214, + "step": 8537 + }, + { + "epoch": 0.35949473684210526, + "grad_norm": 0.43359375, + "learning_rate": 0.00036599343260310113, + "loss": 3.5953, + "step": 8538 + }, + { + "epoch": 0.35953684210526315, + "grad_norm": 0.392578125, + "learning_rate": 0.0003659635392094967, + "loss": 3.1289, + "step": 8539 + }, + { + "epoch": 0.35957894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.00036593364370317267, + "loss": 2.8361, + "step": 8540 + }, + { + "epoch": 0.35962105263157895, + "grad_norm": 0.396484375, + "learning_rate": 0.00036590374608467396, + "loss": 2.8669, + "step": 8541 + }, + { + "epoch": 0.35966315789473685, + "grad_norm": 0.412109375, + "learning_rate": 0.00036587384635454516, + "loss": 3.1655, + "step": 8542 + }, + { + "epoch": 0.35970526315789475, + "grad_norm": 0.40625, + "learning_rate": 0.000365843944513331, + "loss": 3.2912, + "step": 8543 + }, + { + "epoch": 0.35974736842105265, + "grad_norm": 0.42578125, + "learning_rate": 0.00036581404056157635, + "loss": 3.2754, + "step": 8544 + }, + { + "epoch": 0.35978947368421055, + "grad_norm": 0.41796875, + "learning_rate": 0.00036578413449982596, + "loss": 2.8622, + "step": 8545 + }, + { + "epoch": 0.35983157894736845, + "grad_norm": 0.484375, + "learning_rate": 0.0003657542263286246, + "loss": 3.07, + "step": 8546 + }, + { + "epoch": 0.3598736842105263, + "grad_norm": 0.396484375, + "learning_rate": 0.00036572431604851727, + "loss": 3.2908, + "step": 8547 + }, + { + "epoch": 0.3599157894736842, + "grad_norm": 0.39453125, + "learning_rate": 0.0003656944036600489, + "loss": 3.5344, + "step": 8548 + }, + { + "epoch": 0.3599578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.0003656644891637644, + "loss": 3.3022, + "step": 8549 + }, + { + "epoch": 0.36, + "grad_norm": 0.396484375, + "learning_rate": 0.00036563457256020887, + "loss": 3.5644, + "step": 8550 + }, + { + "epoch": 0.3600421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00036560465384992717, + "loss": 3.1371, + "step": 8551 + }, + { + "epoch": 0.3600842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.0003655747330334646, + "loss": 3.301, + "step": 8552 + }, + { + "epoch": 0.3601263157894737, + "grad_norm": 0.4609375, + "learning_rate": 0.00036554481011136615, + "loss": 3.2776, + "step": 8553 + }, + { + "epoch": 0.3601684210526316, + "grad_norm": 0.396484375, + "learning_rate": 0.00036551488508417706, + "loss": 3.3209, + "step": 8554 + }, + { + "epoch": 0.3602105263157895, + "grad_norm": 0.42578125, + "learning_rate": 0.00036548495795244254, + "loss": 3.0257, + "step": 8555 + }, + { + "epoch": 0.3602526315789474, + "grad_norm": 0.427734375, + "learning_rate": 0.00036545502871670766, + "loss": 3.1573, + "step": 8556 + }, + { + "epoch": 0.3602947368421053, + "grad_norm": 0.447265625, + "learning_rate": 0.0003654250973775179, + "loss": 3.4982, + "step": 8557 + }, + { + "epoch": 0.3603368421052632, + "grad_norm": 0.404296875, + "learning_rate": 0.00036539516393541843, + "loss": 2.9723, + "step": 8558 + }, + { + "epoch": 0.3603789473684211, + "grad_norm": 0.4140625, + "learning_rate": 0.0003653652283909547, + "loss": 3.3682, + "step": 8559 + }, + { + "epoch": 0.3604210526315789, + "grad_norm": 0.46484375, + "learning_rate": 0.000365335290744672, + "loss": 3.3689, + "step": 8560 + }, + { + "epoch": 0.3604631578947368, + "grad_norm": 0.37890625, + "learning_rate": 0.00036530535099711577, + "loss": 3.1767, + "step": 8561 + }, + { + "epoch": 0.3605052631578947, + "grad_norm": 0.376953125, + "learning_rate": 0.0003652754091488316, + "loss": 3.1023, + "step": 8562 + }, + { + "epoch": 0.3605473684210526, + "grad_norm": 0.447265625, + "learning_rate": 0.00036524546520036493, + "loss": 3.2475, + "step": 8563 + }, + { + "epoch": 0.3605894736842105, + "grad_norm": 1.078125, + "learning_rate": 0.00036521551915226133, + "loss": 2.9145, + "step": 8564 + }, + { + "epoch": 0.3606315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.0003651855710050663, + "loss": 3.2941, + "step": 8565 + }, + { + "epoch": 0.3606736842105263, + "grad_norm": 0.431640625, + "learning_rate": 0.00036515562075932544, + "loss": 3.3197, + "step": 8566 + }, + { + "epoch": 0.3607157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00036512566841558454, + "loss": 3.1512, + "step": 8567 + }, + { + "epoch": 0.3607578947368421, + "grad_norm": 0.447265625, + "learning_rate": 0.0003650957139743892, + "loss": 3.0299, + "step": 8568 + }, + { + "epoch": 0.3608, + "grad_norm": 0.40234375, + "learning_rate": 0.00036506575743628516, + "loss": 3.2048, + "step": 8569 + }, + { + "epoch": 0.3608421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00036503579880181825, + "loss": 3.1144, + "step": 8570 + }, + { + "epoch": 0.3608842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.0003650058380715343, + "loss": 3.009, + "step": 8571 + }, + { + "epoch": 0.3609263157894737, + "grad_norm": 0.421875, + "learning_rate": 0.000364975875245979, + "loss": 3.3897, + "step": 8572 + }, + { + "epoch": 0.3609684210526316, + "grad_norm": 0.466796875, + "learning_rate": 0.0003649459103256985, + "loss": 2.8768, + "step": 8573 + }, + { + "epoch": 0.36101052631578945, + "grad_norm": 0.41015625, + "learning_rate": 0.0003649159433112384, + "loss": 3.0635, + "step": 8574 + }, + { + "epoch": 0.36105263157894735, + "grad_norm": 0.443359375, + "learning_rate": 0.0003648859742031449, + "loss": 2.724, + "step": 8575 + }, + { + "epoch": 0.36109473684210525, + "grad_norm": 0.40234375, + "learning_rate": 0.00036485600300196396, + "loss": 3.3569, + "step": 8576 + }, + { + "epoch": 0.36113684210526314, + "grad_norm": 0.41015625, + "learning_rate": 0.0003648260297082415, + "loss": 3.0637, + "step": 8577 + }, + { + "epoch": 0.36117894736842104, + "grad_norm": 0.44921875, + "learning_rate": 0.0003647960543225238, + "loss": 3.51, + "step": 8578 + }, + { + "epoch": 0.36122105263157894, + "grad_norm": 0.42578125, + "learning_rate": 0.0003647660768453569, + "loss": 3.3347, + "step": 8579 + }, + { + "epoch": 0.36126315789473684, + "grad_norm": 0.55078125, + "learning_rate": 0.0003647360972772868, + "loss": 3.1537, + "step": 8580 + }, + { + "epoch": 0.36130526315789474, + "grad_norm": 0.443359375, + "learning_rate": 0.00036470611561885993, + "loss": 3.0917, + "step": 8581 + }, + { + "epoch": 0.36134736842105264, + "grad_norm": 0.4296875, + "learning_rate": 0.0003646761318706223, + "loss": 3.0826, + "step": 8582 + }, + { + "epoch": 0.36138947368421054, + "grad_norm": 0.404296875, + "learning_rate": 0.00036464614603312043, + "loss": 3.2376, + "step": 8583 + }, + { + "epoch": 0.36143157894736844, + "grad_norm": 0.4296875, + "learning_rate": 0.00036461615810690035, + "loss": 3.056, + "step": 8584 + }, + { + "epoch": 0.36147368421052634, + "grad_norm": 0.4453125, + "learning_rate": 0.0003645861680925086, + "loss": 3.3235, + "step": 8585 + }, + { + "epoch": 0.36151578947368423, + "grad_norm": 0.43359375, + "learning_rate": 0.0003645561759904915, + "loss": 3.1803, + "step": 8586 + }, + { + "epoch": 0.3615578947368421, + "grad_norm": 0.65234375, + "learning_rate": 0.0003645261818013954, + "loss": 2.6219, + "step": 8587 + }, + { + "epoch": 0.3616, + "grad_norm": 0.3984375, + "learning_rate": 0.0003644961855257669, + "loss": 3.3653, + "step": 8588 + }, + { + "epoch": 0.3616421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.0003644661871641524, + "loss": 3.1121, + "step": 8589 + }, + { + "epoch": 0.3616842105263158, + "grad_norm": 0.4140625, + "learning_rate": 0.0003644361867170985, + "loss": 3.4479, + "step": 8590 + }, + { + "epoch": 0.3617263157894737, + "grad_norm": 0.58203125, + "learning_rate": 0.0003644061841851517, + "loss": 3.5041, + "step": 8591 + }, + { + "epoch": 0.3617684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.0003643761795688587, + "loss": 3.4588, + "step": 8592 + }, + { + "epoch": 0.36181052631578947, + "grad_norm": 0.447265625, + "learning_rate": 0.00036434617286876605, + "loss": 3.0729, + "step": 8593 + }, + { + "epoch": 0.36185263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.0003643161640854204, + "loss": 2.9089, + "step": 8594 + }, + { + "epoch": 0.36189473684210527, + "grad_norm": 0.431640625, + "learning_rate": 0.00036428615321936863, + "loss": 3.2881, + "step": 8595 + }, + { + "epoch": 0.36193684210526317, + "grad_norm": 0.482421875, + "learning_rate": 0.0003642561402711575, + "loss": 3.2396, + "step": 8596 + }, + { + "epoch": 0.36197894736842107, + "grad_norm": 0.4375, + "learning_rate": 0.0003642261252413337, + "loss": 3.0016, + "step": 8597 + }, + { + "epoch": 0.36202105263157897, + "grad_norm": 0.396484375, + "learning_rate": 0.00036419610813044406, + "loss": 3.0481, + "step": 8598 + }, + { + "epoch": 0.36206315789473686, + "grad_norm": 0.404296875, + "learning_rate": 0.0003641660889390356, + "loss": 3.0796, + "step": 8599 + }, + { + "epoch": 0.36210526315789476, + "grad_norm": 0.431640625, + "learning_rate": 0.00036413606766765505, + "loss": 3.2641, + "step": 8600 + }, + { + "epoch": 0.3621473684210526, + "grad_norm": 0.48046875, + "learning_rate": 0.0003641060443168494, + "loss": 3.4444, + "step": 8601 + }, + { + "epoch": 0.3621894736842105, + "grad_norm": 0.408203125, + "learning_rate": 0.0003640760188871657, + "loss": 3.1555, + "step": 8602 + }, + { + "epoch": 0.3622315789473684, + "grad_norm": 0.40234375, + "learning_rate": 0.00036404599137915104, + "loss": 3.2479, + "step": 8603 + }, + { + "epoch": 0.3622736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.00036401596179335246, + "loss": 3.2287, + "step": 8604 + }, + { + "epoch": 0.3623157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.000363985930130317, + "loss": 3.3387, + "step": 8605 + }, + { + "epoch": 0.3623578947368421, + "grad_norm": 0.462890625, + "learning_rate": 0.0003639558963905918, + "loss": 3.393, + "step": 8606 + }, + { + "epoch": 0.3624, + "grad_norm": 0.388671875, + "learning_rate": 0.00036392586057472397, + "loss": 2.7533, + "step": 8607 + }, + { + "epoch": 0.3624421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.0003638958226832609, + "loss": 2.9388, + "step": 8608 + }, + { + "epoch": 0.3624842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.0003638657827167498, + "loss": 3.2171, + "step": 8609 + }, + { + "epoch": 0.3625263157894737, + "grad_norm": 0.392578125, + "learning_rate": 0.0003638357406757379, + "loss": 2.8198, + "step": 8610 + }, + { + "epoch": 0.3625684210526316, + "grad_norm": 0.443359375, + "learning_rate": 0.0003638056965607725, + "loss": 2.9649, + "step": 8611 + }, + { + "epoch": 0.3626105263157895, + "grad_norm": 0.4140625, + "learning_rate": 0.0003637756503724011, + "loss": 3.4412, + "step": 8612 + }, + { + "epoch": 0.3626526315789474, + "grad_norm": 0.40234375, + "learning_rate": 0.00036374560211117104, + "loss": 2.9699, + "step": 8613 + }, + { + "epoch": 0.36269473684210524, + "grad_norm": 0.44140625, + "learning_rate": 0.0003637155517776297, + "loss": 3.3216, + "step": 8614 + }, + { + "epoch": 0.36273684210526314, + "grad_norm": 0.421875, + "learning_rate": 0.00036368549937232456, + "loss": 3.3272, + "step": 8615 + }, + { + "epoch": 0.36277894736842103, + "grad_norm": 0.4296875, + "learning_rate": 0.0003636554448958033, + "loss": 2.8462, + "step": 8616 + }, + { + "epoch": 0.36282105263157893, + "grad_norm": 0.3984375, + "learning_rate": 0.0003636253883486134, + "loss": 3.3656, + "step": 8617 + }, + { + "epoch": 0.36286315789473683, + "grad_norm": 0.423828125, + "learning_rate": 0.00036359532973130235, + "loss": 3.0729, + "step": 8618 + }, + { + "epoch": 0.36290526315789473, + "grad_norm": 0.392578125, + "learning_rate": 0.0003635652690444178, + "loss": 3.1648, + "step": 8619 + }, + { + "epoch": 0.36294736842105263, + "grad_norm": 0.388671875, + "learning_rate": 0.0003635352062885076, + "loss": 2.1842, + "step": 8620 + }, + { + "epoch": 0.36298947368421053, + "grad_norm": 0.427734375, + "learning_rate": 0.0003635051414641193, + "loss": 3.2677, + "step": 8621 + }, + { + "epoch": 0.3630315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.0003634750745718007, + "loss": 3.2, + "step": 8622 + }, + { + "epoch": 0.3630736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.00036344500561209954, + "loss": 3.2294, + "step": 8623 + }, + { + "epoch": 0.3631157894736842, + "grad_norm": 0.39453125, + "learning_rate": 0.00036341493458556367, + "loss": 3.1805, + "step": 8624 + }, + { + "epoch": 0.3631578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00036338486149274104, + "loss": 3.5624, + "step": 8625 + }, + { + "epoch": 0.3632, + "grad_norm": 0.416015625, + "learning_rate": 0.00036335478633417933, + "loss": 3.3642, + "step": 8626 + }, + { + "epoch": 0.3632421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.0003633247091104266, + "loss": 2.8678, + "step": 8627 + }, + { + "epoch": 0.36328421052631577, + "grad_norm": 0.43359375, + "learning_rate": 0.00036329462982203085, + "loss": 3.5098, + "step": 8628 + }, + { + "epoch": 0.36332631578947366, + "grad_norm": 0.412109375, + "learning_rate": 0.0003632645484695401, + "loss": 2.7913, + "step": 8629 + }, + { + "epoch": 0.36336842105263156, + "grad_norm": 0.39453125, + "learning_rate": 0.00036323446505350237, + "loss": 3.3404, + "step": 8630 + }, + { + "epoch": 0.36341052631578946, + "grad_norm": 0.4921875, + "learning_rate": 0.0003632043795744657, + "loss": 2.7892, + "step": 8631 + }, + { + "epoch": 0.36345263157894736, + "grad_norm": 0.423828125, + "learning_rate": 0.0003631742920329783, + "loss": 3.4458, + "step": 8632 + }, + { + "epoch": 0.36349473684210526, + "grad_norm": 0.4296875, + "learning_rate": 0.0003631442024295882, + "loss": 2.7049, + "step": 8633 + }, + { + "epoch": 0.36353684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.00036311411076484363, + "loss": 2.7395, + "step": 8634 + }, + { + "epoch": 0.36357894736842106, + "grad_norm": 0.40625, + "learning_rate": 0.00036308401703929295, + "loss": 2.9941, + "step": 8635 + }, + { + "epoch": 0.36362105263157896, + "grad_norm": 0.41796875, + "learning_rate": 0.0003630539212534843, + "loss": 3.2984, + "step": 8636 + }, + { + "epoch": 0.36366315789473685, + "grad_norm": 0.439453125, + "learning_rate": 0.0003630238234079661, + "loss": 2.9861, + "step": 8637 + }, + { + "epoch": 0.36370526315789475, + "grad_norm": 0.44921875, + "learning_rate": 0.00036299372350328656, + "loss": 2.9727, + "step": 8638 + }, + { + "epoch": 0.36374736842105265, + "grad_norm": 0.404296875, + "learning_rate": 0.00036296362153999417, + "loss": 2.8809, + "step": 8639 + }, + { + "epoch": 0.36378947368421055, + "grad_norm": 0.388671875, + "learning_rate": 0.00036293351751863734, + "loss": 2.8221, + "step": 8640 + }, + { + "epoch": 0.3638315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.00036290341143976445, + "loss": 3.0221, + "step": 8641 + }, + { + "epoch": 0.3638736842105263, + "grad_norm": 0.416015625, + "learning_rate": 0.0003628733033039241, + "loss": 3.253, + "step": 8642 + }, + { + "epoch": 0.3639157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.00036284319311166485, + "loss": 3.3358, + "step": 8643 + }, + { + "epoch": 0.3639578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.00036281308086353513, + "loss": 3.2004, + "step": 8644 + }, + { + "epoch": 0.364, + "grad_norm": 0.392578125, + "learning_rate": 0.0003627829665600836, + "loss": 3.321, + "step": 8645 + }, + { + "epoch": 0.3640421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.00036275285020185904, + "loss": 3.2841, + "step": 8646 + }, + { + "epoch": 0.3640842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.00036272273178940994, + "loss": 2.6696, + "step": 8647 + }, + { + "epoch": 0.3641263157894737, + "grad_norm": 0.439453125, + "learning_rate": 0.00036269261132328514, + "loss": 2.9027, + "step": 8648 + }, + { + "epoch": 0.3641684210526316, + "grad_norm": 0.40625, + "learning_rate": 0.0003626624888040334, + "loss": 3.3783, + "step": 8649 + }, + { + "epoch": 0.3642105263157895, + "grad_norm": 0.40234375, + "learning_rate": 0.0003626323642322035, + "loss": 3.3822, + "step": 8650 + }, + { + "epoch": 0.3642526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.0003626022376083442, + "loss": 2.758, + "step": 8651 + }, + { + "epoch": 0.3642947368421053, + "grad_norm": 0.412109375, + "learning_rate": 0.0003625721089330044, + "loss": 3.4959, + "step": 8652 + }, + { + "epoch": 0.3643368421052632, + "grad_norm": 0.39453125, + "learning_rate": 0.00036254197820673314, + "loss": 3.3613, + "step": 8653 + }, + { + "epoch": 0.3643789473684211, + "grad_norm": 0.453125, + "learning_rate": 0.0003625118454300791, + "loss": 3.1931, + "step": 8654 + }, + { + "epoch": 0.3644210526315789, + "grad_norm": 0.41015625, + "learning_rate": 0.0003624817106035916, + "loss": 3.2664, + "step": 8655 + }, + { + "epoch": 0.3644631578947368, + "grad_norm": 0.421875, + "learning_rate": 0.00036245157372781945, + "loss": 3.4686, + "step": 8656 + }, + { + "epoch": 0.3645052631578947, + "grad_norm": 0.423828125, + "learning_rate": 0.0003624214348033118, + "loss": 2.9575, + "step": 8657 + }, + { + "epoch": 0.3645473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.00036239129383061765, + "loss": 3.1362, + "step": 8658 + }, + { + "epoch": 0.3645894736842105, + "grad_norm": 0.408203125, + "learning_rate": 0.0003623611508102862, + "loss": 3.1704, + "step": 8659 + }, + { + "epoch": 0.3646315789473684, + "grad_norm": 0.431640625, + "learning_rate": 0.00036233100574286655, + "loss": 3.2556, + "step": 8660 + }, + { + "epoch": 0.3646736842105263, + "grad_norm": 0.412109375, + "learning_rate": 0.000362300858628908, + "loss": 3.0411, + "step": 8661 + }, + { + "epoch": 0.3647157894736842, + "grad_norm": 0.45703125, + "learning_rate": 0.0003622707094689597, + "loss": 3.3357, + "step": 8662 + }, + { + "epoch": 0.3647578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.000362240558263571, + "loss": 2.7502, + "step": 8663 + }, + { + "epoch": 0.3648, + "grad_norm": 0.462890625, + "learning_rate": 0.00036221040501329126, + "loss": 3.0725, + "step": 8664 + }, + { + "epoch": 0.3648421052631579, + "grad_norm": 0.48046875, + "learning_rate": 0.00036218024971866977, + "loss": 3.4346, + "step": 8665 + }, + { + "epoch": 0.3648842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.00036215009238025595, + "loss": 3.086, + "step": 8666 + }, + { + "epoch": 0.3649263157894737, + "grad_norm": 0.470703125, + "learning_rate": 0.00036211993299859916, + "loss": 2.894, + "step": 8667 + }, + { + "epoch": 0.36496842105263155, + "grad_norm": 0.453125, + "learning_rate": 0.00036208977157424895, + "loss": 3.3428, + "step": 8668 + }, + { + "epoch": 0.36501052631578945, + "grad_norm": 0.40625, + "learning_rate": 0.0003620596081077548, + "loss": 3.0775, + "step": 8669 + }, + { + "epoch": 0.36505263157894735, + "grad_norm": 0.408203125, + "learning_rate": 0.00036202944259966627, + "loss": 3.5118, + "step": 8670 + }, + { + "epoch": 0.36509473684210525, + "grad_norm": 0.396484375, + "learning_rate": 0.00036199927505053286, + "loss": 3.1989, + "step": 8671 + }, + { + "epoch": 0.36513684210526315, + "grad_norm": 0.423828125, + "learning_rate": 0.00036196910546090435, + "loss": 3.3286, + "step": 8672 + }, + { + "epoch": 0.36517894736842105, + "grad_norm": 0.45703125, + "learning_rate": 0.00036193893383133026, + "loss": 3.5424, + "step": 8673 + }, + { + "epoch": 0.36522105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.00036190876016236034, + "loss": 3.1608, + "step": 8674 + }, + { + "epoch": 0.36526315789473685, + "grad_norm": 0.431640625, + "learning_rate": 0.00036187858445454427, + "loss": 3.3762, + "step": 8675 + }, + { + "epoch": 0.36530526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.0003618484067084318, + "loss": 3.5689, + "step": 8676 + }, + { + "epoch": 0.36534736842105264, + "grad_norm": 0.416015625, + "learning_rate": 0.0003618182269245729, + "loss": 3.0055, + "step": 8677 + }, + { + "epoch": 0.36538947368421054, + "grad_norm": 0.4140625, + "learning_rate": 0.00036178804510351715, + "loss": 3.6636, + "step": 8678 + }, + { + "epoch": 0.36543157894736844, + "grad_norm": 0.408203125, + "learning_rate": 0.0003617578612458146, + "loss": 2.9881, + "step": 8679 + }, + { + "epoch": 0.36547368421052634, + "grad_norm": 0.4140625, + "learning_rate": 0.0003617276753520152, + "loss": 3.3805, + "step": 8680 + }, + { + "epoch": 0.3655157894736842, + "grad_norm": 0.404296875, + "learning_rate": 0.0003616974874226687, + "loss": 3.2967, + "step": 8681 + }, + { + "epoch": 0.3655578947368421, + "grad_norm": 0.47265625, + "learning_rate": 0.0003616672974583253, + "loss": 3.2124, + "step": 8682 + }, + { + "epoch": 0.3656, + "grad_norm": 0.4375, + "learning_rate": 0.000361637105459535, + "loss": 3.0296, + "step": 8683 + }, + { + "epoch": 0.3656421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.0003616069114268478, + "loss": 3.2064, + "step": 8684 + }, + { + "epoch": 0.3656842105263158, + "grad_norm": 0.44140625, + "learning_rate": 0.0003615767153608137, + "loss": 3.2912, + "step": 8685 + }, + { + "epoch": 0.3657263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.00036154651726198295, + "loss": 3.4403, + "step": 8686 + }, + { + "epoch": 0.3657684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.00036151631713090583, + "loss": 3.2468, + "step": 8687 + }, + { + "epoch": 0.3658105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.00036148611496813235, + "loss": 3.5002, + "step": 8688 + }, + { + "epoch": 0.3658526315789474, + "grad_norm": 0.443359375, + "learning_rate": 0.00036145591077421283, + "loss": 3.6422, + "step": 8689 + }, + { + "epoch": 0.3658947368421053, + "grad_norm": 0.400390625, + "learning_rate": 0.0003614257045496976, + "loss": 3.3135, + "step": 8690 + }, + { + "epoch": 0.36593684210526317, + "grad_norm": 0.416015625, + "learning_rate": 0.000361395496295137, + "loss": 3.0861, + "step": 8691 + }, + { + "epoch": 0.36597894736842107, + "grad_norm": 0.400390625, + "learning_rate": 0.0003613652860110813, + "loss": 2.8707, + "step": 8692 + }, + { + "epoch": 0.36602105263157897, + "grad_norm": 0.439453125, + "learning_rate": 0.000361335073698081, + "loss": 3.4729, + "step": 8693 + }, + { + "epoch": 0.36606315789473687, + "grad_norm": 0.4140625, + "learning_rate": 0.00036130485935668635, + "loss": 2.9446, + "step": 8694 + }, + { + "epoch": 0.3661052631578947, + "grad_norm": 0.423828125, + "learning_rate": 0.0003612746429874479, + "loss": 3.2595, + "step": 8695 + }, + { + "epoch": 0.3661473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.0003612444245909164, + "loss": 3.2128, + "step": 8696 + }, + { + "epoch": 0.3661894736842105, + "grad_norm": 0.431640625, + "learning_rate": 0.00036121420416764205, + "loss": 3.0934, + "step": 8697 + }, + { + "epoch": 0.3662315789473684, + "grad_norm": 0.482421875, + "learning_rate": 0.00036118398171817567, + "loss": 3.0088, + "step": 8698 + }, + { + "epoch": 0.3662736842105263, + "grad_norm": 0.416015625, + "learning_rate": 0.00036115375724306775, + "loss": 3.1936, + "step": 8699 + }, + { + "epoch": 0.3663157894736842, + "grad_norm": 0.4375, + "learning_rate": 0.00036112353074286896, + "loss": 3.3108, + "step": 8700 + }, + { + "epoch": 0.3663578947368421, + "grad_norm": 0.40234375, + "learning_rate": 0.0003610933022181299, + "loss": 2.928, + "step": 8701 + }, + { + "epoch": 0.3664, + "grad_norm": 0.439453125, + "learning_rate": 0.00036106307166940154, + "loss": 3.4794, + "step": 8702 + }, + { + "epoch": 0.3664421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.0003610328390972344, + "loss": 3.6009, + "step": 8703 + }, + { + "epoch": 0.3664842105263158, + "grad_norm": 0.470703125, + "learning_rate": 0.00036100260450217946, + "loss": 3.2725, + "step": 8704 + }, + { + "epoch": 0.3665263157894737, + "grad_norm": 0.40625, + "learning_rate": 0.0003609723678847875, + "loss": 3.0208, + "step": 8705 + }, + { + "epoch": 0.3665684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.0003609421292456094, + "loss": 3.5445, + "step": 8706 + }, + { + "epoch": 0.3666105263157895, + "grad_norm": 0.423828125, + "learning_rate": 0.00036091188858519604, + "loss": 3.4127, + "step": 8707 + }, + { + "epoch": 0.36665263157894734, + "grad_norm": 0.453125, + "learning_rate": 0.00036088164590409834, + "loss": 3.227, + "step": 8708 + }, + { + "epoch": 0.36669473684210524, + "grad_norm": 0.40625, + "learning_rate": 0.00036085140120286735, + "loss": 3.3638, + "step": 8709 + }, + { + "epoch": 0.36673684210526314, + "grad_norm": 0.419921875, + "learning_rate": 0.0003608211544820541, + "loss": 3.5464, + "step": 8710 + }, + { + "epoch": 0.36677894736842104, + "grad_norm": 0.466796875, + "learning_rate": 0.0003607909057422096, + "loss": 3.2785, + "step": 8711 + }, + { + "epoch": 0.36682105263157894, + "grad_norm": 0.443359375, + "learning_rate": 0.00036076065498388495, + "loss": 2.791, + "step": 8712 + }, + { + "epoch": 0.36686315789473684, + "grad_norm": 0.443359375, + "learning_rate": 0.0003607304022076314, + "loss": 3.5848, + "step": 8713 + }, + { + "epoch": 0.36690526315789473, + "grad_norm": 0.404296875, + "learning_rate": 0.00036070014741399994, + "loss": 3.0988, + "step": 8714 + }, + { + "epoch": 0.36694736842105263, + "grad_norm": 0.416015625, + "learning_rate": 0.00036066989060354185, + "loss": 3.0988, + "step": 8715 + }, + { + "epoch": 0.36698947368421053, + "grad_norm": 0.4375, + "learning_rate": 0.00036063963177680844, + "loss": 3.0324, + "step": 8716 + }, + { + "epoch": 0.36703157894736843, + "grad_norm": 0.423828125, + "learning_rate": 0.00036060937093435085, + "loss": 3.4222, + "step": 8717 + }, + { + "epoch": 0.36707368421052633, + "grad_norm": 0.45703125, + "learning_rate": 0.00036057910807672056, + "loss": 3.2787, + "step": 8718 + }, + { + "epoch": 0.36711578947368423, + "grad_norm": 0.419921875, + "learning_rate": 0.00036054884320446884, + "loss": 3.0296, + "step": 8719 + }, + { + "epoch": 0.3671578947368421, + "grad_norm": 0.453125, + "learning_rate": 0.0003605185763181471, + "loss": 2.952, + "step": 8720 + }, + { + "epoch": 0.3672, + "grad_norm": 0.396484375, + "learning_rate": 0.00036048830741830675, + "loss": 3.1313, + "step": 8721 + }, + { + "epoch": 0.36724210526315787, + "grad_norm": 0.4296875, + "learning_rate": 0.0003604580365054991, + "loss": 3.4035, + "step": 8722 + }, + { + "epoch": 0.36728421052631577, + "grad_norm": 0.44140625, + "learning_rate": 0.000360427763580276, + "loss": 3.351, + "step": 8723 + }, + { + "epoch": 0.36732631578947367, + "grad_norm": 0.41015625, + "learning_rate": 0.0003603974886431888, + "loss": 3.2386, + "step": 8724 + }, + { + "epoch": 0.36736842105263157, + "grad_norm": 0.42578125, + "learning_rate": 0.000360367211694789, + "loss": 3.19, + "step": 8725 + }, + { + "epoch": 0.36741052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.00036033693273562826, + "loss": 3.3061, + "step": 8726 + }, + { + "epoch": 0.36745263157894736, + "grad_norm": 0.39453125, + "learning_rate": 0.0003603066517662583, + "loss": 2.9887, + "step": 8727 + }, + { + "epoch": 0.36749473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.0003602763687872307, + "loss": 3.2417, + "step": 8728 + }, + { + "epoch": 0.36753684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.0003602460837990972, + "loss": 2.8331, + "step": 8729 + }, + { + "epoch": 0.36757894736842106, + "grad_norm": 0.427734375, + "learning_rate": 0.0003602157968024097, + "loss": 3.1964, + "step": 8730 + }, + { + "epoch": 0.36762105263157896, + "grad_norm": 0.431640625, + "learning_rate": 0.0003601855077977198, + "loss": 3.0692, + "step": 8731 + }, + { + "epoch": 0.36766315789473686, + "grad_norm": 0.408203125, + "learning_rate": 0.0003601552167855794, + "loss": 3.0501, + "step": 8732 + }, + { + "epoch": 0.36770526315789476, + "grad_norm": 0.4375, + "learning_rate": 0.0003601249237665405, + "loss": 3.4019, + "step": 8733 + }, + { + "epoch": 0.36774736842105266, + "grad_norm": 0.421875, + "learning_rate": 0.0003600946287411548, + "loss": 3.2833, + "step": 8734 + }, + { + "epoch": 0.3677894736842105, + "grad_norm": 0.408203125, + "learning_rate": 0.0003600643317099742, + "loss": 3.4603, + "step": 8735 + }, + { + "epoch": 0.3678315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.00036003403267355095, + "loss": 3.5506, + "step": 8736 + }, + { + "epoch": 0.3678736842105263, + "grad_norm": 0.404296875, + "learning_rate": 0.00036000373163243683, + "loss": 3.0973, + "step": 8737 + }, + { + "epoch": 0.3679157894736842, + "grad_norm": 0.46875, + "learning_rate": 0.000359973428587184, + "loss": 2.8814, + "step": 8738 + }, + { + "epoch": 0.3679578947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.0003599431235383446, + "loss": 3.447, + "step": 8739 + }, + { + "epoch": 0.368, + "grad_norm": 0.5546875, + "learning_rate": 0.00035991281648647055, + "loss": 3.1102, + "step": 8740 + }, + { + "epoch": 0.3680421052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.00035988250743211413, + "loss": 3.0448, + "step": 8741 + }, + { + "epoch": 0.3680842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.00035985219637582757, + "loss": 3.1204, + "step": 8742 + }, + { + "epoch": 0.3681263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.00035982188331816305, + "loss": 2.7306, + "step": 8743 + }, + { + "epoch": 0.3681684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.00035979156825967286, + "loss": 3.6199, + "step": 8744 + }, + { + "epoch": 0.3682105263157895, + "grad_norm": 0.423828125, + "learning_rate": 0.0003597612512009093, + "loss": 3.3402, + "step": 8745 + }, + { + "epoch": 0.3682526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.0003597309321424247, + "loss": 3.5146, + "step": 8746 + }, + { + "epoch": 0.3682947368421053, + "grad_norm": 0.416015625, + "learning_rate": 0.0003597006110847714, + "loss": 3.0812, + "step": 8747 + }, + { + "epoch": 0.3683368421052632, + "grad_norm": 0.439453125, + "learning_rate": 0.00035967028802850196, + "loss": 3.102, + "step": 8748 + }, + { + "epoch": 0.36837894736842103, + "grad_norm": 0.408203125, + "learning_rate": 0.00035963996297416864, + "loss": 3.3346, + "step": 8749 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 0.44140625, + "learning_rate": 0.000359609635922324, + "loss": 2.6832, + "step": 8750 + }, + { + "epoch": 0.3684631578947368, + "grad_norm": 0.4453125, + "learning_rate": 0.0003595793068735207, + "loss": 2.7353, + "step": 8751 + }, + { + "epoch": 0.3685052631578947, + "grad_norm": 0.416015625, + "learning_rate": 0.0003595489758283111, + "loss": 3.399, + "step": 8752 + }, + { + "epoch": 0.3685473684210526, + "grad_norm": 0.4140625, + "learning_rate": 0.0003595186427872479, + "loss": 3.1513, + "step": 8753 + }, + { + "epoch": 0.3685894736842105, + "grad_norm": 0.408203125, + "learning_rate": 0.0003594883077508836, + "loss": 3.0264, + "step": 8754 + }, + { + "epoch": 0.3686315789473684, + "grad_norm": 0.40234375, + "learning_rate": 0.0003594579707197711, + "loss": 3.2465, + "step": 8755 + }, + { + "epoch": 0.3686736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.00035942763169446296, + "loss": 3.7956, + "step": 8756 + }, + { + "epoch": 0.3687157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.0003593972906755119, + "loss": 2.888, + "step": 8757 + }, + { + "epoch": 0.3687578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.0003593669476634708, + "loss": 3.3127, + "step": 8758 + }, + { + "epoch": 0.3688, + "grad_norm": 0.451171875, + "learning_rate": 0.0003593366026588924, + "loss": 3.0411, + "step": 8759 + }, + { + "epoch": 0.3688421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.00035930625566232957, + "loss": 3.2916, + "step": 8760 + }, + { + "epoch": 0.3688842105263158, + "grad_norm": 0.458984375, + "learning_rate": 0.0003592759066743351, + "loss": 2.8771, + "step": 8761 + }, + { + "epoch": 0.36892631578947366, + "grad_norm": 0.421875, + "learning_rate": 0.0003592455556954621, + "loss": 3.7729, + "step": 8762 + }, + { + "epoch": 0.36896842105263156, + "grad_norm": 0.40234375, + "learning_rate": 0.0003592152027262634, + "loss": 3.3187, + "step": 8763 + }, + { + "epoch": 0.36901052631578946, + "grad_norm": 0.435546875, + "learning_rate": 0.00035918484776729204, + "loss": 3.5149, + "step": 8764 + }, + { + "epoch": 0.36905263157894735, + "grad_norm": 0.408203125, + "learning_rate": 0.00035915449081910106, + "loss": 3.5412, + "step": 8765 + }, + { + "epoch": 0.36909473684210525, + "grad_norm": 0.416015625, + "learning_rate": 0.00035912413188224346, + "loss": 3.0853, + "step": 8766 + }, + { + "epoch": 0.36913684210526315, + "grad_norm": 0.435546875, + "learning_rate": 0.00035909377095727247, + "loss": 3.4828, + "step": 8767 + }, + { + "epoch": 0.36917894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.00035906340804474113, + "loss": 3.1695, + "step": 8768 + }, + { + "epoch": 0.36922105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.00035903304314520256, + "loss": 2.5928, + "step": 8769 + }, + { + "epoch": 0.36926315789473685, + "grad_norm": 0.431640625, + "learning_rate": 0.00035900267625921006, + "loss": 3.471, + "step": 8770 + }, + { + "epoch": 0.36930526315789475, + "grad_norm": 0.455078125, + "learning_rate": 0.000358972307387317, + "loss": 2.885, + "step": 8771 + }, + { + "epoch": 0.36934736842105265, + "grad_norm": 0.5859375, + "learning_rate": 0.0003589419365300764, + "loss": 3.4296, + "step": 8772 + }, + { + "epoch": 0.36938947368421055, + "grad_norm": 0.80078125, + "learning_rate": 0.0003589115636880418, + "loss": 3.275, + "step": 8773 + }, + { + "epoch": 0.36943157894736844, + "grad_norm": 0.42578125, + "learning_rate": 0.00035888118886176647, + "loss": 3.2415, + "step": 8774 + }, + { + "epoch": 0.36947368421052634, + "grad_norm": 0.46875, + "learning_rate": 0.00035885081205180375, + "loss": 2.7681, + "step": 8775 + }, + { + "epoch": 0.3695157894736842, + "grad_norm": 0.44921875, + "learning_rate": 0.00035882043325870716, + "loss": 3.1312, + "step": 8776 + }, + { + "epoch": 0.3695578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.00035879005248303013, + "loss": 3.0068, + "step": 8777 + }, + { + "epoch": 0.3696, + "grad_norm": 0.49609375, + "learning_rate": 0.00035875966972532614, + "loss": 2.8719, + "step": 8778 + }, + { + "epoch": 0.3696421052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.00035872928498614884, + "loss": 3.0963, + "step": 8779 + }, + { + "epoch": 0.3696842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.00035869889826605167, + "loss": 2.9659, + "step": 8780 + }, + { + "epoch": 0.3697263157894737, + "grad_norm": 0.451171875, + "learning_rate": 0.00035866850956558827, + "loss": 3.1153, + "step": 8781 + }, + { + "epoch": 0.3697684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.0003586381188853123, + "loss": 2.7501, + "step": 8782 + }, + { + "epoch": 0.3698105263157895, + "grad_norm": 0.451171875, + "learning_rate": 0.0003586077262257775, + "loss": 2.8393, + "step": 8783 + }, + { + "epoch": 0.3698526315789474, + "grad_norm": 0.55859375, + "learning_rate": 0.00035857733158753754, + "loss": 2.7448, + "step": 8784 + }, + { + "epoch": 0.3698947368421053, + "grad_norm": 0.478515625, + "learning_rate": 0.0003585469349711461, + "loss": 3.0044, + "step": 8785 + }, + { + "epoch": 0.3699368421052632, + "grad_norm": 0.42578125, + "learning_rate": 0.0003585165363771571, + "loss": 3.1144, + "step": 8786 + }, + { + "epoch": 0.3699789473684211, + "grad_norm": 0.3828125, + "learning_rate": 0.00035848613580612425, + "loss": 2.8756, + "step": 8787 + }, + { + "epoch": 0.370021052631579, + "grad_norm": 0.45703125, + "learning_rate": 0.0003584557332586016, + "loss": 3.1545, + "step": 8788 + }, + { + "epoch": 0.3700631578947368, + "grad_norm": 0.462890625, + "learning_rate": 0.0003584253287351428, + "loss": 3.4403, + "step": 8789 + }, + { + "epoch": 0.3701052631578947, + "grad_norm": 0.44140625, + "learning_rate": 0.0003583949222363019, + "loss": 3.062, + "step": 8790 + }, + { + "epoch": 0.3701473684210526, + "grad_norm": 0.4296875, + "learning_rate": 0.000358364513762633, + "loss": 3.5378, + "step": 8791 + }, + { + "epoch": 0.3701894736842105, + "grad_norm": 0.478515625, + "learning_rate": 0.00035833410331468984, + "loss": 2.6829, + "step": 8792 + }, + { + "epoch": 0.3702315789473684, + "grad_norm": 0.451171875, + "learning_rate": 0.00035830369089302674, + "loss": 3.058, + "step": 8793 + }, + { + "epoch": 0.3702736842105263, + "grad_norm": 0.396484375, + "learning_rate": 0.00035827327649819754, + "loss": 2.8752, + "step": 8794 + }, + { + "epoch": 0.3703157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.0003582428601307565, + "loss": 3.3184, + "step": 8795 + }, + { + "epoch": 0.3703578947368421, + "grad_norm": 0.400390625, + "learning_rate": 0.0003582124417912576, + "loss": 3.3844, + "step": 8796 + }, + { + "epoch": 0.3704, + "grad_norm": 0.404296875, + "learning_rate": 0.0003581820214802553, + "loss": 2.8874, + "step": 8797 + }, + { + "epoch": 0.3704421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.0003581515991983036, + "loss": 2.8709, + "step": 8798 + }, + { + "epoch": 0.3704842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.00035812117494595695, + "loss": 2.9632, + "step": 8799 + }, + { + "epoch": 0.3705263157894737, + "grad_norm": 0.40625, + "learning_rate": 0.00035809074872376943, + "loss": 3.374, + "step": 8800 + }, + { + "epoch": 0.3705684210526316, + "grad_norm": 0.416015625, + "learning_rate": 0.00035806032053229553, + "loss": 3.2579, + "step": 8801 + }, + { + "epoch": 0.3706105263157895, + "grad_norm": 0.400390625, + "learning_rate": 0.0003580298903720895, + "loss": 3.5015, + "step": 8802 + }, + { + "epoch": 0.37065263157894734, + "grad_norm": 0.44921875, + "learning_rate": 0.00035799945824370585, + "loss": 3.026, + "step": 8803 + }, + { + "epoch": 0.37069473684210524, + "grad_norm": 0.408203125, + "learning_rate": 0.0003579690241476989, + "loss": 3.1322, + "step": 8804 + }, + { + "epoch": 0.37073684210526314, + "grad_norm": 0.416015625, + "learning_rate": 0.0003579385880846232, + "loss": 3.4497, + "step": 8805 + }, + { + "epoch": 0.37077894736842104, + "grad_norm": 0.41796875, + "learning_rate": 0.0003579081500550333, + "loss": 2.8693, + "step": 8806 + }, + { + "epoch": 0.37082105263157894, + "grad_norm": 0.4765625, + "learning_rate": 0.0003578777100594837, + "loss": 3.2827, + "step": 8807 + }, + { + "epoch": 0.37086315789473684, + "grad_norm": 0.41015625, + "learning_rate": 0.00035784726809852895, + "loss": 2.7358, + "step": 8808 + }, + { + "epoch": 0.37090526315789474, + "grad_norm": 0.40234375, + "learning_rate": 0.00035781682417272367, + "loss": 3.3884, + "step": 8809 + }, + { + "epoch": 0.37094736842105264, + "grad_norm": 0.408203125, + "learning_rate": 0.00035778637828262253, + "loss": 3.4693, + "step": 8810 + }, + { + "epoch": 0.37098947368421054, + "grad_norm": 0.42578125, + "learning_rate": 0.00035775593042878026, + "loss": 3.2111, + "step": 8811 + }, + { + "epoch": 0.37103157894736843, + "grad_norm": 0.4296875, + "learning_rate": 0.0003577254806117515, + "loss": 3.6081, + "step": 8812 + }, + { + "epoch": 0.37107368421052633, + "grad_norm": 0.404296875, + "learning_rate": 0.00035769502883209106, + "loss": 3.1627, + "step": 8813 + }, + { + "epoch": 0.37111578947368423, + "grad_norm": 0.421875, + "learning_rate": 0.00035766457509035375, + "loss": 3.1394, + "step": 8814 + }, + { + "epoch": 0.37115789473684213, + "grad_norm": 0.515625, + "learning_rate": 0.00035763411938709446, + "loss": 3.4173, + "step": 8815 + }, + { + "epoch": 0.3712, + "grad_norm": 0.431640625, + "learning_rate": 0.00035760366172286786, + "loss": 3.6149, + "step": 8816 + }, + { + "epoch": 0.3712421052631579, + "grad_norm": 0.3984375, + "learning_rate": 0.00035757320209822897, + "loss": 3.2791, + "step": 8817 + }, + { + "epoch": 0.37128421052631577, + "grad_norm": 0.54296875, + "learning_rate": 0.0003575427405137328, + "loss": 3.0291, + "step": 8818 + }, + { + "epoch": 0.37132631578947367, + "grad_norm": 0.490234375, + "learning_rate": 0.0003575122769699343, + "loss": 2.8175, + "step": 8819 + }, + { + "epoch": 0.37136842105263157, + "grad_norm": 0.404296875, + "learning_rate": 0.0003574818114673883, + "loss": 3.1578, + "step": 8820 + }, + { + "epoch": 0.37141052631578947, + "grad_norm": 0.46875, + "learning_rate": 0.00035745134400665005, + "loss": 3.0817, + "step": 8821 + }, + { + "epoch": 0.37145263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.0003574208745882745, + "loss": 3.3079, + "step": 8822 + }, + { + "epoch": 0.37149473684210527, + "grad_norm": 0.4296875, + "learning_rate": 0.0003573904032128169, + "loss": 3.479, + "step": 8823 + }, + { + "epoch": 0.37153684210526317, + "grad_norm": 0.408203125, + "learning_rate": 0.00035735992988083235, + "loss": 3.4404, + "step": 8824 + }, + { + "epoch": 0.37157894736842106, + "grad_norm": 0.439453125, + "learning_rate": 0.00035732945459287596, + "loss": 2.8156, + "step": 8825 + }, + { + "epoch": 0.37162105263157896, + "grad_norm": 0.427734375, + "learning_rate": 0.00035729897734950307, + "loss": 2.9612, + "step": 8826 + }, + { + "epoch": 0.37166315789473686, + "grad_norm": 0.416015625, + "learning_rate": 0.0003572684981512689, + "loss": 3.1853, + "step": 8827 + }, + { + "epoch": 0.37170526315789476, + "grad_norm": 0.431640625, + "learning_rate": 0.0003572380169987286, + "loss": 3.2269, + "step": 8828 + }, + { + "epoch": 0.3717473684210526, + "grad_norm": 0.41796875, + "learning_rate": 0.00035720753389243776, + "loss": 2.9896, + "step": 8829 + }, + { + "epoch": 0.3717894736842105, + "grad_norm": 0.384765625, + "learning_rate": 0.00035717704883295154, + "loss": 2.8069, + "step": 8830 + }, + { + "epoch": 0.3718315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.00035714656182082546, + "loss": 3.3079, + "step": 8831 + }, + { + "epoch": 0.3718736842105263, + "grad_norm": 0.400390625, + "learning_rate": 0.0003571160728566149, + "loss": 3.0712, + "step": 8832 + }, + { + "epoch": 0.3719157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.00035708558194087536, + "loss": 2.9352, + "step": 8833 + }, + { + "epoch": 0.3719578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0003570550890741624, + "loss": 3.1976, + "step": 8834 + }, + { + "epoch": 0.372, + "grad_norm": 0.427734375, + "learning_rate": 0.00035702459425703144, + "loss": 3.6019, + "step": 8835 + }, + { + "epoch": 0.3720421052631579, + "grad_norm": 0.396484375, + "learning_rate": 0.0003569940974900381, + "loss": 2.9547, + "step": 8836 + }, + { + "epoch": 0.3720842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.000356963598773738, + "loss": 3.3466, + "step": 8837 + }, + { + "epoch": 0.3721263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.0003569330981086868, + "loss": 3.5103, + "step": 8838 + }, + { + "epoch": 0.3721684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.00035690259549544026, + "loss": 3.7032, + "step": 8839 + }, + { + "epoch": 0.3722105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.00035687209093455404, + "loss": 3.299, + "step": 8840 + }, + { + "epoch": 0.3722526315789474, + "grad_norm": 0.73828125, + "learning_rate": 0.0003568415844265839, + "loss": 3.2712, + "step": 8841 + }, + { + "epoch": 0.3722947368421053, + "grad_norm": 0.41796875, + "learning_rate": 0.0003568110759720855, + "loss": 3.3655, + "step": 8842 + }, + { + "epoch": 0.37233684210526313, + "grad_norm": 0.427734375, + "learning_rate": 0.0003567805655716149, + "loss": 3.0251, + "step": 8843 + }, + { + "epoch": 0.37237894736842103, + "grad_norm": 0.46484375, + "learning_rate": 0.00035675005322572783, + "loss": 3.1441, + "step": 8844 + }, + { + "epoch": 0.37242105263157893, + "grad_norm": 0.44140625, + "learning_rate": 0.0003567195389349802, + "loss": 3.2199, + "step": 8845 + }, + { + "epoch": 0.37246315789473683, + "grad_norm": 0.5, + "learning_rate": 0.00035668902269992797, + "loss": 3.4215, + "step": 8846 + }, + { + "epoch": 0.37250526315789473, + "grad_norm": 0.431640625, + "learning_rate": 0.0003566585045211271, + "loss": 2.9532, + "step": 8847 + }, + { + "epoch": 0.3725473684210526, + "grad_norm": 0.447265625, + "learning_rate": 0.0003566279843991337, + "loss": 3.8134, + "step": 8848 + }, + { + "epoch": 0.3725894736842105, + "grad_norm": 0.4140625, + "learning_rate": 0.00035659746233450364, + "loss": 3.0871, + "step": 8849 + }, + { + "epoch": 0.3726315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.00035656693832779296, + "loss": 3.2368, + "step": 8850 + }, + { + "epoch": 0.3726736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.000356536412379558, + "loss": 3.1628, + "step": 8851 + }, + { + "epoch": 0.3727157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.0003565058844903547, + "loss": 2.5716, + "step": 8852 + }, + { + "epoch": 0.3727578947368421, + "grad_norm": 0.455078125, + "learning_rate": 0.00035647535466073945, + "loss": 3.3633, + "step": 8853 + }, + { + "epoch": 0.3728, + "grad_norm": 0.466796875, + "learning_rate": 0.0003564448228912682, + "loss": 3.1276, + "step": 8854 + }, + { + "epoch": 0.3728421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.0003564142891824974, + "loss": 3.1487, + "step": 8855 + }, + { + "epoch": 0.37288421052631576, + "grad_norm": 0.44921875, + "learning_rate": 0.0003563837535349832, + "loss": 3.1549, + "step": 8856 + }, + { + "epoch": 0.37292631578947366, + "grad_norm": 0.423828125, + "learning_rate": 0.0003563532159492821, + "loss": 3.3066, + "step": 8857 + }, + { + "epoch": 0.37296842105263156, + "grad_norm": 0.458984375, + "learning_rate": 0.0003563226764259504, + "loss": 3.5371, + "step": 8858 + }, + { + "epoch": 0.37301052631578946, + "grad_norm": 0.41796875, + "learning_rate": 0.0003562921349655444, + "loss": 3.4258, + "step": 8859 + }, + { + "epoch": 0.37305263157894736, + "grad_norm": 0.44140625, + "learning_rate": 0.00035626159156862066, + "loss": 3.2932, + "step": 8860 + }, + { + "epoch": 0.37309473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.0003562310462357355, + "loss": 2.9712, + "step": 8861 + }, + { + "epoch": 0.37313684210526316, + "grad_norm": 0.4296875, + "learning_rate": 0.0003562004989674454, + "loss": 2.8239, + "step": 8862 + }, + { + "epoch": 0.37317894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.0003561699497643071, + "loss": 3.736, + "step": 8863 + }, + { + "epoch": 0.37322105263157895, + "grad_norm": 0.49609375, + "learning_rate": 0.0003561393986268771, + "loss": 2.8324, + "step": 8864 + }, + { + "epoch": 0.37326315789473685, + "grad_norm": 0.46875, + "learning_rate": 0.00035610884555571193, + "loss": 3.107, + "step": 8865 + }, + { + "epoch": 0.37330526315789475, + "grad_norm": 0.404296875, + "learning_rate": 0.00035607829055136824, + "loss": 3.4908, + "step": 8866 + }, + { + "epoch": 0.37334736842105265, + "grad_norm": 0.39453125, + "learning_rate": 0.0003560477336144028, + "loss": 3.3465, + "step": 8867 + }, + { + "epoch": 0.37338947368421055, + "grad_norm": 0.43359375, + "learning_rate": 0.00035601717474537223, + "loss": 3.2578, + "step": 8868 + }, + { + "epoch": 0.37343157894736845, + "grad_norm": 0.396484375, + "learning_rate": 0.00035598661394483326, + "loss": 2.8672, + "step": 8869 + }, + { + "epoch": 0.3734736842105263, + "grad_norm": 0.44140625, + "learning_rate": 0.00035595605121334275, + "loss": 3.0722, + "step": 8870 + }, + { + "epoch": 0.3735157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.0003559254865514574, + "loss": 3.2482, + "step": 8871 + }, + { + "epoch": 0.3735578947368421, + "grad_norm": 0.453125, + "learning_rate": 0.0003558949199597342, + "loss": 3.1095, + "step": 8872 + }, + { + "epoch": 0.3736, + "grad_norm": 0.40625, + "learning_rate": 0.00035586435143873, + "loss": 3.4075, + "step": 8873 + }, + { + "epoch": 0.3736421052631579, + "grad_norm": 0.3828125, + "learning_rate": 0.0003558337809890017, + "loss": 2.998, + "step": 8874 + }, + { + "epoch": 0.3736842105263158, + "grad_norm": 0.44140625, + "learning_rate": 0.0003558032086111063, + "loss": 2.9599, + "step": 8875 + }, + { + "epoch": 0.3737263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.00035577263430560063, + "loss": 3.0893, + "step": 8876 + }, + { + "epoch": 0.3737684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.00035574205807304196, + "loss": 3.311, + "step": 8877 + }, + { + "epoch": 0.3738105263157895, + "grad_norm": 0.416015625, + "learning_rate": 0.00035571147991398717, + "loss": 3.6897, + "step": 8878 + }, + { + "epoch": 0.3738526315789474, + "grad_norm": 0.4140625, + "learning_rate": 0.00035568089982899336, + "loss": 3.2627, + "step": 8879 + }, + { + "epoch": 0.3738947368421053, + "grad_norm": 0.40234375, + "learning_rate": 0.00035565031781861783, + "loss": 3.6754, + "step": 8880 + }, + { + "epoch": 0.3739368421052632, + "grad_norm": 0.431640625, + "learning_rate": 0.00035561973388341763, + "loss": 2.9702, + "step": 8881 + }, + { + "epoch": 0.3739789473684211, + "grad_norm": 0.380859375, + "learning_rate": 0.00035558914802395, + "loss": 2.9022, + "step": 8882 + }, + { + "epoch": 0.3740210526315789, + "grad_norm": 0.412109375, + "learning_rate": 0.00035555856024077203, + "loss": 3.0538, + "step": 8883 + }, + { + "epoch": 0.3740631578947368, + "grad_norm": 0.44140625, + "learning_rate": 0.00035552797053444124, + "loss": 2.8616, + "step": 8884 + }, + { + "epoch": 0.3741052631578947, + "grad_norm": 0.40625, + "learning_rate": 0.0003554973789055148, + "loss": 3.7329, + "step": 8885 + }, + { + "epoch": 0.3741473684210526, + "grad_norm": 0.40625, + "learning_rate": 0.0003554667853545501, + "loss": 3.1809, + "step": 8886 + }, + { + "epoch": 0.3741894736842105, + "grad_norm": 0.431640625, + "learning_rate": 0.00035543618988210433, + "loss": 3.1128, + "step": 8887 + }, + { + "epoch": 0.3742315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.0003554055924887352, + "loss": 3.4302, + "step": 8888 + }, + { + "epoch": 0.3742736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.0003553749931750001, + "loss": 3.4367, + "step": 8889 + }, + { + "epoch": 0.3743157894736842, + "grad_norm": 0.44921875, + "learning_rate": 0.00035534439194145624, + "loss": 3.357, + "step": 8890 + }, + { + "epoch": 0.3743578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00035531378878866145, + "loss": 3.2225, + "step": 8891 + }, + { + "epoch": 0.3744, + "grad_norm": 0.37890625, + "learning_rate": 0.0003552831837171731, + "loss": 2.9735, + "step": 8892 + }, + { + "epoch": 0.3744421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00035525257672754893, + "loss": 3.0731, + "step": 8893 + }, + { + "epoch": 0.3744842105263158, + "grad_norm": 0.4140625, + "learning_rate": 0.00035522196782034645, + "loss": 3.1161, + "step": 8894 + }, + { + "epoch": 0.3745263157894737, + "grad_norm": 0.443359375, + "learning_rate": 0.00035519135699612324, + "loss": 3.1231, + "step": 8895 + }, + { + "epoch": 0.3745684210526316, + "grad_norm": 0.384765625, + "learning_rate": 0.00035516074425543726, + "loss": 3.0028, + "step": 8896 + }, + { + "epoch": 0.37461052631578945, + "grad_norm": 0.41796875, + "learning_rate": 0.0003551301295988459, + "loss": 3.2266, + "step": 8897 + }, + { + "epoch": 0.37465263157894735, + "grad_norm": 0.408203125, + "learning_rate": 0.0003550995130269073, + "loss": 3.2177, + "step": 8898 + }, + { + "epoch": 0.37469473684210525, + "grad_norm": 0.392578125, + "learning_rate": 0.000355068894540179, + "loss": 2.9098, + "step": 8899 + }, + { + "epoch": 0.37473684210526315, + "grad_norm": 0.4140625, + "learning_rate": 0.0003550382741392189, + "loss": 2.9361, + "step": 8900 + }, + { + "epoch": 0.37477894736842104, + "grad_norm": 0.439453125, + "learning_rate": 0.00035500765182458477, + "loss": 3.4568, + "step": 8901 + }, + { + "epoch": 0.37482105263157894, + "grad_norm": 0.39453125, + "learning_rate": 0.0003549770275968347, + "loss": 3.3822, + "step": 8902 + }, + { + "epoch": 0.37486315789473684, + "grad_norm": 0.400390625, + "learning_rate": 0.00035494640145652647, + "loss": 3.5677, + "step": 8903 + }, + { + "epoch": 0.37490526315789474, + "grad_norm": 0.3828125, + "learning_rate": 0.0003549157734042181, + "loss": 3.1161, + "step": 8904 + }, + { + "epoch": 0.37494736842105264, + "grad_norm": 0.40234375, + "learning_rate": 0.0003548851434404676, + "loss": 2.6879, + "step": 8905 + }, + { + "epoch": 0.37498947368421054, + "grad_norm": 0.42578125, + "learning_rate": 0.0003548545115658331, + "loss": 2.9278, + "step": 8906 + }, + { + "epoch": 0.37503157894736844, + "grad_norm": 0.443359375, + "learning_rate": 0.00035482387778087255, + "loss": 3.1649, + "step": 8907 + }, + { + "epoch": 0.37507368421052634, + "grad_norm": 0.412109375, + "learning_rate": 0.00035479324208614416, + "loss": 3.5147, + "step": 8908 + }, + { + "epoch": 0.37511578947368424, + "grad_norm": 0.419921875, + "learning_rate": 0.000354762604482206, + "loss": 3.3122, + "step": 8909 + }, + { + "epoch": 0.3751578947368421, + "grad_norm": 0.396484375, + "learning_rate": 0.0003547319649696162, + "loss": 2.9256, + "step": 8910 + }, + { + "epoch": 0.3752, + "grad_norm": 0.4453125, + "learning_rate": 0.00035470132354893316, + "loss": 2.842, + "step": 8911 + }, + { + "epoch": 0.3752421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.00035467068022071495, + "loss": 3.192, + "step": 8912 + }, + { + "epoch": 0.3752842105263158, + "grad_norm": 0.404296875, + "learning_rate": 0.0003546400349855199, + "loss": 3.364, + "step": 8913 + }, + { + "epoch": 0.3753263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.00035460938784390644, + "loss": 3.4077, + "step": 8914 + }, + { + "epoch": 0.3753684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.00035457873879643286, + "loss": 3.2066, + "step": 8915 + }, + { + "epoch": 0.3754105263157895, + "grad_norm": 0.40234375, + "learning_rate": 0.00035454808784365747, + "loss": 2.8645, + "step": 8916 + }, + { + "epoch": 0.37545263157894737, + "grad_norm": 0.40625, + "learning_rate": 0.00035451743498613874, + "loss": 3.0542, + "step": 8917 + }, + { + "epoch": 0.37549473684210527, + "grad_norm": 0.4140625, + "learning_rate": 0.00035448678022443524, + "loss": 3.0786, + "step": 8918 + }, + { + "epoch": 0.37553684210526317, + "grad_norm": 0.384765625, + "learning_rate": 0.00035445612355910534, + "loss": 2.6355, + "step": 8919 + }, + { + "epoch": 0.37557894736842107, + "grad_norm": 0.439453125, + "learning_rate": 0.0003544254649907076, + "loss": 3.3552, + "step": 8920 + }, + { + "epoch": 0.37562105263157897, + "grad_norm": 0.396484375, + "learning_rate": 0.00035439480451980056, + "loss": 3.0728, + "step": 8921 + }, + { + "epoch": 0.37566315789473687, + "grad_norm": 0.404296875, + "learning_rate": 0.0003543641421469428, + "loss": 2.9536, + "step": 8922 + }, + { + "epoch": 0.37570526315789476, + "grad_norm": 0.431640625, + "learning_rate": 0.0003543334778726931, + "loss": 3.5309, + "step": 8923 + }, + { + "epoch": 0.3757473684210526, + "grad_norm": 0.484375, + "learning_rate": 0.0003543028116976099, + "loss": 3.4626, + "step": 8924 + }, + { + "epoch": 0.3757894736842105, + "grad_norm": 0.443359375, + "learning_rate": 0.0003542721436222521, + "loss": 3.2538, + "step": 8925 + }, + { + "epoch": 0.3758315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.0003542414736471783, + "loss": 3.2549, + "step": 8926 + }, + { + "epoch": 0.3758736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.0003542108017729474, + "loss": 3.0534, + "step": 8927 + }, + { + "epoch": 0.3759157894736842, + "grad_norm": 0.45703125, + "learning_rate": 0.0003541801280001181, + "loss": 3.0592, + "step": 8928 + }, + { + "epoch": 0.3759578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00035414945232924925, + "loss": 3.0419, + "step": 8929 + }, + { + "epoch": 0.376, + "grad_norm": 0.515625, + "learning_rate": 0.00035411877476089975, + "loss": 3.4069, + "step": 8930 + }, + { + "epoch": 0.3760421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00035408809529562855, + "loss": 3.0436, + "step": 8931 + }, + { + "epoch": 0.3760842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.0003540574139339945, + "loss": 3.5367, + "step": 8932 + }, + { + "epoch": 0.3761263157894737, + "grad_norm": 0.470703125, + "learning_rate": 0.0003540267306765567, + "loss": 3.0923, + "step": 8933 + }, + { + "epoch": 0.3761684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.0003539960455238741, + "loss": 3.1992, + "step": 8934 + }, + { + "epoch": 0.3762105263157895, + "grad_norm": 0.390625, + "learning_rate": 0.00035396535847650567, + "loss": 3.3281, + "step": 8935 + }, + { + "epoch": 0.3762526315789474, + "grad_norm": 0.40625, + "learning_rate": 0.00035393466953501063, + "loss": 3.6561, + "step": 8936 + }, + { + "epoch": 0.37629473684210524, + "grad_norm": 0.427734375, + "learning_rate": 0.00035390397869994794, + "loss": 3.1228, + "step": 8937 + }, + { + "epoch": 0.37633684210526314, + "grad_norm": 0.45703125, + "learning_rate": 0.0003538732859718769, + "loss": 3.327, + "step": 8938 + }, + { + "epoch": 0.37637894736842104, + "grad_norm": 0.416015625, + "learning_rate": 0.00035384259135135665, + "loss": 3.2079, + "step": 8939 + }, + { + "epoch": 0.37642105263157893, + "grad_norm": 0.427734375, + "learning_rate": 0.00035381189483894637, + "loss": 3.4769, + "step": 8940 + }, + { + "epoch": 0.37646315789473683, + "grad_norm": 0.44140625, + "learning_rate": 0.00035378119643520535, + "loss": 3.3818, + "step": 8941 + }, + { + "epoch": 0.37650526315789473, + "grad_norm": 0.578125, + "learning_rate": 0.0003537504961406929, + "loss": 3.0938, + "step": 8942 + }, + { + "epoch": 0.37654736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.00035371979395596825, + "loss": 3.1622, + "step": 8943 + }, + { + "epoch": 0.37658947368421053, + "grad_norm": 0.44140625, + "learning_rate": 0.0003536890898815908, + "loss": 3.1466, + "step": 8944 + }, + { + "epoch": 0.37663157894736843, + "grad_norm": 0.39453125, + "learning_rate": 0.00035365838391812, + "loss": 3.024, + "step": 8945 + }, + { + "epoch": 0.3766736842105263, + "grad_norm": 0.447265625, + "learning_rate": 0.00035362767606611533, + "loss": 3.4037, + "step": 8946 + }, + { + "epoch": 0.3767157894736842, + "grad_norm": 0.51953125, + "learning_rate": 0.00035359696632613605, + "loss": 2.6377, + "step": 8947 + }, + { + "epoch": 0.3767578947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.0003535662546987418, + "loss": 3.0471, + "step": 8948 + }, + { + "epoch": 0.3768, + "grad_norm": 0.42578125, + "learning_rate": 0.00035353554118449206, + "loss": 3.1841, + "step": 8949 + }, + { + "epoch": 0.37684210526315787, + "grad_norm": 0.419921875, + "learning_rate": 0.00035350482578394636, + "loss": 3.2254, + "step": 8950 + }, + { + "epoch": 0.37688421052631577, + "grad_norm": 0.45703125, + "learning_rate": 0.0003534741084976644, + "loss": 3.1085, + "step": 8951 + }, + { + "epoch": 0.37692631578947366, + "grad_norm": 0.40234375, + "learning_rate": 0.00035344338932620577, + "loss": 3.1488, + "step": 8952 + }, + { + "epoch": 0.37696842105263156, + "grad_norm": 0.3984375, + "learning_rate": 0.0003534126682701302, + "loss": 2.7812, + "step": 8953 + }, + { + "epoch": 0.37701052631578946, + "grad_norm": 0.423828125, + "learning_rate": 0.0003533819453299972, + "loss": 3.2079, + "step": 8954 + }, + { + "epoch": 0.37705263157894736, + "grad_norm": 0.435546875, + "learning_rate": 0.00035335122050636665, + "loss": 3.3812, + "step": 8955 + }, + { + "epoch": 0.37709473684210526, + "grad_norm": 0.392578125, + "learning_rate": 0.0003533204937997983, + "loss": 3.0381, + "step": 8956 + }, + { + "epoch": 0.37713684210526316, + "grad_norm": 0.5078125, + "learning_rate": 0.00035328976521085194, + "loss": 3.0779, + "step": 8957 + }, + { + "epoch": 0.37717894736842106, + "grad_norm": 0.4140625, + "learning_rate": 0.0003532590347400874, + "loss": 3.3667, + "step": 8958 + }, + { + "epoch": 0.37722105263157896, + "grad_norm": 0.40234375, + "learning_rate": 0.00035322830238806465, + "loss": 3.2379, + "step": 8959 + }, + { + "epoch": 0.37726315789473686, + "grad_norm": 0.396484375, + "learning_rate": 0.0003531975681553435, + "loss": 3.5144, + "step": 8960 + }, + { + "epoch": 0.37730526315789475, + "grad_norm": 0.455078125, + "learning_rate": 0.0003531668320424839, + "loss": 3.6362, + "step": 8961 + }, + { + "epoch": 0.37734736842105265, + "grad_norm": 0.435546875, + "learning_rate": 0.00035313609405004584, + "loss": 2.9789, + "step": 8962 + }, + { + "epoch": 0.37738947368421055, + "grad_norm": 0.4140625, + "learning_rate": 0.0003531053541785892, + "loss": 3.0765, + "step": 8963 + }, + { + "epoch": 0.3774315789473684, + "grad_norm": 0.40234375, + "learning_rate": 0.0003530746124286743, + "loss": 3.0762, + "step": 8964 + }, + { + "epoch": 0.3774736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.00035304386880086105, + "loss": 3.2973, + "step": 8965 + }, + { + "epoch": 0.3775157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.0003530131232957096, + "loss": 2.9986, + "step": 8966 + }, + { + "epoch": 0.3775578947368421, + "grad_norm": 0.5078125, + "learning_rate": 0.00035298237591378003, + "loss": 2.6886, + "step": 8967 + }, + { + "epoch": 0.3776, + "grad_norm": 0.498046875, + "learning_rate": 0.00035295162665563255, + "loss": 3.7233, + "step": 8968 + }, + { + "epoch": 0.3776421052631579, + "grad_norm": 0.40234375, + "learning_rate": 0.00035292087552182746, + "loss": 2.8755, + "step": 8969 + }, + { + "epoch": 0.3776842105263158, + "grad_norm": 0.40234375, + "learning_rate": 0.0003528901225129249, + "loss": 3.265, + "step": 8970 + }, + { + "epoch": 0.3777263157894737, + "grad_norm": 0.458984375, + "learning_rate": 0.0003528593676294852, + "loss": 3.3299, + "step": 8971 + }, + { + "epoch": 0.3777684210526316, + "grad_norm": 0.4296875, + "learning_rate": 0.00035282861087206866, + "loss": 3.2227, + "step": 8972 + }, + { + "epoch": 0.3778105263157895, + "grad_norm": 0.41015625, + "learning_rate": 0.00035279785224123566, + "loss": 3.2662, + "step": 8973 + }, + { + "epoch": 0.3778526315789474, + "grad_norm": 0.421875, + "learning_rate": 0.00035276709173754663, + "loss": 2.9549, + "step": 8974 + }, + { + "epoch": 0.3778947368421053, + "grad_norm": 0.421875, + "learning_rate": 0.0003527363293615619, + "loss": 3.0989, + "step": 8975 + }, + { + "epoch": 0.3779368421052632, + "grad_norm": 0.384765625, + "learning_rate": 0.00035270556511384193, + "loss": 3.15, + "step": 8976 + }, + { + "epoch": 0.377978947368421, + "grad_norm": 0.396484375, + "learning_rate": 0.0003526747989949473, + "loss": 3.1652, + "step": 8977 + }, + { + "epoch": 0.3780210526315789, + "grad_norm": 0.439453125, + "learning_rate": 0.0003526440310054384, + "loss": 2.8999, + "step": 8978 + }, + { + "epoch": 0.3780631578947368, + "grad_norm": 0.435546875, + "learning_rate": 0.00035261326114587586, + "loss": 3.601, + "step": 8979 + }, + { + "epoch": 0.3781052631578947, + "grad_norm": 0.44140625, + "learning_rate": 0.00035258248941682027, + "loss": 2.7751, + "step": 8980 + }, + { + "epoch": 0.3781473684210526, + "grad_norm": 0.404296875, + "learning_rate": 0.00035255171581883226, + "loss": 3.3878, + "step": 8981 + }, + { + "epoch": 0.3781894736842105, + "grad_norm": 0.419921875, + "learning_rate": 0.0003525209403524726, + "loss": 3.5145, + "step": 8982 + }, + { + "epoch": 0.3782315789473684, + "grad_norm": 0.453125, + "learning_rate": 0.0003524901630183017, + "loss": 3.0004, + "step": 8983 + }, + { + "epoch": 0.3782736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.00035245938381688047, + "loss": 2.7245, + "step": 8984 + }, + { + "epoch": 0.3783157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.00035242860274876974, + "loss": 3.0773, + "step": 8985 + }, + { + "epoch": 0.3783578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.0003523978198145302, + "loss": 3.2519, + "step": 8986 + }, + { + "epoch": 0.3784, + "grad_norm": 0.44140625, + "learning_rate": 0.00035236703501472264, + "loss": 3.3628, + "step": 8987 + }, + { + "epoch": 0.3784421052631579, + "grad_norm": 0.470703125, + "learning_rate": 0.00035233624834990794, + "loss": 3.2578, + "step": 8988 + }, + { + "epoch": 0.3784842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.00035230545982064713, + "loss": 3.214, + "step": 8989 + }, + { + "epoch": 0.3785263157894737, + "grad_norm": 0.392578125, + "learning_rate": 0.0003522746694275011, + "loss": 3.3579, + "step": 8990 + }, + { + "epoch": 0.37856842105263155, + "grad_norm": 0.40625, + "learning_rate": 0.0003522438771710306, + "loss": 2.9936, + "step": 8991 + }, + { + "epoch": 0.37861052631578945, + "grad_norm": 0.4140625, + "learning_rate": 0.00035221308305179687, + "loss": 3.5256, + "step": 8992 + }, + { + "epoch": 0.37865263157894735, + "grad_norm": 0.400390625, + "learning_rate": 0.00035218228707036085, + "loss": 2.9523, + "step": 8993 + }, + { + "epoch": 0.37869473684210525, + "grad_norm": 0.4375, + "learning_rate": 0.00035215148922728367, + "loss": 3.5945, + "step": 8994 + }, + { + "epoch": 0.37873684210526315, + "grad_norm": 0.40234375, + "learning_rate": 0.00035212068952312635, + "loss": 2.8541, + "step": 8995 + }, + { + "epoch": 0.37877894736842105, + "grad_norm": 0.453125, + "learning_rate": 0.00035208988795844997, + "loss": 2.8665, + "step": 8996 + }, + { + "epoch": 0.37882105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.0003520590845338159, + "loss": 3.259, + "step": 8997 + }, + { + "epoch": 0.37886315789473685, + "grad_norm": 0.416015625, + "learning_rate": 0.00035202827924978513, + "loss": 3.2115, + "step": 8998 + }, + { + "epoch": 0.37890526315789474, + "grad_norm": 0.396484375, + "learning_rate": 0.0003519974721069191, + "loss": 3.0475, + "step": 8999 + }, + { + "epoch": 0.37894736842105264, + "grad_norm": 0.439453125, + "learning_rate": 0.0003519666631057789, + "loss": 2.898, + "step": 9000 + }, + { + "epoch": 0.37894736842105264, + "eval_loss": 3.1893510818481445, + "eval_runtime": 335.4664, + "eval_samples_per_second": 44.714, + "eval_steps_per_second": 5.589, + "step": 9000 + }, + { + "epoch": 0.37898947368421054, + "grad_norm": 0.41015625, + "learning_rate": 0.0003519358522469259, + "loss": 3.2336, + "step": 9001 + }, + { + "epoch": 0.37903157894736844, + "grad_norm": 0.419921875, + "learning_rate": 0.00035190503953092146, + "loss": 3.3474, + "step": 9002 + }, + { + "epoch": 0.37907368421052634, + "grad_norm": 0.44921875, + "learning_rate": 0.000351874224958327, + "loss": 3.1011, + "step": 9003 + }, + { + "epoch": 0.3791157894736842, + "grad_norm": 0.55078125, + "learning_rate": 0.0003518434085297037, + "loss": 2.8481, + "step": 9004 + }, + { + "epoch": 0.3791578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.0003518125902456133, + "loss": 3.0454, + "step": 9005 + }, + { + "epoch": 0.3792, + "grad_norm": 0.404296875, + "learning_rate": 0.000351781770106617, + "loss": 3.114, + "step": 9006 + }, + { + "epoch": 0.3792421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00035175094811327654, + "loss": 3.1363, + "step": 9007 + }, + { + "epoch": 0.3792842105263158, + "grad_norm": 0.412109375, + "learning_rate": 0.00035172012426615333, + "loss": 3.0914, + "step": 9008 + }, + { + "epoch": 0.3793263157894737, + "grad_norm": 0.421875, + "learning_rate": 0.000351689298565809, + "loss": 3.4058, + "step": 9009 + }, + { + "epoch": 0.3793684210526316, + "grad_norm": 0.4296875, + "learning_rate": 0.0003516584710128051, + "loss": 3.2226, + "step": 9010 + }, + { + "epoch": 0.3794105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.0003516276416077033, + "loss": 3.2155, + "step": 9011 + }, + { + "epoch": 0.3794526315789474, + "grad_norm": 0.419921875, + "learning_rate": 0.0003515968103510653, + "loss": 3.1663, + "step": 9012 + }, + { + "epoch": 0.3794947368421053, + "grad_norm": 0.443359375, + "learning_rate": 0.00035156597724345274, + "loss": 2.9861, + "step": 9013 + }, + { + "epoch": 0.3795368421052632, + "grad_norm": 0.40234375, + "learning_rate": 0.00035153514228542735, + "loss": 3.4498, + "step": 9014 + }, + { + "epoch": 0.37957894736842107, + "grad_norm": 0.41796875, + "learning_rate": 0.0003515043054775511, + "loss": 3.0106, + "step": 9015 + }, + { + "epoch": 0.37962105263157897, + "grad_norm": 0.4140625, + "learning_rate": 0.00035147346682038567, + "loss": 3.119, + "step": 9016 + }, + { + "epoch": 0.37966315789473687, + "grad_norm": 0.431640625, + "learning_rate": 0.0003514426263144929, + "loss": 3.455, + "step": 9017 + }, + { + "epoch": 0.3797052631578947, + "grad_norm": 0.4375, + "learning_rate": 0.00035141178396043453, + "loss": 2.9395, + "step": 9018 + }, + { + "epoch": 0.3797473684210526, + "grad_norm": 0.40234375, + "learning_rate": 0.00035138093975877274, + "loss": 3.0246, + "step": 9019 + }, + { + "epoch": 0.3797894736842105, + "grad_norm": 0.39453125, + "learning_rate": 0.00035135009371006933, + "loss": 3.1106, + "step": 9020 + }, + { + "epoch": 0.3798315789473684, + "grad_norm": 0.400390625, + "learning_rate": 0.0003513192458148863, + "loss": 3.452, + "step": 9021 + }, + { + "epoch": 0.3798736842105263, + "grad_norm": 0.390625, + "learning_rate": 0.00035128839607378553, + "loss": 3.1534, + "step": 9022 + }, + { + "epoch": 0.3799157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.0003512575444873293, + "loss": 2.9802, + "step": 9023 + }, + { + "epoch": 0.3799578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0003512266910560795, + "loss": 3.1454, + "step": 9024 + }, + { + "epoch": 0.38, + "grad_norm": 0.4140625, + "learning_rate": 0.00035119583578059843, + "loss": 3.2772, + "step": 9025 + }, + { + "epoch": 0.3800421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.00035116497866144816, + "loss": 3.2466, + "step": 9026 + }, + { + "epoch": 0.3800842105263158, + "grad_norm": 0.44140625, + "learning_rate": 0.00035113411969919073, + "loss": 3.2412, + "step": 9027 + }, + { + "epoch": 0.3801263157894737, + "grad_norm": 0.439453125, + "learning_rate": 0.00035110325889438853, + "loss": 3.1463, + "step": 9028 + }, + { + "epoch": 0.3801684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.0003510723962476038, + "loss": 3.0792, + "step": 9029 + }, + { + "epoch": 0.3802105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.00035104153175939863, + "loss": 3.2869, + "step": 9030 + }, + { + "epoch": 0.38025263157894734, + "grad_norm": 0.4609375, + "learning_rate": 0.00035101066543033547, + "loss": 3.1292, + "step": 9031 + }, + { + "epoch": 0.38029473684210524, + "grad_norm": 0.3984375, + "learning_rate": 0.00035097979726097676, + "loss": 3.319, + "step": 9032 + }, + { + "epoch": 0.38033684210526314, + "grad_norm": 0.4140625, + "learning_rate": 0.00035094892725188483, + "loss": 3.203, + "step": 9033 + }, + { + "epoch": 0.38037894736842104, + "grad_norm": 0.4609375, + "learning_rate": 0.000350918055403622, + "loss": 2.8066, + "step": 9034 + }, + { + "epoch": 0.38042105263157894, + "grad_norm": 0.408203125, + "learning_rate": 0.0003508871817167508, + "loss": 3.0446, + "step": 9035 + }, + { + "epoch": 0.38046315789473684, + "grad_norm": 0.4140625, + "learning_rate": 0.00035085630619183364, + "loss": 3.2608, + "step": 9036 + }, + { + "epoch": 0.38050526315789474, + "grad_norm": 0.421875, + "learning_rate": 0.00035082542882943315, + "loss": 2.8483, + "step": 9037 + }, + { + "epoch": 0.38054736842105263, + "grad_norm": 0.451171875, + "learning_rate": 0.0003507945496301117, + "loss": 3.5058, + "step": 9038 + }, + { + "epoch": 0.38058947368421053, + "grad_norm": 0.439453125, + "learning_rate": 0.00035076366859443196, + "loss": 3.3024, + "step": 9039 + }, + { + "epoch": 0.38063157894736843, + "grad_norm": 0.427734375, + "learning_rate": 0.00035073278572295667, + "loss": 3.0329, + "step": 9040 + }, + { + "epoch": 0.38067368421052633, + "grad_norm": 0.4140625, + "learning_rate": 0.0003507019010162484, + "loss": 3.5822, + "step": 9041 + }, + { + "epoch": 0.38071578947368423, + "grad_norm": 0.42578125, + "learning_rate": 0.00035067101447486975, + "loss": 3.3362, + "step": 9042 + }, + { + "epoch": 0.38075789473684213, + "grad_norm": 0.43359375, + "learning_rate": 0.0003506401260993836, + "loss": 3.1626, + "step": 9043 + }, + { + "epoch": 0.3808, + "grad_norm": 0.41015625, + "learning_rate": 0.00035060923589035254, + "loss": 3.3958, + "step": 9044 + }, + { + "epoch": 0.38084210526315787, + "grad_norm": 0.400390625, + "learning_rate": 0.00035057834384833933, + "loss": 2.8822, + "step": 9045 + }, + { + "epoch": 0.38088421052631577, + "grad_norm": 0.44140625, + "learning_rate": 0.000350547449973907, + "loss": 3.2318, + "step": 9046 + }, + { + "epoch": 0.38092631578947367, + "grad_norm": 0.396484375, + "learning_rate": 0.0003505165542676181, + "loss": 3.1886, + "step": 9047 + }, + { + "epoch": 0.38096842105263157, + "grad_norm": 0.443359375, + "learning_rate": 0.00035048565673003585, + "loss": 3.1682, + "step": 9048 + }, + { + "epoch": 0.38101052631578947, + "grad_norm": 0.474609375, + "learning_rate": 0.00035045475736172297, + "loss": 2.963, + "step": 9049 + }, + { + "epoch": 0.38105263157894737, + "grad_norm": 0.40625, + "learning_rate": 0.0003504238561632424, + "loss": 3.1009, + "step": 9050 + }, + { + "epoch": 0.38109473684210526, + "grad_norm": 0.412109375, + "learning_rate": 0.0003503929531351571, + "loss": 2.9971, + "step": 9051 + }, + { + "epoch": 0.38113684210526316, + "grad_norm": 0.462890625, + "learning_rate": 0.00035036204827803023, + "loss": 3.1567, + "step": 9052 + }, + { + "epoch": 0.38117894736842106, + "grad_norm": 0.416015625, + "learning_rate": 0.0003503311415924248, + "loss": 3.1449, + "step": 9053 + }, + { + "epoch": 0.38122105263157896, + "grad_norm": 0.400390625, + "learning_rate": 0.00035030023307890384, + "loss": 3.3972, + "step": 9054 + }, + { + "epoch": 0.38126315789473686, + "grad_norm": 0.42578125, + "learning_rate": 0.0003502693227380304, + "loss": 2.9572, + "step": 9055 + }, + { + "epoch": 0.38130526315789476, + "grad_norm": 0.44921875, + "learning_rate": 0.0003502384105703678, + "loss": 3.136, + "step": 9056 + }, + { + "epoch": 0.38134736842105266, + "grad_norm": 0.427734375, + "learning_rate": 0.0003502074965764791, + "loss": 3.204, + "step": 9057 + }, + { + "epoch": 0.3813894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.0003501765807569275, + "loss": 3.1345, + "step": 9058 + }, + { + "epoch": 0.3814315789473684, + "grad_norm": 0.40625, + "learning_rate": 0.00035014566311227635, + "loss": 3.192, + "step": 9059 + }, + { + "epoch": 0.3814736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.00035011474364308895, + "loss": 3.2295, + "step": 9060 + }, + { + "epoch": 0.3815157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.0003500838223499285, + "loss": 3.1093, + "step": 9061 + }, + { + "epoch": 0.3815578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.00035005289923335835, + "loss": 3.4212, + "step": 9062 + }, + { + "epoch": 0.3816, + "grad_norm": 0.40625, + "learning_rate": 0.000350021974293942, + "loss": 2.9013, + "step": 9063 + }, + { + "epoch": 0.3816421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.00034999104753224275, + "loss": 2.9509, + "step": 9064 + }, + { + "epoch": 0.3816842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.00034996011894882406, + "loss": 3.2612, + "step": 9065 + }, + { + "epoch": 0.3817263157894737, + "grad_norm": 0.4453125, + "learning_rate": 0.00034992918854424957, + "loss": 3.2508, + "step": 9066 + }, + { + "epoch": 0.3817684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.0003498982563190826, + "loss": 3.2259, + "step": 9067 + }, + { + "epoch": 0.3818105263157895, + "grad_norm": 0.396484375, + "learning_rate": 0.00034986732227388685, + "loss": 3.4245, + "step": 9068 + }, + { + "epoch": 0.3818526315789474, + "grad_norm": 0.435546875, + "learning_rate": 0.00034983638640922576, + "loss": 2.9004, + "step": 9069 + }, + { + "epoch": 0.3818947368421053, + "grad_norm": 0.4375, + "learning_rate": 0.000349805448725663, + "loss": 2.8261, + "step": 9070 + }, + { + "epoch": 0.38193684210526313, + "grad_norm": 0.419921875, + "learning_rate": 0.0003497745092237623, + "loss": 2.9816, + "step": 9071 + }, + { + "epoch": 0.38197894736842103, + "grad_norm": 0.37890625, + "learning_rate": 0.00034974356790408716, + "loss": 2.7078, + "step": 9072 + }, + { + "epoch": 0.38202105263157893, + "grad_norm": 0.40234375, + "learning_rate": 0.0003497126247672014, + "loss": 3.0877, + "step": 9073 + }, + { + "epoch": 0.3820631578947368, + "grad_norm": 0.416015625, + "learning_rate": 0.0003496816798136689, + "loss": 3.2347, + "step": 9074 + }, + { + "epoch": 0.3821052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.00034965073304405326, + "loss": 3.5157, + "step": 9075 + }, + { + "epoch": 0.3821473684210526, + "grad_norm": 0.4375, + "learning_rate": 0.0003496197844589184, + "loss": 3.0099, + "step": 9076 + }, + { + "epoch": 0.3821894736842105, + "grad_norm": 0.408203125, + "learning_rate": 0.0003495888340588281, + "loss": 3.3364, + "step": 9077 + }, + { + "epoch": 0.3822315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.00034955788184434615, + "loss": 2.9762, + "step": 9078 + }, + { + "epoch": 0.3822736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.0003495269278160367, + "loss": 3.3684, + "step": 9079 + }, + { + "epoch": 0.3823157894736842, + "grad_norm": 0.380859375, + "learning_rate": 0.0003494959719744635, + "loss": 2.8853, + "step": 9080 + }, + { + "epoch": 0.3823578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.0003494650143201905, + "loss": 3.4443, + "step": 9081 + }, + { + "epoch": 0.3824, + "grad_norm": 0.41015625, + "learning_rate": 0.000349434054853782, + "loss": 2.6348, + "step": 9082 + }, + { + "epoch": 0.3824421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.0003494030935758017, + "loss": 2.9665, + "step": 9083 + }, + { + "epoch": 0.3824842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.00034937213048681393, + "loss": 3.072, + "step": 9084 + }, + { + "epoch": 0.38252631578947366, + "grad_norm": 0.44140625, + "learning_rate": 0.0003493411655873826, + "loss": 3.4029, + "step": 9085 + }, + { + "epoch": 0.38256842105263156, + "grad_norm": 0.431640625, + "learning_rate": 0.000349310198878072, + "loss": 2.9327, + "step": 9086 + }, + { + "epoch": 0.38261052631578946, + "grad_norm": 0.42578125, + "learning_rate": 0.00034927923035944634, + "loss": 2.8851, + "step": 9087 + }, + { + "epoch": 0.38265263157894736, + "grad_norm": 0.41796875, + "learning_rate": 0.0003492482600320697, + "loss": 3.0079, + "step": 9088 + }, + { + "epoch": 0.38269473684210525, + "grad_norm": 0.4453125, + "learning_rate": 0.0003492172878965063, + "loss": 3.5401, + "step": 9089 + }, + { + "epoch": 0.38273684210526315, + "grad_norm": 0.421875, + "learning_rate": 0.00034918631395332056, + "loss": 3.0147, + "step": 9090 + }, + { + "epoch": 0.38277894736842105, + "grad_norm": 0.416015625, + "learning_rate": 0.0003491553382030767, + "loss": 3.0488, + "step": 9091 + }, + { + "epoch": 0.38282105263157895, + "grad_norm": 0.41015625, + "learning_rate": 0.000349124360646339, + "loss": 2.9225, + "step": 9092 + }, + { + "epoch": 0.38286315789473685, + "grad_norm": 0.40625, + "learning_rate": 0.0003490933812836721, + "loss": 3.4416, + "step": 9093 + }, + { + "epoch": 0.38290526315789475, + "grad_norm": 0.439453125, + "learning_rate": 0.00034906240011564003, + "loss": 3.678, + "step": 9094 + }, + { + "epoch": 0.38294736842105265, + "grad_norm": 0.416015625, + "learning_rate": 0.0003490314171428076, + "loss": 2.9728, + "step": 9095 + }, + { + "epoch": 0.38298947368421055, + "grad_norm": 0.408203125, + "learning_rate": 0.00034900043236573905, + "loss": 2.9566, + "step": 9096 + }, + { + "epoch": 0.38303157894736845, + "grad_norm": 0.5078125, + "learning_rate": 0.00034896944578499884, + "loss": 3.2297, + "step": 9097 + }, + { + "epoch": 0.3830736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.00034893845740115165, + "loss": 3.1214, + "step": 9098 + }, + { + "epoch": 0.3831157894736842, + "grad_norm": 0.46484375, + "learning_rate": 0.0003489074672147621, + "loss": 3.4199, + "step": 9099 + }, + { + "epoch": 0.3831578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00034887647522639467, + "loss": 3.1668, + "step": 9100 + }, + { + "epoch": 0.3832, + "grad_norm": 0.421875, + "learning_rate": 0.00034884548143661396, + "loss": 3.2169, + "step": 9101 + }, + { + "epoch": 0.3832421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.00034881448584598483, + "loss": 3.3098, + "step": 9102 + }, + { + "epoch": 0.3832842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.0003487834884550718, + "loss": 2.8624, + "step": 9103 + }, + { + "epoch": 0.3833263157894737, + "grad_norm": 0.48046875, + "learning_rate": 0.00034875248926443975, + "loss": 2.9868, + "step": 9104 + }, + { + "epoch": 0.3833684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.00034872148827465326, + "loss": 3.3611, + "step": 9105 + }, + { + "epoch": 0.3834105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.00034869048548627734, + "loss": 3.4054, + "step": 9106 + }, + { + "epoch": 0.3834526315789474, + "grad_norm": 0.412109375, + "learning_rate": 0.0003486594808998767, + "loss": 2.9441, + "step": 9107 + }, + { + "epoch": 0.3834947368421053, + "grad_norm": 0.431640625, + "learning_rate": 0.0003486284745160162, + "loss": 3.1384, + "step": 9108 + }, + { + "epoch": 0.3835368421052632, + "grad_norm": 0.4296875, + "learning_rate": 0.00034859746633526085, + "loss": 3.2456, + "step": 9109 + }, + { + "epoch": 0.3835789473684211, + "grad_norm": 0.4140625, + "learning_rate": 0.0003485664563581755, + "loss": 3.0276, + "step": 9110 + }, + { + "epoch": 0.383621052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.0003485354445853252, + "loss": 3.4695, + "step": 9111 + }, + { + "epoch": 0.3836631578947368, + "grad_norm": 0.431640625, + "learning_rate": 0.0003485044310172747, + "loss": 3.3797, + "step": 9112 + }, + { + "epoch": 0.3837052631578947, + "grad_norm": 0.388671875, + "learning_rate": 0.0003484734156545892, + "loss": 2.8459, + "step": 9113 + }, + { + "epoch": 0.3837473684210526, + "grad_norm": 0.4453125, + "learning_rate": 0.00034844239849783393, + "loss": 3.267, + "step": 9114 + }, + { + "epoch": 0.3837894736842105, + "grad_norm": 0.427734375, + "learning_rate": 0.00034841137954757373, + "loss": 3.0776, + "step": 9115 + }, + { + "epoch": 0.3838315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.0003483803588043738, + "loss": 2.7924, + "step": 9116 + }, + { + "epoch": 0.3838736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.0003483493362687994, + "loss": 3.2586, + "step": 9117 + }, + { + "epoch": 0.3839157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.0003483183119414156, + "loss": 2.9108, + "step": 9118 + }, + { + "epoch": 0.3839578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.00034828728582278766, + "loss": 3.0227, + "step": 9119 + }, + { + "epoch": 0.384, + "grad_norm": 0.41796875, + "learning_rate": 0.0003482562579134809, + "loss": 3.0682, + "step": 9120 + }, + { + "epoch": 0.3840421052631579, + "grad_norm": 0.458984375, + "learning_rate": 0.00034822522821406057, + "loss": 3.1137, + "step": 9121 + }, + { + "epoch": 0.3840842105263158, + "grad_norm": 0.39453125, + "learning_rate": 0.000348194196725092, + "loss": 3.1703, + "step": 9122 + }, + { + "epoch": 0.3841263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.0003481631634471405, + "loss": 3.2669, + "step": 9123 + }, + { + "epoch": 0.3841684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.00034813212838077153, + "loss": 3.1843, + "step": 9124 + }, + { + "epoch": 0.38421052631578945, + "grad_norm": 0.4375, + "learning_rate": 0.0003481010915265504, + "loss": 2.879, + "step": 9125 + }, + { + "epoch": 0.38425263157894735, + "grad_norm": 0.39453125, + "learning_rate": 0.0003480700528850427, + "loss": 3.1436, + "step": 9126 + }, + { + "epoch": 0.38429473684210524, + "grad_norm": 0.443359375, + "learning_rate": 0.0003480390124568139, + "loss": 3.0416, + "step": 9127 + }, + { + "epoch": 0.38433684210526314, + "grad_norm": 0.4296875, + "learning_rate": 0.0003480079702424295, + "loss": 3.1986, + "step": 9128 + }, + { + "epoch": 0.38437894736842104, + "grad_norm": 0.453125, + "learning_rate": 0.00034797692624245497, + "loss": 3.4086, + "step": 9129 + }, + { + "epoch": 0.38442105263157894, + "grad_norm": 0.462890625, + "learning_rate": 0.000347945880457456, + "loss": 3.3974, + "step": 9130 + }, + { + "epoch": 0.38446315789473684, + "grad_norm": 0.408203125, + "learning_rate": 0.00034791483288799816, + "loss": 3.5608, + "step": 9131 + }, + { + "epoch": 0.38450526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.000347883783534647, + "loss": 3.2104, + "step": 9132 + }, + { + "epoch": 0.38454736842105264, + "grad_norm": 0.435546875, + "learning_rate": 0.00034785273239796845, + "loss": 3.3573, + "step": 9133 + }, + { + "epoch": 0.38458947368421054, + "grad_norm": 0.412109375, + "learning_rate": 0.0003478216794785281, + "loss": 3.2512, + "step": 9134 + }, + { + "epoch": 0.38463157894736844, + "grad_norm": 0.400390625, + "learning_rate": 0.00034779062477689163, + "loss": 2.8528, + "step": 9135 + }, + { + "epoch": 0.38467368421052633, + "grad_norm": 0.375, + "learning_rate": 0.00034775956829362494, + "loss": 2.7154, + "step": 9136 + }, + { + "epoch": 0.38471578947368423, + "grad_norm": 0.421875, + "learning_rate": 0.00034772851002929376, + "loss": 3.0953, + "step": 9137 + }, + { + "epoch": 0.38475789473684213, + "grad_norm": 0.4140625, + "learning_rate": 0.0003476974499844639, + "loss": 3.2482, + "step": 9138 + }, + { + "epoch": 0.3848, + "grad_norm": 0.416015625, + "learning_rate": 0.00034766638815970127, + "loss": 3.3156, + "step": 9139 + }, + { + "epoch": 0.3848421052631579, + "grad_norm": 0.46484375, + "learning_rate": 0.00034763532455557176, + "loss": 3.4069, + "step": 9140 + }, + { + "epoch": 0.3848842105263158, + "grad_norm": 0.439453125, + "learning_rate": 0.00034760425917264147, + "loss": 2.835, + "step": 9141 + }, + { + "epoch": 0.38492631578947367, + "grad_norm": 0.41796875, + "learning_rate": 0.00034757319201147627, + "loss": 2.9664, + "step": 9142 + }, + { + "epoch": 0.38496842105263157, + "grad_norm": 0.40625, + "learning_rate": 0.0003475421230726421, + "loss": 3.5689, + "step": 9143 + }, + { + "epoch": 0.38501052631578947, + "grad_norm": 0.3984375, + "learning_rate": 0.0003475110523567051, + "loss": 3.6589, + "step": 9144 + }, + { + "epoch": 0.38505263157894737, + "grad_norm": 0.4453125, + "learning_rate": 0.00034747997986423124, + "loss": 3.1623, + "step": 9145 + }, + { + "epoch": 0.38509473684210527, + "grad_norm": 0.5703125, + "learning_rate": 0.00034744890559578666, + "loss": 3.3893, + "step": 9146 + }, + { + "epoch": 0.38513684210526317, + "grad_norm": 0.41796875, + "learning_rate": 0.00034741782955193754, + "loss": 3.1992, + "step": 9147 + }, + { + "epoch": 0.38517894736842107, + "grad_norm": 0.431640625, + "learning_rate": 0.00034738675173325007, + "loss": 3.1352, + "step": 9148 + }, + { + "epoch": 0.38522105263157896, + "grad_norm": 0.408203125, + "learning_rate": 0.0003473556721402904, + "loss": 2.9454, + "step": 9149 + }, + { + "epoch": 0.38526315789473686, + "grad_norm": 0.40234375, + "learning_rate": 0.0003473245907736248, + "loss": 2.888, + "step": 9150 + }, + { + "epoch": 0.38530526315789476, + "grad_norm": 0.419921875, + "learning_rate": 0.0003472935076338194, + "loss": 3.086, + "step": 9151 + }, + { + "epoch": 0.3853473684210526, + "grad_norm": 0.4453125, + "learning_rate": 0.0003472624227214406, + "loss": 3.3419, + "step": 9152 + }, + { + "epoch": 0.3853894736842105, + "grad_norm": 0.400390625, + "learning_rate": 0.0003472313360370549, + "loss": 3.1707, + "step": 9153 + }, + { + "epoch": 0.3854315789473684, + "grad_norm": 0.404296875, + "learning_rate": 0.00034720024758122834, + "loss": 3.6091, + "step": 9154 + }, + { + "epoch": 0.3854736842105263, + "grad_norm": 0.404296875, + "learning_rate": 0.0003471691573545276, + "loss": 3.4001, + "step": 9155 + }, + { + "epoch": 0.3855157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.0003471380653575189, + "loss": 2.9759, + "step": 9156 + }, + { + "epoch": 0.3855578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.0003471069715907688, + "loss": 3.2656, + "step": 9157 + }, + { + "epoch": 0.3856, + "grad_norm": 0.41015625, + "learning_rate": 0.0003470758760548438, + "loss": 3.1051, + "step": 9158 + }, + { + "epoch": 0.3856421052631579, + "grad_norm": 0.453125, + "learning_rate": 0.0003470447787503103, + "loss": 3.3771, + "step": 9159 + }, + { + "epoch": 0.3856842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.00034701367967773507, + "loss": 3.4899, + "step": 9160 + }, + { + "epoch": 0.3857263157894737, + "grad_norm": 0.404296875, + "learning_rate": 0.00034698257883768456, + "loss": 3.2793, + "step": 9161 + }, + { + "epoch": 0.3857684210526316, + "grad_norm": 0.435546875, + "learning_rate": 0.00034695147623072545, + "loss": 3.2151, + "step": 9162 + }, + { + "epoch": 0.3858105263157895, + "grad_norm": 0.431640625, + "learning_rate": 0.0003469203718574243, + "loss": 2.8305, + "step": 9163 + }, + { + "epoch": 0.3858526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.0003468892657183478, + "loss": 3.3651, + "step": 9164 + }, + { + "epoch": 0.3858947368421053, + "grad_norm": 0.3984375, + "learning_rate": 0.0003468581578140628, + "loss": 3.3477, + "step": 9165 + }, + { + "epoch": 0.38593684210526313, + "grad_norm": 0.408203125, + "learning_rate": 0.0003468270481451359, + "loss": 3.3132, + "step": 9166 + }, + { + "epoch": 0.38597894736842103, + "grad_norm": 0.431640625, + "learning_rate": 0.000346795936712134, + "loss": 3.0948, + "step": 9167 + }, + { + "epoch": 0.38602105263157893, + "grad_norm": 0.431640625, + "learning_rate": 0.00034676482351562386, + "loss": 3.6712, + "step": 9168 + }, + { + "epoch": 0.38606315789473683, + "grad_norm": 0.392578125, + "learning_rate": 0.0003467337085561723, + "loss": 3.3806, + "step": 9169 + }, + { + "epoch": 0.38610526315789473, + "grad_norm": 0.439453125, + "learning_rate": 0.0003467025918343463, + "loss": 2.8861, + "step": 9170 + }, + { + "epoch": 0.38614736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.00034667147335071263, + "loss": 3.2543, + "step": 9171 + }, + { + "epoch": 0.3861894736842105, + "grad_norm": 0.53125, + "learning_rate": 0.0003466403531058383, + "loss": 3.1865, + "step": 9172 + }, + { + "epoch": 0.3862315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.0003466092311002903, + "loss": 2.9101, + "step": 9173 + }, + { + "epoch": 0.3862736842105263, + "grad_norm": 0.3984375, + "learning_rate": 0.00034657810733463565, + "loss": 3.0891, + "step": 9174 + }, + { + "epoch": 0.3863157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.00034654698180944135, + "loss": 2.9474, + "step": 9175 + }, + { + "epoch": 0.3863578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.0003465158545252745, + "loss": 3.3993, + "step": 9176 + }, + { + "epoch": 0.3864, + "grad_norm": 0.4140625, + "learning_rate": 0.0003464847254827021, + "loss": 2.9952, + "step": 9177 + }, + { + "epoch": 0.3864421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.0003464535946822914, + "loss": 3.1781, + "step": 9178 + }, + { + "epoch": 0.38648421052631576, + "grad_norm": 0.423828125, + "learning_rate": 0.0003464224621246095, + "loss": 3.3455, + "step": 9179 + }, + { + "epoch": 0.38652631578947366, + "grad_norm": 0.427734375, + "learning_rate": 0.0003463913278102237, + "loss": 3.1198, + "step": 9180 + }, + { + "epoch": 0.38656842105263156, + "grad_norm": 0.41015625, + "learning_rate": 0.00034636019173970114, + "loss": 3.5939, + "step": 9181 + }, + { + "epoch": 0.38661052631578946, + "grad_norm": 0.40234375, + "learning_rate": 0.0003463290539136091, + "loss": 2.9391, + "step": 9182 + }, + { + "epoch": 0.38665263157894736, + "grad_norm": 0.455078125, + "learning_rate": 0.0003462979143325149, + "loss": 3.5022, + "step": 9183 + }, + { + "epoch": 0.38669473684210526, + "grad_norm": 0.390625, + "learning_rate": 0.00034626677299698583, + "loss": 3.0262, + "step": 9184 + }, + { + "epoch": 0.38673684210526316, + "grad_norm": 0.39453125, + "learning_rate": 0.00034623562990758927, + "loss": 3.2477, + "step": 9185 + }, + { + "epoch": 0.38677894736842106, + "grad_norm": 0.4140625, + "learning_rate": 0.00034620448506489257, + "loss": 3.3257, + "step": 9186 + }, + { + "epoch": 0.38682105263157895, + "grad_norm": 0.3984375, + "learning_rate": 0.0003461733384694632, + "loss": 3.1583, + "step": 9187 + }, + { + "epoch": 0.38686315789473685, + "grad_norm": 0.384765625, + "learning_rate": 0.00034614219012186863, + "loss": 3.101, + "step": 9188 + }, + { + "epoch": 0.38690526315789475, + "grad_norm": 0.404296875, + "learning_rate": 0.00034611104002267633, + "loss": 3.1601, + "step": 9189 + }, + { + "epoch": 0.38694736842105265, + "grad_norm": 0.40625, + "learning_rate": 0.0003460798881724537, + "loss": 3.0, + "step": 9190 + }, + { + "epoch": 0.38698947368421055, + "grad_norm": 0.408203125, + "learning_rate": 0.00034604873457176854, + "loss": 3.0523, + "step": 9191 + }, + { + "epoch": 0.38703157894736845, + "grad_norm": 0.392578125, + "learning_rate": 0.00034601757922118827, + "loss": 3.3357, + "step": 9192 + }, + { + "epoch": 0.3870736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.00034598642212128037, + "loss": 3.2214, + "step": 9193 + }, + { + "epoch": 0.3871157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.00034595526327261286, + "loss": 3.1781, + "step": 9194 + }, + { + "epoch": 0.3871578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00034592410267575305, + "loss": 2.7701, + "step": 9195 + }, + { + "epoch": 0.3872, + "grad_norm": 0.41015625, + "learning_rate": 0.00034589294033126884, + "loss": 3.1996, + "step": 9196 + }, + { + "epoch": 0.3872421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.0003458617762397279, + "loss": 3.3618, + "step": 9197 + }, + { + "epoch": 0.3872842105263158, + "grad_norm": 0.447265625, + "learning_rate": 0.00034583061040169803, + "loss": 3.1964, + "step": 9198 + }, + { + "epoch": 0.3873263157894737, + "grad_norm": 0.443359375, + "learning_rate": 0.00034579944281774703, + "loss": 3.1649, + "step": 9199 + }, + { + "epoch": 0.3873684210526316, + "grad_norm": 0.3828125, + "learning_rate": 0.0003457682734884428, + "loss": 3.0159, + "step": 9200 + }, + { + "epoch": 0.3874105263157895, + "grad_norm": 0.41015625, + "learning_rate": 0.00034573710241435313, + "loss": 3.1175, + "step": 9201 + }, + { + "epoch": 0.3874526315789474, + "grad_norm": 0.427734375, + "learning_rate": 0.0003457059295960459, + "loss": 3.1072, + "step": 9202 + }, + { + "epoch": 0.3874947368421053, + "grad_norm": 0.376953125, + "learning_rate": 0.00034567475503408915, + "loss": 2.8826, + "step": 9203 + }, + { + "epoch": 0.3875368421052632, + "grad_norm": 0.41015625, + "learning_rate": 0.0003456435787290507, + "loss": 3.1208, + "step": 9204 + }, + { + "epoch": 0.3875789473684211, + "grad_norm": 0.373046875, + "learning_rate": 0.0003456124006814987, + "loss": 2.8116, + "step": 9205 + }, + { + "epoch": 0.3876210526315789, + "grad_norm": 0.40625, + "learning_rate": 0.00034558122089200106, + "loss": 2.973, + "step": 9206 + }, + { + "epoch": 0.3876631578947368, + "grad_norm": 0.451171875, + "learning_rate": 0.0003455500393611258, + "loss": 3.7385, + "step": 9207 + }, + { + "epoch": 0.3877052631578947, + "grad_norm": 0.4296875, + "learning_rate": 0.00034551885608944115, + "loss": 2.984, + "step": 9208 + }, + { + "epoch": 0.3877473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.00034548767107751523, + "loss": 2.9402, + "step": 9209 + }, + { + "epoch": 0.3877894736842105, + "grad_norm": 0.435546875, + "learning_rate": 0.00034545648432591613, + "loss": 3.1153, + "step": 9210 + }, + { + "epoch": 0.3878315789473684, + "grad_norm": 0.44921875, + "learning_rate": 0.00034542529583521197, + "loss": 3.331, + "step": 9211 + }, + { + "epoch": 0.3878736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.00034539410560597113, + "loss": 2.6639, + "step": 9212 + }, + { + "epoch": 0.3879157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.00034536291363876164, + "loss": 2.9007, + "step": 9213 + }, + { + "epoch": 0.3879578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.000345331719934152, + "loss": 3.1507, + "step": 9214 + }, + { + "epoch": 0.388, + "grad_norm": 0.3984375, + "learning_rate": 0.00034530052449271043, + "loss": 2.8033, + "step": 9215 + }, + { + "epoch": 0.3880421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.0003452693273150053, + "loss": 3.2249, + "step": 9216 + }, + { + "epoch": 0.3880842105263158, + "grad_norm": 0.40234375, + "learning_rate": 0.0003452381284016049, + "loss": 2.801, + "step": 9217 + }, + { + "epoch": 0.3881263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.00034520692775307773, + "loss": 3.317, + "step": 9218 + }, + { + "epoch": 0.38816842105263155, + "grad_norm": 0.423828125, + "learning_rate": 0.0003451757253699922, + "loss": 3.1291, + "step": 9219 + }, + { + "epoch": 0.38821052631578945, + "grad_norm": 0.4609375, + "learning_rate": 0.0003451445212529167, + "loss": 3.0008, + "step": 9220 + }, + { + "epoch": 0.38825263157894735, + "grad_norm": 0.427734375, + "learning_rate": 0.00034511331540242, + "loss": 3.1768, + "step": 9221 + }, + { + "epoch": 0.38829473684210525, + "grad_norm": 0.423828125, + "learning_rate": 0.0003450821078190703, + "loss": 3.0742, + "step": 9222 + }, + { + "epoch": 0.38833684210526315, + "grad_norm": 0.41015625, + "learning_rate": 0.00034505089850343627, + "loss": 3.1913, + "step": 9223 + }, + { + "epoch": 0.38837894736842105, + "grad_norm": 0.427734375, + "learning_rate": 0.0003450196874560867, + "loss": 3.0506, + "step": 9224 + }, + { + "epoch": 0.38842105263157894, + "grad_norm": 0.412109375, + "learning_rate": 0.00034498847467759, + "loss": 3.1226, + "step": 9225 + }, + { + "epoch": 0.38846315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.0003449572601685149, + "loss": 3.5846, + "step": 9226 + }, + { + "epoch": 0.38850526315789474, + "grad_norm": 0.490234375, + "learning_rate": 0.00034492604392943004, + "loss": 3.1976, + "step": 9227 + }, + { + "epoch": 0.38854736842105264, + "grad_norm": 0.435546875, + "learning_rate": 0.00034489482596090427, + "loss": 2.8722, + "step": 9228 + }, + { + "epoch": 0.38858947368421054, + "grad_norm": 0.3984375, + "learning_rate": 0.00034486360626350624, + "loss": 3.0238, + "step": 9229 + }, + { + "epoch": 0.38863157894736844, + "grad_norm": 0.423828125, + "learning_rate": 0.0003448323848378048, + "loss": 3.2199, + "step": 9230 + }, + { + "epoch": 0.38867368421052634, + "grad_norm": 0.470703125, + "learning_rate": 0.00034480116168436863, + "loss": 3.2655, + "step": 9231 + }, + { + "epoch": 0.38871578947368424, + "grad_norm": 0.404296875, + "learning_rate": 0.0003447699368037668, + "loss": 3.3257, + "step": 9232 + }, + { + "epoch": 0.3887578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.000344738710196568, + "loss": 2.9313, + "step": 9233 + }, + { + "epoch": 0.3888, + "grad_norm": 0.40234375, + "learning_rate": 0.0003447074818633412, + "loss": 3.5461, + "step": 9234 + }, + { + "epoch": 0.3888421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.0003446762518046554, + "loss": 3.5364, + "step": 9235 + }, + { + "epoch": 0.3888842105263158, + "grad_norm": 0.412109375, + "learning_rate": 0.00034464502002107955, + "loss": 3.3248, + "step": 9236 + }, + { + "epoch": 0.3889263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0003446137865131826, + "loss": 3.2285, + "step": 9237 + }, + { + "epoch": 0.3889684210526316, + "grad_norm": 0.44140625, + "learning_rate": 0.0003445825512815337, + "loss": 3.5039, + "step": 9238 + }, + { + "epoch": 0.3890105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.0003445513143267017, + "loss": 3.5397, + "step": 9239 + }, + { + "epoch": 0.38905263157894737, + "grad_norm": 0.39453125, + "learning_rate": 0.0003445200756492559, + "loss": 2.9977, + "step": 9240 + }, + { + "epoch": 0.38909473684210527, + "grad_norm": 0.412109375, + "learning_rate": 0.0003444888352497654, + "loss": 2.7339, + "step": 9241 + }, + { + "epoch": 0.38913684210526317, + "grad_norm": 0.435546875, + "learning_rate": 0.0003444575931287994, + "loss": 3.4114, + "step": 9242 + }, + { + "epoch": 0.38917894736842107, + "grad_norm": 0.4140625, + "learning_rate": 0.00034442634928692696, + "loss": 3.2276, + "step": 9243 + }, + { + "epoch": 0.38922105263157897, + "grad_norm": 0.431640625, + "learning_rate": 0.0003443951037247174, + "loss": 3.2348, + "step": 9244 + }, + { + "epoch": 0.38926315789473687, + "grad_norm": 0.423828125, + "learning_rate": 0.00034436385644274, + "loss": 3.0852, + "step": 9245 + }, + { + "epoch": 0.3893052631578947, + "grad_norm": 0.41015625, + "learning_rate": 0.000344332607441564, + "loss": 3.6413, + "step": 9246 + }, + { + "epoch": 0.3893473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.00034430135672175867, + "loss": 3.2108, + "step": 9247 + }, + { + "epoch": 0.3893894736842105, + "grad_norm": 0.416015625, + "learning_rate": 0.00034427010428389346, + "loss": 3.6164, + "step": 9248 + }, + { + "epoch": 0.3894315789473684, + "grad_norm": 0.400390625, + "learning_rate": 0.00034423885012853763, + "loss": 2.682, + "step": 9249 + }, + { + "epoch": 0.3894736842105263, + "grad_norm": 0.40625, + "learning_rate": 0.0003442075942562608, + "loss": 3.3274, + "step": 9250 + }, + { + "epoch": 0.3895157894736842, + "grad_norm": 0.486328125, + "learning_rate": 0.00034417633666763227, + "loss": 3.354, + "step": 9251 + }, + { + "epoch": 0.3895578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0003441450773632215, + "loss": 3.0121, + "step": 9252 + }, + { + "epoch": 0.3896, + "grad_norm": 0.4140625, + "learning_rate": 0.00034411381634359803, + "loss": 3.0092, + "step": 9253 + }, + { + "epoch": 0.3896421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.0003440825536093314, + "loss": 3.2775, + "step": 9254 + }, + { + "epoch": 0.3896842105263158, + "grad_norm": 0.443359375, + "learning_rate": 0.0003440512891609912, + "loss": 3.6086, + "step": 9255 + }, + { + "epoch": 0.3897263157894737, + "grad_norm": 0.44921875, + "learning_rate": 0.000344020022999147, + "loss": 3.575, + "step": 9256 + }, + { + "epoch": 0.3897684210526316, + "grad_norm": 0.439453125, + "learning_rate": 0.0003439887551243685, + "loss": 3.3348, + "step": 9257 + }, + { + "epoch": 0.3898105263157895, + "grad_norm": 0.435546875, + "learning_rate": 0.00034395748553722527, + "loss": 3.5396, + "step": 9258 + }, + { + "epoch": 0.3898526315789474, + "grad_norm": 0.45703125, + "learning_rate": 0.00034392621423828714, + "loss": 3.886, + "step": 9259 + }, + { + "epoch": 0.38989473684210524, + "grad_norm": 0.435546875, + "learning_rate": 0.00034389494122812357, + "loss": 3.2329, + "step": 9260 + }, + { + "epoch": 0.38993684210526314, + "grad_norm": 0.49609375, + "learning_rate": 0.00034386366650730467, + "loss": 2.9259, + "step": 9261 + }, + { + "epoch": 0.38997894736842104, + "grad_norm": 0.427734375, + "learning_rate": 0.00034383239007639997, + "loss": 3.1411, + "step": 9262 + }, + { + "epoch": 0.39002105263157893, + "grad_norm": 0.38671875, + "learning_rate": 0.0003438011119359794, + "loss": 2.7093, + "step": 9263 + }, + { + "epoch": 0.39006315789473683, + "grad_norm": 0.44140625, + "learning_rate": 0.0003437698320866128, + "loss": 3.5848, + "step": 9264 + }, + { + "epoch": 0.39010526315789473, + "grad_norm": 0.43359375, + "learning_rate": 0.00034373855052886995, + "loss": 3.3434, + "step": 9265 + }, + { + "epoch": 0.39014736842105263, + "grad_norm": 0.45703125, + "learning_rate": 0.0003437072672633209, + "loss": 2.8476, + "step": 9266 + }, + { + "epoch": 0.39018947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.00034367598229053547, + "loss": 3.2013, + "step": 9267 + }, + { + "epoch": 0.39023157894736843, + "grad_norm": 0.43359375, + "learning_rate": 0.0003436446956110838, + "loss": 3.2493, + "step": 9268 + }, + { + "epoch": 0.39027368421052633, + "grad_norm": 0.42578125, + "learning_rate": 0.0003436134072255358, + "loss": 3.2342, + "step": 9269 + }, + { + "epoch": 0.3903157894736842, + "grad_norm": 0.4375, + "learning_rate": 0.0003435821171344615, + "loss": 3.1419, + "step": 9270 + }, + { + "epoch": 0.3903578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.000343550825338431, + "loss": 3.486, + "step": 9271 + }, + { + "epoch": 0.3904, + "grad_norm": 0.40234375, + "learning_rate": 0.0003435195318380143, + "loss": 2.7725, + "step": 9272 + }, + { + "epoch": 0.39044210526315787, + "grad_norm": 0.416015625, + "learning_rate": 0.00034348823663378166, + "loss": 3.297, + "step": 9273 + }, + { + "epoch": 0.39048421052631577, + "grad_norm": 0.4296875, + "learning_rate": 0.0003434569397263032, + "loss": 3.2614, + "step": 9274 + }, + { + "epoch": 0.39052631578947367, + "grad_norm": 0.408203125, + "learning_rate": 0.00034342564111614906, + "loss": 3.2127, + "step": 9275 + }, + { + "epoch": 0.39056842105263156, + "grad_norm": 0.4375, + "learning_rate": 0.0003433943408038895, + "loss": 3.1767, + "step": 9276 + }, + { + "epoch": 0.39061052631578946, + "grad_norm": 0.42578125, + "learning_rate": 0.0003433630387900949, + "loss": 2.7615, + "step": 9277 + }, + { + "epoch": 0.39065263157894736, + "grad_norm": 0.419921875, + "learning_rate": 0.0003433317350753354, + "loss": 3.0941, + "step": 9278 + }, + { + "epoch": 0.39069473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.00034330042966018134, + "loss": 3.3639, + "step": 9279 + }, + { + "epoch": 0.39073684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.000343269122545203, + "loss": 3.1707, + "step": 9280 + }, + { + "epoch": 0.39077894736842106, + "grad_norm": 0.42578125, + "learning_rate": 0.0003432378137309708, + "loss": 3.16, + "step": 9281 + }, + { + "epoch": 0.39082105263157896, + "grad_norm": 0.439453125, + "learning_rate": 0.00034320650321805537, + "loss": 3.2619, + "step": 9282 + }, + { + "epoch": 0.39086315789473686, + "grad_norm": 0.43359375, + "learning_rate": 0.00034317519100702674, + "loss": 3.4493, + "step": 9283 + }, + { + "epoch": 0.39090526315789476, + "grad_norm": 0.439453125, + "learning_rate": 0.0003431438770984558, + "loss": 3.0853, + "step": 9284 + }, + { + "epoch": 0.39094736842105265, + "grad_norm": 0.419921875, + "learning_rate": 0.00034311256149291284, + "loss": 2.9384, + "step": 9285 + }, + { + "epoch": 0.39098947368421055, + "grad_norm": 0.421875, + "learning_rate": 0.0003430812441909683, + "loss": 3.2078, + "step": 9286 + }, + { + "epoch": 0.3910315789473684, + "grad_norm": 0.44140625, + "learning_rate": 0.0003430499251931929, + "loss": 3.2524, + "step": 9287 + }, + { + "epoch": 0.3910736842105263, + "grad_norm": 0.40625, + "learning_rate": 0.0003430186045001572, + "loss": 3.5292, + "step": 9288 + }, + { + "epoch": 0.3911157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.00034298728211243183, + "loss": 2.9912, + "step": 9289 + }, + { + "epoch": 0.3911578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.0003429559580305874, + "loss": 3.3431, + "step": 9290 + }, + { + "epoch": 0.3912, + "grad_norm": 0.40625, + "learning_rate": 0.00034292463225519464, + "loss": 3.2726, + "step": 9291 + }, + { + "epoch": 0.3912421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.00034289330478682434, + "loss": 3.3585, + "step": 9292 + }, + { + "epoch": 0.3912842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.00034286197562604715, + "loss": 3.4024, + "step": 9293 + }, + { + "epoch": 0.3913263157894737, + "grad_norm": 0.40625, + "learning_rate": 0.0003428306447734338, + "loss": 3.4011, + "step": 9294 + }, + { + "epoch": 0.3913684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.00034279931222955517, + "loss": 3.1112, + "step": 9295 + }, + { + "epoch": 0.3914105263157895, + "grad_norm": 0.3828125, + "learning_rate": 0.00034276797799498215, + "loss": 2.9329, + "step": 9296 + }, + { + "epoch": 0.3914526315789474, + "grad_norm": 0.431640625, + "learning_rate": 0.0003427366420702856, + "loss": 3.0345, + "step": 9297 + }, + { + "epoch": 0.3914947368421053, + "grad_norm": 0.416015625, + "learning_rate": 0.0003427053044560364, + "loss": 3.1107, + "step": 9298 + }, + { + "epoch": 0.3915368421052632, + "grad_norm": 0.44140625, + "learning_rate": 0.0003426739651528054, + "loss": 3.0583, + "step": 9299 + }, + { + "epoch": 0.391578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00034264262416116366, + "loss": 2.8402, + "step": 9300 + }, + { + "epoch": 0.3916210526315789, + "grad_norm": 0.404296875, + "learning_rate": 0.00034261128148168214, + "loss": 3.214, + "step": 9301 + }, + { + "epoch": 0.3916631578947368, + "grad_norm": 0.4296875, + "learning_rate": 0.00034257993711493196, + "loss": 3.7449, + "step": 9302 + }, + { + "epoch": 0.3917052631578947, + "grad_norm": 0.431640625, + "learning_rate": 0.00034254859106148407, + "loss": 3.5061, + "step": 9303 + }, + { + "epoch": 0.3917473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.0003425172433219096, + "loss": 3.0373, + "step": 9304 + }, + { + "epoch": 0.3917894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.00034248589389677963, + "loss": 3.5142, + "step": 9305 + }, + { + "epoch": 0.3918315789473684, + "grad_norm": 0.3984375, + "learning_rate": 0.0003424545427866654, + "loss": 3.4156, + "step": 9306 + }, + { + "epoch": 0.3918736842105263, + "grad_norm": 0.49609375, + "learning_rate": 0.000342423189992138, + "loss": 2.8161, + "step": 9307 + }, + { + "epoch": 0.3919157894736842, + "grad_norm": 0.451171875, + "learning_rate": 0.00034239183551376857, + "loss": 2.9438, + "step": 9308 + }, + { + "epoch": 0.3919578947368421, + "grad_norm": 0.375, + "learning_rate": 0.00034236047935212857, + "loss": 2.8596, + "step": 9309 + }, + { + "epoch": 0.392, + "grad_norm": 0.388671875, + "learning_rate": 0.0003423291215077891, + "loss": 3.431, + "step": 9310 + }, + { + "epoch": 0.3920421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00034229776198132155, + "loss": 3.4723, + "step": 9311 + }, + { + "epoch": 0.3920842105263158, + "grad_norm": 0.462890625, + "learning_rate": 0.00034226640077329723, + "loss": 3.3518, + "step": 9312 + }, + { + "epoch": 0.3921263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.0003422350378842875, + "loss": 3.1741, + "step": 9313 + }, + { + "epoch": 0.39216842105263156, + "grad_norm": 0.40625, + "learning_rate": 0.0003422036733148637, + "loss": 3.3177, + "step": 9314 + }, + { + "epoch": 0.39221052631578945, + "grad_norm": 0.423828125, + "learning_rate": 0.0003421723070655973, + "loss": 3.8976, + "step": 9315 + }, + { + "epoch": 0.39225263157894735, + "grad_norm": 0.392578125, + "learning_rate": 0.0003421409391370598, + "loss": 2.6924, + "step": 9316 + }, + { + "epoch": 0.39229473684210525, + "grad_norm": 0.404296875, + "learning_rate": 0.00034210956952982266, + "loss": 3.2678, + "step": 9317 + }, + { + "epoch": 0.39233684210526315, + "grad_norm": 0.4609375, + "learning_rate": 0.0003420781982444574, + "loss": 2.7106, + "step": 9318 + }, + { + "epoch": 0.39237894736842105, + "grad_norm": 0.47265625, + "learning_rate": 0.00034204682528153553, + "loss": 3.5089, + "step": 9319 + }, + { + "epoch": 0.39242105263157895, + "grad_norm": 0.408203125, + "learning_rate": 0.0003420154506416287, + "loss": 3.1609, + "step": 9320 + }, + { + "epoch": 0.39246315789473685, + "grad_norm": 0.388671875, + "learning_rate": 0.0003419840743253084, + "loss": 2.9988, + "step": 9321 + }, + { + "epoch": 0.39250526315789475, + "grad_norm": 0.416015625, + "learning_rate": 0.0003419526963331464, + "loss": 3.2808, + "step": 9322 + }, + { + "epoch": 0.39254736842105264, + "grad_norm": 0.40625, + "learning_rate": 0.0003419213166657143, + "loss": 3.0894, + "step": 9323 + }, + { + "epoch": 0.39258947368421054, + "grad_norm": 0.40625, + "learning_rate": 0.0003418899353235838, + "loss": 3.499, + "step": 9324 + }, + { + "epoch": 0.39263157894736844, + "grad_norm": 0.423828125, + "learning_rate": 0.0003418585523073266, + "loss": 3.5154, + "step": 9325 + }, + { + "epoch": 0.39267368421052634, + "grad_norm": 0.390625, + "learning_rate": 0.0003418271676175146, + "loss": 3.0868, + "step": 9326 + }, + { + "epoch": 0.3927157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.0003417957812547194, + "loss": 3.2105, + "step": 9327 + }, + { + "epoch": 0.3927578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.000341764393219513, + "loss": 3.2384, + "step": 9328 + }, + { + "epoch": 0.3928, + "grad_norm": 0.404296875, + "learning_rate": 0.0003417330035124671, + "loss": 2.7008, + "step": 9329 + }, + { + "epoch": 0.3928421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.0003417016121341537, + "loss": 3.0429, + "step": 9330 + }, + { + "epoch": 0.3928842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.00034167021908514475, + "loss": 3.0118, + "step": 9331 + }, + { + "epoch": 0.3929263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.000341638824366012, + "loss": 3.5525, + "step": 9332 + }, + { + "epoch": 0.3929684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.0003416074279773276, + "loss": 2.8257, + "step": 9333 + }, + { + "epoch": 0.3930105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.00034157602991966344, + "loss": 3.1373, + "step": 9334 + }, + { + "epoch": 0.3930526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00034154463019359164, + "loss": 3.2645, + "step": 9335 + }, + { + "epoch": 0.3930947368421053, + "grad_norm": 0.416015625, + "learning_rate": 0.00034151322879968427, + "loss": 2.8609, + "step": 9336 + }, + { + "epoch": 0.3931368421052632, + "grad_norm": 0.39453125, + "learning_rate": 0.0003414818257385134, + "loss": 3.4161, + "step": 9337 + }, + { + "epoch": 0.3931789473684211, + "grad_norm": 0.41015625, + "learning_rate": 0.0003414504210106512, + "loss": 3.0461, + "step": 9338 + }, + { + "epoch": 0.39322105263157897, + "grad_norm": 0.462890625, + "learning_rate": 0.00034141901461666967, + "loss": 3.3991, + "step": 9339 + }, + { + "epoch": 0.3932631578947368, + "grad_norm": 0.4296875, + "learning_rate": 0.00034138760655714106, + "loss": 3.3579, + "step": 9340 + }, + { + "epoch": 0.3933052631578947, + "grad_norm": 0.376953125, + "learning_rate": 0.00034135619683263777, + "loss": 2.8748, + "step": 9341 + }, + { + "epoch": 0.3933473684210526, + "grad_norm": 0.40234375, + "learning_rate": 0.0003413247854437318, + "loss": 2.9787, + "step": 9342 + }, + { + "epoch": 0.3933894736842105, + "grad_norm": 3.609375, + "learning_rate": 0.0003412933723909956, + "loss": 3.2526, + "step": 9343 + }, + { + "epoch": 0.3934315789473684, + "grad_norm": 0.44140625, + "learning_rate": 0.00034126195767500146, + "loss": 3.4516, + "step": 9344 + }, + { + "epoch": 0.3934736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.00034123054129632166, + "loss": 3.3662, + "step": 9345 + }, + { + "epoch": 0.3935157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.0003411991232555286, + "loss": 3.488, + "step": 9346 + }, + { + "epoch": 0.3935578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0003411677035531946, + "loss": 3.3121, + "step": 9347 + }, + { + "epoch": 0.3936, + "grad_norm": 0.447265625, + "learning_rate": 0.0003411362821898922, + "loss": 3.1821, + "step": 9348 + }, + { + "epoch": 0.3936421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.0003411048591661938, + "loss": 2.9684, + "step": 9349 + }, + { + "epoch": 0.3936842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.0003410734344826719, + "loss": 2.8663, + "step": 9350 + }, + { + "epoch": 0.3937263157894737, + "grad_norm": 0.443359375, + "learning_rate": 0.000341042008139899, + "loss": 3.4578, + "step": 9351 + }, + { + "epoch": 0.3937684210526316, + "grad_norm": 0.412109375, + "learning_rate": 0.00034101058013844785, + "loss": 3.1262, + "step": 9352 + }, + { + "epoch": 0.3938105263157895, + "grad_norm": 0.42578125, + "learning_rate": 0.00034097915047889073, + "loss": 2.8811, + "step": 9353 + }, + { + "epoch": 0.39385263157894734, + "grad_norm": 0.40234375, + "learning_rate": 0.0003409477191618004, + "loss": 3.334, + "step": 9354 + }, + { + "epoch": 0.39389473684210524, + "grad_norm": 0.5078125, + "learning_rate": 0.0003409162861877495, + "loss": 3.3667, + "step": 9355 + }, + { + "epoch": 0.39393684210526314, + "grad_norm": 0.443359375, + "learning_rate": 0.0003408848515573107, + "loss": 3.3957, + "step": 9356 + }, + { + "epoch": 0.39397894736842104, + "grad_norm": 0.439453125, + "learning_rate": 0.0003408534152710567, + "loss": 3.4571, + "step": 9357 + }, + { + "epoch": 0.39402105263157894, + "grad_norm": 0.4140625, + "learning_rate": 0.0003408219773295601, + "loss": 3.084, + "step": 9358 + }, + { + "epoch": 0.39406315789473684, + "grad_norm": 0.4140625, + "learning_rate": 0.0003407905377333939, + "loss": 3.1206, + "step": 9359 + }, + { + "epoch": 0.39410526315789474, + "grad_norm": 0.4140625, + "learning_rate": 0.0003407590964831308, + "loss": 3.3634, + "step": 9360 + }, + { + "epoch": 0.39414736842105264, + "grad_norm": 0.41015625, + "learning_rate": 0.0003407276535793435, + "loss": 3.0231, + "step": 9361 + }, + { + "epoch": 0.39418947368421053, + "grad_norm": 0.44140625, + "learning_rate": 0.00034069620902260503, + "loss": 3.1187, + "step": 9362 + }, + { + "epoch": 0.39423157894736843, + "grad_norm": 0.451171875, + "learning_rate": 0.0003406647628134882, + "loss": 3.3034, + "step": 9363 + }, + { + "epoch": 0.39427368421052633, + "grad_norm": 0.447265625, + "learning_rate": 0.0003406333149525659, + "loss": 3.0686, + "step": 9364 + }, + { + "epoch": 0.39431578947368423, + "grad_norm": 0.404296875, + "learning_rate": 0.00034060186544041107, + "loss": 3.0862, + "step": 9365 + }, + { + "epoch": 0.39435789473684213, + "grad_norm": 0.41796875, + "learning_rate": 0.0003405704142775967, + "loss": 2.8707, + "step": 9366 + }, + { + "epoch": 0.3944, + "grad_norm": 0.421875, + "learning_rate": 0.0003405389614646958, + "loss": 3.5068, + "step": 9367 + }, + { + "epoch": 0.39444210526315787, + "grad_norm": 0.474609375, + "learning_rate": 0.0003405075070022814, + "loss": 2.9891, + "step": 9368 + }, + { + "epoch": 0.39448421052631577, + "grad_norm": 0.41015625, + "learning_rate": 0.0003404760508909266, + "loss": 3.341, + "step": 9369 + }, + { + "epoch": 0.39452631578947367, + "grad_norm": 0.44921875, + "learning_rate": 0.00034044459313120444, + "loss": 3.3758, + "step": 9370 + }, + { + "epoch": 0.39456842105263157, + "grad_norm": 0.455078125, + "learning_rate": 0.000340413133723688, + "loss": 3.2097, + "step": 9371 + }, + { + "epoch": 0.39461052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.0003403816726689506, + "loss": 3.3644, + "step": 9372 + }, + { + "epoch": 0.39465263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0003403502099675653, + "loss": 3.1723, + "step": 9373 + }, + { + "epoch": 0.39469473684210526, + "grad_norm": 0.412109375, + "learning_rate": 0.0003403187456201053, + "loss": 3.0189, + "step": 9374 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 0.396484375, + "learning_rate": 0.0003402872796271438, + "loss": 3.0011, + "step": 9375 + }, + { + "epoch": 0.39477894736842106, + "grad_norm": 0.404296875, + "learning_rate": 0.00034025581198925424, + "loss": 3.3418, + "step": 9376 + }, + { + "epoch": 0.39482105263157896, + "grad_norm": 0.416015625, + "learning_rate": 0.0003402243427070099, + "loss": 3.0275, + "step": 9377 + }, + { + "epoch": 0.39486315789473686, + "grad_norm": 0.416015625, + "learning_rate": 0.0003401928717809839, + "loss": 3.2846, + "step": 9378 + }, + { + "epoch": 0.39490526315789476, + "grad_norm": 0.41796875, + "learning_rate": 0.0003401613992117498, + "loss": 3.0887, + "step": 9379 + }, + { + "epoch": 0.39494736842105266, + "grad_norm": 0.4375, + "learning_rate": 0.000340129924999881, + "loss": 3.2651, + "step": 9380 + }, + { + "epoch": 0.3949894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.0003400984491459508, + "loss": 3.2532, + "step": 9381 + }, + { + "epoch": 0.3950315789473684, + "grad_norm": 0.43359375, + "learning_rate": 0.00034006697165053274, + "loss": 3.5388, + "step": 9382 + }, + { + "epoch": 0.3950736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.00034003549251420025, + "loss": 3.6687, + "step": 9383 + }, + { + "epoch": 0.3951157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.0003400040117375269, + "loss": 2.8632, + "step": 9384 + }, + { + "epoch": 0.3951578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.0003399725293210862, + "loss": 3.1374, + "step": 9385 + }, + { + "epoch": 0.3952, + "grad_norm": 0.44140625, + "learning_rate": 0.0003399410452654518, + "loss": 3.1815, + "step": 9386 + }, + { + "epoch": 0.3952421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.0003399095595711971, + "loss": 3.5786, + "step": 9387 + }, + { + "epoch": 0.3952842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.00033987807223889596, + "loss": 3.2187, + "step": 9388 + }, + { + "epoch": 0.3953263157894737, + "grad_norm": 0.5859375, + "learning_rate": 0.0003398465832691219, + "loss": 3.2285, + "step": 9389 + }, + { + "epoch": 0.3953684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.0003398150926624487, + "loss": 3.3207, + "step": 9390 + }, + { + "epoch": 0.3954105263157895, + "grad_norm": 0.416015625, + "learning_rate": 0.00033978360041945, + "loss": 3.351, + "step": 9391 + }, + { + "epoch": 0.3954526315789474, + "grad_norm": 0.439453125, + "learning_rate": 0.0003397521065406996, + "loss": 3.1505, + "step": 9392 + }, + { + "epoch": 0.3954947368421053, + "grad_norm": 0.4453125, + "learning_rate": 0.0003397206110267713, + "loss": 3.4979, + "step": 9393 + }, + { + "epoch": 0.39553684210526313, + "grad_norm": 0.408203125, + "learning_rate": 0.0003396891138782389, + "loss": 2.8629, + "step": 9394 + }, + { + "epoch": 0.39557894736842103, + "grad_norm": 0.40234375, + "learning_rate": 0.0003396576150956761, + "loss": 3.326, + "step": 9395 + }, + { + "epoch": 0.39562105263157893, + "grad_norm": 0.41015625, + "learning_rate": 0.000339626114679657, + "loss": 3.1833, + "step": 9396 + }, + { + "epoch": 0.3956631578947368, + "grad_norm": 0.412109375, + "learning_rate": 0.0003395946126307554, + "loss": 3.1944, + "step": 9397 + }, + { + "epoch": 0.3957052631578947, + "grad_norm": 0.451171875, + "learning_rate": 0.0003395631089495452, + "loss": 3.4424, + "step": 9398 + }, + { + "epoch": 0.3957473684210526, + "grad_norm": 0.419921875, + "learning_rate": 0.00033953160363660046, + "loss": 3.2839, + "step": 9399 + }, + { + "epoch": 0.3957894736842105, + "grad_norm": 0.58984375, + "learning_rate": 0.000339500096692495, + "loss": 2.7449, + "step": 9400 + }, + { + "epoch": 0.3958315789473684, + "grad_norm": 0.462890625, + "learning_rate": 0.0003394685881178029, + "loss": 2.9628, + "step": 9401 + }, + { + "epoch": 0.3958736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.00033943707791309833, + "loss": 3.2985, + "step": 9402 + }, + { + "epoch": 0.3959157894736842, + "grad_norm": 0.447265625, + "learning_rate": 0.0003394055660789553, + "loss": 3.1842, + "step": 9403 + }, + { + "epoch": 0.3959578947368421, + "grad_norm": 0.443359375, + "learning_rate": 0.0003393740526159479, + "loss": 2.9547, + "step": 9404 + }, + { + "epoch": 0.396, + "grad_norm": 0.41796875, + "learning_rate": 0.0003393425375246503, + "loss": 3.5107, + "step": 9405 + }, + { + "epoch": 0.3960421052631579, + "grad_norm": 0.39453125, + "learning_rate": 0.00033931102080563653, + "loss": 2.876, + "step": 9406 + }, + { + "epoch": 0.3960842105263158, + "grad_norm": 0.439453125, + "learning_rate": 0.000339279502459481, + "loss": 3.3906, + "step": 9407 + }, + { + "epoch": 0.39612631578947366, + "grad_norm": 0.455078125, + "learning_rate": 0.0003392479824867577, + "loss": 3.4675, + "step": 9408 + }, + { + "epoch": 0.39616842105263156, + "grad_norm": 0.39453125, + "learning_rate": 0.0003392164608880411, + "loss": 2.8985, + "step": 9409 + }, + { + "epoch": 0.39621052631578946, + "grad_norm": 0.46484375, + "learning_rate": 0.0003391849376639055, + "loss": 2.9495, + "step": 9410 + }, + { + "epoch": 0.39625263157894736, + "grad_norm": 0.421875, + "learning_rate": 0.00033915341281492495, + "loss": 3.4323, + "step": 9411 + }, + { + "epoch": 0.39629473684210526, + "grad_norm": 0.40234375, + "learning_rate": 0.0003391218863416741, + "loss": 3.0022, + "step": 9412 + }, + { + "epoch": 0.39633684210526315, + "grad_norm": 0.40234375, + "learning_rate": 0.0003390903582447272, + "loss": 3.087, + "step": 9413 + }, + { + "epoch": 0.39637894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.0003390588285246586, + "loss": 3.0441, + "step": 9414 + }, + { + "epoch": 0.39642105263157895, + "grad_norm": 0.439453125, + "learning_rate": 0.00033902729718204275, + "loss": 3.4206, + "step": 9415 + }, + { + "epoch": 0.39646315789473685, + "grad_norm": 0.5546875, + "learning_rate": 0.0003389957642174542, + "loss": 3.0876, + "step": 9416 + }, + { + "epoch": 0.39650526315789475, + "grad_norm": 0.4140625, + "learning_rate": 0.00033896422963146735, + "loss": 3.0009, + "step": 9417 + }, + { + "epoch": 0.39654736842105265, + "grad_norm": 0.435546875, + "learning_rate": 0.0003389326934246568, + "loss": 3.3044, + "step": 9418 + }, + { + "epoch": 0.39658947368421055, + "grad_norm": 0.40625, + "learning_rate": 0.00033890115559759705, + "loss": 3.1981, + "step": 9419 + }, + { + "epoch": 0.39663157894736845, + "grad_norm": 0.431640625, + "learning_rate": 0.00033886961615086275, + "loss": 3.0344, + "step": 9420 + }, + { + "epoch": 0.3966736842105263, + "grad_norm": 0.45703125, + "learning_rate": 0.00033883807508502836, + "loss": 3.456, + "step": 9421 + }, + { + "epoch": 0.3967157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.0003388065324006687, + "loss": 3.3726, + "step": 9422 + }, + { + "epoch": 0.3967578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00033877498809835836, + "loss": 3.187, + "step": 9423 + }, + { + "epoch": 0.3968, + "grad_norm": 0.39453125, + "learning_rate": 0.00033874344217867206, + "loss": 3.365, + "step": 9424 + }, + { + "epoch": 0.3968421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.00033871189464218445, + "loss": 3.4772, + "step": 9425 + }, + { + "epoch": 0.3968842105263158, + "grad_norm": 0.44140625, + "learning_rate": 0.00033868034548947037, + "loss": 3.1937, + "step": 9426 + }, + { + "epoch": 0.3969263157894737, + "grad_norm": 0.443359375, + "learning_rate": 0.0003386487947211046, + "loss": 3.2586, + "step": 9427 + }, + { + "epoch": 0.3969684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.00033861724233766197, + "loss": 3.0654, + "step": 9428 + }, + { + "epoch": 0.3970105263157895, + "grad_norm": 0.455078125, + "learning_rate": 0.0003385856883397173, + "loss": 3.6218, + "step": 9429 + }, + { + "epoch": 0.3970526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.0003385541327278454, + "loss": 3.4129, + "step": 9430 + }, + { + "epoch": 0.3970947368421053, + "grad_norm": 0.458984375, + "learning_rate": 0.0003385225755026213, + "loss": 2.9773, + "step": 9431 + }, + { + "epoch": 0.3971368421052632, + "grad_norm": 0.419921875, + "learning_rate": 0.00033849101666461994, + "loss": 3.5603, + "step": 9432 + }, + { + "epoch": 0.3971789473684211, + "grad_norm": 0.4921875, + "learning_rate": 0.00033845945621441617, + "loss": 3.0165, + "step": 9433 + }, + { + "epoch": 0.397221052631579, + "grad_norm": 0.447265625, + "learning_rate": 0.00033842789415258507, + "loss": 3.4725, + "step": 9434 + }, + { + "epoch": 0.3972631578947368, + "grad_norm": 0.416015625, + "learning_rate": 0.0003383963304797016, + "loss": 3.0359, + "step": 9435 + }, + { + "epoch": 0.3973052631578947, + "grad_norm": 0.435546875, + "learning_rate": 0.0003383647651963409, + "loss": 3.3101, + "step": 9436 + }, + { + "epoch": 0.3973473684210526, + "grad_norm": 0.4375, + "learning_rate": 0.00033833319830307797, + "loss": 3.3951, + "step": 9437 + }, + { + "epoch": 0.3973894736842105, + "grad_norm": 0.455078125, + "learning_rate": 0.00033830162980048796, + "loss": 3.1684, + "step": 9438 + }, + { + "epoch": 0.3974315789473684, + "grad_norm": 0.39453125, + "learning_rate": 0.00033827005968914604, + "loss": 3.0398, + "step": 9439 + }, + { + "epoch": 0.3974736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.0003382384879696273, + "loss": 3.1277, + "step": 9440 + }, + { + "epoch": 0.3975157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.000338206914642507, + "loss": 2.8026, + "step": 9441 + }, + { + "epoch": 0.3975578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.00033817533970836037, + "loss": 2.9444, + "step": 9442 + }, + { + "epoch": 0.3976, + "grad_norm": 0.515625, + "learning_rate": 0.0003381437631677625, + "loss": 3.4598, + "step": 9443 + }, + { + "epoch": 0.3976421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.00033811218502128904, + "loss": 3.1898, + "step": 9444 + }, + { + "epoch": 0.3976842105263158, + "grad_norm": 0.40234375, + "learning_rate": 0.000338080605269515, + "loss": 3.2844, + "step": 9445 + }, + { + "epoch": 0.3977263157894737, + "grad_norm": 0.396484375, + "learning_rate": 0.0003380490239130158, + "loss": 3.1321, + "step": 9446 + }, + { + "epoch": 0.3977684210526316, + "grad_norm": 0.40234375, + "learning_rate": 0.000338017440952367, + "loss": 3.1293, + "step": 9447 + }, + { + "epoch": 0.39781052631578945, + "grad_norm": 0.416015625, + "learning_rate": 0.0003379858563881436, + "loss": 3.3363, + "step": 9448 + }, + { + "epoch": 0.39785263157894735, + "grad_norm": 0.439453125, + "learning_rate": 0.00033795427022092144, + "loss": 3.4328, + "step": 9449 + }, + { + "epoch": 0.39789473684210525, + "grad_norm": 0.427734375, + "learning_rate": 0.00033792268245127577, + "loss": 3.5661, + "step": 9450 + }, + { + "epoch": 0.39793684210526314, + "grad_norm": 0.41015625, + "learning_rate": 0.0003378910930797821, + "loss": 3.0374, + "step": 9451 + }, + { + "epoch": 0.39797894736842104, + "grad_norm": 0.451171875, + "learning_rate": 0.00033785950210701605, + "loss": 3.1964, + "step": 9452 + }, + { + "epoch": 0.39802105263157894, + "grad_norm": 0.43359375, + "learning_rate": 0.0003378279095335531, + "loss": 3.1254, + "step": 9453 + }, + { + "epoch": 0.39806315789473684, + "grad_norm": 0.400390625, + "learning_rate": 0.0003377963153599688, + "loss": 3.0871, + "step": 9454 + }, + { + "epoch": 0.39810526315789474, + "grad_norm": 0.466796875, + "learning_rate": 0.00033776471958683874, + "loss": 3.2675, + "step": 9455 + }, + { + "epoch": 0.39814736842105264, + "grad_norm": 0.478515625, + "learning_rate": 0.00033773312221473865, + "loss": 3.302, + "step": 9456 + }, + { + "epoch": 0.39818947368421054, + "grad_norm": 0.482421875, + "learning_rate": 0.0003377015232442442, + "loss": 3.0438, + "step": 9457 + }, + { + "epoch": 0.39823157894736844, + "grad_norm": 0.408203125, + "learning_rate": 0.000337669922675931, + "loss": 3.2306, + "step": 9458 + }, + { + "epoch": 0.39827368421052634, + "grad_norm": 0.4140625, + "learning_rate": 0.0003376383205103748, + "loss": 3.0962, + "step": 9459 + }, + { + "epoch": 0.39831578947368423, + "grad_norm": 0.40234375, + "learning_rate": 0.0003376067167481514, + "loss": 3.5966, + "step": 9460 + }, + { + "epoch": 0.3983578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00033757511138983654, + "loss": 3.5273, + "step": 9461 + }, + { + "epoch": 0.3984, + "grad_norm": 0.400390625, + "learning_rate": 0.0003375435044360061, + "loss": 3.3945, + "step": 9462 + }, + { + "epoch": 0.3984421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.0003375118958872358, + "loss": 3.7384, + "step": 9463 + }, + { + "epoch": 0.3984842105263158, + "grad_norm": 0.4140625, + "learning_rate": 0.0003374802857441016, + "loss": 3.5527, + "step": 9464 + }, + { + "epoch": 0.3985263157894737, + "grad_norm": 0.390625, + "learning_rate": 0.0003374486740071794, + "loss": 3.0801, + "step": 9465 + }, + { + "epoch": 0.39856842105263157, + "grad_norm": 0.423828125, + "learning_rate": 0.0003374170606770451, + "loss": 3.1779, + "step": 9466 + }, + { + "epoch": 0.39861052631578947, + "grad_norm": 0.4140625, + "learning_rate": 0.0003373854457542746, + "loss": 3.4532, + "step": 9467 + }, + { + "epoch": 0.39865263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.000337353829239444, + "loss": 3.077, + "step": 9468 + }, + { + "epoch": 0.39869473684210527, + "grad_norm": 0.408203125, + "learning_rate": 0.0003373222111331293, + "loss": 2.9798, + "step": 9469 + }, + { + "epoch": 0.39873684210526317, + "grad_norm": 0.412109375, + "learning_rate": 0.00033729059143590645, + "loss": 3.1474, + "step": 9470 + }, + { + "epoch": 0.39877894736842107, + "grad_norm": 0.419921875, + "learning_rate": 0.0003372589701483517, + "loss": 2.9066, + "step": 9471 + }, + { + "epoch": 0.39882105263157897, + "grad_norm": 0.42578125, + "learning_rate": 0.0003372273472710409, + "loss": 3.2572, + "step": 9472 + }, + { + "epoch": 0.39886315789473686, + "grad_norm": 0.404296875, + "learning_rate": 0.00033719572280455047, + "loss": 3.2899, + "step": 9473 + }, + { + "epoch": 0.39890526315789476, + "grad_norm": 0.423828125, + "learning_rate": 0.0003371640967494563, + "loss": 3.5375, + "step": 9474 + }, + { + "epoch": 0.3989473684210526, + "grad_norm": 0.400390625, + "learning_rate": 0.0003371324691063347, + "loss": 2.7358, + "step": 9475 + }, + { + "epoch": 0.3989894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.0003371008398757619, + "loss": 3.2017, + "step": 9476 + }, + { + "epoch": 0.3990315789473684, + "grad_norm": 0.419921875, + "learning_rate": 0.0003370692090583142, + "loss": 2.9936, + "step": 9477 + }, + { + "epoch": 0.3990736842105263, + "grad_norm": 0.40625, + "learning_rate": 0.00033703757665456773, + "loss": 3.5172, + "step": 9478 + }, + { + "epoch": 0.3991157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.000337005942665099, + "loss": 3.5242, + "step": 9479 + }, + { + "epoch": 0.3991578947368421, + "grad_norm": 0.404296875, + "learning_rate": 0.00033697430709048415, + "loss": 3.3897, + "step": 9480 + }, + { + "epoch": 0.3992, + "grad_norm": 0.41015625, + "learning_rate": 0.00033694266993129963, + "loss": 2.9709, + "step": 9481 + }, + { + "epoch": 0.3992421052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.0003369110311881217, + "loss": 3.229, + "step": 9482 + }, + { + "epoch": 0.3992842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.000336879390861527, + "loss": 2.7598, + "step": 9483 + }, + { + "epoch": 0.3993263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.00033684774895209184, + "loss": 3.261, + "step": 9484 + }, + { + "epoch": 0.3993684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.00033681610546039284, + "loss": 3.0724, + "step": 9485 + }, + { + "epoch": 0.3994105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.0003367844603870063, + "loss": 3.4202, + "step": 9486 + }, + { + "epoch": 0.3994526315789474, + "grad_norm": 0.427734375, + "learning_rate": 0.00033675281373250895, + "loss": 3.1828, + "step": 9487 + }, + { + "epoch": 0.39949473684210524, + "grad_norm": 0.41015625, + "learning_rate": 0.0003367211654974772, + "loss": 3.0022, + "step": 9488 + }, + { + "epoch": 0.39953684210526313, + "grad_norm": 0.421875, + "learning_rate": 0.0003366895156824877, + "loss": 3.2549, + "step": 9489 + }, + { + "epoch": 0.39957894736842103, + "grad_norm": 0.42578125, + "learning_rate": 0.00033665786428811707, + "loss": 3.2343, + "step": 9490 + }, + { + "epoch": 0.39962105263157893, + "grad_norm": 0.453125, + "learning_rate": 0.000336626211314942, + "loss": 3.3211, + "step": 9491 + }, + { + "epoch": 0.39966315789473683, + "grad_norm": 0.427734375, + "learning_rate": 0.0003365945567635391, + "loss": 3.2794, + "step": 9492 + }, + { + "epoch": 0.39970526315789473, + "grad_norm": 0.447265625, + "learning_rate": 0.00033656290063448517, + "loss": 2.8306, + "step": 9493 + }, + { + "epoch": 0.39974736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.0003365312429283569, + "loss": 2.3731, + "step": 9494 + }, + { + "epoch": 0.39978947368421053, + "grad_norm": 0.435546875, + "learning_rate": 0.000336499583645731, + "loss": 2.9537, + "step": 9495 + }, + { + "epoch": 0.3998315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.0003364679227871844, + "loss": 2.9668, + "step": 9496 + }, + { + "epoch": 0.3998736842105263, + "grad_norm": 0.40234375, + "learning_rate": 0.0003364362603532938, + "loss": 2.8881, + "step": 9497 + }, + { + "epoch": 0.3999157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.0003364045963446361, + "loss": 3.194, + "step": 9498 + }, + { + "epoch": 0.3999578947368421, + "grad_norm": 0.46484375, + "learning_rate": 0.0003363729307617882, + "loss": 3.1098, + "step": 9499 + }, + { + "epoch": 0.4, + "grad_norm": 0.42578125, + "learning_rate": 0.0003363412636053269, + "loss": 3.5797, + "step": 9500 + }, + { + "epoch": 0.4000421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.0003363095948758293, + "loss": 3.3311, + "step": 9501 + }, + { + "epoch": 0.40008421052631576, + "grad_norm": 0.423828125, + "learning_rate": 0.0003362779245738722, + "loss": 3.476, + "step": 9502 + }, + { + "epoch": 0.40012631578947366, + "grad_norm": 0.416015625, + "learning_rate": 0.00033624625270003283, + "loss": 3.073, + "step": 9503 + }, + { + "epoch": 0.40016842105263156, + "grad_norm": 0.4375, + "learning_rate": 0.00033621457925488797, + "loss": 3.0007, + "step": 9504 + }, + { + "epoch": 0.40021052631578946, + "grad_norm": 0.42578125, + "learning_rate": 0.0003361829042390148, + "loss": 3.6113, + "step": 9505 + }, + { + "epoch": 0.40025263157894736, + "grad_norm": 0.4609375, + "learning_rate": 0.00033615122765299045, + "loss": 3.2049, + "step": 9506 + }, + { + "epoch": 0.40029473684210526, + "grad_norm": 0.408203125, + "learning_rate": 0.0003361195494973919, + "loss": 3.5253, + "step": 9507 + }, + { + "epoch": 0.40033684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.00033608786977279626, + "loss": 3.234, + "step": 9508 + }, + { + "epoch": 0.40037894736842106, + "grad_norm": 0.416015625, + "learning_rate": 0.0003360561884797809, + "loss": 3.1876, + "step": 9509 + }, + { + "epoch": 0.40042105263157896, + "grad_norm": 0.42578125, + "learning_rate": 0.0003360245056189228, + "loss": 3.2033, + "step": 9510 + }, + { + "epoch": 0.40046315789473685, + "grad_norm": 0.421875, + "learning_rate": 0.0003359928211907993, + "loss": 3.1127, + "step": 9511 + }, + { + "epoch": 0.40050526315789475, + "grad_norm": 0.412109375, + "learning_rate": 0.0003359611351959877, + "loss": 3.5139, + "step": 9512 + }, + { + "epoch": 0.40054736842105265, + "grad_norm": 0.4140625, + "learning_rate": 0.0003359294476350652, + "loss": 3.2633, + "step": 9513 + }, + { + "epoch": 0.40058947368421055, + "grad_norm": 0.419921875, + "learning_rate": 0.00033589775850860915, + "loss": 3.2925, + "step": 9514 + }, + { + "epoch": 0.4006315789473684, + "grad_norm": 0.48828125, + "learning_rate": 0.00033586606781719683, + "loss": 3.0153, + "step": 9515 + }, + { + "epoch": 0.4006736842105263, + "grad_norm": 0.392578125, + "learning_rate": 0.00033583437556140557, + "loss": 2.9299, + "step": 9516 + }, + { + "epoch": 0.4007157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.00033580268174181297, + "loss": 3.1478, + "step": 9517 + }, + { + "epoch": 0.4007578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.00033577098635899615, + "loss": 3.4643, + "step": 9518 + }, + { + "epoch": 0.4008, + "grad_norm": 0.396484375, + "learning_rate": 0.0003357392894135329, + "loss": 3.1587, + "step": 9519 + }, + { + "epoch": 0.4008421052631579, + "grad_norm": 0.396484375, + "learning_rate": 0.00033570759090600043, + "loss": 3.1527, + "step": 9520 + }, + { + "epoch": 0.4008842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.00033567589083697647, + "loss": 3.3283, + "step": 9521 + }, + { + "epoch": 0.4009263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.00033564418920703834, + "loss": 3.4517, + "step": 9522 + }, + { + "epoch": 0.4009684210526316, + "grad_norm": 1.5625, + "learning_rate": 0.0003356124860167637, + "loss": 3.0725, + "step": 9523 + }, + { + "epoch": 0.4010105263157895, + "grad_norm": 0.44921875, + "learning_rate": 0.00033558078126673033, + "loss": 3.3593, + "step": 9524 + }, + { + "epoch": 0.4010526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.00033554907495751555, + "loss": 3.31, + "step": 9525 + }, + { + "epoch": 0.4010947368421053, + "grad_norm": 0.5078125, + "learning_rate": 0.0003355173670896971, + "loss": 2.5479, + "step": 9526 + }, + { + "epoch": 0.4011368421052632, + "grad_norm": 0.439453125, + "learning_rate": 0.00033548565766385274, + "loss": 3.721, + "step": 9527 + }, + { + "epoch": 0.4011789473684211, + "grad_norm": 0.4296875, + "learning_rate": 0.00033545394668056016, + "loss": 2.8006, + "step": 9528 + }, + { + "epoch": 0.4012210526315789, + "grad_norm": 0.435546875, + "learning_rate": 0.000335422234140397, + "loss": 3.1794, + "step": 9529 + }, + { + "epoch": 0.4012631578947368, + "grad_norm": 0.439453125, + "learning_rate": 0.0003353905200439412, + "loss": 3.2252, + "step": 9530 + }, + { + "epoch": 0.4013052631578947, + "grad_norm": 0.408203125, + "learning_rate": 0.00033535880439177043, + "loss": 3.2743, + "step": 9531 + }, + { + "epoch": 0.4013473684210526, + "grad_norm": 0.4765625, + "learning_rate": 0.0003353270871844625, + "loss": 2.9152, + "step": 9532 + }, + { + "epoch": 0.4013894736842105, + "grad_norm": 0.431640625, + "learning_rate": 0.0003352953684225953, + "loss": 3.2095, + "step": 9533 + }, + { + "epoch": 0.4014315789473684, + "grad_norm": 0.44140625, + "learning_rate": 0.00033526364810674674, + "loss": 3.0716, + "step": 9534 + }, + { + "epoch": 0.4014736842105263, + "grad_norm": 0.451171875, + "learning_rate": 0.0003352319262374947, + "loss": 3.0851, + "step": 9535 + }, + { + "epoch": 0.4015157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.0003352002028154171, + "loss": 2.9024, + "step": 9536 + }, + { + "epoch": 0.4015578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00033516847784109184, + "loss": 3.2751, + "step": 9537 + }, + { + "epoch": 0.4016, + "grad_norm": 0.408203125, + "learning_rate": 0.0003351367513150971, + "loss": 2.552, + "step": 9538 + }, + { + "epoch": 0.4016421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.0003351050232380107, + "loss": 3.4615, + "step": 9539 + }, + { + "epoch": 0.4016842105263158, + "grad_norm": 0.46484375, + "learning_rate": 0.0003350732936104108, + "loss": 3.3881, + "step": 9540 + }, + { + "epoch": 0.4017263157894737, + "grad_norm": 0.66796875, + "learning_rate": 0.0003350415624328754, + "loss": 2.6621, + "step": 9541 + }, + { + "epoch": 0.40176842105263155, + "grad_norm": 0.51171875, + "learning_rate": 0.00033500982970598273, + "loss": 3.0586, + "step": 9542 + }, + { + "epoch": 0.40181052631578945, + "grad_norm": 0.478515625, + "learning_rate": 0.0003349780954303108, + "loss": 3.2474, + "step": 9543 + }, + { + "epoch": 0.40185263157894735, + "grad_norm": 0.4375, + "learning_rate": 0.00033494635960643775, + "loss": 3.3123, + "step": 9544 + }, + { + "epoch": 0.40189473684210525, + "grad_norm": 0.46484375, + "learning_rate": 0.00033491462223494197, + "loss": 2.974, + "step": 9545 + }, + { + "epoch": 0.40193684210526315, + "grad_norm": 0.427734375, + "learning_rate": 0.00033488288331640147, + "loss": 3.0403, + "step": 9546 + }, + { + "epoch": 0.40197894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.0003348511428513946, + "loss": 3.1178, + "step": 9547 + }, + { + "epoch": 0.40202105263157895, + "grad_norm": 0.423828125, + "learning_rate": 0.0003348194008404996, + "loss": 3.0713, + "step": 9548 + }, + { + "epoch": 0.40206315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.0003347876572842947, + "loss": 3.0525, + "step": 9549 + }, + { + "epoch": 0.40210526315789474, + "grad_norm": 0.447265625, + "learning_rate": 0.00033475591218335836, + "loss": 3.2282, + "step": 9550 + }, + { + "epoch": 0.40214736842105264, + "grad_norm": 0.421875, + "learning_rate": 0.0003347241655382689, + "loss": 3.1419, + "step": 9551 + }, + { + "epoch": 0.40218947368421054, + "grad_norm": 0.40234375, + "learning_rate": 0.0003346924173496046, + "loss": 3.5581, + "step": 9552 + }, + { + "epoch": 0.40223157894736844, + "grad_norm": 0.408203125, + "learning_rate": 0.00033466066761794413, + "loss": 2.9949, + "step": 9553 + }, + { + "epoch": 0.40227368421052634, + "grad_norm": 0.421875, + "learning_rate": 0.0003346289163438657, + "loss": 3.4723, + "step": 9554 + }, + { + "epoch": 0.40231578947368424, + "grad_norm": 0.41796875, + "learning_rate": 0.0003345971635279478, + "loss": 3.3126, + "step": 9555 + }, + { + "epoch": 0.4023578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.000334565409170769, + "loss": 3.5224, + "step": 9556 + }, + { + "epoch": 0.4024, + "grad_norm": 0.4765625, + "learning_rate": 0.00033453365327290786, + "loss": 3.1665, + "step": 9557 + }, + { + "epoch": 0.4024421052631579, + "grad_norm": 0.451171875, + "learning_rate": 0.00033450189583494286, + "loss": 3.6346, + "step": 9558 + }, + { + "epoch": 0.4024842105263158, + "grad_norm": 0.40234375, + "learning_rate": 0.0003344701368574525, + "loss": 2.9938, + "step": 9559 + }, + { + "epoch": 0.4025263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.0003344383763410156, + "loss": 3.3557, + "step": 9560 + }, + { + "epoch": 0.4025684210526316, + "grad_norm": 0.400390625, + "learning_rate": 0.00033440661428621064, + "loss": 2.9688, + "step": 9561 + }, + { + "epoch": 0.4026105263157895, + "grad_norm": 0.412109375, + "learning_rate": 0.0003343748506936164, + "loss": 2.9146, + "step": 9562 + }, + { + "epoch": 0.4026526315789474, + "grad_norm": 0.408203125, + "learning_rate": 0.0003343430855638115, + "loss": 3.3913, + "step": 9563 + }, + { + "epoch": 0.40269473684210527, + "grad_norm": 0.419921875, + "learning_rate": 0.00033431131889737466, + "loss": 3.3432, + "step": 9564 + }, + { + "epoch": 0.40273684210526317, + "grad_norm": 0.40234375, + "learning_rate": 0.0003342795506948847, + "loss": 3.1368, + "step": 9565 + }, + { + "epoch": 0.40277894736842107, + "grad_norm": 0.423828125, + "learning_rate": 0.00033424778095692025, + "loss": 3.4826, + "step": 9566 + }, + { + "epoch": 0.40282105263157897, + "grad_norm": 0.41796875, + "learning_rate": 0.0003342160096840603, + "loss": 3.2695, + "step": 9567 + }, + { + "epoch": 0.40286315789473687, + "grad_norm": 0.423828125, + "learning_rate": 0.00033418423687688357, + "loss": 3.2549, + "step": 9568 + }, + { + "epoch": 0.4029052631578947, + "grad_norm": 0.412109375, + "learning_rate": 0.000334152462535969, + "loss": 3.4559, + "step": 9569 + }, + { + "epoch": 0.4029473684210526, + "grad_norm": 0.4140625, + "learning_rate": 0.00033412068666189535, + "loss": 3.5343, + "step": 9570 + }, + { + "epoch": 0.4029894736842105, + "grad_norm": 0.40234375, + "learning_rate": 0.0003340889092552417, + "loss": 3.3089, + "step": 9571 + }, + { + "epoch": 0.4030315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.0003340571303165869, + "loss": 2.8076, + "step": 9572 + }, + { + "epoch": 0.4030736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.00033402534984650994, + "loss": 3.341, + "step": 9573 + }, + { + "epoch": 0.4031157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00033399356784558987, + "loss": 3.4694, + "step": 9574 + }, + { + "epoch": 0.4031578947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.0003339617843144057, + "loss": 3.0503, + "step": 9575 + }, + { + "epoch": 0.4032, + "grad_norm": 0.408203125, + "learning_rate": 0.0003339299992535364, + "loss": 3.153, + "step": 9576 + }, + { + "epoch": 0.4032421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.0003338982126635611, + "loss": 3.301, + "step": 9577 + }, + { + "epoch": 0.4032842105263158, + "grad_norm": 0.49609375, + "learning_rate": 0.000333866424545059, + "loss": 3.4547, + "step": 9578 + }, + { + "epoch": 0.4033263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.0003338346348986092, + "loss": 3.3145, + "step": 9579 + }, + { + "epoch": 0.4033684210526316, + "grad_norm": 0.43359375, + "learning_rate": 0.00033380284372479084, + "loss": 2.9829, + "step": 9580 + }, + { + "epoch": 0.4034105263157895, + "grad_norm": 0.490234375, + "learning_rate": 0.00033377105102418316, + "loss": 2.9806, + "step": 9581 + }, + { + "epoch": 0.4034526315789474, + "grad_norm": 0.421875, + "learning_rate": 0.0003337392567973653, + "loss": 3.078, + "step": 9582 + }, + { + "epoch": 0.40349473684210524, + "grad_norm": 0.515625, + "learning_rate": 0.0003337074610449166, + "loss": 3.2246, + "step": 9583 + }, + { + "epoch": 0.40353684210526314, + "grad_norm": 0.408203125, + "learning_rate": 0.0003336756637674163, + "loss": 2.7454, + "step": 9584 + }, + { + "epoch": 0.40357894736842104, + "grad_norm": 0.42578125, + "learning_rate": 0.0003336438649654437, + "loss": 3.0879, + "step": 9585 + }, + { + "epoch": 0.40362105263157894, + "grad_norm": 0.484375, + "learning_rate": 0.00033361206463957814, + "loss": 3.4993, + "step": 9586 + }, + { + "epoch": 0.40366315789473683, + "grad_norm": 0.423828125, + "learning_rate": 0.00033358026279039913, + "loss": 2.9112, + "step": 9587 + }, + { + "epoch": 0.40370526315789473, + "grad_norm": 0.4296875, + "learning_rate": 0.0003335484594184858, + "loss": 3.2015, + "step": 9588 + }, + { + "epoch": 0.40374736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.0003335166545244178, + "loss": 3.2426, + "step": 9589 + }, + { + "epoch": 0.40378947368421053, + "grad_norm": 0.4765625, + "learning_rate": 0.00033348484810877445, + "loss": 3.646, + "step": 9590 + }, + { + "epoch": 0.40383157894736843, + "grad_norm": 0.453125, + "learning_rate": 0.00033345304017213525, + "loss": 3.3021, + "step": 9591 + }, + { + "epoch": 0.40387368421052633, + "grad_norm": 0.42578125, + "learning_rate": 0.00033342123071507967, + "loss": 3.0405, + "step": 9592 + }, + { + "epoch": 0.40391578947368423, + "grad_norm": 0.408203125, + "learning_rate": 0.0003333894197381873, + "loss": 3.0886, + "step": 9593 + }, + { + "epoch": 0.4039578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00033335760724203774, + "loss": 2.9195, + "step": 9594 + }, + { + "epoch": 0.404, + "grad_norm": 0.41015625, + "learning_rate": 0.00033332579322721046, + "loss": 3.2085, + "step": 9595 + }, + { + "epoch": 0.40404210526315787, + "grad_norm": 0.4375, + "learning_rate": 0.00033329397769428515, + "loss": 3.441, + "step": 9596 + }, + { + "epoch": 0.40408421052631577, + "grad_norm": 0.431640625, + "learning_rate": 0.0003332621606438414, + "loss": 3.1667, + "step": 9597 + }, + { + "epoch": 0.40412631578947367, + "grad_norm": 0.419921875, + "learning_rate": 0.00033323034207645896, + "loss": 3.125, + "step": 9598 + }, + { + "epoch": 0.40416842105263157, + "grad_norm": 0.404296875, + "learning_rate": 0.0003331985219927175, + "loss": 3.143, + "step": 9599 + }, + { + "epoch": 0.40421052631578946, + "grad_norm": 0.41015625, + "learning_rate": 0.0003331667003931967, + "loss": 3.0768, + "step": 9600 + }, + { + "epoch": 0.40425263157894736, + "grad_norm": 0.439453125, + "learning_rate": 0.0003331348772784762, + "loss": 2.6815, + "step": 9601 + }, + { + "epoch": 0.40429473684210526, + "grad_norm": 0.435546875, + "learning_rate": 0.00033310305264913615, + "loss": 2.9858, + "step": 9602 + }, + { + "epoch": 0.40433684210526316, + "grad_norm": 0.45703125, + "learning_rate": 0.00033307122650575593, + "loss": 3.0674, + "step": 9603 + }, + { + "epoch": 0.40437894736842106, + "grad_norm": 0.447265625, + "learning_rate": 0.0003330393988489156, + "loss": 3.166, + "step": 9604 + }, + { + "epoch": 0.40442105263157896, + "grad_norm": 0.447265625, + "learning_rate": 0.00033300756967919517, + "loss": 3.4479, + "step": 9605 + }, + { + "epoch": 0.40446315789473686, + "grad_norm": 0.462890625, + "learning_rate": 0.0003329757389971742, + "loss": 3.4272, + "step": 9606 + }, + { + "epoch": 0.40450526315789476, + "grad_norm": 0.421875, + "learning_rate": 0.0003329439068034328, + "loss": 3.4884, + "step": 9607 + }, + { + "epoch": 0.40454736842105266, + "grad_norm": 0.4296875, + "learning_rate": 0.00033291207309855085, + "loss": 3.1125, + "step": 9608 + }, + { + "epoch": 0.4045894736842105, + "grad_norm": 0.431640625, + "learning_rate": 0.0003328802378831083, + "loss": 3.3973, + "step": 9609 + }, + { + "epoch": 0.4046315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.00033284840115768526, + "loss": 3.26, + "step": 9610 + }, + { + "epoch": 0.4046736842105263, + "grad_norm": 0.4375, + "learning_rate": 0.0003328165629228617, + "loss": 3.6006, + "step": 9611 + }, + { + "epoch": 0.4047157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.0003327847231792176, + "loss": 3.4155, + "step": 9612 + }, + { + "epoch": 0.4047578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00033275288192733315, + "loss": 2.8228, + "step": 9613 + }, + { + "epoch": 0.4048, + "grad_norm": 0.447265625, + "learning_rate": 0.00033272103916778844, + "loss": 3.4277, + "step": 9614 + }, + { + "epoch": 0.4048421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.0003326891949011636, + "loss": 3.0916, + "step": 9615 + }, + { + "epoch": 0.4048842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.0003326573491280388, + "loss": 3.5064, + "step": 9616 + }, + { + "epoch": 0.4049263157894737, + "grad_norm": 0.390625, + "learning_rate": 0.00033262550184899413, + "loss": 3.0846, + "step": 9617 + }, + { + "epoch": 0.4049684210526316, + "grad_norm": 0.412109375, + "learning_rate": 0.00033259365306461, + "loss": 2.8935, + "step": 9618 + }, + { + "epoch": 0.4050105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.0003325618027754664, + "loss": 3.2446, + "step": 9619 + }, + { + "epoch": 0.4050526315789474, + "grad_norm": 0.3984375, + "learning_rate": 0.0003325299509821439, + "loss": 3.2455, + "step": 9620 + }, + { + "epoch": 0.4050947368421053, + "grad_norm": 0.466796875, + "learning_rate": 0.0003324980976852227, + "loss": 3.3241, + "step": 9621 + }, + { + "epoch": 0.4051368421052632, + "grad_norm": 0.435546875, + "learning_rate": 0.00033246624288528297, + "loss": 3.1677, + "step": 9622 + }, + { + "epoch": 0.405178947368421, + "grad_norm": 0.3984375, + "learning_rate": 0.0003324343865829052, + "loss": 3.3264, + "step": 9623 + }, + { + "epoch": 0.4052210526315789, + "grad_norm": 0.419921875, + "learning_rate": 0.0003324025287786698, + "loss": 3.0443, + "step": 9624 + }, + { + "epoch": 0.4052631578947368, + "grad_norm": 0.421875, + "learning_rate": 0.0003323706694731572, + "loss": 3.5171, + "step": 9625 + }, + { + "epoch": 0.4053052631578947, + "grad_norm": 0.40625, + "learning_rate": 0.0003323388086669477, + "loss": 3.3871, + "step": 9626 + }, + { + "epoch": 0.4053473684210526, + "grad_norm": 0.421875, + "learning_rate": 0.00033230694636062185, + "loss": 2.7277, + "step": 9627 + }, + { + "epoch": 0.4053894736842105, + "grad_norm": 0.3984375, + "learning_rate": 0.0003322750825547602, + "loss": 3.2167, + "step": 9628 + }, + { + "epoch": 0.4054315789473684, + "grad_norm": 0.40625, + "learning_rate": 0.00033224321724994316, + "loss": 3.1942, + "step": 9629 + }, + { + "epoch": 0.4054736842105263, + "grad_norm": 0.4375, + "learning_rate": 0.0003322113504467513, + "loss": 2.9969, + "step": 9630 + }, + { + "epoch": 0.4055157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.0003321794821457653, + "loss": 3.6783, + "step": 9631 + }, + { + "epoch": 0.4055578947368421, + "grad_norm": 0.484375, + "learning_rate": 0.00033214761234756575, + "loss": 3.2277, + "step": 9632 + }, + { + "epoch": 0.4056, + "grad_norm": 0.41015625, + "learning_rate": 0.00033211574105273314, + "loss": 3.4145, + "step": 9633 + }, + { + "epoch": 0.4056421052631579, + "grad_norm": 0.40234375, + "learning_rate": 0.0003320838682618482, + "loss": 3.4892, + "step": 9634 + }, + { + "epoch": 0.4056842105263158, + "grad_norm": 0.38671875, + "learning_rate": 0.0003320519939754917, + "loss": 2.8889, + "step": 9635 + }, + { + "epoch": 0.40572631578947366, + "grad_norm": 0.431640625, + "learning_rate": 0.00033202011819424414, + "loss": 3.0783, + "step": 9636 + }, + { + "epoch": 0.40576842105263156, + "grad_norm": 0.40625, + "learning_rate": 0.0003319882409186865, + "loss": 3.6964, + "step": 9637 + }, + { + "epoch": 0.40581052631578945, + "grad_norm": 0.416015625, + "learning_rate": 0.0003319563621493994, + "loss": 3.2361, + "step": 9638 + }, + { + "epoch": 0.40585263157894735, + "grad_norm": 0.416015625, + "learning_rate": 0.00033192448188696375, + "loss": 3.5128, + "step": 9639 + }, + { + "epoch": 0.40589473684210525, + "grad_norm": 0.40625, + "learning_rate": 0.00033189260013196027, + "loss": 3.2741, + "step": 9640 + }, + { + "epoch": 0.40593684210526315, + "grad_norm": 0.431640625, + "learning_rate": 0.00033186071688496975, + "loss": 3.055, + "step": 9641 + }, + { + "epoch": 0.40597894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.00033182883214657327, + "loss": 3.0144, + "step": 9642 + }, + { + "epoch": 0.40602105263157895, + "grad_norm": 0.412109375, + "learning_rate": 0.00033179694591735153, + "loss": 3.4203, + "step": 9643 + }, + { + "epoch": 0.40606315789473685, + "grad_norm": 0.546875, + "learning_rate": 0.0003317650581978855, + "loss": 2.8581, + "step": 9644 + }, + { + "epoch": 0.40610526315789475, + "grad_norm": 0.423828125, + "learning_rate": 0.0003317331689887563, + "loss": 3.7203, + "step": 9645 + }, + { + "epoch": 0.40614736842105265, + "grad_norm": 0.4140625, + "learning_rate": 0.00033170127829054474, + "loss": 3.0822, + "step": 9646 + }, + { + "epoch": 0.40618947368421054, + "grad_norm": 0.45703125, + "learning_rate": 0.0003316693861038319, + "loss": 3.0134, + "step": 9647 + }, + { + "epoch": 0.40623157894736844, + "grad_norm": 0.419921875, + "learning_rate": 0.00033163749242919884, + "loss": 2.9565, + "step": 9648 + }, + { + "epoch": 0.40627368421052634, + "grad_norm": 0.462890625, + "learning_rate": 0.0003316055972672265, + "loss": 3.0561, + "step": 9649 + }, + { + "epoch": 0.4063157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.00033157370061849604, + "loss": 3.2031, + "step": 9650 + }, + { + "epoch": 0.4063578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00033154180248358867, + "loss": 3.0986, + "step": 9651 + }, + { + "epoch": 0.4064, + "grad_norm": 0.419921875, + "learning_rate": 0.00033150990286308547, + "loss": 3.2941, + "step": 9652 + }, + { + "epoch": 0.4064421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.0003314780017575676, + "loss": 3.1769, + "step": 9653 + }, + { + "epoch": 0.4064842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.0003314460991676163, + "loss": 3.2008, + "step": 9654 + }, + { + "epoch": 0.4065263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.0003314141950938127, + "loss": 3.3516, + "step": 9655 + }, + { + "epoch": 0.4065684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.0003313822895367382, + "loss": 3.0113, + "step": 9656 + }, + { + "epoch": 0.4066105263157895, + "grad_norm": 0.40234375, + "learning_rate": 0.00033135038249697393, + "loss": 3.2706, + "step": 9657 + }, + { + "epoch": 0.4066526315789474, + "grad_norm": 0.390625, + "learning_rate": 0.0003313184739751013, + "loss": 3.1267, + "step": 9658 + }, + { + "epoch": 0.4066947368421053, + "grad_norm": 0.4375, + "learning_rate": 0.00033128656397170167, + "loss": 2.7876, + "step": 9659 + }, + { + "epoch": 0.4067368421052632, + "grad_norm": 0.431640625, + "learning_rate": 0.0003312546524873562, + "loss": 3.3756, + "step": 9660 + }, + { + "epoch": 0.4067789473684211, + "grad_norm": 0.57421875, + "learning_rate": 0.00033122273952264656, + "loss": 3.3481, + "step": 9661 + }, + { + "epoch": 0.40682105263157897, + "grad_norm": 0.427734375, + "learning_rate": 0.000331190825078154, + "loss": 3.3248, + "step": 9662 + }, + { + "epoch": 0.4068631578947368, + "grad_norm": 0.4375, + "learning_rate": 0.00033115890915446007, + "loss": 2.9776, + "step": 9663 + }, + { + "epoch": 0.4069052631578947, + "grad_norm": 0.419921875, + "learning_rate": 0.0003311269917521461, + "loss": 2.8864, + "step": 9664 + }, + { + "epoch": 0.4069473684210526, + "grad_norm": 0.419921875, + "learning_rate": 0.00033109507287179367, + "loss": 3.0867, + "step": 9665 + }, + { + "epoch": 0.4069894736842105, + "grad_norm": 0.431640625, + "learning_rate": 0.00033106315251398434, + "loss": 3.1672, + "step": 9666 + }, + { + "epoch": 0.4070315789473684, + "grad_norm": 0.46875, + "learning_rate": 0.0003310312306792996, + "loss": 3.0602, + "step": 9667 + }, + { + "epoch": 0.4070736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00033099930736832096, + "loss": 2.9953, + "step": 9668 + }, + { + "epoch": 0.4071157894736842, + "grad_norm": 0.388671875, + "learning_rate": 0.0003309673825816302, + "loss": 3.3506, + "step": 9669 + }, + { + "epoch": 0.4071578947368421, + "grad_norm": 0.458984375, + "learning_rate": 0.0003309354563198088, + "loss": 2.8665, + "step": 9670 + }, + { + "epoch": 0.4072, + "grad_norm": 0.416015625, + "learning_rate": 0.00033090352858343853, + "loss": 2.6563, + "step": 9671 + }, + { + "epoch": 0.4072421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00033087159937310106, + "loss": 2.9886, + "step": 9672 + }, + { + "epoch": 0.4072842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.00033083966868937805, + "loss": 3.4338, + "step": 9673 + }, + { + "epoch": 0.4073263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.0003308077365328512, + "loss": 3.3274, + "step": 9674 + }, + { + "epoch": 0.4073684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.0003307758029041024, + "loss": 3.4929, + "step": 9675 + }, + { + "epoch": 0.4074105263157895, + "grad_norm": 0.443359375, + "learning_rate": 0.00033074386780371337, + "loss": 3.0753, + "step": 9676 + }, + { + "epoch": 0.40745263157894734, + "grad_norm": 0.404296875, + "learning_rate": 0.0003307119312322659, + "loss": 3.265, + "step": 9677 + }, + { + "epoch": 0.40749473684210524, + "grad_norm": 0.419921875, + "learning_rate": 0.0003306799931903419, + "loss": 3.2584, + "step": 9678 + }, + { + "epoch": 0.40753684210526314, + "grad_norm": 0.4140625, + "learning_rate": 0.00033064805367852317, + "loss": 3.1428, + "step": 9679 + }, + { + "epoch": 0.40757894736842104, + "grad_norm": 0.419921875, + "learning_rate": 0.0003306161126973918, + "loss": 3.2893, + "step": 9680 + }, + { + "epoch": 0.40762105263157894, + "grad_norm": 0.4375, + "learning_rate": 0.0003305841702475295, + "loss": 3.3859, + "step": 9681 + }, + { + "epoch": 0.40766315789473684, + "grad_norm": 0.431640625, + "learning_rate": 0.00033055222632951824, + "loss": 3.5214, + "step": 9682 + }, + { + "epoch": 0.40770526315789474, + "grad_norm": 0.400390625, + "learning_rate": 0.0003305202809439401, + "loss": 3.074, + "step": 9683 + }, + { + "epoch": 0.40774736842105264, + "grad_norm": 0.40625, + "learning_rate": 0.000330488334091377, + "loss": 2.8221, + "step": 9684 + }, + { + "epoch": 0.40778947368421053, + "grad_norm": 0.412109375, + "learning_rate": 0.0003304563857724111, + "loss": 3.2497, + "step": 9685 + }, + { + "epoch": 0.40783157894736843, + "grad_norm": 0.5234375, + "learning_rate": 0.0003304244359876243, + "loss": 3.4423, + "step": 9686 + }, + { + "epoch": 0.40787368421052633, + "grad_norm": 0.416015625, + "learning_rate": 0.00033039248473759885, + "loss": 3.5976, + "step": 9687 + }, + { + "epoch": 0.40791578947368423, + "grad_norm": 0.447265625, + "learning_rate": 0.00033036053202291685, + "loss": 3.0927, + "step": 9688 + }, + { + "epoch": 0.40795789473684213, + "grad_norm": 0.42578125, + "learning_rate": 0.0003303285778441603, + "loss": 3.2877, + "step": 9689 + }, + { + "epoch": 0.408, + "grad_norm": 0.416015625, + "learning_rate": 0.0003302966222019114, + "loss": 3.482, + "step": 9690 + }, + { + "epoch": 0.4080421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.0003302646650967524, + "loss": 3.1242, + "step": 9691 + }, + { + "epoch": 0.40808421052631577, + "grad_norm": 0.41796875, + "learning_rate": 0.0003302327065292655, + "loss": 2.9952, + "step": 9692 + }, + { + "epoch": 0.40812631578947367, + "grad_norm": 0.388671875, + "learning_rate": 0.00033020074650003307, + "loss": 3.0478, + "step": 9693 + }, + { + "epoch": 0.40816842105263157, + "grad_norm": 0.408203125, + "learning_rate": 0.00033016878500963714, + "loss": 2.9878, + "step": 9694 + }, + { + "epoch": 0.40821052631578947, + "grad_norm": 0.396484375, + "learning_rate": 0.0003301368220586602, + "loss": 3.0597, + "step": 9695 + }, + { + "epoch": 0.40825263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.0003301048576476845, + "loss": 3.2524, + "step": 9696 + }, + { + "epoch": 0.40829473684210527, + "grad_norm": 0.494140625, + "learning_rate": 0.00033007289177729236, + "loss": 3.2568, + "step": 9697 + }, + { + "epoch": 0.40833684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.0003300409244480663, + "loss": 3.2716, + "step": 9698 + }, + { + "epoch": 0.40837894736842106, + "grad_norm": 0.421875, + "learning_rate": 0.00033000895566058866, + "loss": 3.1387, + "step": 9699 + }, + { + "epoch": 0.40842105263157896, + "grad_norm": 0.4609375, + "learning_rate": 0.00032997698541544184, + "loss": 3.0485, + "step": 9700 + }, + { + "epoch": 0.40846315789473686, + "grad_norm": 0.3984375, + "learning_rate": 0.0003299450137132083, + "loss": 3.2091, + "step": 9701 + }, + { + "epoch": 0.40850526315789476, + "grad_norm": 0.443359375, + "learning_rate": 0.00032991304055447046, + "loss": 3.1921, + "step": 9702 + }, + { + "epoch": 0.40854736842105266, + "grad_norm": 0.48046875, + "learning_rate": 0.00032988106593981103, + "loss": 3.1941, + "step": 9703 + }, + { + "epoch": 0.4085894736842105, + "grad_norm": 0.43359375, + "learning_rate": 0.00032984908986981234, + "loss": 2.709, + "step": 9704 + }, + { + "epoch": 0.4086315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.00032981711234505713, + "loss": 3.2018, + "step": 9705 + }, + { + "epoch": 0.4086736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.00032978513336612785, + "loss": 3.3015, + "step": 9706 + }, + { + "epoch": 0.4087157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.00032975315293360725, + "loss": 3.074, + "step": 9707 + }, + { + "epoch": 0.4087578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00032972117104807793, + "loss": 3.4442, + "step": 9708 + }, + { + "epoch": 0.4088, + "grad_norm": 0.462890625, + "learning_rate": 0.00032968918771012244, + "loss": 3.0092, + "step": 9709 + }, + { + "epoch": 0.4088421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00032965720292032363, + "loss": 2.7784, + "step": 9710 + }, + { + "epoch": 0.4088842105263158, + "grad_norm": 0.396484375, + "learning_rate": 0.00032962521667926416, + "loss": 3.1199, + "step": 9711 + }, + { + "epoch": 0.4089263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.0003295932289875268, + "loss": 3.5696, + "step": 9712 + }, + { + "epoch": 0.4089684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.0003295612398456944, + "loss": 3.0235, + "step": 9713 + }, + { + "epoch": 0.4090105263157895, + "grad_norm": 0.384765625, + "learning_rate": 0.00032952924925434965, + "loss": 3.1547, + "step": 9714 + }, + { + "epoch": 0.4090526315789474, + "grad_norm": 0.39453125, + "learning_rate": 0.0003294972572140754, + "loss": 3.38, + "step": 9715 + }, + { + "epoch": 0.4090947368421053, + "grad_norm": 0.408203125, + "learning_rate": 0.00032946526372545454, + "loss": 3.6133, + "step": 9716 + }, + { + "epoch": 0.40913684210526313, + "grad_norm": 0.44140625, + "learning_rate": 0.00032943326878906996, + "loss": 2.7362, + "step": 9717 + }, + { + "epoch": 0.40917894736842103, + "grad_norm": 0.4296875, + "learning_rate": 0.00032940127240550446, + "loss": 3.6014, + "step": 9718 + }, + { + "epoch": 0.40922105263157893, + "grad_norm": 0.41796875, + "learning_rate": 0.0003293692745753412, + "loss": 2.9918, + "step": 9719 + }, + { + "epoch": 0.40926315789473683, + "grad_norm": 0.4140625, + "learning_rate": 0.00032933727529916283, + "loss": 2.5423, + "step": 9720 + }, + { + "epoch": 0.4093052631578947, + "grad_norm": 0.46875, + "learning_rate": 0.00032930527457755265, + "loss": 3.0446, + "step": 9721 + }, + { + "epoch": 0.4093473684210526, + "grad_norm": 0.408203125, + "learning_rate": 0.0003292732724110936, + "loss": 3.2719, + "step": 9722 + }, + { + "epoch": 0.4093894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.00032924126880036866, + "loss": 3.3202, + "step": 9723 + }, + { + "epoch": 0.4094315789473684, + "grad_norm": 0.404296875, + "learning_rate": 0.0003292092637459608, + "loss": 2.8208, + "step": 9724 + }, + { + "epoch": 0.4094736842105263, + "grad_norm": 0.431640625, + "learning_rate": 0.0003291772572484533, + "loss": 3.4516, + "step": 9725 + }, + { + "epoch": 0.4095157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.00032914524930842913, + "loss": 3.4475, + "step": 9726 + }, + { + "epoch": 0.4095578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00032911323992647156, + "loss": 2.9029, + "step": 9727 + }, + { + "epoch": 0.4096, + "grad_norm": 0.44140625, + "learning_rate": 0.0003290812291031637, + "loss": 3.0591, + "step": 9728 + }, + { + "epoch": 0.4096421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.00032904921683908877, + "loss": 2.5879, + "step": 9729 + }, + { + "epoch": 0.40968421052631576, + "grad_norm": 0.41796875, + "learning_rate": 0.00032901720313483, + "loss": 3.1034, + "step": 9730 + }, + { + "epoch": 0.40972631578947366, + "grad_norm": 0.435546875, + "learning_rate": 0.0003289851879909706, + "loss": 3.3222, + "step": 9731 + }, + { + "epoch": 0.40976842105263156, + "grad_norm": 0.453125, + "learning_rate": 0.0003289531714080939, + "loss": 3.222, + "step": 9732 + }, + { + "epoch": 0.40981052631578946, + "grad_norm": 0.427734375, + "learning_rate": 0.0003289211533867832, + "loss": 3.4566, + "step": 9733 + }, + { + "epoch": 0.40985263157894736, + "grad_norm": 0.419921875, + "learning_rate": 0.00032888913392762183, + "loss": 2.7881, + "step": 9734 + }, + { + "epoch": 0.40989473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.0003288571130311931, + "loss": 3.228, + "step": 9735 + }, + { + "epoch": 0.40993684210526316, + "grad_norm": 0.416015625, + "learning_rate": 0.00032882509069808043, + "loss": 3.2224, + "step": 9736 + }, + { + "epoch": 0.40997894736842105, + "grad_norm": 0.435546875, + "learning_rate": 0.0003287930669288672, + "loss": 2.8991, + "step": 9737 + }, + { + "epoch": 0.41002105263157895, + "grad_norm": 0.439453125, + "learning_rate": 0.00032876104172413685, + "loss": 3.139, + "step": 9738 + }, + { + "epoch": 0.41006315789473685, + "grad_norm": 0.412109375, + "learning_rate": 0.0003287290150844729, + "loss": 3.0915, + "step": 9739 + }, + { + "epoch": 0.41010526315789475, + "grad_norm": 0.41015625, + "learning_rate": 0.0003286969870104588, + "loss": 2.923, + "step": 9740 + }, + { + "epoch": 0.41014736842105265, + "grad_norm": 0.408203125, + "learning_rate": 0.00032866495750267805, + "loss": 3.3063, + "step": 9741 + }, + { + "epoch": 0.41018947368421055, + "grad_norm": 0.4296875, + "learning_rate": 0.0003286329265617142, + "loss": 3.5232, + "step": 9742 + }, + { + "epoch": 0.41023157894736845, + "grad_norm": 0.40234375, + "learning_rate": 0.00032860089418815084, + "loss": 2.81, + "step": 9743 + }, + { + "epoch": 0.4102736842105263, + "grad_norm": 0.40234375, + "learning_rate": 0.0003285688603825715, + "loss": 3.4731, + "step": 9744 + }, + { + "epoch": 0.4103157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.00032853682514555983, + "loss": 3.2572, + "step": 9745 + }, + { + "epoch": 0.4103578947368421, + "grad_norm": 0.404296875, + "learning_rate": 0.00032850478847769955, + "loss": 3.3471, + "step": 9746 + }, + { + "epoch": 0.4104, + "grad_norm": 0.49609375, + "learning_rate": 0.00032847275037957424, + "loss": 2.5486, + "step": 9747 + }, + { + "epoch": 0.4104421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00032844071085176763, + "loss": 3.3354, + "step": 9748 + }, + { + "epoch": 0.4104842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.0003284086698948634, + "loss": 3.1434, + "step": 9749 + }, + { + "epoch": 0.4105263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.00032837662750944535, + "loss": 3.2549, + "step": 9750 + }, + { + "epoch": 0.4105684210526316, + "grad_norm": 0.435546875, + "learning_rate": 0.0003283445836960972, + "loss": 2.9685, + "step": 9751 + }, + { + "epoch": 0.4106105263157895, + "grad_norm": 0.408203125, + "learning_rate": 0.0003283125384554028, + "loss": 3.1856, + "step": 9752 + }, + { + "epoch": 0.4106526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00032828049178794595, + "loss": 2.879, + "step": 9753 + }, + { + "epoch": 0.4106947368421053, + "grad_norm": 0.408203125, + "learning_rate": 0.0003282484436943106, + "loss": 3.3675, + "step": 9754 + }, + { + "epoch": 0.4107368421052632, + "grad_norm": 0.43359375, + "learning_rate": 0.0003282163941750804, + "loss": 2.9492, + "step": 9755 + }, + { + "epoch": 0.4107789473684211, + "grad_norm": 0.408203125, + "learning_rate": 0.00032818434323083956, + "loss": 3.3967, + "step": 9756 + }, + { + "epoch": 0.4108210526315789, + "grad_norm": 0.423828125, + "learning_rate": 0.0003281522908621718, + "loss": 3.2263, + "step": 9757 + }, + { + "epoch": 0.4108631578947368, + "grad_norm": 0.43359375, + "learning_rate": 0.00032812023706966104, + "loss": 3.2892, + "step": 9758 + }, + { + "epoch": 0.4109052631578947, + "grad_norm": 0.4296875, + "learning_rate": 0.0003280881818538914, + "loss": 3.0762, + "step": 9759 + }, + { + "epoch": 0.4109473684210526, + "grad_norm": 0.41015625, + "learning_rate": 0.00032805612521544686, + "loss": 3.3244, + "step": 9760 + }, + { + "epoch": 0.4109894736842105, + "grad_norm": 0.427734375, + "learning_rate": 0.0003280240671549114, + "loss": 3.7043, + "step": 9761 + }, + { + "epoch": 0.4110315789473684, + "grad_norm": 0.41015625, + "learning_rate": 0.0003279920076728691, + "loss": 3.1436, + "step": 9762 + }, + { + "epoch": 0.4110736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.0003279599467699041, + "loss": 3.0941, + "step": 9763 + }, + { + "epoch": 0.4111157894736842, + "grad_norm": 0.4375, + "learning_rate": 0.0003279278844466005, + "loss": 3.4766, + "step": 9764 + }, + { + "epoch": 0.4111578947368421, + "grad_norm": 0.455078125, + "learning_rate": 0.00032789582070354227, + "loss": 2.8604, + "step": 9765 + }, + { + "epoch": 0.4112, + "grad_norm": 0.39453125, + "learning_rate": 0.0003278637555413138, + "loss": 2.9306, + "step": 9766 + }, + { + "epoch": 0.4112421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.0003278316889604993, + "loss": 3.516, + "step": 9767 + }, + { + "epoch": 0.4112842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.0003277996209616827, + "loss": 3.2826, + "step": 9768 + }, + { + "epoch": 0.4113263157894737, + "grad_norm": 0.443359375, + "learning_rate": 0.00032776755154544844, + "loss": 3.0299, + "step": 9769 + }, + { + "epoch": 0.4113684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.0003277354807123808, + "loss": 3.1031, + "step": 9770 + }, + { + "epoch": 0.41141052631578945, + "grad_norm": 0.43359375, + "learning_rate": 0.00032770340846306403, + "loss": 3.3292, + "step": 9771 + }, + { + "epoch": 0.41145263157894735, + "grad_norm": 0.458984375, + "learning_rate": 0.00032767133479808246, + "loss": 2.9134, + "step": 9772 + }, + { + "epoch": 0.41149473684210525, + "grad_norm": 0.42578125, + "learning_rate": 0.00032763925971802043, + "loss": 2.5583, + "step": 9773 + }, + { + "epoch": 0.41153684210526315, + "grad_norm": 0.4375, + "learning_rate": 0.0003276071832234624, + "loss": 3.3809, + "step": 9774 + }, + { + "epoch": 0.41157894736842104, + "grad_norm": 0.416015625, + "learning_rate": 0.00032757510531499255, + "loss": 3.1292, + "step": 9775 + }, + { + "epoch": 0.41162105263157894, + "grad_norm": 0.40625, + "learning_rate": 0.0003275430259931955, + "loss": 3.599, + "step": 9776 + }, + { + "epoch": 0.41166315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.00032751094525865557, + "loss": 2.9445, + "step": 9777 + }, + { + "epoch": 0.41170526315789474, + "grad_norm": 0.416015625, + "learning_rate": 0.00032747886311195736, + "loss": 3.0075, + "step": 9778 + }, + { + "epoch": 0.41174736842105264, + "grad_norm": 0.4296875, + "learning_rate": 0.0003274467795536852, + "loss": 3.2097, + "step": 9779 + }, + { + "epoch": 0.41178947368421054, + "grad_norm": 0.3984375, + "learning_rate": 0.00032741469458442377, + "loss": 3.0515, + "step": 9780 + }, + { + "epoch": 0.41183157894736844, + "grad_norm": 0.416015625, + "learning_rate": 0.0003273826082047576, + "loss": 3.2655, + "step": 9781 + }, + { + "epoch": 0.41187368421052634, + "grad_norm": 0.431640625, + "learning_rate": 0.00032735052041527117, + "loss": 3.4306, + "step": 9782 + }, + { + "epoch": 0.41191578947368424, + "grad_norm": 0.400390625, + "learning_rate": 0.0003273184312165492, + "loss": 2.6865, + "step": 9783 + }, + { + "epoch": 0.4119578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.0003272863406091762, + "loss": 3.5431, + "step": 9784 + }, + { + "epoch": 0.412, + "grad_norm": 0.396484375, + "learning_rate": 0.00032725424859373687, + "loss": 2.9599, + "step": 9785 + }, + { + "epoch": 0.4120421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.0003272221551708159, + "loss": 3.4338, + "step": 9786 + }, + { + "epoch": 0.4120842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.00032719006034099803, + "loss": 3.3016, + "step": 9787 + }, + { + "epoch": 0.4121263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.000327157964104868, + "loss": 3.2408, + "step": 9788 + }, + { + "epoch": 0.4121684210526316, + "grad_norm": 0.44921875, + "learning_rate": 0.00032712586646301044, + "loss": 3.2086, + "step": 9789 + }, + { + "epoch": 0.41221052631578947, + "grad_norm": 0.455078125, + "learning_rate": 0.0003270937674160103, + "loss": 2.9384, + "step": 9790 + }, + { + "epoch": 0.41225263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.0003270616669644522, + "loss": 3.3137, + "step": 9791 + }, + { + "epoch": 0.41229473684210527, + "grad_norm": 0.39453125, + "learning_rate": 0.00032702956510892114, + "loss": 2.9307, + "step": 9792 + }, + { + "epoch": 0.41233684210526317, + "grad_norm": 0.40625, + "learning_rate": 0.0003269974618500019, + "loss": 3.3631, + "step": 9793 + }, + { + "epoch": 0.41237894736842107, + "grad_norm": 0.421875, + "learning_rate": 0.00032696535718827936, + "loss": 3.4386, + "step": 9794 + }, + { + "epoch": 0.41242105263157897, + "grad_norm": 0.423828125, + "learning_rate": 0.00032693325112433835, + "loss": 2.9951, + "step": 9795 + }, + { + "epoch": 0.41246315789473686, + "grad_norm": 0.4296875, + "learning_rate": 0.0003269011436587641, + "loss": 3.3136, + "step": 9796 + }, + { + "epoch": 0.41250526315789476, + "grad_norm": 0.416015625, + "learning_rate": 0.0003268690347921412, + "loss": 3.2238, + "step": 9797 + }, + { + "epoch": 0.4125473684210526, + "grad_norm": 0.421875, + "learning_rate": 0.0003268369245250548, + "loss": 2.987, + "step": 9798 + }, + { + "epoch": 0.4125894736842105, + "grad_norm": 0.404296875, + "learning_rate": 0.00032680481285809007, + "loss": 2.9411, + "step": 9799 + }, + { + "epoch": 0.4126315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.00032677269979183176, + "loss": 3.2473, + "step": 9800 + }, + { + "epoch": 0.4126736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.0003267405853268651, + "loss": 3.0204, + "step": 9801 + }, + { + "epoch": 0.4127157894736842, + "grad_norm": 0.5234375, + "learning_rate": 0.0003267084694637752, + "loss": 3.4134, + "step": 9802 + }, + { + "epoch": 0.4127578947368421, + "grad_norm": 0.40234375, + "learning_rate": 0.00032667635220314694, + "loss": 3.6243, + "step": 9803 + }, + { + "epoch": 0.4128, + "grad_norm": 0.42578125, + "learning_rate": 0.0003266442335455658, + "loss": 3.7101, + "step": 9804 + }, + { + "epoch": 0.4128421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00032661211349161665, + "loss": 3.4791, + "step": 9805 + }, + { + "epoch": 0.4128842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.0003265799920418849, + "loss": 3.1093, + "step": 9806 + }, + { + "epoch": 0.4129263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.00032654786919695573, + "loss": 3.4331, + "step": 9807 + }, + { + "epoch": 0.4129684210526316, + "grad_norm": 0.40625, + "learning_rate": 0.00032651574495741426, + "loss": 3.4692, + "step": 9808 + }, + { + "epoch": 0.4130105263157895, + "grad_norm": 0.462890625, + "learning_rate": 0.00032648361932384587, + "loss": 3.1984, + "step": 9809 + }, + { + "epoch": 0.4130526315789474, + "grad_norm": 0.4375, + "learning_rate": 0.0003264514922968357, + "loss": 2.9987, + "step": 9810 + }, + { + "epoch": 0.41309473684210524, + "grad_norm": 0.451171875, + "learning_rate": 0.0003264193638769692, + "loss": 3.7, + "step": 9811 + }, + { + "epoch": 0.41313684210526314, + "grad_norm": 0.421875, + "learning_rate": 0.0003263872340648316, + "loss": 3.186, + "step": 9812 + }, + { + "epoch": 0.41317894736842103, + "grad_norm": 0.451171875, + "learning_rate": 0.0003263551028610085, + "loss": 3.2846, + "step": 9813 + }, + { + "epoch": 0.41322105263157893, + "grad_norm": 0.421875, + "learning_rate": 0.000326322970266085, + "loss": 3.1472, + "step": 9814 + }, + { + "epoch": 0.41326315789473683, + "grad_norm": 0.412109375, + "learning_rate": 0.0003262908362806468, + "loss": 3.083, + "step": 9815 + }, + { + "epoch": 0.41330526315789473, + "grad_norm": 0.404296875, + "learning_rate": 0.00032625870090527913, + "loss": 2.8058, + "step": 9816 + }, + { + "epoch": 0.41334736842105263, + "grad_norm": 0.412109375, + "learning_rate": 0.00032622656414056764, + "loss": 3.3321, + "step": 9817 + }, + { + "epoch": 0.41338947368421053, + "grad_norm": 0.4140625, + "learning_rate": 0.0003261944259870975, + "loss": 3.3838, + "step": 9818 + }, + { + "epoch": 0.4134315789473684, + "grad_norm": 0.40625, + "learning_rate": 0.0003261622864454546, + "loss": 3.2222, + "step": 9819 + }, + { + "epoch": 0.4134736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.00032613014551622423, + "loss": 2.8741, + "step": 9820 + }, + { + "epoch": 0.4135157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.0003260980031999921, + "loss": 3.0814, + "step": 9821 + }, + { + "epoch": 0.4135578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.0003260658594973438, + "loss": 3.2751, + "step": 9822 + }, + { + "epoch": 0.4136, + "grad_norm": 0.439453125, + "learning_rate": 0.00032603371440886484, + "loss": 3.2581, + "step": 9823 + }, + { + "epoch": 0.4136421052631579, + "grad_norm": 0.40234375, + "learning_rate": 0.000326001567935141, + "loss": 3.6344, + "step": 9824 + }, + { + "epoch": 0.41368421052631577, + "grad_norm": 0.462890625, + "learning_rate": 0.0003259694200767579, + "loss": 2.9319, + "step": 9825 + }, + { + "epoch": 0.41372631578947366, + "grad_norm": 0.421875, + "learning_rate": 0.0003259372708343012, + "loss": 3.1244, + "step": 9826 + }, + { + "epoch": 0.41376842105263156, + "grad_norm": 0.404296875, + "learning_rate": 0.00032590512020835665, + "loss": 3.1019, + "step": 9827 + }, + { + "epoch": 0.41381052631578946, + "grad_norm": 0.408203125, + "learning_rate": 0.00032587296819950993, + "loss": 3.2263, + "step": 9828 + }, + { + "epoch": 0.41385263157894736, + "grad_norm": 0.451171875, + "learning_rate": 0.00032584081480834697, + "loss": 3.2399, + "step": 9829 + }, + { + "epoch": 0.41389473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.00032580866003545345, + "loss": 3.2034, + "step": 9830 + }, + { + "epoch": 0.41393684210526316, + "grad_norm": 0.4765625, + "learning_rate": 0.0003257765038814152, + "loss": 2.8272, + "step": 9831 + }, + { + "epoch": 0.41397894736842106, + "grad_norm": 0.419921875, + "learning_rate": 0.0003257443463468181, + "loss": 3.2751, + "step": 9832 + }, + { + "epoch": 0.41402105263157896, + "grad_norm": 0.427734375, + "learning_rate": 0.00032571218743224803, + "loss": 3.1496, + "step": 9833 + }, + { + "epoch": 0.41406315789473686, + "grad_norm": 0.462890625, + "learning_rate": 0.0003256800271382908, + "loss": 3.0077, + "step": 9834 + }, + { + "epoch": 0.41410526315789475, + "grad_norm": 0.451171875, + "learning_rate": 0.00032564786546553245, + "loss": 3.0288, + "step": 9835 + }, + { + "epoch": 0.41414736842105265, + "grad_norm": 0.41796875, + "learning_rate": 0.00032561570241455883, + "loss": 3.1003, + "step": 9836 + }, + { + "epoch": 0.41418947368421055, + "grad_norm": 0.416015625, + "learning_rate": 0.00032558353798595597, + "loss": 3.0699, + "step": 9837 + }, + { + "epoch": 0.4142315789473684, + "grad_norm": 0.408203125, + "learning_rate": 0.0003255513721803099, + "loss": 2.9111, + "step": 9838 + }, + { + "epoch": 0.4142736842105263, + "grad_norm": 0.55859375, + "learning_rate": 0.00032551920499820653, + "loss": 2.688, + "step": 9839 + }, + { + "epoch": 0.4143157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.00032548703644023197, + "loss": 3.4043, + "step": 9840 + }, + { + "epoch": 0.4143578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.0003254548665069724, + "loss": 2.8924, + "step": 9841 + }, + { + "epoch": 0.4144, + "grad_norm": 0.39453125, + "learning_rate": 0.0003254226951990137, + "loss": 3.2164, + "step": 9842 + }, + { + "epoch": 0.4144421052631579, + "grad_norm": 0.39453125, + "learning_rate": 0.00032539052251694226, + "loss": 2.7516, + "step": 9843 + }, + { + "epoch": 0.4144842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.00032535834846134386, + "loss": 3.383, + "step": 9844 + }, + { + "epoch": 0.4145263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0003253261730328051, + "loss": 3.272, + "step": 9845 + }, + { + "epoch": 0.4145684210526316, + "grad_norm": 0.453125, + "learning_rate": 0.0003252939962319118, + "loss": 2.9747, + "step": 9846 + }, + { + "epoch": 0.4146105263157895, + "grad_norm": 0.41015625, + "learning_rate": 0.00032526181805925047, + "loss": 3.0692, + "step": 9847 + }, + { + "epoch": 0.4146526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.0003252296385154072, + "loss": 2.8142, + "step": 9848 + }, + { + "epoch": 0.4146947368421053, + "grad_norm": 0.44140625, + "learning_rate": 0.0003251974576009684, + "loss": 2.9298, + "step": 9849 + }, + { + "epoch": 0.4147368421052632, + "grad_norm": 0.41015625, + "learning_rate": 0.00032516527531652016, + "loss": 3.1044, + "step": 9850 + }, + { + "epoch": 0.4147789473684211, + "grad_norm": 0.416015625, + "learning_rate": 0.00032513309166264906, + "loss": 3.221, + "step": 9851 + }, + { + "epoch": 0.4148210526315789, + "grad_norm": 0.419921875, + "learning_rate": 0.0003251009066399411, + "loss": 3.048, + "step": 9852 + }, + { + "epoch": 0.4148631578947368, + "grad_norm": 0.4140625, + "learning_rate": 0.00032506872024898304, + "loss": 3.2, + "step": 9853 + }, + { + "epoch": 0.4149052631578947, + "grad_norm": 0.431640625, + "learning_rate": 0.00032503653249036095, + "loss": 3.5713, + "step": 9854 + }, + { + "epoch": 0.4149473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.0003250043433646615, + "loss": 3.3172, + "step": 9855 + }, + { + "epoch": 0.4149894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.00032497215287247106, + "loss": 3.0392, + "step": 9856 + }, + { + "epoch": 0.4150315789473684, + "grad_norm": 0.447265625, + "learning_rate": 0.00032493996101437604, + "loss": 3.0969, + "step": 9857 + }, + { + "epoch": 0.4150736842105263, + "grad_norm": 0.4765625, + "learning_rate": 0.000324907767790963, + "loss": 3.0283, + "step": 9858 + }, + { + "epoch": 0.4151157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.0003248755732028184, + "loss": 3.2518, + "step": 9859 + }, + { + "epoch": 0.4151578947368421, + "grad_norm": 0.4453125, + "learning_rate": 0.0003248433772505289, + "loss": 3.4062, + "step": 9860 + }, + { + "epoch": 0.4152, + "grad_norm": 0.408203125, + "learning_rate": 0.00032481117993468097, + "loss": 3.0963, + "step": 9861 + }, + { + "epoch": 0.4152421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.0003247789812558612, + "loss": 3.5872, + "step": 9862 + }, + { + "epoch": 0.4152842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.0003247467812146563, + "loss": 3.0918, + "step": 9863 + }, + { + "epoch": 0.4153263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.0003247145798116529, + "loss": 3.3447, + "step": 9864 + }, + { + "epoch": 0.41536842105263155, + "grad_norm": 0.59765625, + "learning_rate": 0.00032468237704743756, + "loss": 3.0961, + "step": 9865 + }, + { + "epoch": 0.41541052631578945, + "grad_norm": 0.51171875, + "learning_rate": 0.0003246501729225971, + "loss": 3.3209, + "step": 9866 + }, + { + "epoch": 0.41545263157894735, + "grad_norm": 0.39453125, + "learning_rate": 0.00032461796743771823, + "loss": 3.4434, + "step": 9867 + }, + { + "epoch": 0.41549473684210525, + "grad_norm": 0.435546875, + "learning_rate": 0.0003245857605933876, + "loss": 3.2151, + "step": 9868 + }, + { + "epoch": 0.41553684210526315, + "grad_norm": 0.37890625, + "learning_rate": 0.0003245535523901921, + "loss": 2.6901, + "step": 9869 + }, + { + "epoch": 0.41557894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.0003245213428287184, + "loss": 2.5731, + "step": 9870 + }, + { + "epoch": 0.41562105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.0003244891319095535, + "loss": 3.1945, + "step": 9871 + }, + { + "epoch": 0.41566315789473685, + "grad_norm": 0.41796875, + "learning_rate": 0.00032445691963328406, + "loss": 3.3934, + "step": 9872 + }, + { + "epoch": 0.41570526315789474, + "grad_norm": 0.435546875, + "learning_rate": 0.000324424706000497, + "loss": 3.3875, + "step": 9873 + }, + { + "epoch": 0.41574736842105264, + "grad_norm": 0.43359375, + "learning_rate": 0.00032439249101177937, + "loss": 2.7071, + "step": 9874 + }, + { + "epoch": 0.41578947368421054, + "grad_norm": 0.416015625, + "learning_rate": 0.00032436027466771787, + "loss": 2.7539, + "step": 9875 + }, + { + "epoch": 0.41583157894736844, + "grad_norm": 0.40234375, + "learning_rate": 0.00032432805696889957, + "loss": 3.2034, + "step": 9876 + }, + { + "epoch": 0.41587368421052634, + "grad_norm": 0.419921875, + "learning_rate": 0.0003242958379159114, + "loss": 3.351, + "step": 9877 + }, + { + "epoch": 0.4159157894736842, + "grad_norm": 0.404296875, + "learning_rate": 0.0003242636175093403, + "loss": 3.3835, + "step": 9878 + }, + { + "epoch": 0.4159578947368421, + "grad_norm": 0.396484375, + "learning_rate": 0.0003242313957497734, + "loss": 3.292, + "step": 9879 + }, + { + "epoch": 0.416, + "grad_norm": 0.39453125, + "learning_rate": 0.0003241991726377976, + "loss": 3.1166, + "step": 9880 + }, + { + "epoch": 0.4160421052631579, + "grad_norm": 0.40234375, + "learning_rate": 0.00032416694817400017, + "loss": 3.1668, + "step": 9881 + }, + { + "epoch": 0.4160842105263158, + "grad_norm": 0.412109375, + "learning_rate": 0.00032413472235896806, + "loss": 3.1273, + "step": 9882 + }, + { + "epoch": 0.4161263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.00032410249519328844, + "loss": 3.1253, + "step": 9883 + }, + { + "epoch": 0.4161684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.0003240702666775484, + "loss": 3.2969, + "step": 9884 + }, + { + "epoch": 0.4162105263157895, + "grad_norm": 0.423828125, + "learning_rate": 0.0003240380368123351, + "loss": 3.4129, + "step": 9885 + }, + { + "epoch": 0.4162526315789474, + "grad_norm": 0.388671875, + "learning_rate": 0.0003240058055982358, + "loss": 2.9068, + "step": 9886 + }, + { + "epoch": 0.4162947368421053, + "grad_norm": 0.482421875, + "learning_rate": 0.0003239735730358376, + "loss": 2.9856, + "step": 9887 + }, + { + "epoch": 0.41633684210526317, + "grad_norm": 0.408203125, + "learning_rate": 0.0003239413391257279, + "loss": 3.0167, + "step": 9888 + }, + { + "epoch": 0.41637894736842107, + "grad_norm": 0.41015625, + "learning_rate": 0.0003239091038684938, + "loss": 3.2524, + "step": 9889 + }, + { + "epoch": 0.41642105263157897, + "grad_norm": 0.419921875, + "learning_rate": 0.0003238768672647228, + "loss": 3.3682, + "step": 9890 + }, + { + "epoch": 0.41646315789473687, + "grad_norm": 0.423828125, + "learning_rate": 0.0003238446293150019, + "loss": 3.452, + "step": 9891 + }, + { + "epoch": 0.4165052631578947, + "grad_norm": 0.45703125, + "learning_rate": 0.0003238123900199187, + "loss": 2.7072, + "step": 9892 + }, + { + "epoch": 0.4165473684210526, + "grad_norm": 0.400390625, + "learning_rate": 0.00032378014938006053, + "loss": 3.0442, + "step": 9893 + }, + { + "epoch": 0.4165894736842105, + "grad_norm": 0.44140625, + "learning_rate": 0.00032374790739601465, + "loss": 3.1754, + "step": 9894 + }, + { + "epoch": 0.4166315789473684, + "grad_norm": 0.392578125, + "learning_rate": 0.0003237156640683686, + "loss": 3.2779, + "step": 9895 + }, + { + "epoch": 0.4166736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.0003236834193977097, + "loss": 3.4479, + "step": 9896 + }, + { + "epoch": 0.4167157894736842, + "grad_norm": 0.48046875, + "learning_rate": 0.00032365117338462555, + "loss": 3.2864, + "step": 9897 + }, + { + "epoch": 0.4167578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.00032361892602970357, + "loss": 3.3573, + "step": 9898 + }, + { + "epoch": 0.4168, + "grad_norm": 0.453125, + "learning_rate": 0.00032358667733353117, + "loss": 3.2354, + "step": 9899 + }, + { + "epoch": 0.4168421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00032355442729669606, + "loss": 3.3138, + "step": 9900 + }, + { + "epoch": 0.4168842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.0003235221759197856, + "loss": 3.2333, + "step": 9901 + }, + { + "epoch": 0.4169263157894737, + "grad_norm": 0.439453125, + "learning_rate": 0.0003234899232033876, + "loss": 3.158, + "step": 9902 + }, + { + "epoch": 0.4169684210526316, + "grad_norm": 0.40234375, + "learning_rate": 0.0003234576691480895, + "loss": 3.172, + "step": 9903 + }, + { + "epoch": 0.4170105263157895, + "grad_norm": 0.404296875, + "learning_rate": 0.00032342541375447894, + "loss": 3.1702, + "step": 9904 + }, + { + "epoch": 0.41705263157894734, + "grad_norm": 0.40234375, + "learning_rate": 0.00032339315702314367, + "loss": 3.4364, + "step": 9905 + }, + { + "epoch": 0.41709473684210524, + "grad_norm": 0.412109375, + "learning_rate": 0.0003233608989546712, + "loss": 2.6916, + "step": 9906 + }, + { + "epoch": 0.41713684210526314, + "grad_norm": 0.466796875, + "learning_rate": 0.0003233286395496494, + "loss": 2.665, + "step": 9907 + }, + { + "epoch": 0.41717894736842104, + "grad_norm": 0.419921875, + "learning_rate": 0.000323296378808666, + "loss": 3.1891, + "step": 9908 + }, + { + "epoch": 0.41722105263157894, + "grad_norm": 0.4140625, + "learning_rate": 0.0003232641167323087, + "loss": 3.3153, + "step": 9909 + }, + { + "epoch": 0.41726315789473684, + "grad_norm": 0.408203125, + "learning_rate": 0.00032323185332116524, + "loss": 3.2422, + "step": 9910 + }, + { + "epoch": 0.41730526315789473, + "grad_norm": 0.4140625, + "learning_rate": 0.0003231995885758235, + "loss": 3.4683, + "step": 9911 + }, + { + "epoch": 0.41734736842105263, + "grad_norm": 0.47265625, + "learning_rate": 0.0003231673224968712, + "loss": 3.0819, + "step": 9912 + }, + { + "epoch": 0.41738947368421053, + "grad_norm": 0.4453125, + "learning_rate": 0.00032313505508489623, + "loss": 2.9248, + "step": 9913 + }, + { + "epoch": 0.41743157894736843, + "grad_norm": 0.43359375, + "learning_rate": 0.0003231027863404865, + "loss": 3.0531, + "step": 9914 + }, + { + "epoch": 0.41747368421052633, + "grad_norm": 0.466796875, + "learning_rate": 0.00032307051626422994, + "loss": 3.2802, + "step": 9915 + }, + { + "epoch": 0.41751578947368423, + "grad_norm": 0.435546875, + "learning_rate": 0.00032303824485671443, + "loss": 3.2528, + "step": 9916 + }, + { + "epoch": 0.41755789473684213, + "grad_norm": 0.474609375, + "learning_rate": 0.00032300597211852794, + "loss": 2.6933, + "step": 9917 + }, + { + "epoch": 0.4176, + "grad_norm": 0.412109375, + "learning_rate": 0.0003229736980502584, + "loss": 3.1386, + "step": 9918 + }, + { + "epoch": 0.41764210526315787, + "grad_norm": 0.416015625, + "learning_rate": 0.0003229414226524937, + "loss": 3.5457, + "step": 9919 + }, + { + "epoch": 0.41768421052631577, + "grad_norm": 0.470703125, + "learning_rate": 0.0003229091459258221, + "loss": 3.6531, + "step": 9920 + }, + { + "epoch": 0.41772631578947367, + "grad_norm": 0.390625, + "learning_rate": 0.0003228768678708315, + "loss": 3.0543, + "step": 9921 + }, + { + "epoch": 0.41776842105263157, + "grad_norm": 0.42578125, + "learning_rate": 0.00032284458848811, + "loss": 2.7756, + "step": 9922 + }, + { + "epoch": 0.41781052631578947, + "grad_norm": 0.546875, + "learning_rate": 0.0003228123077782457, + "loss": 2.7894, + "step": 9923 + }, + { + "epoch": 0.41785263157894736, + "grad_norm": 0.412109375, + "learning_rate": 0.00032278002574182674, + "loss": 3.4973, + "step": 9924 + }, + { + "epoch": 0.41789473684210526, + "grad_norm": 0.421875, + "learning_rate": 0.00032274774237944114, + "loss": 3.3233, + "step": 9925 + }, + { + "epoch": 0.41793684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.00032271545769167715, + "loss": 3.3199, + "step": 9926 + }, + { + "epoch": 0.41797894736842106, + "grad_norm": 0.412109375, + "learning_rate": 0.00032268317167912305, + "loss": 3.5432, + "step": 9927 + }, + { + "epoch": 0.41802105263157896, + "grad_norm": 0.41015625, + "learning_rate": 0.00032265088434236687, + "loss": 3.522, + "step": 9928 + }, + { + "epoch": 0.41806315789473686, + "grad_norm": 0.42578125, + "learning_rate": 0.00032261859568199694, + "loss": 3.1878, + "step": 9929 + }, + { + "epoch": 0.41810526315789476, + "grad_norm": 0.390625, + "learning_rate": 0.0003225863056986016, + "loss": 3.0819, + "step": 9930 + }, + { + "epoch": 0.41814736842105266, + "grad_norm": 0.45703125, + "learning_rate": 0.000322554014392769, + "loss": 3.25, + "step": 9931 + }, + { + "epoch": 0.4181894736842105, + "grad_norm": 0.453125, + "learning_rate": 0.0003225217217650875, + "loss": 3.5337, + "step": 9932 + }, + { + "epoch": 0.4182315789473684, + "grad_norm": 0.478515625, + "learning_rate": 0.00032248942781614543, + "loss": 3.3138, + "step": 9933 + }, + { + "epoch": 0.4182736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.0003224571325465312, + "loss": 2.7497, + "step": 9934 + }, + { + "epoch": 0.4183157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.00032242483595683314, + "loss": 3.1407, + "step": 9935 + }, + { + "epoch": 0.4183578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.0003223925380476396, + "loss": 2.8838, + "step": 9936 + }, + { + "epoch": 0.4184, + "grad_norm": 0.388671875, + "learning_rate": 0.0003223602388195391, + "loss": 3.2789, + "step": 9937 + }, + { + "epoch": 0.4184421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00032232793827312013, + "loss": 3.4798, + "step": 9938 + }, + { + "epoch": 0.4184842105263158, + "grad_norm": 0.455078125, + "learning_rate": 0.0003222956364089711, + "loss": 3.4487, + "step": 9939 + }, + { + "epoch": 0.4185263157894737, + "grad_norm": 0.392578125, + "learning_rate": 0.0003222633332276804, + "loss": 3.1943, + "step": 9940 + }, + { + "epoch": 0.4185684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.0003222310287298368, + "loss": 3.0457, + "step": 9941 + }, + { + "epoch": 0.4186105263157895, + "grad_norm": 0.392578125, + "learning_rate": 0.0003221987229160287, + "loss": 2.9015, + "step": 9942 + }, + { + "epoch": 0.4186526315789474, + "grad_norm": 0.412109375, + "learning_rate": 0.0003221664157868447, + "loss": 3.4956, + "step": 9943 + }, + { + "epoch": 0.4186947368421053, + "grad_norm": 0.439453125, + "learning_rate": 0.00032213410734287343, + "loss": 3.1667, + "step": 9944 + }, + { + "epoch": 0.4187368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.00032210179758470335, + "loss": 2.9965, + "step": 9945 + }, + { + "epoch": 0.41877894736842103, + "grad_norm": 0.46484375, + "learning_rate": 0.00032206948651292334, + "loss": 3.3478, + "step": 9946 + }, + { + "epoch": 0.4188210526315789, + "grad_norm": 0.408203125, + "learning_rate": 0.00032203717412812187, + "loss": 3.1169, + "step": 9947 + }, + { + "epoch": 0.4188631578947368, + "grad_norm": 0.41015625, + "learning_rate": 0.0003220048604308878, + "loss": 3.5785, + "step": 9948 + }, + { + "epoch": 0.4189052631578947, + "grad_norm": 0.419921875, + "learning_rate": 0.00032197254542180984, + "loss": 3.2262, + "step": 9949 + }, + { + "epoch": 0.4189473684210526, + "grad_norm": 0.4140625, + "learning_rate": 0.00032194022910147656, + "loss": 3.0814, + "step": 9950 + }, + { + "epoch": 0.4189894736842105, + "grad_norm": 0.443359375, + "learning_rate": 0.0003219079114704769, + "loss": 2.812, + "step": 9951 + }, + { + "epoch": 0.4190315789473684, + "grad_norm": 0.43359375, + "learning_rate": 0.0003218755925293996, + "loss": 2.9703, + "step": 9952 + }, + { + "epoch": 0.4190736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.00032184327227883337, + "loss": 3.2367, + "step": 9953 + }, + { + "epoch": 0.4191157894736842, + "grad_norm": 0.453125, + "learning_rate": 0.0003218109507193672, + "loss": 2.7638, + "step": 9954 + }, + { + "epoch": 0.4191578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.0003217786278515897, + "loss": 2.9502, + "step": 9955 + }, + { + "epoch": 0.4192, + "grad_norm": 0.39453125, + "learning_rate": 0.00032174630367609017, + "loss": 2.4855, + "step": 9956 + }, + { + "epoch": 0.4192421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.0003217139781934572, + "loss": 3.0994, + "step": 9957 + }, + { + "epoch": 0.4192842105263158, + "grad_norm": 0.3984375, + "learning_rate": 0.00032168165140427984, + "loss": 3.0774, + "step": 9958 + }, + { + "epoch": 0.41932631578947366, + "grad_norm": 0.43359375, + "learning_rate": 0.000321649323309147, + "loss": 3.0791, + "step": 9959 + }, + { + "epoch": 0.41936842105263156, + "grad_norm": 0.453125, + "learning_rate": 0.00032161699390864765, + "loss": 2.8809, + "step": 9960 + }, + { + "epoch": 0.41941052631578946, + "grad_norm": 0.41796875, + "learning_rate": 0.0003215846632033708, + "loss": 3.0215, + "step": 9961 + }, + { + "epoch": 0.41945263157894735, + "grad_norm": 0.42578125, + "learning_rate": 0.00032155233119390554, + "loss": 3.2099, + "step": 9962 + }, + { + "epoch": 0.41949473684210525, + "grad_norm": 0.435546875, + "learning_rate": 0.0003215199978808408, + "loss": 3.2015, + "step": 9963 + }, + { + "epoch": 0.41953684210526315, + "grad_norm": 0.458984375, + "learning_rate": 0.00032148766326476583, + "loss": 2.9152, + "step": 9964 + }, + { + "epoch": 0.41957894736842105, + "grad_norm": 0.431640625, + "learning_rate": 0.00032145532734626956, + "loss": 3.0224, + "step": 9965 + }, + { + "epoch": 0.41962105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.0003214229901259412, + "loss": 3.5088, + "step": 9966 + }, + { + "epoch": 0.41966315789473685, + "grad_norm": 0.5, + "learning_rate": 0.00032139065160436986, + "loss": 3.5367, + "step": 9967 + }, + { + "epoch": 0.41970526315789475, + "grad_norm": 0.412109375, + "learning_rate": 0.0003213583117821448, + "loss": 3.3454, + "step": 9968 + }, + { + "epoch": 0.41974736842105265, + "grad_norm": 0.4140625, + "learning_rate": 0.0003213259706598551, + "loss": 3.0399, + "step": 9969 + }, + { + "epoch": 0.41978947368421055, + "grad_norm": 0.416015625, + "learning_rate": 0.0003212936282380901, + "loss": 3.3484, + "step": 9970 + }, + { + "epoch": 0.41983157894736844, + "grad_norm": 0.427734375, + "learning_rate": 0.00032126128451743873, + "loss": 3.0956, + "step": 9971 + }, + { + "epoch": 0.41987368421052634, + "grad_norm": 0.40625, + "learning_rate": 0.0003212289394984907, + "loss": 3.203, + "step": 9972 + }, + { + "epoch": 0.4199157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.00032119659318183497, + "loss": 3.4919, + "step": 9973 + }, + { + "epoch": 0.4199578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.00032116424556806103, + "loss": 3.4529, + "step": 9974 + }, + { + "epoch": 0.42, + "grad_norm": 0.447265625, + "learning_rate": 0.0003211318966577581, + "loss": 3.2855, + "step": 9975 + }, + { + "epoch": 0.4200421052631579, + "grad_norm": 0.49609375, + "learning_rate": 0.00032109954645151564, + "loss": 3.2054, + "step": 9976 + }, + { + "epoch": 0.4200842105263158, + "grad_norm": 0.55859375, + "learning_rate": 0.000321067194949923, + "loss": 2.8804, + "step": 9977 + }, + { + "epoch": 0.4201263157894737, + "grad_norm": 0.443359375, + "learning_rate": 0.0003210348421535695, + "loss": 2.9569, + "step": 9978 + }, + { + "epoch": 0.4201684210526316, + "grad_norm": 0.470703125, + "learning_rate": 0.00032100248806304465, + "loss": 3.225, + "step": 9979 + }, + { + "epoch": 0.4202105263157895, + "grad_norm": 0.451171875, + "learning_rate": 0.0003209701326789379, + "loss": 3.4321, + "step": 9980 + }, + { + "epoch": 0.4202526315789474, + "grad_norm": 0.486328125, + "learning_rate": 0.0003209377760018387, + "loss": 3.2132, + "step": 9981 + }, + { + "epoch": 0.4202947368421053, + "grad_norm": 0.4453125, + "learning_rate": 0.00032090541803233663, + "loss": 3.298, + "step": 9982 + }, + { + "epoch": 0.4203368421052632, + "grad_norm": 0.400390625, + "learning_rate": 0.00032087305877102115, + "loss": 3.2368, + "step": 9983 + }, + { + "epoch": 0.4203789473684211, + "grad_norm": 0.42578125, + "learning_rate": 0.00032084069821848175, + "loss": 3.4467, + "step": 9984 + }, + { + "epoch": 0.420421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00032080833637530815, + "loss": 2.8654, + "step": 9985 + }, + { + "epoch": 0.4204631578947368, + "grad_norm": 0.5, + "learning_rate": 0.00032077597324208987, + "loss": 3.1979, + "step": 9986 + }, + { + "epoch": 0.4205052631578947, + "grad_norm": 0.423828125, + "learning_rate": 0.00032074360881941634, + "loss": 3.414, + "step": 9987 + }, + { + "epoch": 0.4205473684210526, + "grad_norm": 0.45703125, + "learning_rate": 0.00032071124310787747, + "loss": 3.3114, + "step": 9988 + }, + { + "epoch": 0.4205894736842105, + "grad_norm": 0.462890625, + "learning_rate": 0.00032067887610806286, + "loss": 3.0939, + "step": 9989 + }, + { + "epoch": 0.4206315789473684, + "grad_norm": 0.44140625, + "learning_rate": 0.00032064650782056214, + "loss": 2.9733, + "step": 9990 + }, + { + "epoch": 0.4206736842105263, + "grad_norm": 0.390625, + "learning_rate": 0.0003206141382459651, + "loss": 2.7988, + "step": 9991 + }, + { + "epoch": 0.4207157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.00032058176738486145, + "loss": 3.3119, + "step": 9992 + }, + { + "epoch": 0.4207578947368421, + "grad_norm": 0.4453125, + "learning_rate": 0.0003205493952378408, + "loss": 3.3106, + "step": 9993 + }, + { + "epoch": 0.4208, + "grad_norm": 0.43359375, + "learning_rate": 0.0003205170218054932, + "loss": 2.9681, + "step": 9994 + }, + { + "epoch": 0.4208421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.0003204846470884083, + "loss": 3.2842, + "step": 9995 + }, + { + "epoch": 0.4208842105263158, + "grad_norm": 0.6953125, + "learning_rate": 0.00032045227108717586, + "loss": 3.0096, + "step": 9996 + }, + { + "epoch": 0.4209263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.00032041989380238585, + "loss": 3.1242, + "step": 9997 + }, + { + "epoch": 0.4209684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.0003203875152346282, + "loss": 3.1947, + "step": 9998 + }, + { + "epoch": 0.42101052631578945, + "grad_norm": 0.419921875, + "learning_rate": 0.0003203551353844926, + "loss": 2.913, + "step": 9999 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.408203125, + "learning_rate": 0.00032032275425256915, + "loss": 3.3064, + "step": 10000 + }, + { + "epoch": 0.42109473684210524, + "grad_norm": 0.408203125, + "learning_rate": 0.0003202903718394478, + "loss": 3.4375, + "step": 10001 + }, + { + "epoch": 0.42113684210526314, + "grad_norm": 0.416015625, + "learning_rate": 0.00032025798814571836, + "loss": 3.0528, + "step": 10002 + }, + { + "epoch": 0.42117894736842104, + "grad_norm": 0.419921875, + "learning_rate": 0.000320225603171971, + "loss": 3.5987, + "step": 10003 + }, + { + "epoch": 0.42122105263157894, + "grad_norm": 0.412109375, + "learning_rate": 0.0003201932169187956, + "loss": 3.3923, + "step": 10004 + }, + { + "epoch": 0.42126315789473684, + "grad_norm": 0.482421875, + "learning_rate": 0.00032016082938678234, + "loss": 2.849, + "step": 10005 + }, + { + "epoch": 0.42130526315789474, + "grad_norm": 0.40234375, + "learning_rate": 0.0003201284405765211, + "loss": 3.0765, + "step": 10006 + }, + { + "epoch": 0.42134736842105264, + "grad_norm": 0.4296875, + "learning_rate": 0.00032009605048860214, + "loss": 3.1943, + "step": 10007 + }, + { + "epoch": 0.42138947368421054, + "grad_norm": 0.423828125, + "learning_rate": 0.00032006365912361554, + "loss": 3.1438, + "step": 10008 + }, + { + "epoch": 0.42143157894736843, + "grad_norm": 0.42578125, + "learning_rate": 0.0003200312664821513, + "loss": 3.4497, + "step": 10009 + }, + { + "epoch": 0.42147368421052633, + "grad_norm": 0.4375, + "learning_rate": 0.00031999887256479976, + "loss": 3.0464, + "step": 10010 + }, + { + "epoch": 0.42151578947368423, + "grad_norm": 0.55078125, + "learning_rate": 0.00031996647737215096, + "loss": 2.8291, + "step": 10011 + }, + { + "epoch": 0.42155789473684213, + "grad_norm": 0.42578125, + "learning_rate": 0.00031993408090479514, + "loss": 3.3749, + "step": 10012 + }, + { + "epoch": 0.4216, + "grad_norm": 0.4296875, + "learning_rate": 0.0003199016831633225, + "loss": 3.1971, + "step": 10013 + }, + { + "epoch": 0.4216421052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.00031986928414832333, + "loss": 2.7438, + "step": 10014 + }, + { + "epoch": 0.4216842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.00031983688386038803, + "loss": 3.3877, + "step": 10015 + }, + { + "epoch": 0.42172631578947367, + "grad_norm": 0.38671875, + "learning_rate": 0.0003198044823001066, + "loss": 3.3923, + "step": 10016 + }, + { + "epoch": 0.42176842105263157, + "grad_norm": 0.408203125, + "learning_rate": 0.0003197720794680696, + "loss": 3.4403, + "step": 10017 + }, + { + "epoch": 0.42181052631578947, + "grad_norm": 0.41796875, + "learning_rate": 0.0003197396753648673, + "loss": 3.7153, + "step": 10018 + }, + { + "epoch": 0.42185263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.00031970726999109005, + "loss": 3.5582, + "step": 10019 + }, + { + "epoch": 0.42189473684210527, + "grad_norm": 0.44140625, + "learning_rate": 0.0003196748633473282, + "loss": 3.1249, + "step": 10020 + }, + { + "epoch": 0.42193684210526317, + "grad_norm": 0.41796875, + "learning_rate": 0.00031964245543417223, + "loss": 3.2499, + "step": 10021 + }, + { + "epoch": 0.42197894736842106, + "grad_norm": 0.4140625, + "learning_rate": 0.0003196100462522125, + "loss": 3.3743, + "step": 10022 + }, + { + "epoch": 0.42202105263157896, + "grad_norm": 0.5, + "learning_rate": 0.0003195776358020396, + "loss": 3.2409, + "step": 10023 + }, + { + "epoch": 0.42206315789473686, + "grad_norm": 0.447265625, + "learning_rate": 0.0003195452240842439, + "loss": 3.7971, + "step": 10024 + }, + { + "epoch": 0.42210526315789476, + "grad_norm": 0.4140625, + "learning_rate": 0.0003195128110994159, + "loss": 2.9724, + "step": 10025 + }, + { + "epoch": 0.4221473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.0003194803968481462, + "loss": 2.9188, + "step": 10026 + }, + { + "epoch": 0.4221894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.0003194479813310253, + "loss": 3.1805, + "step": 10027 + }, + { + "epoch": 0.4222315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.00031941556454864376, + "loss": 3.4091, + "step": 10028 + }, + { + "epoch": 0.4222736842105263, + "grad_norm": 0.416015625, + "learning_rate": 0.0003193831465015922, + "loss": 3.1602, + "step": 10029 + }, + { + "epoch": 0.4223157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.00031935072719046115, + "loss": 3.1512, + "step": 10030 + }, + { + "epoch": 0.4223578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.00031931830661584146, + "loss": 3.0206, + "step": 10031 + }, + { + "epoch": 0.4224, + "grad_norm": 0.419921875, + "learning_rate": 0.0003192858847783236, + "loss": 3.1342, + "step": 10032 + }, + { + "epoch": 0.4224421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.00031925346167849836, + "loss": 2.843, + "step": 10033 + }, + { + "epoch": 0.4224842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.00031922103731695634, + "loss": 3.0924, + "step": 10034 + }, + { + "epoch": 0.4225263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.0003191886116942884, + "loss": 3.3818, + "step": 10035 + }, + { + "epoch": 0.4225684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.00031915618481108535, + "loss": 3.005, + "step": 10036 + }, + { + "epoch": 0.4226105263157895, + "grad_norm": 0.48828125, + "learning_rate": 0.00031912375666793777, + "loss": 3.1221, + "step": 10037 + }, + { + "epoch": 0.4226526315789474, + "grad_norm": 0.404296875, + "learning_rate": 0.00031909132726543656, + "loss": 2.8993, + "step": 10038 + }, + { + "epoch": 0.4226947368421053, + "grad_norm": 0.439453125, + "learning_rate": 0.0003190588966041725, + "loss": 3.076, + "step": 10039 + }, + { + "epoch": 0.42273684210526313, + "grad_norm": 0.3984375, + "learning_rate": 0.00031902646468473663, + "loss": 3.0939, + "step": 10040 + }, + { + "epoch": 0.42277894736842103, + "grad_norm": 0.396484375, + "learning_rate": 0.0003189940315077195, + "loss": 2.8906, + "step": 10041 + }, + { + "epoch": 0.42282105263157893, + "grad_norm": 0.4140625, + "learning_rate": 0.0003189615970737123, + "loss": 2.5777, + "step": 10042 + }, + { + "epoch": 0.42286315789473683, + "grad_norm": 0.45703125, + "learning_rate": 0.0003189291613833058, + "loss": 3.0734, + "step": 10043 + }, + { + "epoch": 0.42290526315789473, + "grad_norm": 0.41796875, + "learning_rate": 0.0003188967244370909, + "loss": 3.3007, + "step": 10044 + }, + { + "epoch": 0.4229473684210526, + "grad_norm": 0.40234375, + "learning_rate": 0.0003188642862356588, + "loss": 2.9522, + "step": 10045 + }, + { + "epoch": 0.4229894736842105, + "grad_norm": 0.5390625, + "learning_rate": 0.00031883184677960016, + "loss": 2.8573, + "step": 10046 + }, + { + "epoch": 0.4230315789473684, + "grad_norm": 0.431640625, + "learning_rate": 0.0003187994060695062, + "loss": 2.7364, + "step": 10047 + }, + { + "epoch": 0.4230736842105263, + "grad_norm": 0.390625, + "learning_rate": 0.00031876696410596786, + "loss": 2.9881, + "step": 10048 + }, + { + "epoch": 0.4231157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.00031873452088957625, + "loss": 3.117, + "step": 10049 + }, + { + "epoch": 0.4231578947368421, + "grad_norm": 0.45703125, + "learning_rate": 0.00031870207642092246, + "loss": 3.3258, + "step": 10050 + }, + { + "epoch": 0.4232, + "grad_norm": 0.400390625, + "learning_rate": 0.0003186696307005976, + "loss": 3.1575, + "step": 10051 + }, + { + "epoch": 0.4232421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.0003186371837291927, + "loss": 3.0875, + "step": 10052 + }, + { + "epoch": 0.42328421052631576, + "grad_norm": 0.41796875, + "learning_rate": 0.00031860473550729895, + "loss": 3.2147, + "step": 10053 + }, + { + "epoch": 0.42332631578947366, + "grad_norm": 0.4453125, + "learning_rate": 0.00031857228603550755, + "loss": 3.1935, + "step": 10054 + }, + { + "epoch": 0.42336842105263156, + "grad_norm": 0.4140625, + "learning_rate": 0.0003185398353144096, + "loss": 3.0529, + "step": 10055 + }, + { + "epoch": 0.42341052631578946, + "grad_norm": 0.39453125, + "learning_rate": 0.0003185073833445964, + "loss": 3.0873, + "step": 10056 + }, + { + "epoch": 0.42345263157894736, + "grad_norm": 0.396484375, + "learning_rate": 0.00031847493012665925, + "loss": 3.2185, + "step": 10057 + }, + { + "epoch": 0.42349473684210526, + "grad_norm": 0.4140625, + "learning_rate": 0.00031844247566118925, + "loss": 3.2159, + "step": 10058 + }, + { + "epoch": 0.42353684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.00031841001994877785, + "loss": 2.7102, + "step": 10059 + }, + { + "epoch": 0.42357894736842105, + "grad_norm": 0.466796875, + "learning_rate": 0.0003183775629900162, + "loss": 3.0398, + "step": 10060 + }, + { + "epoch": 0.42362105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.0003183451047854956, + "loss": 3.238, + "step": 10061 + }, + { + "epoch": 0.42366315789473685, + "grad_norm": 0.4375, + "learning_rate": 0.0003183126453358076, + "loss": 3.0174, + "step": 10062 + }, + { + "epoch": 0.42370526315789475, + "grad_norm": 0.404296875, + "learning_rate": 0.00031828018464154353, + "loss": 3.2731, + "step": 10063 + }, + { + "epoch": 0.42374736842105265, + "grad_norm": 0.4609375, + "learning_rate": 0.00031824772270329463, + "loss": 2.6642, + "step": 10064 + }, + { + "epoch": 0.42378947368421055, + "grad_norm": 0.470703125, + "learning_rate": 0.00031821525952165243, + "loss": 2.9698, + "step": 10065 + }, + { + "epoch": 0.42383157894736845, + "grad_norm": 0.482421875, + "learning_rate": 0.0003181827950972083, + "loss": 2.9055, + "step": 10066 + }, + { + "epoch": 0.4238736842105263, + "grad_norm": 0.458984375, + "learning_rate": 0.00031815032943055375, + "loss": 3.081, + "step": 10067 + }, + { + "epoch": 0.4239157894736842, + "grad_norm": 0.451171875, + "learning_rate": 0.00031811786252228035, + "loss": 3.4177, + "step": 10068 + }, + { + "epoch": 0.4239578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00031808539437297955, + "loss": 2.746, + "step": 10069 + }, + { + "epoch": 0.424, + "grad_norm": 0.416015625, + "learning_rate": 0.00031805292498324276, + "loss": 2.7964, + "step": 10070 + }, + { + "epoch": 0.4240421052631579, + "grad_norm": 0.486328125, + "learning_rate": 0.00031802045435366176, + "loss": 3.0636, + "step": 10071 + }, + { + "epoch": 0.4240842105263158, + "grad_norm": 0.412109375, + "learning_rate": 0.0003179879824848279, + "loss": 2.5872, + "step": 10072 + }, + { + "epoch": 0.4241263157894737, + "grad_norm": 0.5234375, + "learning_rate": 0.00031795550937733293, + "loss": 3.0958, + "step": 10073 + }, + { + "epoch": 0.4241684210526316, + "grad_norm": 0.439453125, + "learning_rate": 0.00031792303503176835, + "loss": 2.901, + "step": 10074 + }, + { + "epoch": 0.4242105263157895, + "grad_norm": 0.4609375, + "learning_rate": 0.000317890559448726, + "loss": 3.2005, + "step": 10075 + }, + { + "epoch": 0.4242526315789474, + "grad_norm": 0.416015625, + "learning_rate": 0.0003178580826287973, + "loss": 2.8681, + "step": 10076 + }, + { + "epoch": 0.4242947368421053, + "grad_norm": 0.427734375, + "learning_rate": 0.0003178256045725741, + "loss": 3.1837, + "step": 10077 + }, + { + "epoch": 0.4243368421052632, + "grad_norm": 0.439453125, + "learning_rate": 0.0003177931252806482, + "loss": 3.1584, + "step": 10078 + }, + { + "epoch": 0.4243789473684211, + "grad_norm": 0.431640625, + "learning_rate": 0.0003177606447536111, + "loss": 3.3915, + "step": 10079 + }, + { + "epoch": 0.4244210526315789, + "grad_norm": 0.43359375, + "learning_rate": 0.00031772816299205467, + "loss": 3.5969, + "step": 10080 + }, + { + "epoch": 0.4244631578947368, + "grad_norm": 0.41796875, + "learning_rate": 0.00031769567999657066, + "loss": 3.2616, + "step": 10081 + }, + { + "epoch": 0.4245052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.00031766319576775095, + "loss": 3.1007, + "step": 10082 + }, + { + "epoch": 0.4245473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.0003176307103061873, + "loss": 3.4711, + "step": 10083 + }, + { + "epoch": 0.4245894736842105, + "grad_norm": 0.4140625, + "learning_rate": 0.0003175982236124716, + "loss": 3.1098, + "step": 10084 + }, + { + "epoch": 0.4246315789473684, + "grad_norm": 0.453125, + "learning_rate": 0.00031756573568719564, + "loss": 3.2801, + "step": 10085 + }, + { + "epoch": 0.4246736842105263, + "grad_norm": 0.48828125, + "learning_rate": 0.0003175332465309514, + "loss": 3.1742, + "step": 10086 + }, + { + "epoch": 0.4247157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.00031750075614433076, + "loss": 3.5646, + "step": 10087 + }, + { + "epoch": 0.4247578947368421, + "grad_norm": 0.466796875, + "learning_rate": 0.0003174682645279256, + "loss": 3.4282, + "step": 10088 + }, + { + "epoch": 0.4248, + "grad_norm": 0.42578125, + "learning_rate": 0.000317435771682328, + "loss": 3.2263, + "step": 10089 + }, + { + "epoch": 0.4248421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.0003174032776081298, + "loss": 2.8816, + "step": 10090 + }, + { + "epoch": 0.4248842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.00031737078230592315, + "loss": 2.8598, + "step": 10091 + }, + { + "epoch": 0.4249263157894737, + "grad_norm": 0.3984375, + "learning_rate": 0.0003173382857763, + "loss": 3.3723, + "step": 10092 + }, + { + "epoch": 0.4249684210526316, + "grad_norm": 0.46484375, + "learning_rate": 0.0003173057880198523, + "loss": 3.5783, + "step": 10093 + }, + { + "epoch": 0.42501052631578945, + "grad_norm": 0.443359375, + "learning_rate": 0.00031727328903717237, + "loss": 3.4208, + "step": 10094 + }, + { + "epoch": 0.42505263157894735, + "grad_norm": 0.46484375, + "learning_rate": 0.00031724078882885206, + "loss": 2.985, + "step": 10095 + }, + { + "epoch": 0.42509473684210525, + "grad_norm": 0.435546875, + "learning_rate": 0.0003172082873954836, + "loss": 3.4192, + "step": 10096 + }, + { + "epoch": 0.42513684210526315, + "grad_norm": 0.423828125, + "learning_rate": 0.0003171757847376591, + "loss": 3.0772, + "step": 10097 + }, + { + "epoch": 0.42517894736842105, + "grad_norm": 0.419921875, + "learning_rate": 0.0003171432808559708, + "loss": 3.1269, + "step": 10098 + }, + { + "epoch": 0.42522105263157894, + "grad_norm": 0.4140625, + "learning_rate": 0.00031711077575101074, + "loss": 3.5053, + "step": 10099 + }, + { + "epoch": 0.42526315789473684, + "grad_norm": 0.453125, + "learning_rate": 0.0003170782694233712, + "loss": 2.44, + "step": 10100 + }, + { + "epoch": 0.42530526315789474, + "grad_norm": 0.51171875, + "learning_rate": 0.0003170457618736444, + "loss": 3.053, + "step": 10101 + }, + { + "epoch": 0.42534736842105264, + "grad_norm": 0.41796875, + "learning_rate": 0.00031701325310242264, + "loss": 2.965, + "step": 10102 + }, + { + "epoch": 0.42538947368421054, + "grad_norm": 0.435546875, + "learning_rate": 0.00031698074311029814, + "loss": 3.5375, + "step": 10103 + }, + { + "epoch": 0.42543157894736844, + "grad_norm": 0.4140625, + "learning_rate": 0.00031694823189786316, + "loss": 3.0056, + "step": 10104 + }, + { + "epoch": 0.42547368421052634, + "grad_norm": 0.419921875, + "learning_rate": 0.00031691571946571007, + "loss": 3.4757, + "step": 10105 + }, + { + "epoch": 0.42551578947368424, + "grad_norm": 0.455078125, + "learning_rate": 0.0003168832058144312, + "loss": 3.0534, + "step": 10106 + }, + { + "epoch": 0.4255578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.00031685069094461893, + "loss": 3.2086, + "step": 10107 + }, + { + "epoch": 0.4256, + "grad_norm": 0.419921875, + "learning_rate": 0.00031681817485686555, + "loss": 3.4178, + "step": 10108 + }, + { + "epoch": 0.4256421052631579, + "grad_norm": 0.40234375, + "learning_rate": 0.00031678565755176356, + "loss": 3.7112, + "step": 10109 + }, + { + "epoch": 0.4256842105263158, + "grad_norm": 0.40625, + "learning_rate": 0.00031675313902990545, + "loss": 2.7696, + "step": 10110 + }, + { + "epoch": 0.4257263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.00031672061929188344, + "loss": 3.2333, + "step": 10111 + }, + { + "epoch": 0.4257684210526316, + "grad_norm": 0.474609375, + "learning_rate": 0.0003166880983382903, + "loss": 3.198, + "step": 10112 + }, + { + "epoch": 0.4258105263157895, + "grad_norm": 0.37109375, + "learning_rate": 0.00031665557616971816, + "loss": 2.88, + "step": 10113 + }, + { + "epoch": 0.42585263157894737, + "grad_norm": 0.421875, + "learning_rate": 0.0003166230527867599, + "loss": 2.7536, + "step": 10114 + }, + { + "epoch": 0.42589473684210527, + "grad_norm": 0.41015625, + "learning_rate": 0.0003165905281900078, + "loss": 3.2176, + "step": 10115 + }, + { + "epoch": 0.42593684210526317, + "grad_norm": 0.453125, + "learning_rate": 0.00031655800238005453, + "loss": 3.5902, + "step": 10116 + }, + { + "epoch": 0.42597894736842107, + "grad_norm": 0.431640625, + "learning_rate": 0.00031652547535749274, + "loss": 3.0308, + "step": 10117 + }, + { + "epoch": 0.42602105263157897, + "grad_norm": 0.4453125, + "learning_rate": 0.0003164929471229149, + "loss": 2.6443, + "step": 10118 + }, + { + "epoch": 0.42606315789473687, + "grad_norm": 0.40625, + "learning_rate": 0.00031646041767691374, + "loss": 2.9164, + "step": 10119 + }, + { + "epoch": 0.4261052631578947, + "grad_norm": 0.38671875, + "learning_rate": 0.0003164278870200818, + "loss": 3.0683, + "step": 10120 + }, + { + "epoch": 0.4261473684210526, + "grad_norm": 0.41015625, + "learning_rate": 0.0003163953551530118, + "loss": 2.7613, + "step": 10121 + }, + { + "epoch": 0.4261894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.00031636282207629646, + "loss": 3.0128, + "step": 10122 + }, + { + "epoch": 0.4262315789473684, + "grad_norm": 0.4921875, + "learning_rate": 0.0003163302877905284, + "loss": 2.9972, + "step": 10123 + }, + { + "epoch": 0.4262736842105263, + "grad_norm": 0.458984375, + "learning_rate": 0.00031629775229630056, + "loss": 3.4395, + "step": 10124 + }, + { + "epoch": 0.4263157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.0003162652155942055, + "loss": 3.3965, + "step": 10125 + }, + { + "epoch": 0.4263578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.0003162326776848361, + "loss": 2.8352, + "step": 10126 + }, + { + "epoch": 0.4264, + "grad_norm": 0.40234375, + "learning_rate": 0.0003162001385687852, + "loss": 3.0641, + "step": 10127 + }, + { + "epoch": 0.4264421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.00031616759824664543, + "loss": 3.4249, + "step": 10128 + }, + { + "epoch": 0.4264842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.00031613505671900975, + "loss": 2.8441, + "step": 10129 + }, + { + "epoch": 0.4265263157894737, + "grad_norm": 0.86328125, + "learning_rate": 0.0003161025139864712, + "loss": 3.1708, + "step": 10130 + }, + { + "epoch": 0.4265684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.0003160699700496223, + "loss": 3.5297, + "step": 10131 + }, + { + "epoch": 0.4266105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.00031603742490905637, + "loss": 2.8646, + "step": 10132 + }, + { + "epoch": 0.4266526315789474, + "grad_norm": 0.46484375, + "learning_rate": 0.000316004878565366, + "loss": 2.7702, + "step": 10133 + }, + { + "epoch": 0.42669473684210524, + "grad_norm": 0.41796875, + "learning_rate": 0.0003159723310191444, + "loss": 3.4487, + "step": 10134 + }, + { + "epoch": 0.42673684210526314, + "grad_norm": 0.4375, + "learning_rate": 0.0003159397822709844, + "loss": 3.2339, + "step": 10135 + }, + { + "epoch": 0.42677894736842104, + "grad_norm": 0.671875, + "learning_rate": 0.00031590723232147904, + "loss": 3.0922, + "step": 10136 + }, + { + "epoch": 0.42682105263157893, + "grad_norm": 0.4375, + "learning_rate": 0.0003158746811712214, + "loss": 3.4962, + "step": 10137 + }, + { + "epoch": 0.42686315789473683, + "grad_norm": 0.46484375, + "learning_rate": 0.00031584212882080444, + "loss": 3.1091, + "step": 10138 + }, + { + "epoch": 0.42690526315789473, + "grad_norm": 0.38671875, + "learning_rate": 0.00031580957527082117, + "loss": 3.1597, + "step": 10139 + }, + { + "epoch": 0.42694736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.00031577702052186486, + "loss": 3.2436, + "step": 10140 + }, + { + "epoch": 0.42698947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.00031574446457452844, + "loss": 2.8126, + "step": 10141 + }, + { + "epoch": 0.42703157894736843, + "grad_norm": 0.42578125, + "learning_rate": 0.00031571190742940515, + "loss": 2.7193, + "step": 10142 + }, + { + "epoch": 0.4270736842105263, + "grad_norm": 0.4375, + "learning_rate": 0.0003156793490870882, + "loss": 3.528, + "step": 10143 + }, + { + "epoch": 0.4271157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.0003156467895481706, + "loss": 3.0729, + "step": 10144 + }, + { + "epoch": 0.4271578947368421, + "grad_norm": 0.455078125, + "learning_rate": 0.0003156142288132456, + "loss": 3.0067, + "step": 10145 + }, + { + "epoch": 0.4272, + "grad_norm": 0.44921875, + "learning_rate": 0.0003155816668829066, + "loss": 3.3205, + "step": 10146 + }, + { + "epoch": 0.42724210526315787, + "grad_norm": 0.427734375, + "learning_rate": 0.00031554910375774646, + "loss": 3.2467, + "step": 10147 + }, + { + "epoch": 0.42728421052631577, + "grad_norm": 0.46875, + "learning_rate": 0.0003155165394383588, + "loss": 3.3178, + "step": 10148 + }, + { + "epoch": 0.42732631578947367, + "grad_norm": 0.455078125, + "learning_rate": 0.00031548397392533673, + "loss": 3.2583, + "step": 10149 + }, + { + "epoch": 0.42736842105263156, + "grad_norm": 0.435546875, + "learning_rate": 0.0003154514072192736, + "loss": 3.5147, + "step": 10150 + }, + { + "epoch": 0.42741052631578946, + "grad_norm": 0.451171875, + "learning_rate": 0.0003154188393207628, + "loss": 2.9281, + "step": 10151 + }, + { + "epoch": 0.42745263157894736, + "grad_norm": 0.42578125, + "learning_rate": 0.00031538627023039754, + "loss": 2.5242, + "step": 10152 + }, + { + "epoch": 0.42749473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.00031535369994877135, + "loss": 3.312, + "step": 10153 + }, + { + "epoch": 0.42753684210526316, + "grad_norm": 0.43359375, + "learning_rate": 0.0003153211284764774, + "loss": 3.21, + "step": 10154 + }, + { + "epoch": 0.42757894736842106, + "grad_norm": 0.40234375, + "learning_rate": 0.0003152885558141093, + "loss": 3.2412, + "step": 10155 + }, + { + "epoch": 0.42762105263157896, + "grad_norm": 0.44921875, + "learning_rate": 0.0003152559819622604, + "loss": 2.7756, + "step": 10156 + }, + { + "epoch": 0.42766315789473686, + "grad_norm": 0.431640625, + "learning_rate": 0.00031522340692152425, + "loss": 3.2571, + "step": 10157 + }, + { + "epoch": 0.42770526315789476, + "grad_norm": 0.427734375, + "learning_rate": 0.0003151908306924942, + "loss": 2.7833, + "step": 10158 + }, + { + "epoch": 0.42774736842105265, + "grad_norm": 0.439453125, + "learning_rate": 0.00031515825327576387, + "loss": 3.3814, + "step": 10159 + }, + { + "epoch": 0.42778947368421055, + "grad_norm": 0.435546875, + "learning_rate": 0.00031512567467192665, + "loss": 2.9172, + "step": 10160 + }, + { + "epoch": 0.4278315789473684, + "grad_norm": 0.462890625, + "learning_rate": 0.00031509309488157624, + "loss": 3.4523, + "step": 10161 + }, + { + "epoch": 0.4278736842105263, + "grad_norm": 0.404296875, + "learning_rate": 0.000315060513905306, + "loss": 2.9854, + "step": 10162 + }, + { + "epoch": 0.4279157894736842, + "grad_norm": 0.4375, + "learning_rate": 0.00031502793174370975, + "loss": 3.2274, + "step": 10163 + }, + { + "epoch": 0.4279578947368421, + "grad_norm": 0.5390625, + "learning_rate": 0.00031499534839738087, + "loss": 3.1963, + "step": 10164 + }, + { + "epoch": 0.428, + "grad_norm": 0.416015625, + "learning_rate": 0.00031496276386691326, + "loss": 3.1917, + "step": 10165 + }, + { + "epoch": 0.4280421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.0003149301781529003, + "loss": 3.0492, + "step": 10166 + }, + { + "epoch": 0.4280842105263158, + "grad_norm": 0.404296875, + "learning_rate": 0.0003148975912559359, + "loss": 3.1299, + "step": 10167 + }, + { + "epoch": 0.4281263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.0003148650031766135, + "loss": 3.2426, + "step": 10168 + }, + { + "epoch": 0.4281684210526316, + "grad_norm": 0.400390625, + "learning_rate": 0.0003148324139155271, + "loss": 3.2413, + "step": 10169 + }, + { + "epoch": 0.4282105263157895, + "grad_norm": 0.68359375, + "learning_rate": 0.0003147998234732702, + "loss": 3.2582, + "step": 10170 + }, + { + "epoch": 0.4282526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.00031476723185043667, + "loss": 3.3921, + "step": 10171 + }, + { + "epoch": 0.4282947368421053, + "grad_norm": 0.384765625, + "learning_rate": 0.00031473463904762027, + "loss": 2.907, + "step": 10172 + }, + { + "epoch": 0.4283368421052632, + "grad_norm": 0.435546875, + "learning_rate": 0.00031470204506541476, + "loss": 3.0413, + "step": 10173 + }, + { + "epoch": 0.428378947368421, + "grad_norm": 0.498046875, + "learning_rate": 0.0003146694499044141, + "loss": 3.0903, + "step": 10174 + }, + { + "epoch": 0.4284210526315789, + "grad_norm": 0.41796875, + "learning_rate": 0.000314636853565212, + "loss": 3.4594, + "step": 10175 + }, + { + "epoch": 0.4284631578947368, + "grad_norm": 0.423828125, + "learning_rate": 0.0003146042560484024, + "loss": 3.4647, + "step": 10176 + }, + { + "epoch": 0.4285052631578947, + "grad_norm": 0.412109375, + "learning_rate": 0.00031457165735457917, + "loss": 3.3174, + "step": 10177 + }, + { + "epoch": 0.4285473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.0003145390574843362, + "loss": 3.3223, + "step": 10178 + }, + { + "epoch": 0.4285894736842105, + "grad_norm": 0.435546875, + "learning_rate": 0.00031450645643826743, + "loss": 3.0605, + "step": 10179 + }, + { + "epoch": 0.4286315789473684, + "grad_norm": 0.470703125, + "learning_rate": 0.00031447385421696676, + "loss": 3.0823, + "step": 10180 + }, + { + "epoch": 0.4286736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.0003144412508210283, + "loss": 3.0949, + "step": 10181 + }, + { + "epoch": 0.4287157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.0003144086462510458, + "loss": 3.2474, + "step": 10182 + }, + { + "epoch": 0.4287578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00031437604050761365, + "loss": 2.6926, + "step": 10183 + }, + { + "epoch": 0.4288, + "grad_norm": 0.431640625, + "learning_rate": 0.0003143434335913256, + "loss": 3.4124, + "step": 10184 + }, + { + "epoch": 0.4288421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.0003143108255027758, + "loss": 3.0115, + "step": 10185 + }, + { + "epoch": 0.4288842105263158, + "grad_norm": 0.396484375, + "learning_rate": 0.0003142782162425584, + "loss": 3.1265, + "step": 10186 + }, + { + "epoch": 0.4289263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.00031424560581126725, + "loss": 3.0554, + "step": 10187 + }, + { + "epoch": 0.42896842105263155, + "grad_norm": 0.4296875, + "learning_rate": 0.00031421299420949676, + "loss": 3.2927, + "step": 10188 + }, + { + "epoch": 0.42901052631578945, + "grad_norm": 0.416015625, + "learning_rate": 0.00031418038143784095, + "loss": 3.0832, + "step": 10189 + }, + { + "epoch": 0.42905263157894735, + "grad_norm": 0.453125, + "learning_rate": 0.0003141477674968939, + "loss": 3.2295, + "step": 10190 + }, + { + "epoch": 0.42909473684210525, + "grad_norm": 0.42578125, + "learning_rate": 0.00031411515238725, + "loss": 3.5265, + "step": 10191 + }, + { + "epoch": 0.42913684210526315, + "grad_norm": 0.435546875, + "learning_rate": 0.0003140825361095034, + "loss": 2.8361, + "step": 10192 + }, + { + "epoch": 0.42917894736842105, + "grad_norm": 0.4140625, + "learning_rate": 0.00031404991866424824, + "loss": 3.2419, + "step": 10193 + }, + { + "epoch": 0.42922105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.0003140173000520788, + "loss": 3.0404, + "step": 10194 + }, + { + "epoch": 0.42926315789473685, + "grad_norm": 0.50390625, + "learning_rate": 0.00031398468027358937, + "loss": 2.4601, + "step": 10195 + }, + { + "epoch": 0.42930526315789475, + "grad_norm": 0.48828125, + "learning_rate": 0.0003139520593293742, + "loss": 3.4189, + "step": 10196 + }, + { + "epoch": 0.42934736842105264, + "grad_norm": 0.427734375, + "learning_rate": 0.0003139194372200278, + "loss": 3.3392, + "step": 10197 + }, + { + "epoch": 0.42938947368421054, + "grad_norm": 0.4140625, + "learning_rate": 0.00031388681394614414, + "loss": 3.2365, + "step": 10198 + }, + { + "epoch": 0.42943157894736844, + "grad_norm": 0.40234375, + "learning_rate": 0.00031385418950831797, + "loss": 2.9478, + "step": 10199 + }, + { + "epoch": 0.42947368421052634, + "grad_norm": 0.48046875, + "learning_rate": 0.00031382156390714346, + "loss": 2.7601, + "step": 10200 + }, + { + "epoch": 0.4295157894736842, + "grad_norm": 0.390625, + "learning_rate": 0.000313788937143215, + "loss": 2.9426, + "step": 10201 + }, + { + "epoch": 0.4295578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.000313756309217127, + "loss": 3.3544, + "step": 10202 + }, + { + "epoch": 0.4296, + "grad_norm": 0.423828125, + "learning_rate": 0.0003137236801294741, + "loss": 3.1918, + "step": 10203 + }, + { + "epoch": 0.4296421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.0003136910498808505, + "loss": 3.1501, + "step": 10204 + }, + { + "epoch": 0.4296842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.0003136584184718509, + "loss": 3.2413, + "step": 10205 + }, + { + "epoch": 0.4297263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0003136257859030696, + "loss": 3.1982, + "step": 10206 + }, + { + "epoch": 0.4297684210526316, + "grad_norm": 0.40625, + "learning_rate": 0.00031359315217510133, + "loss": 2.9473, + "step": 10207 + }, + { + "epoch": 0.4298105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.0003135605172885405, + "loss": 2.6487, + "step": 10208 + }, + { + "epoch": 0.4298526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.0003135278812439817, + "loss": 3.5155, + "step": 10209 + }, + { + "epoch": 0.4298947368421053, + "grad_norm": 0.435546875, + "learning_rate": 0.0003134952440420196, + "loss": 3.0402, + "step": 10210 + }, + { + "epoch": 0.4299368421052632, + "grad_norm": 0.59375, + "learning_rate": 0.0003134626056832488, + "loss": 3.1165, + "step": 10211 + }, + { + "epoch": 0.42997894736842107, + "grad_norm": 0.404296875, + "learning_rate": 0.0003134299661682638, + "loss": 3.1961, + "step": 10212 + }, + { + "epoch": 0.43002105263157897, + "grad_norm": 0.69140625, + "learning_rate": 0.0003133973254976594, + "loss": 2.9106, + "step": 10213 + }, + { + "epoch": 0.43006315789473687, + "grad_norm": 0.408203125, + "learning_rate": 0.0003133646836720302, + "loss": 2.8769, + "step": 10214 + }, + { + "epoch": 0.4301052631578947, + "grad_norm": 0.453125, + "learning_rate": 0.0003133320406919709, + "loss": 2.9815, + "step": 10215 + }, + { + "epoch": 0.4301473684210526, + "grad_norm": 0.734375, + "learning_rate": 0.0003132993965580762, + "loss": 3.2554, + "step": 10216 + }, + { + "epoch": 0.4301894736842105, + "grad_norm": 0.435546875, + "learning_rate": 0.00031326675127094094, + "loss": 3.8991, + "step": 10217 + }, + { + "epoch": 0.4302315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.0003132341048311598, + "loss": 3.1336, + "step": 10218 + }, + { + "epoch": 0.4302736842105263, + "grad_norm": 0.416015625, + "learning_rate": 0.00031320145723932755, + "loss": 3.1203, + "step": 10219 + }, + { + "epoch": 0.4303157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.000313168808496039, + "loss": 3.2426, + "step": 10220 + }, + { + "epoch": 0.4303578947368421, + "grad_norm": 0.5, + "learning_rate": 0.000313136158601889, + "loss": 3.2841, + "step": 10221 + }, + { + "epoch": 0.4304, + "grad_norm": 0.427734375, + "learning_rate": 0.0003131035075574723, + "loss": 2.8845, + "step": 10222 + }, + { + "epoch": 0.4304421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.00031307085536338395, + "loss": 3.3125, + "step": 10223 + }, + { + "epoch": 0.4304842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.0003130382020202186, + "loss": 2.7131, + "step": 10224 + }, + { + "epoch": 0.4305263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.00031300554752857135, + "loss": 3.3339, + "step": 10225 + }, + { + "epoch": 0.4305684210526316, + "grad_norm": 0.412109375, + "learning_rate": 0.00031297289188903703, + "loss": 3.2773, + "step": 10226 + }, + { + "epoch": 0.4306105263157895, + "grad_norm": 0.408203125, + "learning_rate": 0.00031294023510221064, + "loss": 3.1077, + "step": 10227 + }, + { + "epoch": 0.43065263157894734, + "grad_norm": 0.48828125, + "learning_rate": 0.0003129075771686871, + "loss": 2.9743, + "step": 10228 + }, + { + "epoch": 0.43069473684210524, + "grad_norm": 0.423828125, + "learning_rate": 0.00031287491808906136, + "loss": 3.3877, + "step": 10229 + }, + { + "epoch": 0.43073684210526314, + "grad_norm": 0.4453125, + "learning_rate": 0.0003128422578639286, + "loss": 3.1964, + "step": 10230 + }, + { + "epoch": 0.43077894736842104, + "grad_norm": 0.4375, + "learning_rate": 0.00031280959649388363, + "loss": 3.3281, + "step": 10231 + }, + { + "epoch": 0.43082105263157894, + "grad_norm": 0.451171875, + "learning_rate": 0.00031277693397952167, + "loss": 3.4426, + "step": 10232 + }, + { + "epoch": 0.43086315789473684, + "grad_norm": 0.6953125, + "learning_rate": 0.0003127442703214377, + "loss": 2.839, + "step": 10233 + }, + { + "epoch": 0.43090526315789474, + "grad_norm": 0.419921875, + "learning_rate": 0.0003127116055202269, + "loss": 3.0567, + "step": 10234 + }, + { + "epoch": 0.43094736842105263, + "grad_norm": 0.453125, + "learning_rate": 0.0003126789395764843, + "loss": 3.3066, + "step": 10235 + }, + { + "epoch": 0.43098947368421053, + "grad_norm": 0.435546875, + "learning_rate": 0.000312646272490805, + "loss": 3.0747, + "step": 10236 + }, + { + "epoch": 0.43103157894736843, + "grad_norm": 0.4453125, + "learning_rate": 0.0003126136042637843, + "loss": 3.4677, + "step": 10237 + }, + { + "epoch": 0.43107368421052633, + "grad_norm": 0.4921875, + "learning_rate": 0.00031258093489601734, + "loss": 2.7609, + "step": 10238 + }, + { + "epoch": 0.43111578947368423, + "grad_norm": 0.427734375, + "learning_rate": 0.00031254826438809925, + "loss": 2.8618, + "step": 10239 + }, + { + "epoch": 0.43115789473684213, + "grad_norm": 0.494140625, + "learning_rate": 0.00031251559274062516, + "loss": 3.1001, + "step": 10240 + }, + { + "epoch": 0.4312, + "grad_norm": 0.4453125, + "learning_rate": 0.0003124829199541905, + "loss": 3.1229, + "step": 10241 + }, + { + "epoch": 0.43124210526315787, + "grad_norm": 0.435546875, + "learning_rate": 0.00031245024602939047, + "loss": 3.2554, + "step": 10242 + }, + { + "epoch": 0.43128421052631577, + "grad_norm": 0.40234375, + "learning_rate": 0.0003124175709668203, + "loss": 2.7909, + "step": 10243 + }, + { + "epoch": 0.43132631578947367, + "grad_norm": 0.435546875, + "learning_rate": 0.0003123848947670754, + "loss": 3.6928, + "step": 10244 + }, + { + "epoch": 0.43136842105263157, + "grad_norm": 0.46875, + "learning_rate": 0.000312352217430751, + "loss": 3.2343, + "step": 10245 + }, + { + "epoch": 0.43141052631578947, + "grad_norm": 0.486328125, + "learning_rate": 0.0003123195389584424, + "loss": 3.2263, + "step": 10246 + }, + { + "epoch": 0.43145263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.0003122868593507451, + "loss": 3.085, + "step": 10247 + }, + { + "epoch": 0.43149473684210526, + "grad_norm": 0.447265625, + "learning_rate": 0.00031225417860825425, + "loss": 3.21, + "step": 10248 + }, + { + "epoch": 0.43153684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.0003122214967315655, + "loss": 3.1692, + "step": 10249 + }, + { + "epoch": 0.43157894736842106, + "grad_norm": 0.4375, + "learning_rate": 0.0003121888137212742, + "loss": 3.3765, + "step": 10250 + }, + { + "epoch": 0.43162105263157896, + "grad_norm": 0.43359375, + "learning_rate": 0.0003121561295779758, + "loss": 2.9009, + "step": 10251 + }, + { + "epoch": 0.43166315789473686, + "grad_norm": 0.5078125, + "learning_rate": 0.00031212344430226575, + "loss": 3.5694, + "step": 10252 + }, + { + "epoch": 0.43170526315789476, + "grad_norm": 0.427734375, + "learning_rate": 0.00031209075789473954, + "loss": 3.0949, + "step": 10253 + }, + { + "epoch": 0.43174736842105266, + "grad_norm": 0.466796875, + "learning_rate": 0.00031205807035599266, + "loss": 2.7531, + "step": 10254 + }, + { + "epoch": 0.4317894736842105, + "grad_norm": 0.388671875, + "learning_rate": 0.0003120253816866207, + "loss": 2.9114, + "step": 10255 + }, + { + "epoch": 0.4318315789473684, + "grad_norm": 0.3984375, + "learning_rate": 0.0003119926918872191, + "loss": 3.1408, + "step": 10256 + }, + { + "epoch": 0.4318736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.0003119600009583834, + "loss": 3.3425, + "step": 10257 + }, + { + "epoch": 0.4319157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.0003119273089007094, + "loss": 2.8746, + "step": 10258 + }, + { + "epoch": 0.4319578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00031189461571479263, + "loss": 3.3955, + "step": 10259 + }, + { + "epoch": 0.432, + "grad_norm": 0.421875, + "learning_rate": 0.0003118619214012286, + "loss": 3.4417, + "step": 10260 + }, + { + "epoch": 0.4320421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.00031182922596061313, + "loss": 3.2029, + "step": 10261 + }, + { + "epoch": 0.4320842105263158, + "grad_norm": 0.40234375, + "learning_rate": 0.00031179652939354173, + "loss": 3.3395, + "step": 10262 + }, + { + "epoch": 0.4321263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.0003117638317006102, + "loss": 3.1245, + "step": 10263 + }, + { + "epoch": 0.4321684210526316, + "grad_norm": 0.390625, + "learning_rate": 0.0003117311328824142, + "loss": 2.8827, + "step": 10264 + }, + { + "epoch": 0.4322105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.0003116984329395495, + "loss": 3.1062, + "step": 10265 + }, + { + "epoch": 0.4322526315789474, + "grad_norm": 0.421875, + "learning_rate": 0.00031166573187261186, + "loss": 3.5417, + "step": 10266 + }, + { + "epoch": 0.4322947368421053, + "grad_norm": 0.48828125, + "learning_rate": 0.0003116330296821971, + "loss": 3.1474, + "step": 10267 + }, + { + "epoch": 0.43233684210526313, + "grad_norm": 0.416015625, + "learning_rate": 0.0003116003263689009, + "loss": 3.2847, + "step": 10268 + }, + { + "epoch": 0.43237894736842103, + "grad_norm": 0.4375, + "learning_rate": 0.0003115676219333191, + "loss": 3.2136, + "step": 10269 + }, + { + "epoch": 0.43242105263157893, + "grad_norm": 0.43359375, + "learning_rate": 0.0003115349163760476, + "loss": 3.3208, + "step": 10270 + }, + { + "epoch": 0.4324631578947368, + "grad_norm": 0.443359375, + "learning_rate": 0.00031150220969768225, + "loss": 3.2305, + "step": 10271 + }, + { + "epoch": 0.4325052631578947, + "grad_norm": 0.431640625, + "learning_rate": 0.0003114695018988189, + "loss": 3.2357, + "step": 10272 + }, + { + "epoch": 0.4325473684210526, + "grad_norm": 0.421875, + "learning_rate": 0.0003114367929800533, + "loss": 3.1619, + "step": 10273 + }, + { + "epoch": 0.4325894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.00031140408294198167, + "loss": 2.7291, + "step": 10274 + }, + { + "epoch": 0.4326315789473684, + "grad_norm": 0.396484375, + "learning_rate": 0.0003113713717851998, + "loss": 3.0375, + "step": 10275 + }, + { + "epoch": 0.4326736842105263, + "grad_norm": 0.48828125, + "learning_rate": 0.0003113386595103036, + "loss": 2.9779, + "step": 10276 + }, + { + "epoch": 0.4327157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.0003113059461178891, + "loss": 2.8716, + "step": 10277 + }, + { + "epoch": 0.4327578947368421, + "grad_norm": 0.455078125, + "learning_rate": 0.0003112732316085523, + "loss": 2.5239, + "step": 10278 + }, + { + "epoch": 0.4328, + "grad_norm": 0.4140625, + "learning_rate": 0.00031124051598288924, + "loss": 3.071, + "step": 10279 + }, + { + "epoch": 0.4328421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.0003112077992414959, + "loss": 3.494, + "step": 10280 + }, + { + "epoch": 0.4328842105263158, + "grad_norm": 0.40625, + "learning_rate": 0.0003111750813849684, + "loss": 3.0461, + "step": 10281 + }, + { + "epoch": 0.43292631578947366, + "grad_norm": 0.546875, + "learning_rate": 0.0003111423624139027, + "loss": 2.9781, + "step": 10282 + }, + { + "epoch": 0.43296842105263156, + "grad_norm": 0.5546875, + "learning_rate": 0.00031110964232889505, + "loss": 2.8699, + "step": 10283 + }, + { + "epoch": 0.43301052631578946, + "grad_norm": 1.7890625, + "learning_rate": 0.00031107692113054155, + "loss": 3.0424, + "step": 10284 + }, + { + "epoch": 0.43305263157894736, + "grad_norm": 0.421875, + "learning_rate": 0.0003110441988194383, + "loss": 3.3475, + "step": 10285 + }, + { + "epoch": 0.43309473684210525, + "grad_norm": 0.44921875, + "learning_rate": 0.00031101147539618144, + "loss": 3.6058, + "step": 10286 + }, + { + "epoch": 0.43313684210526315, + "grad_norm": 0.423828125, + "learning_rate": 0.0003109787508613673, + "loss": 3.3069, + "step": 10287 + }, + { + "epoch": 0.43317894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.00031094602521559186, + "loss": 3.0923, + "step": 10288 + }, + { + "epoch": 0.43322105263157895, + "grad_norm": 0.41015625, + "learning_rate": 0.00031091329845945144, + "loss": 3.4624, + "step": 10289 + }, + { + "epoch": 0.43326315789473685, + "grad_norm": 0.423828125, + "learning_rate": 0.0003108805705935423, + "loss": 3.3341, + "step": 10290 + }, + { + "epoch": 0.43330526315789475, + "grad_norm": 0.431640625, + "learning_rate": 0.0003108478416184607, + "loss": 3.1184, + "step": 10291 + }, + { + "epoch": 0.43334736842105265, + "grad_norm": 0.451171875, + "learning_rate": 0.0003108151115348029, + "loss": 3.4846, + "step": 10292 + }, + { + "epoch": 0.43338947368421055, + "grad_norm": 0.4375, + "learning_rate": 0.0003107823803431653, + "loss": 2.9394, + "step": 10293 + }, + { + "epoch": 0.43343157894736845, + "grad_norm": 0.43359375, + "learning_rate": 0.00031074964804414415, + "loss": 3.4557, + "step": 10294 + }, + { + "epoch": 0.4334736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.0003107169146383357, + "loss": 3.0079, + "step": 10295 + }, + { + "epoch": 0.4335157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.0003106841801263364, + "loss": 3.6496, + "step": 10296 + }, + { + "epoch": 0.4335578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.0003106514445087427, + "loss": 3.2014, + "step": 10297 + }, + { + "epoch": 0.4336, + "grad_norm": 0.435546875, + "learning_rate": 0.000310618707786151, + "loss": 3.4901, + "step": 10298 + }, + { + "epoch": 0.4336421052631579, + "grad_norm": 0.47265625, + "learning_rate": 0.0003105859699591575, + "loss": 3.03, + "step": 10299 + }, + { + "epoch": 0.4336842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.00031055323102835897, + "loss": 3.119, + "step": 10300 + }, + { + "epoch": 0.4337263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.0003105204909943516, + "loss": 3.2656, + "step": 10301 + }, + { + "epoch": 0.4337684210526316, + "grad_norm": 0.455078125, + "learning_rate": 0.0003104877498577321, + "loss": 3.2354, + "step": 10302 + }, + { + "epoch": 0.4338105263157895, + "grad_norm": 0.42578125, + "learning_rate": 0.00031045500761909677, + "loss": 3.2186, + "step": 10303 + }, + { + "epoch": 0.4338526315789474, + "grad_norm": 0.439453125, + "learning_rate": 0.0003104222642790423, + "loss": 3.1294, + "step": 10304 + }, + { + "epoch": 0.4338947368421053, + "grad_norm": 0.412109375, + "learning_rate": 0.00031038951983816513, + "loss": 3.2637, + "step": 10305 + }, + { + "epoch": 0.4339368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.00031035677429706193, + "loss": 3.295, + "step": 10306 + }, + { + "epoch": 0.4339789473684211, + "grad_norm": 0.416015625, + "learning_rate": 0.00031032402765632916, + "loss": 3.2949, + "step": 10307 + }, + { + "epoch": 0.434021052631579, + "grad_norm": 0.3984375, + "learning_rate": 0.0003102912799165635, + "loss": 3.3416, + "step": 10308 + }, + { + "epoch": 0.4340631578947368, + "grad_norm": 0.3828125, + "learning_rate": 0.00031025853107836157, + "loss": 3.2374, + "step": 10309 + }, + { + "epoch": 0.4341052631578947, + "grad_norm": 0.4296875, + "learning_rate": 0.0003102257811423199, + "loss": 3.2134, + "step": 10310 + }, + { + "epoch": 0.4341473684210526, + "grad_norm": 0.451171875, + "learning_rate": 0.0003101930301090354, + "loss": 2.6182, + "step": 10311 + }, + { + "epoch": 0.4341894736842105, + "grad_norm": 0.419921875, + "learning_rate": 0.00031016027797910455, + "loss": 3.136, + "step": 10312 + }, + { + "epoch": 0.4342315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.0003101275247531242, + "loss": 3.8209, + "step": 10313 + }, + { + "epoch": 0.4342736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.00031009477043169095, + "loss": 2.9243, + "step": 10314 + }, + { + "epoch": 0.4343157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.00031006201501540147, + "loss": 3.4454, + "step": 10315 + }, + { + "epoch": 0.4343578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00031002925850485275, + "loss": 3.1152, + "step": 10316 + }, + { + "epoch": 0.4344, + "grad_norm": 0.42578125, + "learning_rate": 0.0003099965009006415, + "loss": 3.2999, + "step": 10317 + }, + { + "epoch": 0.4344421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00030996374220336443, + "loss": 3.3634, + "step": 10318 + }, + { + "epoch": 0.4344842105263158, + "grad_norm": 0.447265625, + "learning_rate": 0.0003099309824136185, + "loss": 3.1904, + "step": 10319 + }, + { + "epoch": 0.4345263157894737, + "grad_norm": 0.396484375, + "learning_rate": 0.0003098982215320005, + "loss": 3.1303, + "step": 10320 + }, + { + "epoch": 0.4345684210526316, + "grad_norm": 0.3984375, + "learning_rate": 0.00030986545955910723, + "loss": 3.2103, + "step": 10321 + }, + { + "epoch": 0.43461052631578945, + "grad_norm": 0.48046875, + "learning_rate": 0.00030983269649553563, + "loss": 3.0465, + "step": 10322 + }, + { + "epoch": 0.43465263157894735, + "grad_norm": 0.462890625, + "learning_rate": 0.0003097999323418826, + "loss": 2.7737, + "step": 10323 + }, + { + "epoch": 0.43469473684210524, + "grad_norm": 0.421875, + "learning_rate": 0.00030976716709874496, + "loss": 3.4548, + "step": 10324 + }, + { + "epoch": 0.43473684210526314, + "grad_norm": 0.396484375, + "learning_rate": 0.00030973440076671987, + "loss": 2.8545, + "step": 10325 + }, + { + "epoch": 0.43477894736842104, + "grad_norm": 0.56640625, + "learning_rate": 0.00030970163334640425, + "loss": 2.8709, + "step": 10326 + }, + { + "epoch": 0.43482105263157894, + "grad_norm": 0.45703125, + "learning_rate": 0.00030966886483839494, + "loss": 3.0409, + "step": 10327 + }, + { + "epoch": 0.43486315789473684, + "grad_norm": 0.447265625, + "learning_rate": 0.0003096360952432891, + "loss": 3.4493, + "step": 10328 + }, + { + "epoch": 0.43490526315789474, + "grad_norm": 0.435546875, + "learning_rate": 0.00030960332456168356, + "loss": 3.5565, + "step": 10329 + }, + { + "epoch": 0.43494736842105264, + "grad_norm": 0.40625, + "learning_rate": 0.00030957055279417546, + "loss": 3.0773, + "step": 10330 + }, + { + "epoch": 0.43498947368421054, + "grad_norm": 0.412109375, + "learning_rate": 0.00030953777994136196, + "loss": 3.2632, + "step": 10331 + }, + { + "epoch": 0.43503157894736844, + "grad_norm": 0.443359375, + "learning_rate": 0.00030950500600384007, + "loss": 3.2477, + "step": 10332 + }, + { + "epoch": 0.43507368421052633, + "grad_norm": 0.50390625, + "learning_rate": 0.00030947223098220676, + "loss": 3.0646, + "step": 10333 + }, + { + "epoch": 0.43511578947368423, + "grad_norm": 0.451171875, + "learning_rate": 0.0003094394548770594, + "loss": 2.9225, + "step": 10334 + }, + { + "epoch": 0.43515789473684213, + "grad_norm": 0.439453125, + "learning_rate": 0.00030940667768899506, + "loss": 3.1923, + "step": 10335 + }, + { + "epoch": 0.4352, + "grad_norm": 0.435546875, + "learning_rate": 0.00030937389941861076, + "loss": 3.1587, + "step": 10336 + }, + { + "epoch": 0.4352421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.00030934112006650375, + "loss": 2.8878, + "step": 10337 + }, + { + "epoch": 0.4352842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.0003093083396332713, + "loss": 3.4308, + "step": 10338 + }, + { + "epoch": 0.4353263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.00030927555811951065, + "loss": 2.5809, + "step": 10339 + }, + { + "epoch": 0.43536842105263157, + "grad_norm": 0.384765625, + "learning_rate": 0.0003092427755258189, + "loss": 3.1825, + "step": 10340 + }, + { + "epoch": 0.43541052631578947, + "grad_norm": 0.421875, + "learning_rate": 0.0003092099918527934, + "loss": 3.2957, + "step": 10341 + }, + { + "epoch": 0.43545263157894737, + "grad_norm": 0.384765625, + "learning_rate": 0.00030917720710103136, + "loss": 3.017, + "step": 10342 + }, + { + "epoch": 0.43549473684210527, + "grad_norm": 0.416015625, + "learning_rate": 0.0003091444212711302, + "loss": 3.2214, + "step": 10343 + }, + { + "epoch": 0.43553684210526317, + "grad_norm": 0.41796875, + "learning_rate": 0.00030911163436368717, + "loss": 3.4679, + "step": 10344 + }, + { + "epoch": 0.43557894736842107, + "grad_norm": 0.53125, + "learning_rate": 0.0003090788463792996, + "loss": 2.9486, + "step": 10345 + }, + { + "epoch": 0.43562105263157896, + "grad_norm": 0.427734375, + "learning_rate": 0.00030904605731856493, + "loss": 3.0537, + "step": 10346 + }, + { + "epoch": 0.43566315789473686, + "grad_norm": 0.443359375, + "learning_rate": 0.00030901326718208046, + "loss": 3.5625, + "step": 10347 + }, + { + "epoch": 0.43570526315789476, + "grad_norm": 0.4296875, + "learning_rate": 0.00030898047597044355, + "loss": 3.3916, + "step": 10348 + }, + { + "epoch": 0.4357473684210526, + "grad_norm": 0.451171875, + "learning_rate": 0.00030894768368425164, + "loss": 3.4189, + "step": 10349 + }, + { + "epoch": 0.4357894736842105, + "grad_norm": 0.416015625, + "learning_rate": 0.0003089148903241022, + "loss": 3.3901, + "step": 10350 + }, + { + "epoch": 0.4358315789473684, + "grad_norm": 0.4140625, + "learning_rate": 0.0003088820958905927, + "loss": 3.1573, + "step": 10351 + }, + { + "epoch": 0.4358736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.00030884930038432064, + "loss": 2.9614, + "step": 10352 + }, + { + "epoch": 0.4359157894736842, + "grad_norm": 0.4453125, + "learning_rate": 0.00030881650380588347, + "loss": 2.9518, + "step": 10353 + }, + { + "epoch": 0.4359578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00030878370615587865, + "loss": 3.0883, + "step": 10354 + }, + { + "epoch": 0.436, + "grad_norm": 0.439453125, + "learning_rate": 0.0003087509074349038, + "loss": 3.3926, + "step": 10355 + }, + { + "epoch": 0.4360421052631579, + "grad_norm": 0.462890625, + "learning_rate": 0.00030871810764355643, + "loss": 2.8601, + "step": 10356 + }, + { + "epoch": 0.4360842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.00030868530678243415, + "loss": 3.0935, + "step": 10357 + }, + { + "epoch": 0.4361263157894737, + "grad_norm": 0.46875, + "learning_rate": 0.00030865250485213446, + "loss": 3.3136, + "step": 10358 + }, + { + "epoch": 0.4361684210526316, + "grad_norm": 0.453125, + "learning_rate": 0.00030861970185325514, + "loss": 3.2019, + "step": 10359 + }, + { + "epoch": 0.4362105263157895, + "grad_norm": 0.396484375, + "learning_rate": 0.0003085868977863937, + "loss": 2.9428, + "step": 10360 + }, + { + "epoch": 0.4362526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.00030855409265214784, + "loss": 3.0771, + "step": 10361 + }, + { + "epoch": 0.4362947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.00030852128645111524, + "loss": 3.284, + "step": 10362 + }, + { + "epoch": 0.43633684210526313, + "grad_norm": 0.4140625, + "learning_rate": 0.0003084884791838935, + "loss": 3.0459, + "step": 10363 + }, + { + "epoch": 0.43637894736842103, + "grad_norm": 0.4453125, + "learning_rate": 0.0003084556708510804, + "loss": 3.1154, + "step": 10364 + }, + { + "epoch": 0.43642105263157893, + "grad_norm": 0.43359375, + "learning_rate": 0.00030842286145327366, + "loss": 3.1394, + "step": 10365 + }, + { + "epoch": 0.43646315789473683, + "grad_norm": 0.421875, + "learning_rate": 0.000308390050991071, + "loss": 3.3277, + "step": 10366 + }, + { + "epoch": 0.43650526315789473, + "grad_norm": 0.435546875, + "learning_rate": 0.0003083572394650702, + "loss": 3.0497, + "step": 10367 + }, + { + "epoch": 0.43654736842105263, + "grad_norm": 0.39453125, + "learning_rate": 0.00030832442687586914, + "loss": 2.8033, + "step": 10368 + }, + { + "epoch": 0.4365894736842105, + "grad_norm": 0.44140625, + "learning_rate": 0.0003082916132240656, + "loss": 3.2446, + "step": 10369 + }, + { + "epoch": 0.4366315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00030825879851025716, + "loss": 3.4533, + "step": 10370 + }, + { + "epoch": 0.4366736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.000308225982735042, + "loss": 3.0644, + "step": 10371 + }, + { + "epoch": 0.4367157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.00030819316589901784, + "loss": 2.8828, + "step": 10372 + }, + { + "epoch": 0.4367578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.0003081603480027826, + "loss": 3.1742, + "step": 10373 + }, + { + "epoch": 0.4368, + "grad_norm": 0.423828125, + "learning_rate": 0.0003081275290469341, + "loss": 3.4915, + "step": 10374 + }, + { + "epoch": 0.4368421052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.00030809470903207033, + "loss": 3.4146, + "step": 10375 + }, + { + "epoch": 0.43688421052631576, + "grad_norm": 0.412109375, + "learning_rate": 0.0003080618879587892, + "loss": 2.8529, + "step": 10376 + }, + { + "epoch": 0.43692631578947366, + "grad_norm": 0.388671875, + "learning_rate": 0.0003080290658276887, + "loss": 3.1891, + "step": 10377 + }, + { + "epoch": 0.43696842105263156, + "grad_norm": 0.4921875, + "learning_rate": 0.0003079962426393668, + "loss": 3.2417, + "step": 10378 + }, + { + "epoch": 0.43701052631578946, + "grad_norm": 0.431640625, + "learning_rate": 0.00030796341839442156, + "loss": 3.0707, + "step": 10379 + }, + { + "epoch": 0.43705263157894736, + "grad_norm": 0.40234375, + "learning_rate": 0.0003079305930934509, + "loss": 2.8132, + "step": 10380 + }, + { + "epoch": 0.43709473684210526, + "grad_norm": 0.4296875, + "learning_rate": 0.0003078977667370529, + "loss": 3.6715, + "step": 10381 + }, + { + "epoch": 0.43713684210526316, + "grad_norm": 0.443359375, + "learning_rate": 0.0003078649393258256, + "loss": 2.6121, + "step": 10382 + }, + { + "epoch": 0.43717894736842106, + "grad_norm": 0.42578125, + "learning_rate": 0.0003078321108603671, + "loss": 2.9402, + "step": 10383 + }, + { + "epoch": 0.43722105263157895, + "grad_norm": 0.416015625, + "learning_rate": 0.00030779928134127546, + "loss": 2.992, + "step": 10384 + }, + { + "epoch": 0.43726315789473685, + "grad_norm": 0.400390625, + "learning_rate": 0.0003077664507691489, + "loss": 3.2131, + "step": 10385 + }, + { + "epoch": 0.43730526315789475, + "grad_norm": 0.4140625, + "learning_rate": 0.0003077336191445854, + "loss": 3.1198, + "step": 10386 + }, + { + "epoch": 0.43734736842105265, + "grad_norm": 0.40234375, + "learning_rate": 0.0003077007864681833, + "loss": 3.1842, + "step": 10387 + }, + { + "epoch": 0.43738947368421055, + "grad_norm": 0.46484375, + "learning_rate": 0.0003076679527405406, + "loss": 3.1635, + "step": 10388 + }, + { + "epoch": 0.4374315789473684, + "grad_norm": 0.40625, + "learning_rate": 0.0003076351179622556, + "loss": 2.8915, + "step": 10389 + }, + { + "epoch": 0.4374736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.0003076022821339264, + "loss": 3.1661, + "step": 10390 + }, + { + "epoch": 0.4375157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00030756944525615133, + "loss": 2.6321, + "step": 10391 + }, + { + "epoch": 0.4375578947368421, + "grad_norm": 0.40234375, + "learning_rate": 0.00030753660732952854, + "loss": 3.4021, + "step": 10392 + }, + { + "epoch": 0.4376, + "grad_norm": 0.41015625, + "learning_rate": 0.0003075037683546564, + "loss": 3.1174, + "step": 10393 + }, + { + "epoch": 0.4376421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.00030747092833213323, + "loss": 3.4748, + "step": 10394 + }, + { + "epoch": 0.4376842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.0003074380872625573, + "loss": 3.3705, + "step": 10395 + }, + { + "epoch": 0.4377263157894737, + "grad_norm": 0.4453125, + "learning_rate": 0.00030740524514652685, + "loss": 3.1499, + "step": 10396 + }, + { + "epoch": 0.4377684210526316, + "grad_norm": 0.40625, + "learning_rate": 0.0003073724019846402, + "loss": 3.216, + "step": 10397 + }, + { + "epoch": 0.4378105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.00030733955777749596, + "loss": 3.2239, + "step": 10398 + }, + { + "epoch": 0.4378526315789474, + "grad_norm": 0.388671875, + "learning_rate": 0.00030730671252569223, + "loss": 3.4278, + "step": 10399 + }, + { + "epoch": 0.4378947368421053, + "grad_norm": 0.5234375, + "learning_rate": 0.00030727386622982756, + "loss": 3.167, + "step": 10400 + }, + { + "epoch": 0.4379368421052632, + "grad_norm": 0.4375, + "learning_rate": 0.00030724101889050036, + "loss": 2.7846, + "step": 10401 + }, + { + "epoch": 0.4379789473684211, + "grad_norm": 0.53515625, + "learning_rate": 0.00030720817050830903, + "loss": 3.8017, + "step": 10402 + }, + { + "epoch": 0.4380210526315789, + "grad_norm": 0.396484375, + "learning_rate": 0.0003071753210838521, + "loss": 2.8716, + "step": 10403 + }, + { + "epoch": 0.4380631578947368, + "grad_norm": 0.462890625, + "learning_rate": 0.0003071424706177279, + "loss": 3.3399, + "step": 10404 + }, + { + "epoch": 0.4381052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.0003071096191105351, + "loss": 3.0516, + "step": 10405 + }, + { + "epoch": 0.4381473684210526, + "grad_norm": 0.421875, + "learning_rate": 0.00030707676656287214, + "loss": 3.1168, + "step": 10406 + }, + { + "epoch": 0.4381894736842105, + "grad_norm": 0.419921875, + "learning_rate": 0.00030704391297533754, + "loss": 3.2377, + "step": 10407 + }, + { + "epoch": 0.4382315789473684, + "grad_norm": 0.44921875, + "learning_rate": 0.00030701105834852983, + "loss": 3.2906, + "step": 10408 + }, + { + "epoch": 0.4382736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.00030697820268304766, + "loss": 3.0429, + "step": 10409 + }, + { + "epoch": 0.4383157894736842, + "grad_norm": 0.40625, + "learning_rate": 0.00030694534597948956, + "loss": 3.0752, + "step": 10410 + }, + { + "epoch": 0.4383578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.0003069124882384542, + "loss": 2.8575, + "step": 10411 + }, + { + "epoch": 0.4384, + "grad_norm": 0.39453125, + "learning_rate": 0.00030687962946054014, + "loss": 2.9696, + "step": 10412 + }, + { + "epoch": 0.4384421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.0003068467696463461, + "loss": 3.0476, + "step": 10413 + }, + { + "epoch": 0.4384842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.0003068139087964707, + "loss": 3.0782, + "step": 10414 + }, + { + "epoch": 0.4385263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.00030678104691151265, + "loss": 3.5278, + "step": 10415 + }, + { + "epoch": 0.43856842105263155, + "grad_norm": 0.392578125, + "learning_rate": 0.00030674818399207063, + "loss": 3.4843, + "step": 10416 + }, + { + "epoch": 0.43861052631578945, + "grad_norm": 0.421875, + "learning_rate": 0.0003067153200387433, + "loss": 3.2554, + "step": 10417 + }, + { + "epoch": 0.43865263157894735, + "grad_norm": 0.46875, + "learning_rate": 0.00030668245505212957, + "loss": 3.2461, + "step": 10418 + }, + { + "epoch": 0.43869473684210525, + "grad_norm": 0.466796875, + "learning_rate": 0.00030664958903282803, + "loss": 3.4586, + "step": 10419 + }, + { + "epoch": 0.43873684210526315, + "grad_norm": 0.41015625, + "learning_rate": 0.00030661672198143764, + "loss": 3.0523, + "step": 10420 + }, + { + "epoch": 0.43877894736842105, + "grad_norm": 0.439453125, + "learning_rate": 0.00030658385389855704, + "loss": 2.9867, + "step": 10421 + }, + { + "epoch": 0.43882105263157895, + "grad_norm": 0.443359375, + "learning_rate": 0.00030655098478478513, + "loss": 2.6462, + "step": 10422 + }, + { + "epoch": 0.43886315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.0003065181146407207, + "loss": 3.4885, + "step": 10423 + }, + { + "epoch": 0.43890526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.0003064852434669626, + "loss": 3.1478, + "step": 10424 + }, + { + "epoch": 0.43894736842105264, + "grad_norm": 0.4453125, + "learning_rate": 0.0003064523712641097, + "loss": 3.1935, + "step": 10425 + }, + { + "epoch": 0.43898947368421054, + "grad_norm": 0.4375, + "learning_rate": 0.00030641949803276097, + "loss": 3.5379, + "step": 10426 + }, + { + "epoch": 0.43903157894736844, + "grad_norm": 0.53515625, + "learning_rate": 0.00030638662377351533, + "loss": 3.1917, + "step": 10427 + }, + { + "epoch": 0.43907368421052634, + "grad_norm": 0.421875, + "learning_rate": 0.0003063537484869716, + "loss": 3.2609, + "step": 10428 + }, + { + "epoch": 0.43911578947368424, + "grad_norm": 0.419921875, + "learning_rate": 0.00030632087217372886, + "loss": 2.9715, + "step": 10429 + }, + { + "epoch": 0.4391578947368421, + "grad_norm": 0.453125, + "learning_rate": 0.00030628799483438596, + "loss": 3.0849, + "step": 10430 + }, + { + "epoch": 0.4392, + "grad_norm": 0.43359375, + "learning_rate": 0.00030625511646954187, + "loss": 3.5909, + "step": 10431 + }, + { + "epoch": 0.4392421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00030622223707979567, + "loss": 2.7738, + "step": 10432 + }, + { + "epoch": 0.4392842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.00030618935666574637, + "loss": 3.2497, + "step": 10433 + }, + { + "epoch": 0.4393263157894737, + "grad_norm": 0.455078125, + "learning_rate": 0.00030615647522799305, + "loss": 3.0866, + "step": 10434 + }, + { + "epoch": 0.4393684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.00030612359276713474, + "loss": 3.3239, + "step": 10435 + }, + { + "epoch": 0.4394105263157895, + "grad_norm": 0.4140625, + "learning_rate": 0.0003060907092837705, + "loss": 2.9103, + "step": 10436 + }, + { + "epoch": 0.4394526315789474, + "grad_norm": 0.431640625, + "learning_rate": 0.00030605782477849945, + "loss": 2.983, + "step": 10437 + }, + { + "epoch": 0.43949473684210527, + "grad_norm": 0.447265625, + "learning_rate": 0.00030602493925192066, + "loss": 3.053, + "step": 10438 + }, + { + "epoch": 0.43953684210526317, + "grad_norm": 0.4140625, + "learning_rate": 0.0003059920527046333, + "loss": 2.7399, + "step": 10439 + }, + { + "epoch": 0.43957894736842107, + "grad_norm": 0.412109375, + "learning_rate": 0.0003059591651372366, + "loss": 3.0214, + "step": 10440 + }, + { + "epoch": 0.43962105263157897, + "grad_norm": 0.43359375, + "learning_rate": 0.00030592627655032955, + "loss": 2.9279, + "step": 10441 + }, + { + "epoch": 0.43966315789473687, + "grad_norm": 0.43359375, + "learning_rate": 0.00030589338694451154, + "loss": 3.4296, + "step": 10442 + }, + { + "epoch": 0.4397052631578947, + "grad_norm": 0.421875, + "learning_rate": 0.0003058604963203817, + "loss": 2.9419, + "step": 10443 + }, + { + "epoch": 0.4397473684210526, + "grad_norm": 0.40234375, + "learning_rate": 0.00030582760467853917, + "loss": 3.1846, + "step": 10444 + }, + { + "epoch": 0.4397894736842105, + "grad_norm": 0.466796875, + "learning_rate": 0.0003057947120195833, + "loss": 3.07, + "step": 10445 + }, + { + "epoch": 0.4398315789473684, + "grad_norm": 0.390625, + "learning_rate": 0.0003057618183441134, + "loss": 3.4921, + "step": 10446 + }, + { + "epoch": 0.4398736842105263, + "grad_norm": 0.490234375, + "learning_rate": 0.00030572892365272864, + "loss": 3.2273, + "step": 10447 + }, + { + "epoch": 0.4399157894736842, + "grad_norm": 0.453125, + "learning_rate": 0.0003056960279460284, + "loss": 3.3551, + "step": 10448 + }, + { + "epoch": 0.4399578947368421, + "grad_norm": 0.48046875, + "learning_rate": 0.0003056631312246119, + "loss": 3.1826, + "step": 10449 + }, + { + "epoch": 0.44, + "grad_norm": 0.408203125, + "learning_rate": 0.0003056302334890786, + "loss": 2.9921, + "step": 10450 + }, + { + "epoch": 0.4400421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00030559733474002775, + "loss": 3.1729, + "step": 10451 + }, + { + "epoch": 0.4400842105263158, + "grad_norm": 0.404296875, + "learning_rate": 0.00030556443497805887, + "loss": 2.9209, + "step": 10452 + }, + { + "epoch": 0.4401263157894737, + "grad_norm": 0.478515625, + "learning_rate": 0.0003055315342037713, + "loss": 3.624, + "step": 10453 + }, + { + "epoch": 0.4401684210526316, + "grad_norm": 0.416015625, + "learning_rate": 0.00030549863241776434, + "loss": 3.2372, + "step": 10454 + }, + { + "epoch": 0.4402105263157895, + "grad_norm": 0.451171875, + "learning_rate": 0.00030546572962063755, + "loss": 3.3846, + "step": 10455 + }, + { + "epoch": 0.4402526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.0003054328258129903, + "loss": 3.2372, + "step": 10456 + }, + { + "epoch": 0.44029473684210524, + "grad_norm": 0.408203125, + "learning_rate": 0.0003053999209954221, + "loss": 3.3276, + "step": 10457 + }, + { + "epoch": 0.44033684210526314, + "grad_norm": 0.427734375, + "learning_rate": 0.0003053670151685324, + "loss": 2.5826, + "step": 10458 + }, + { + "epoch": 0.44037894736842104, + "grad_norm": 0.390625, + "learning_rate": 0.0003053341083329208, + "loss": 2.9943, + "step": 10459 + }, + { + "epoch": 0.44042105263157894, + "grad_norm": 0.4375, + "learning_rate": 0.00030530120048918673, + "loss": 3.1917, + "step": 10460 + }, + { + "epoch": 0.44046315789473683, + "grad_norm": 0.50390625, + "learning_rate": 0.0003052682916379298, + "loss": 3.0646, + "step": 10461 + }, + { + "epoch": 0.44050526315789473, + "grad_norm": 0.41015625, + "learning_rate": 0.0003052353817797495, + "loss": 3.172, + "step": 10462 + }, + { + "epoch": 0.44054736842105263, + "grad_norm": 0.44921875, + "learning_rate": 0.0003052024709152455, + "loss": 3.1724, + "step": 10463 + }, + { + "epoch": 0.44058947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.00030516955904501725, + "loss": 2.8461, + "step": 10464 + }, + { + "epoch": 0.44063157894736843, + "grad_norm": 0.419921875, + "learning_rate": 0.0003051366461696645, + "loss": 3.3684, + "step": 10465 + }, + { + "epoch": 0.44067368421052633, + "grad_norm": 0.4140625, + "learning_rate": 0.00030510373228978684, + "loss": 3.4937, + "step": 10466 + }, + { + "epoch": 0.4407157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00030507081740598383, + "loss": 2.8291, + "step": 10467 + }, + { + "epoch": 0.4407578947368421, + "grad_norm": 0.4453125, + "learning_rate": 0.0003050379015188554, + "loss": 3.2368, + "step": 10468 + }, + { + "epoch": 0.4408, + "grad_norm": 0.44140625, + "learning_rate": 0.000305004984629001, + "loss": 3.3286, + "step": 10469 + }, + { + "epoch": 0.44084210526315787, + "grad_norm": 0.52734375, + "learning_rate": 0.0003049720667370204, + "loss": 3.53, + "step": 10470 + }, + { + "epoch": 0.44088421052631577, + "grad_norm": 0.4765625, + "learning_rate": 0.0003049391478435133, + "loss": 3.0017, + "step": 10471 + }, + { + "epoch": 0.44092631578947367, + "grad_norm": 0.412109375, + "learning_rate": 0.0003049062279490795, + "loss": 2.8856, + "step": 10472 + }, + { + "epoch": 0.44096842105263157, + "grad_norm": 0.455078125, + "learning_rate": 0.0003048733070543188, + "loss": 3.7096, + "step": 10473 + }, + { + "epoch": 0.44101052631578946, + "grad_norm": 0.439453125, + "learning_rate": 0.00030484038515983086, + "loss": 3.3101, + "step": 10474 + }, + { + "epoch": 0.44105263157894736, + "grad_norm": 0.466796875, + "learning_rate": 0.0003048074622662155, + "loss": 3.4182, + "step": 10475 + }, + { + "epoch": 0.44109473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.00030477453837407265, + "loss": 2.9342, + "step": 10476 + }, + { + "epoch": 0.44113684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.000304741613484002, + "loss": 3.3998, + "step": 10477 + }, + { + "epoch": 0.44117894736842106, + "grad_norm": 0.41796875, + "learning_rate": 0.0003047086875966035, + "loss": 3.5417, + "step": 10478 + }, + { + "epoch": 0.44122105263157896, + "grad_norm": 0.6796875, + "learning_rate": 0.000304675760712477, + "loss": 3.2562, + "step": 10479 + }, + { + "epoch": 0.44126315789473686, + "grad_norm": 0.423828125, + "learning_rate": 0.00030464283283222244, + "loss": 3.4119, + "step": 10480 + }, + { + "epoch": 0.44130526315789476, + "grad_norm": 0.43359375, + "learning_rate": 0.0003046099039564396, + "loss": 3.4781, + "step": 10481 + }, + { + "epoch": 0.44134736842105265, + "grad_norm": 0.4140625, + "learning_rate": 0.0003045769740857285, + "loss": 2.7657, + "step": 10482 + }, + { + "epoch": 0.44138947368421055, + "grad_norm": 0.4140625, + "learning_rate": 0.00030454404322068903, + "loss": 3.1645, + "step": 10483 + }, + { + "epoch": 0.4414315789473684, + "grad_norm": 0.462890625, + "learning_rate": 0.0003045111113619212, + "loss": 3.3728, + "step": 10484 + }, + { + "epoch": 0.4414736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.0003044781785100249, + "loss": 2.675, + "step": 10485 + }, + { + "epoch": 0.4415157894736842, + "grad_norm": 0.447265625, + "learning_rate": 0.00030444524466560026, + "loss": 3.2005, + "step": 10486 + }, + { + "epoch": 0.4415578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00030441230982924727, + "loss": 3.0171, + "step": 10487 + }, + { + "epoch": 0.4416, + "grad_norm": 0.40625, + "learning_rate": 0.0003043793740015659, + "loss": 3.5587, + "step": 10488 + }, + { + "epoch": 0.4416421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.0003043464371831562, + "loss": 2.7387, + "step": 10489 + }, + { + "epoch": 0.4416842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.00030431349937461833, + "loss": 3.2309, + "step": 10490 + }, + { + "epoch": 0.4417263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.00030428056057655217, + "loss": 3.0492, + "step": 10491 + }, + { + "epoch": 0.4417684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.0003042476207895581, + "loss": 3.1302, + "step": 10492 + }, + { + "epoch": 0.4418105263157895, + "grad_norm": 0.44921875, + "learning_rate": 0.0003042146800142361, + "loss": 3.1055, + "step": 10493 + }, + { + "epoch": 0.4418526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00030418173825118636, + "loss": 3.2716, + "step": 10494 + }, + { + "epoch": 0.4418947368421053, + "grad_norm": 0.408203125, + "learning_rate": 0.00030414879550100894, + "loss": 3.227, + "step": 10495 + }, + { + "epoch": 0.4419368421052632, + "grad_norm": 0.4140625, + "learning_rate": 0.0003041158517643041, + "loss": 2.7149, + "step": 10496 + }, + { + "epoch": 0.441978947368421, + "grad_norm": 0.421875, + "learning_rate": 0.0003040829070416721, + "loss": 3.0994, + "step": 10497 + }, + { + "epoch": 0.4420210526315789, + "grad_norm": 0.5, + "learning_rate": 0.00030404996133371297, + "loss": 2.8689, + "step": 10498 + }, + { + "epoch": 0.4420631578947368, + "grad_norm": 0.423828125, + "learning_rate": 0.0003040170146410271, + "loss": 3.4841, + "step": 10499 + }, + { + "epoch": 0.4421052631578947, + "grad_norm": 0.419921875, + "learning_rate": 0.0003039840669642147, + "loss": 3.2117, + "step": 10500 + }, + { + "epoch": 0.4421473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.00030395111830387603, + "loss": 3.2922, + "step": 10501 + }, + { + "epoch": 0.4421894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.00030391816866061133, + "loss": 3.6952, + "step": 10502 + }, + { + "epoch": 0.4422315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.000303885218035021, + "loss": 2.9978, + "step": 10503 + }, + { + "epoch": 0.4422736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.0003038522664277054, + "loss": 2.8767, + "step": 10504 + }, + { + "epoch": 0.4423157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.00030381931383926457, + "loss": 3.0088, + "step": 10505 + }, + { + "epoch": 0.4423578947368421, + "grad_norm": 0.455078125, + "learning_rate": 0.0003037863602702992, + "loss": 3.1206, + "step": 10506 + }, + { + "epoch": 0.4424, + "grad_norm": 0.423828125, + "learning_rate": 0.0003037534057214095, + "loss": 3.1215, + "step": 10507 + }, + { + "epoch": 0.4424421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.0003037204501931959, + "loss": 2.668, + "step": 10508 + }, + { + "epoch": 0.4424842105263158, + "grad_norm": 0.451171875, + "learning_rate": 0.0003036874936862589, + "loss": 3.3978, + "step": 10509 + }, + { + "epoch": 0.44252631578947366, + "grad_norm": 0.3984375, + "learning_rate": 0.0003036545362011988, + "loss": 3.4685, + "step": 10510 + }, + { + "epoch": 0.44256842105263156, + "grad_norm": 0.41015625, + "learning_rate": 0.0003036215777386161, + "loss": 3.304, + "step": 10511 + }, + { + "epoch": 0.44261052631578945, + "grad_norm": 0.421875, + "learning_rate": 0.00030358861829911116, + "loss": 3.2584, + "step": 10512 + }, + { + "epoch": 0.44265263157894735, + "grad_norm": 0.408203125, + "learning_rate": 0.00030355565788328464, + "loss": 2.985, + "step": 10513 + }, + { + "epoch": 0.44269473684210525, + "grad_norm": 0.427734375, + "learning_rate": 0.000303522696491737, + "loss": 2.9167, + "step": 10514 + }, + { + "epoch": 0.44273684210526315, + "grad_norm": 0.4296875, + "learning_rate": 0.00030348973412506867, + "loss": 2.9929, + "step": 10515 + }, + { + "epoch": 0.44277894736842105, + "grad_norm": 0.408203125, + "learning_rate": 0.0003034567707838802, + "loss": 3.148, + "step": 10516 + }, + { + "epoch": 0.44282105263157895, + "grad_norm": 0.423828125, + "learning_rate": 0.00030342380646877223, + "loss": 3.4823, + "step": 10517 + }, + { + "epoch": 0.44286315789473685, + "grad_norm": 0.4453125, + "learning_rate": 0.0003033908411803452, + "loss": 2.7543, + "step": 10518 + }, + { + "epoch": 0.44290526315789475, + "grad_norm": 0.439453125, + "learning_rate": 0.00030335787491919984, + "loss": 2.9379, + "step": 10519 + }, + { + "epoch": 0.44294736842105265, + "grad_norm": 0.4140625, + "learning_rate": 0.0003033249076859367, + "loss": 3.2164, + "step": 10520 + }, + { + "epoch": 0.44298947368421054, + "grad_norm": 0.408203125, + "learning_rate": 0.0003032919394811564, + "loss": 3.3342, + "step": 10521 + }, + { + "epoch": 0.44303157894736844, + "grad_norm": 0.40234375, + "learning_rate": 0.00030325897030545956, + "loss": 3.1205, + "step": 10522 + }, + { + "epoch": 0.44307368421052634, + "grad_norm": 0.41796875, + "learning_rate": 0.0003032260001594469, + "loss": 3.2605, + "step": 10523 + }, + { + "epoch": 0.4431157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.00030319302904371903, + "loss": 3.0136, + "step": 10524 + }, + { + "epoch": 0.4431578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.0003031600569588766, + "loss": 3.4244, + "step": 10525 + }, + { + "epoch": 0.4432, + "grad_norm": 0.44921875, + "learning_rate": 0.00030312708390552043, + "loss": 2.7623, + "step": 10526 + }, + { + "epoch": 0.4432421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.00030309410988425123, + "loss": 3.2121, + "step": 10527 + }, + { + "epoch": 0.4432842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.0003030611348956697, + "loss": 2.9423, + "step": 10528 + }, + { + "epoch": 0.4433263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.0003030281589403767, + "loss": 3.3202, + "step": 10529 + }, + { + "epoch": 0.4433684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.00030299518201897294, + "loss": 3.3964, + "step": 10530 + }, + { + "epoch": 0.4434105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.00030296220413205923, + "loss": 3.1689, + "step": 10531 + }, + { + "epoch": 0.4434526315789474, + "grad_norm": 0.404296875, + "learning_rate": 0.0003029292252802363, + "loss": 2.8913, + "step": 10532 + }, + { + "epoch": 0.4434947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.00030289624546410513, + "loss": 2.8485, + "step": 10533 + }, + { + "epoch": 0.4435368421052632, + "grad_norm": 0.41015625, + "learning_rate": 0.0003028632646842665, + "loss": 3.3953, + "step": 10534 + }, + { + "epoch": 0.4435789473684211, + "grad_norm": 0.41015625, + "learning_rate": 0.00030283028294132133, + "loss": 3.6729, + "step": 10535 + }, + { + "epoch": 0.44362105263157897, + "grad_norm": 0.5703125, + "learning_rate": 0.0003027973002358705, + "loss": 3.0697, + "step": 10536 + }, + { + "epoch": 0.4436631578947368, + "grad_norm": 0.427734375, + "learning_rate": 0.0003027643165685149, + "loss": 3.0972, + "step": 10537 + }, + { + "epoch": 0.4437052631578947, + "grad_norm": 0.4375, + "learning_rate": 0.00030273133193985536, + "loss": 3.3721, + "step": 10538 + }, + { + "epoch": 0.4437473684210526, + "grad_norm": 0.408203125, + "learning_rate": 0.000302698346350493, + "loss": 3.105, + "step": 10539 + }, + { + "epoch": 0.4437894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.00030266535980102863, + "loss": 3.3508, + "step": 10540 + }, + { + "epoch": 0.4438315789473684, + "grad_norm": 0.419921875, + "learning_rate": 0.00030263237229206333, + "loss": 3.4048, + "step": 10541 + }, + { + "epoch": 0.4438736842105263, + "grad_norm": 0.4375, + "learning_rate": 0.0003025993838241979, + "loss": 3.4544, + "step": 10542 + }, + { + "epoch": 0.4439157894736842, + "grad_norm": 0.515625, + "learning_rate": 0.0003025663943980337, + "loss": 2.9386, + "step": 10543 + }, + { + "epoch": 0.4439578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.0003025334040141715, + "loss": 3.4232, + "step": 10544 + }, + { + "epoch": 0.444, + "grad_norm": 0.46484375, + "learning_rate": 0.0003025004126732123, + "loss": 3.1231, + "step": 10545 + }, + { + "epoch": 0.4440421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.0003024674203757574, + "loss": 3.5238, + "step": 10546 + }, + { + "epoch": 0.4440842105263158, + "grad_norm": 0.412109375, + "learning_rate": 0.0003024344271224077, + "loss": 3.3495, + "step": 10547 + }, + { + "epoch": 0.4441263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.0003024014329137644, + "loss": 2.7878, + "step": 10548 + }, + { + "epoch": 0.4441684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.0003023684377504284, + "loss": 3.5051, + "step": 10549 + }, + { + "epoch": 0.4442105263157895, + "grad_norm": 0.423828125, + "learning_rate": 0.00030233544163300115, + "loss": 3.1932, + "step": 10550 + }, + { + "epoch": 0.44425263157894734, + "grad_norm": 0.408203125, + "learning_rate": 0.00030230244456208355, + "loss": 3.1534, + "step": 10551 + }, + { + "epoch": 0.44429473684210524, + "grad_norm": 0.47265625, + "learning_rate": 0.00030226944653827687, + "loss": 3.0033, + "step": 10552 + }, + { + "epoch": 0.44433684210526314, + "grad_norm": 0.435546875, + "learning_rate": 0.0003022364475621824, + "loss": 2.9797, + "step": 10553 + }, + { + "epoch": 0.44437894736842104, + "grad_norm": 0.423828125, + "learning_rate": 0.0003022034476344011, + "loss": 3.2166, + "step": 10554 + }, + { + "epoch": 0.44442105263157894, + "grad_norm": 0.3984375, + "learning_rate": 0.0003021704467555344, + "loss": 2.9338, + "step": 10555 + }, + { + "epoch": 0.44446315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.00030213744492618347, + "loss": 3.3273, + "step": 10556 + }, + { + "epoch": 0.44450526315789474, + "grad_norm": 0.416015625, + "learning_rate": 0.0003021044421469495, + "loss": 3.1894, + "step": 10557 + }, + { + "epoch": 0.44454736842105264, + "grad_norm": 0.447265625, + "learning_rate": 0.00030207143841843383, + "loss": 2.7169, + "step": 10558 + }, + { + "epoch": 0.44458947368421053, + "grad_norm": 0.416015625, + "learning_rate": 0.0003020384337412377, + "loss": 3.4001, + "step": 10559 + }, + { + "epoch": 0.44463157894736843, + "grad_norm": 0.416015625, + "learning_rate": 0.00030200542811596243, + "loss": 2.7693, + "step": 10560 + }, + { + "epoch": 0.44467368421052633, + "grad_norm": 0.427734375, + "learning_rate": 0.0003019724215432095, + "loss": 2.4381, + "step": 10561 + }, + { + "epoch": 0.44471578947368423, + "grad_norm": 0.4296875, + "learning_rate": 0.00030193941402358, + "loss": 3.337, + "step": 10562 + }, + { + "epoch": 0.44475789473684213, + "grad_norm": 0.43359375, + "learning_rate": 0.00030190640555767545, + "loss": 3.365, + "step": 10563 + }, + { + "epoch": 0.4448, + "grad_norm": 0.421875, + "learning_rate": 0.0003018733961460972, + "loss": 3.04, + "step": 10564 + }, + { + "epoch": 0.44484210526315787, + "grad_norm": 0.404296875, + "learning_rate": 0.0003018403857894465, + "loss": 3.2255, + "step": 10565 + }, + { + "epoch": 0.44488421052631577, + "grad_norm": 0.408203125, + "learning_rate": 0.00030180737448832497, + "loss": 2.8701, + "step": 10566 + }, + { + "epoch": 0.44492631578947367, + "grad_norm": 0.423828125, + "learning_rate": 0.000301774362243334, + "loss": 3.2432, + "step": 10567 + }, + { + "epoch": 0.44496842105263157, + "grad_norm": 0.421875, + "learning_rate": 0.00030174134905507484, + "loss": 3.2066, + "step": 10568 + }, + { + "epoch": 0.44501052631578947, + "grad_norm": 0.45703125, + "learning_rate": 0.00030170833492414916, + "loss": 2.7211, + "step": 10569 + }, + { + "epoch": 0.44505263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.0003016753198511584, + "loss": 3.3144, + "step": 10570 + }, + { + "epoch": 0.44509473684210527, + "grad_norm": 0.4296875, + "learning_rate": 0.0003016423038367041, + "loss": 3.4753, + "step": 10571 + }, + { + "epoch": 0.44513684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.00030160928688138756, + "loss": 3.5059, + "step": 10572 + }, + { + "epoch": 0.44517894736842106, + "grad_norm": 0.484375, + "learning_rate": 0.00030157626898581053, + "loss": 2.8016, + "step": 10573 + }, + { + "epoch": 0.44522105263157896, + "grad_norm": 0.443359375, + "learning_rate": 0.00030154325015057444, + "loss": 2.814, + "step": 10574 + }, + { + "epoch": 0.44526315789473686, + "grad_norm": 0.4609375, + "learning_rate": 0.00030151023037628094, + "loss": 2.9643, + "step": 10575 + }, + { + "epoch": 0.44530526315789476, + "grad_norm": 0.451171875, + "learning_rate": 0.00030147720966353146, + "loss": 2.9215, + "step": 10576 + }, + { + "epoch": 0.44534736842105266, + "grad_norm": 0.4296875, + "learning_rate": 0.0003014441880129278, + "loss": 3.2176, + "step": 10577 + }, + { + "epoch": 0.4453894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.0003014111654250715, + "loss": 3.0316, + "step": 10578 + }, + { + "epoch": 0.4454315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.0003013781419005641, + "loss": 3.4477, + "step": 10579 + }, + { + "epoch": 0.4454736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.0003013451174400074, + "loss": 2.8588, + "step": 10580 + }, + { + "epoch": 0.4455157894736842, + "grad_norm": 0.4375, + "learning_rate": 0.0003013120920440029, + "loss": 3.1239, + "step": 10581 + }, + { + "epoch": 0.4455578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.0003012790657131524, + "loss": 3.2689, + "step": 10582 + }, + { + "epoch": 0.4456, + "grad_norm": 0.4375, + "learning_rate": 0.00030124603844805764, + "loss": 3.191, + "step": 10583 + }, + { + "epoch": 0.4456421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.0003012130102493202, + "loss": 3.0597, + "step": 10584 + }, + { + "epoch": 0.4456842105263158, + "grad_norm": 0.39453125, + "learning_rate": 0.00030117998111754186, + "loss": 3.0704, + "step": 10585 + }, + { + "epoch": 0.4457263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.0003011469510533244, + "loss": 3.1581, + "step": 10586 + }, + { + "epoch": 0.4457684210526316, + "grad_norm": 0.396484375, + "learning_rate": 0.00030111392005726967, + "loss": 3.2039, + "step": 10587 + }, + { + "epoch": 0.4458105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.00030108088812997935, + "loss": 3.1708, + "step": 10588 + }, + { + "epoch": 0.4458526315789474, + "grad_norm": 0.4140625, + "learning_rate": 0.00030104785527205525, + "loss": 3.0177, + "step": 10589 + }, + { + "epoch": 0.4458947368421053, + "grad_norm": 0.41796875, + "learning_rate": 0.0003010148214840992, + "loss": 3.4261, + "step": 10590 + }, + { + "epoch": 0.44593684210526313, + "grad_norm": 0.43359375, + "learning_rate": 0.00030098178676671306, + "loss": 2.9311, + "step": 10591 + }, + { + "epoch": 0.44597894736842103, + "grad_norm": 0.435546875, + "learning_rate": 0.00030094875112049864, + "loss": 2.8207, + "step": 10592 + }, + { + "epoch": 0.44602105263157893, + "grad_norm": 0.4140625, + "learning_rate": 0.00030091571454605775, + "loss": 3.2118, + "step": 10593 + }, + { + "epoch": 0.44606315789473683, + "grad_norm": 0.439453125, + "learning_rate": 0.0003008826770439925, + "loss": 3.1249, + "step": 10594 + }, + { + "epoch": 0.4461052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.00030084963861490467, + "loss": 2.8836, + "step": 10595 + }, + { + "epoch": 0.4461473684210526, + "grad_norm": 0.4296875, + "learning_rate": 0.0003008165992593961, + "loss": 3.1556, + "step": 10596 + }, + { + "epoch": 0.4461894736842105, + "grad_norm": 0.419921875, + "learning_rate": 0.00030078355897806885, + "loss": 3.0289, + "step": 10597 + }, + { + "epoch": 0.4462315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.0003007505177715248, + "loss": 2.9401, + "step": 10598 + }, + { + "epoch": 0.4462736842105263, + "grad_norm": 0.400390625, + "learning_rate": 0.0003007174756403659, + "loss": 2.8606, + "step": 10599 + }, + { + "epoch": 0.4463157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.00030068443258519424, + "loss": 3.4791, + "step": 10600 + }, + { + "epoch": 0.4463578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.00030065138860661166, + "loss": 2.4884, + "step": 10601 + }, + { + "epoch": 0.4464, + "grad_norm": 0.40234375, + "learning_rate": 0.0003006183437052204, + "loss": 2.8169, + "step": 10602 + }, + { + "epoch": 0.4464421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00030058529788162244, + "loss": 3.2385, + "step": 10603 + }, + { + "epoch": 0.4464842105263158, + "grad_norm": 0.396484375, + "learning_rate": 0.0003005522511364197, + "loss": 2.9201, + "step": 10604 + }, + { + "epoch": 0.44652631578947366, + "grad_norm": 0.408203125, + "learning_rate": 0.00030051920347021443, + "loss": 3.504, + "step": 10605 + }, + { + "epoch": 0.44656842105263156, + "grad_norm": 0.400390625, + "learning_rate": 0.0003004861548836085, + "loss": 2.9129, + "step": 10606 + }, + { + "epoch": 0.44661052631578946, + "grad_norm": 0.455078125, + "learning_rate": 0.0003004531053772042, + "loss": 3.5747, + "step": 10607 + }, + { + "epoch": 0.44665263157894736, + "grad_norm": 0.486328125, + "learning_rate": 0.0003004200549516037, + "loss": 2.9357, + "step": 10608 + }, + { + "epoch": 0.44669473684210526, + "grad_norm": 0.400390625, + "learning_rate": 0.0003003870036074089, + "loss": 3.269, + "step": 10609 + }, + { + "epoch": 0.44673684210526315, + "grad_norm": 0.396484375, + "learning_rate": 0.00030035395134522217, + "loss": 2.9713, + "step": 10610 + }, + { + "epoch": 0.44677894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.00030032089816564556, + "loss": 2.8832, + "step": 10611 + }, + { + "epoch": 0.44682105263157895, + "grad_norm": 0.404296875, + "learning_rate": 0.0003002878440692813, + "loss": 2.9304, + "step": 10612 + }, + { + "epoch": 0.44686315789473685, + "grad_norm": 0.40234375, + "learning_rate": 0.00030025478905673164, + "loss": 2.9484, + "step": 10613 + }, + { + "epoch": 0.44690526315789475, + "grad_norm": 0.41015625, + "learning_rate": 0.0003002217331285988, + "loss": 2.9904, + "step": 10614 + }, + { + "epoch": 0.44694736842105265, + "grad_norm": 0.46875, + "learning_rate": 0.00030018867628548503, + "loss": 2.9531, + "step": 10615 + }, + { + "epoch": 0.44698947368421055, + "grad_norm": 0.4453125, + "learning_rate": 0.00030015561852799247, + "loss": 2.8714, + "step": 10616 + }, + { + "epoch": 0.44703157894736845, + "grad_norm": 0.40234375, + "learning_rate": 0.0003001225598567234, + "loss": 3.3532, + "step": 10617 + }, + { + "epoch": 0.4470736842105263, + "grad_norm": 0.6640625, + "learning_rate": 0.0003000895002722803, + "loss": 3.5382, + "step": 10618 + }, + { + "epoch": 0.4471157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.0003000564397752653, + "loss": 3.2694, + "step": 10619 + }, + { + "epoch": 0.4471578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0003000233783662808, + "loss": 2.919, + "step": 10620 + }, + { + "epoch": 0.4472, + "grad_norm": 0.4921875, + "learning_rate": 0.0002999903160459292, + "loss": 3.5945, + "step": 10621 + }, + { + "epoch": 0.4472421052631579, + "grad_norm": 0.453125, + "learning_rate": 0.00029995725281481265, + "loss": 3.4309, + "step": 10622 + }, + { + "epoch": 0.4472842105263158, + "grad_norm": 0.3984375, + "learning_rate": 0.0002999241886735338, + "loss": 3.058, + "step": 10623 + }, + { + "epoch": 0.4473263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.00029989112362269476, + "loss": 3.1303, + "step": 10624 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 0.44140625, + "learning_rate": 0.0002998580576628981, + "loss": 3.2856, + "step": 10625 + }, + { + "epoch": 0.4474105263157895, + "grad_norm": 0.435546875, + "learning_rate": 0.0002998249907947462, + "loss": 2.9981, + "step": 10626 + }, + { + "epoch": 0.4474526315789474, + "grad_norm": 0.3984375, + "learning_rate": 0.0002997919230188415, + "loss": 3.149, + "step": 10627 + }, + { + "epoch": 0.4474947368421053, + "grad_norm": 0.4296875, + "learning_rate": 0.0002997588543357865, + "loss": 3.0835, + "step": 10628 + }, + { + "epoch": 0.4475368421052632, + "grad_norm": 0.39453125, + "learning_rate": 0.0002997257847461836, + "loss": 2.7491, + "step": 10629 + }, + { + "epoch": 0.4475789473684211, + "grad_norm": 0.435546875, + "learning_rate": 0.00029969271425063536, + "loss": 2.8589, + "step": 10630 + }, + { + "epoch": 0.447621052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.0002996596428497443, + "loss": 3.4192, + "step": 10631 + }, + { + "epoch": 0.4476631578947368, + "grad_norm": 0.416015625, + "learning_rate": 0.00029962657054411287, + "loss": 3.8253, + "step": 10632 + }, + { + "epoch": 0.4477052631578947, + "grad_norm": 0.435546875, + "learning_rate": 0.00029959349733434354, + "loss": 2.9984, + "step": 10633 + }, + { + "epoch": 0.4477473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.00029956042322103904, + "loss": 3.6453, + "step": 10634 + }, + { + "epoch": 0.4477894736842105, + "grad_norm": 0.45703125, + "learning_rate": 0.0002995273482048018, + "loss": 2.9385, + "step": 10635 + }, + { + "epoch": 0.4478315789473684, + "grad_norm": 0.4609375, + "learning_rate": 0.00029949427228623455, + "loss": 3.4606, + "step": 10636 + }, + { + "epoch": 0.4478736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.00029946119546593976, + "loss": 3.5396, + "step": 10637 + }, + { + "epoch": 0.4479157894736842, + "grad_norm": 0.73828125, + "learning_rate": 0.00029942811774452015, + "loss": 3.3423, + "step": 10638 + }, + { + "epoch": 0.4479578947368421, + "grad_norm": 0.478515625, + "learning_rate": 0.0002993950391225782, + "loss": 3.1435, + "step": 10639 + }, + { + "epoch": 0.448, + "grad_norm": 0.4453125, + "learning_rate": 0.00029936195960071675, + "loss": 3.3942, + "step": 10640 + }, + { + "epoch": 0.4480421052631579, + "grad_norm": 0.46875, + "learning_rate": 0.00029932887917953843, + "loss": 3.5683, + "step": 10641 + }, + { + "epoch": 0.4480842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.0002992957978596459, + "loss": 3.2275, + "step": 10642 + }, + { + "epoch": 0.4481263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.00029926271564164177, + "loss": 3.4429, + "step": 10643 + }, + { + "epoch": 0.4481684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.0002992296325261289, + "loss": 3.3622, + "step": 10644 + }, + { + "epoch": 0.44821052631578945, + "grad_norm": 0.39453125, + "learning_rate": 0.00029919654851371, + "loss": 2.8307, + "step": 10645 + }, + { + "epoch": 0.44825263157894735, + "grad_norm": 0.466796875, + "learning_rate": 0.00029916346360498765, + "loss": 3.1204, + "step": 10646 + }, + { + "epoch": 0.44829473684210525, + "grad_norm": 0.392578125, + "learning_rate": 0.00029913037780056486, + "loss": 2.8019, + "step": 10647 + }, + { + "epoch": 0.44833684210526314, + "grad_norm": 0.41015625, + "learning_rate": 0.0002990972911010443, + "loss": 2.9142, + "step": 10648 + }, + { + "epoch": 0.44837894736842104, + "grad_norm": 0.388671875, + "learning_rate": 0.0002990642035070288, + "loss": 2.553, + "step": 10649 + }, + { + "epoch": 0.44842105263157894, + "grad_norm": 0.45703125, + "learning_rate": 0.0002990311150191212, + "loss": 3.2851, + "step": 10650 + }, + { + "epoch": 0.44846315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00029899802563792416, + "loss": 2.9604, + "step": 10651 + }, + { + "epoch": 0.44850526315789474, + "grad_norm": 0.466796875, + "learning_rate": 0.0002989649353640408, + "loss": 3.1207, + "step": 10652 + }, + { + "epoch": 0.44854736842105264, + "grad_norm": 0.4296875, + "learning_rate": 0.00029893184419807375, + "loss": 3.0314, + "step": 10653 + }, + { + "epoch": 0.44858947368421054, + "grad_norm": 0.443359375, + "learning_rate": 0.000298898752140626, + "loss": 3.2684, + "step": 10654 + }, + { + "epoch": 0.44863157894736844, + "grad_norm": 0.431640625, + "learning_rate": 0.00029886565919230046, + "loss": 3.284, + "step": 10655 + }, + { + "epoch": 0.44867368421052634, + "grad_norm": 0.4375, + "learning_rate": 0.00029883256535370007, + "loss": 3.0025, + "step": 10656 + }, + { + "epoch": 0.44871578947368423, + "grad_norm": 0.4140625, + "learning_rate": 0.00029879947062542766, + "loss": 3.0816, + "step": 10657 + }, + { + "epoch": 0.4487578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0002987663750080863, + "loss": 3.3353, + "step": 10658 + }, + { + "epoch": 0.4488, + "grad_norm": 0.45703125, + "learning_rate": 0.00029873327850227883, + "loss": 3.4437, + "step": 10659 + }, + { + "epoch": 0.4488421052631579, + "grad_norm": 0.400390625, + "learning_rate": 0.0002987001811086083, + "loss": 3.45, + "step": 10660 + }, + { + "epoch": 0.4488842105263158, + "grad_norm": 0.453125, + "learning_rate": 0.0002986670828276776, + "loss": 3.1667, + "step": 10661 + }, + { + "epoch": 0.4489263157894737, + "grad_norm": 0.40234375, + "learning_rate": 0.00029863398366008993, + "loss": 2.6959, + "step": 10662 + }, + { + "epoch": 0.44896842105263157, + "grad_norm": 0.431640625, + "learning_rate": 0.0002986008836064482, + "loss": 3.1509, + "step": 10663 + }, + { + "epoch": 0.44901052631578947, + "grad_norm": 0.423828125, + "learning_rate": 0.00029856778266735553, + "loss": 3.3621, + "step": 10664 + }, + { + "epoch": 0.44905263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.0002985346808434149, + "loss": 2.951, + "step": 10665 + }, + { + "epoch": 0.44909473684210527, + "grad_norm": 0.419921875, + "learning_rate": 0.0002985015781352293, + "loss": 2.5951, + "step": 10666 + }, + { + "epoch": 0.44913684210526317, + "grad_norm": 0.40625, + "learning_rate": 0.0002984684745434021, + "loss": 3.3283, + "step": 10667 + }, + { + "epoch": 0.44917894736842107, + "grad_norm": 0.40234375, + "learning_rate": 0.00029843537006853614, + "loss": 3.091, + "step": 10668 + }, + { + "epoch": 0.44922105263157897, + "grad_norm": 0.40625, + "learning_rate": 0.0002984022647112346, + "loss": 3.0417, + "step": 10669 + }, + { + "epoch": 0.44926315789473686, + "grad_norm": 0.416015625, + "learning_rate": 0.0002983691584721008, + "loss": 2.6233, + "step": 10670 + }, + { + "epoch": 0.44930526315789476, + "grad_norm": 0.435546875, + "learning_rate": 0.0002983360513517377, + "loss": 3.0433, + "step": 10671 + }, + { + "epoch": 0.4493473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.00029830294335074855, + "loss": 3.4448, + "step": 10672 + }, + { + "epoch": 0.4493894736842105, + "grad_norm": 0.482421875, + "learning_rate": 0.00029826983446973646, + "loss": 2.9305, + "step": 10673 + }, + { + "epoch": 0.4494315789473684, + "grad_norm": 0.4140625, + "learning_rate": 0.00029823672470930473, + "loss": 3.1799, + "step": 10674 + }, + { + "epoch": 0.4494736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.0002982036140700566, + "loss": 3.7231, + "step": 10675 + }, + { + "epoch": 0.4495157894736842, + "grad_norm": 0.40234375, + "learning_rate": 0.00029817050255259524, + "loss": 3.5226, + "step": 10676 + }, + { + "epoch": 0.4495578947368421, + "grad_norm": 0.494140625, + "learning_rate": 0.0002981373901575239, + "loss": 2.7976, + "step": 10677 + }, + { + "epoch": 0.4496, + "grad_norm": 0.42578125, + "learning_rate": 0.0002981042768854459, + "loss": 3.3463, + "step": 10678 + }, + { + "epoch": 0.4496421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.0002980711627369645, + "loss": 3.0226, + "step": 10679 + }, + { + "epoch": 0.4496842105263158, + "grad_norm": 0.396484375, + "learning_rate": 0.0002980380477126829, + "loss": 3.0923, + "step": 10680 + }, + { + "epoch": 0.4497263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.0002980049318132046, + "loss": 2.811, + "step": 10681 + }, + { + "epoch": 0.4497684210526316, + "grad_norm": 0.40234375, + "learning_rate": 0.0002979718150391328, + "loss": 2.9547, + "step": 10682 + }, + { + "epoch": 0.4498105263157895, + "grad_norm": 0.412109375, + "learning_rate": 0.000297938697391071, + "loss": 3.3961, + "step": 10683 + }, + { + "epoch": 0.4498526315789474, + "grad_norm": 0.4609375, + "learning_rate": 0.00029790557886962243, + "loss": 3.0942, + "step": 10684 + }, + { + "epoch": 0.44989473684210524, + "grad_norm": 0.44140625, + "learning_rate": 0.0002978724594753904, + "loss": 2.9861, + "step": 10685 + }, + { + "epoch": 0.44993684210526314, + "grad_norm": 0.46484375, + "learning_rate": 0.00029783933920897844, + "loss": 2.6441, + "step": 10686 + }, + { + "epoch": 0.44997894736842103, + "grad_norm": 0.423828125, + "learning_rate": 0.00029780621807098994, + "loss": 3.4641, + "step": 10687 + }, + { + "epoch": 0.45002105263157893, + "grad_norm": 0.40234375, + "learning_rate": 0.0002977730960620283, + "loss": 3.2242, + "step": 10688 + }, + { + "epoch": 0.45006315789473683, + "grad_norm": 0.416015625, + "learning_rate": 0.00029773997318269707, + "loss": 3.0055, + "step": 10689 + }, + { + "epoch": 0.45010526315789473, + "grad_norm": 0.408203125, + "learning_rate": 0.0002977068494335996, + "loss": 2.9299, + "step": 10690 + }, + { + "epoch": 0.45014736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.00029767372481533935, + "loss": 3.3921, + "step": 10691 + }, + { + "epoch": 0.45018947368421053, + "grad_norm": 0.3984375, + "learning_rate": 0.0002976405993285198, + "loss": 3.4299, + "step": 10692 + }, + { + "epoch": 0.4502315789473684, + "grad_norm": 0.419921875, + "learning_rate": 0.00029760747297374456, + "loss": 2.9678, + "step": 10693 + }, + { + "epoch": 0.4502736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.000297574345751617, + "loss": 3.2219, + "step": 10694 + }, + { + "epoch": 0.4503157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.0002975412176627409, + "loss": 3.0976, + "step": 10695 + }, + { + "epoch": 0.4503578947368421, + "grad_norm": 0.50390625, + "learning_rate": 0.0002975080887077196, + "loss": 3.122, + "step": 10696 + }, + { + "epoch": 0.4504, + "grad_norm": 0.41796875, + "learning_rate": 0.0002974749588871568, + "loss": 3.5867, + "step": 10697 + }, + { + "epoch": 0.4504421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.000297441828201656, + "loss": 3.2347, + "step": 10698 + }, + { + "epoch": 0.45048421052631576, + "grad_norm": 0.419921875, + "learning_rate": 0.00029740869665182085, + "loss": 3.4858, + "step": 10699 + }, + { + "epoch": 0.45052631578947366, + "grad_norm": 0.423828125, + "learning_rate": 0.0002973755642382549, + "loss": 3.23, + "step": 10700 + }, + { + "epoch": 0.45056842105263156, + "grad_norm": 0.4609375, + "learning_rate": 0.0002973424309615619, + "loss": 2.8876, + "step": 10701 + }, + { + "epoch": 0.45061052631578946, + "grad_norm": 0.443359375, + "learning_rate": 0.0002973092968223453, + "loss": 2.9289, + "step": 10702 + }, + { + "epoch": 0.45065263157894736, + "grad_norm": 0.423828125, + "learning_rate": 0.000297276161821209, + "loss": 3.0171, + "step": 10703 + }, + { + "epoch": 0.45069473684210526, + "grad_norm": 0.408203125, + "learning_rate": 0.0002972430259587566, + "loss": 3.3216, + "step": 10704 + }, + { + "epoch": 0.45073684210526316, + "grad_norm": 0.44921875, + "learning_rate": 0.0002972098892355918, + "loss": 3.217, + "step": 10705 + }, + { + "epoch": 0.45077894736842106, + "grad_norm": 0.423828125, + "learning_rate": 0.0002971767516523182, + "loss": 2.7713, + "step": 10706 + }, + { + "epoch": 0.45082105263157896, + "grad_norm": 0.443359375, + "learning_rate": 0.00029714361320953966, + "loss": 2.9664, + "step": 10707 + }, + { + "epoch": 0.45086315789473685, + "grad_norm": 0.4375, + "learning_rate": 0.00029711047390785985, + "loss": 3.1795, + "step": 10708 + }, + { + "epoch": 0.45090526315789475, + "grad_norm": 0.4140625, + "learning_rate": 0.0002970773337478826, + "loss": 3.132, + "step": 10709 + }, + { + "epoch": 0.45094736842105265, + "grad_norm": 0.439453125, + "learning_rate": 0.00029704419273021156, + "loss": 3.5035, + "step": 10710 + }, + { + "epoch": 0.45098947368421055, + "grad_norm": 0.416015625, + "learning_rate": 0.0002970110508554507, + "loss": 2.9548, + "step": 10711 + }, + { + "epoch": 0.4510315789473684, + "grad_norm": 0.40625, + "learning_rate": 0.0002969779081242037, + "loss": 3.4058, + "step": 10712 + }, + { + "epoch": 0.4510736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.0002969447645370743, + "loss": 3.236, + "step": 10713 + }, + { + "epoch": 0.4511157894736842, + "grad_norm": 0.462890625, + "learning_rate": 0.0002969116200946666, + "loss": 3.6311, + "step": 10714 + }, + { + "epoch": 0.4511578947368421, + "grad_norm": 0.458984375, + "learning_rate": 0.0002968784747975842, + "loss": 3.3382, + "step": 10715 + }, + { + "epoch": 0.4512, + "grad_norm": 0.484375, + "learning_rate": 0.0002968453286464312, + "loss": 3.4923, + "step": 10716 + }, + { + "epoch": 0.4512421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.00029681218164181127, + "loss": 3.1058, + "step": 10717 + }, + { + "epoch": 0.4512842105263158, + "grad_norm": 0.40625, + "learning_rate": 0.00029677903378432836, + "loss": 2.9032, + "step": 10718 + }, + { + "epoch": 0.4513263157894737, + "grad_norm": 0.455078125, + "learning_rate": 0.00029674588507458643, + "loss": 3.0587, + "step": 10719 + }, + { + "epoch": 0.4513684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.0002967127355131894, + "loss": 3.0987, + "step": 10720 + }, + { + "epoch": 0.4514105263157895, + "grad_norm": 0.40234375, + "learning_rate": 0.00029667958510074124, + "loss": 3.0001, + "step": 10721 + }, + { + "epoch": 0.4514526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.0002966464338378459, + "loss": 2.9063, + "step": 10722 + }, + { + "epoch": 0.4514947368421053, + "grad_norm": 0.412109375, + "learning_rate": 0.0002966132817251073, + "loss": 3.4609, + "step": 10723 + }, + { + "epoch": 0.4515368421052632, + "grad_norm": 0.404296875, + "learning_rate": 0.00029658012876312956, + "loss": 3.0503, + "step": 10724 + }, + { + "epoch": 0.4515789473684211, + "grad_norm": 0.4296875, + "learning_rate": 0.0002965469749525165, + "loss": 2.9545, + "step": 10725 + }, + { + "epoch": 0.4516210526315789, + "grad_norm": 0.447265625, + "learning_rate": 0.0002965138202938723, + "loss": 2.8021, + "step": 10726 + }, + { + "epoch": 0.4516631578947368, + "grad_norm": 0.455078125, + "learning_rate": 0.00029648066478780093, + "loss": 3.1825, + "step": 10727 + }, + { + "epoch": 0.4517052631578947, + "grad_norm": 0.447265625, + "learning_rate": 0.0002964475084349064, + "loss": 3.0208, + "step": 10728 + }, + { + "epoch": 0.4517473684210526, + "grad_norm": 0.4296875, + "learning_rate": 0.00029641435123579294, + "loss": 3.0671, + "step": 10729 + }, + { + "epoch": 0.4517894736842105, + "grad_norm": 0.44140625, + "learning_rate": 0.0002963811931910645, + "loss": 3.1906, + "step": 10730 + }, + { + "epoch": 0.4518315789473684, + "grad_norm": 0.451171875, + "learning_rate": 0.0002963480343013252, + "loss": 2.9931, + "step": 10731 + }, + { + "epoch": 0.4518736842105263, + "grad_norm": 0.40234375, + "learning_rate": 0.00029631487456717925, + "loss": 3.2474, + "step": 10732 + }, + { + "epoch": 0.4519157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.00029628171398923066, + "loss": 3.5211, + "step": 10733 + }, + { + "epoch": 0.4519578947368421, + "grad_norm": 0.396484375, + "learning_rate": 0.0002962485525680835, + "loss": 2.8547, + "step": 10734 + }, + { + "epoch": 0.452, + "grad_norm": 0.388671875, + "learning_rate": 0.0002962153903043422, + "loss": 3.0589, + "step": 10735 + }, + { + "epoch": 0.4520421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.0002961822271986107, + "loss": 3.3236, + "step": 10736 + }, + { + "epoch": 0.4520842105263158, + "grad_norm": 0.44921875, + "learning_rate": 0.00029614906325149334, + "loss": 3.3026, + "step": 10737 + }, + { + "epoch": 0.4521263157894737, + "grad_norm": 0.439453125, + "learning_rate": 0.0002961158984635943, + "loss": 2.8896, + "step": 10738 + }, + { + "epoch": 0.45216842105263155, + "grad_norm": 0.41796875, + "learning_rate": 0.0002960827328355177, + "loss": 2.9734, + "step": 10739 + }, + { + "epoch": 0.45221052631578945, + "grad_norm": 0.427734375, + "learning_rate": 0.00029604956636786786, + "loss": 3.1898, + "step": 10740 + }, + { + "epoch": 0.45225263157894735, + "grad_norm": 0.427734375, + "learning_rate": 0.00029601639906124907, + "loss": 2.9725, + "step": 10741 + }, + { + "epoch": 0.45229473684210525, + "grad_norm": 0.435546875, + "learning_rate": 0.0002959832309162655, + "loss": 3.4522, + "step": 10742 + }, + { + "epoch": 0.45233684210526315, + "grad_norm": 0.392578125, + "learning_rate": 0.0002959500619335216, + "loss": 3.2405, + "step": 10743 + }, + { + "epoch": 0.45237894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.0002959168921136214, + "loss": 3.1098, + "step": 10744 + }, + { + "epoch": 0.45242105263157895, + "grad_norm": 0.49609375, + "learning_rate": 0.00029588372145716944, + "loss": 2.8774, + "step": 10745 + }, + { + "epoch": 0.45246315789473684, + "grad_norm": 0.392578125, + "learning_rate": 0.0002958505499647701, + "loss": 3.1624, + "step": 10746 + }, + { + "epoch": 0.45250526315789474, + "grad_norm": 0.416015625, + "learning_rate": 0.0002958173776370274, + "loss": 2.9587, + "step": 10747 + }, + { + "epoch": 0.45254736842105264, + "grad_norm": 0.412109375, + "learning_rate": 0.00029578420447454607, + "loss": 2.8961, + "step": 10748 + }, + { + "epoch": 0.45258947368421054, + "grad_norm": 0.419921875, + "learning_rate": 0.00029575103047793026, + "loss": 3.0483, + "step": 10749 + }, + { + "epoch": 0.45263157894736844, + "grad_norm": 0.41015625, + "learning_rate": 0.00029571785564778453, + "loss": 3.0328, + "step": 10750 + }, + { + "epoch": 0.45267368421052634, + "grad_norm": 0.443359375, + "learning_rate": 0.0002956846799847131, + "loss": 3.2593, + "step": 10751 + }, + { + "epoch": 0.45271578947368424, + "grad_norm": 0.458984375, + "learning_rate": 0.00029565150348932046, + "loss": 3.3335, + "step": 10752 + }, + { + "epoch": 0.4527578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.0002956183261622111, + "loss": 3.399, + "step": 10753 + }, + { + "epoch": 0.4528, + "grad_norm": 0.40234375, + "learning_rate": 0.0002955851480039894, + "loss": 2.9145, + "step": 10754 + }, + { + "epoch": 0.4528421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.0002955519690152599, + "loss": 3.2448, + "step": 10755 + }, + { + "epoch": 0.4528842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.0002955187891966271, + "loss": 2.6433, + "step": 10756 + }, + { + "epoch": 0.4529263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.0002954856085486954, + "loss": 3.5114, + "step": 10757 + }, + { + "epoch": 0.4529684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.0002954524270720693, + "loss": 3.2221, + "step": 10758 + }, + { + "epoch": 0.4530105263157895, + "grad_norm": 0.416015625, + "learning_rate": 0.00029541924476735347, + "loss": 3.3745, + "step": 10759 + }, + { + "epoch": 0.4530526315789474, + "grad_norm": 0.451171875, + "learning_rate": 0.0002953860616351523, + "loss": 3.1892, + "step": 10760 + }, + { + "epoch": 0.4530947368421053, + "grad_norm": 0.42578125, + "learning_rate": 0.0002953528776760704, + "loss": 2.9072, + "step": 10761 + }, + { + "epoch": 0.45313684210526317, + "grad_norm": 0.3984375, + "learning_rate": 0.00029531969289071236, + "loss": 3.0702, + "step": 10762 + }, + { + "epoch": 0.45317894736842107, + "grad_norm": 0.42578125, + "learning_rate": 0.00029528650727968285, + "loss": 3.0727, + "step": 10763 + }, + { + "epoch": 0.45322105263157897, + "grad_norm": 0.453125, + "learning_rate": 0.0002952533208435864, + "loss": 3.2286, + "step": 10764 + }, + { + "epoch": 0.45326315789473687, + "grad_norm": 0.431640625, + "learning_rate": 0.0002952201335830275, + "loss": 3.2382, + "step": 10765 + }, + { + "epoch": 0.4533052631578947, + "grad_norm": 0.451171875, + "learning_rate": 0.00029518694549861095, + "loss": 3.169, + "step": 10766 + }, + { + "epoch": 0.4533473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.0002951537565909413, + "loss": 2.8632, + "step": 10767 + }, + { + "epoch": 0.4533894736842105, + "grad_norm": 0.419921875, + "learning_rate": 0.00029512056686062334, + "loss": 2.9703, + "step": 10768 + }, + { + "epoch": 0.4534315789473684, + "grad_norm": 0.43359375, + "learning_rate": 0.00029508737630826154, + "loss": 3.3141, + "step": 10769 + }, + { + "epoch": 0.4534736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.00029505418493446086, + "loss": 3.3536, + "step": 10770 + }, + { + "epoch": 0.4535157894736842, + "grad_norm": 0.46875, + "learning_rate": 0.0002950209927398258, + "loss": 3.3407, + "step": 10771 + }, + { + "epoch": 0.4535578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.0002949877997249612, + "loss": 3.0141, + "step": 10772 + }, + { + "epoch": 0.4536, + "grad_norm": 0.419921875, + "learning_rate": 0.0002949546058904717, + "loss": 3.0799, + "step": 10773 + }, + { + "epoch": 0.4536421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.0002949214112369621, + "loss": 3.19, + "step": 10774 + }, + { + "epoch": 0.4536842105263158, + "grad_norm": 0.5703125, + "learning_rate": 0.00029488821576503717, + "loss": 2.6722, + "step": 10775 + }, + { + "epoch": 0.4537263157894737, + "grad_norm": 0.447265625, + "learning_rate": 0.0002948550194753017, + "loss": 3.5367, + "step": 10776 + }, + { + "epoch": 0.4537684210526316, + "grad_norm": 0.396484375, + "learning_rate": 0.00029482182236836044, + "loss": 3.2778, + "step": 10777 + }, + { + "epoch": 0.4538105263157895, + "grad_norm": 0.412109375, + "learning_rate": 0.0002947886244448183, + "loss": 3.0304, + "step": 10778 + }, + { + "epoch": 0.45385263157894734, + "grad_norm": 0.44140625, + "learning_rate": 0.00029475542570528, + "loss": 3.087, + "step": 10779 + }, + { + "epoch": 0.45389473684210524, + "grad_norm": 0.4453125, + "learning_rate": 0.0002947222261503505, + "loss": 2.9305, + "step": 10780 + }, + { + "epoch": 0.45393684210526314, + "grad_norm": 0.49609375, + "learning_rate": 0.0002946890257806345, + "loss": 3.4819, + "step": 10781 + }, + { + "epoch": 0.45397894736842104, + "grad_norm": 0.40625, + "learning_rate": 0.00029465582459673703, + "loss": 3.2831, + "step": 10782 + }, + { + "epoch": 0.45402105263157894, + "grad_norm": 0.6796875, + "learning_rate": 0.0002946226225992628, + "loss": 3.0029, + "step": 10783 + }, + { + "epoch": 0.45406315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.00029458941978881693, + "loss": 3.108, + "step": 10784 + }, + { + "epoch": 0.45410526315789473, + "grad_norm": 0.46484375, + "learning_rate": 0.00029455621616600417, + "loss": 2.8524, + "step": 10785 + }, + { + "epoch": 0.45414736842105263, + "grad_norm": 0.451171875, + "learning_rate": 0.0002945230117314295, + "loss": 3.2725, + "step": 10786 + }, + { + "epoch": 0.45418947368421053, + "grad_norm": 0.4296875, + "learning_rate": 0.0002944898064856978, + "loss": 3.5465, + "step": 10787 + }, + { + "epoch": 0.45423157894736843, + "grad_norm": 0.423828125, + "learning_rate": 0.0002944566004294142, + "loss": 3.2026, + "step": 10788 + }, + { + "epoch": 0.45427368421052633, + "grad_norm": 0.416015625, + "learning_rate": 0.0002944233935631836, + "loss": 3.1862, + "step": 10789 + }, + { + "epoch": 0.45431578947368423, + "grad_norm": 0.51953125, + "learning_rate": 0.0002943901858876109, + "loss": 3.1205, + "step": 10790 + }, + { + "epoch": 0.4543578947368421, + "grad_norm": 0.484375, + "learning_rate": 0.0002943569774033012, + "loss": 3.0645, + "step": 10791 + }, + { + "epoch": 0.4544, + "grad_norm": 0.455078125, + "learning_rate": 0.00029432376811085946, + "loss": 2.8163, + "step": 10792 + }, + { + "epoch": 0.45444210526315787, + "grad_norm": 0.447265625, + "learning_rate": 0.00029429055801089075, + "loss": 2.9481, + "step": 10793 + }, + { + "epoch": 0.45448421052631577, + "grad_norm": 0.431640625, + "learning_rate": 0.00029425734710400014, + "loss": 2.7775, + "step": 10794 + }, + { + "epoch": 0.45452631578947367, + "grad_norm": 0.421875, + "learning_rate": 0.00029422413539079265, + "loss": 3.406, + "step": 10795 + }, + { + "epoch": 0.45456842105263157, + "grad_norm": 0.416015625, + "learning_rate": 0.0002941909228718733, + "loss": 3.2039, + "step": 10796 + }, + { + "epoch": 0.45461052631578946, + "grad_norm": 0.439453125, + "learning_rate": 0.00029415770954784737, + "loss": 3.3115, + "step": 10797 + }, + { + "epoch": 0.45465263157894736, + "grad_norm": 0.43359375, + "learning_rate": 0.00029412449541931987, + "loss": 3.129, + "step": 10798 + }, + { + "epoch": 0.45469473684210526, + "grad_norm": 0.41015625, + "learning_rate": 0.0002940912804868958, + "loss": 3.3204, + "step": 10799 + }, + { + "epoch": 0.45473684210526316, + "grad_norm": 0.396484375, + "learning_rate": 0.0002940580647511805, + "loss": 2.9686, + "step": 10800 + }, + { + "epoch": 0.45477894736842106, + "grad_norm": 0.4453125, + "learning_rate": 0.0002940248482127789, + "loss": 3.7095, + "step": 10801 + }, + { + "epoch": 0.45482105263157896, + "grad_norm": 0.3984375, + "learning_rate": 0.0002939916308722964, + "loss": 3.1674, + "step": 10802 + }, + { + "epoch": 0.45486315789473686, + "grad_norm": 0.43359375, + "learning_rate": 0.000293958412730338, + "loss": 3.158, + "step": 10803 + }, + { + "epoch": 0.45490526315789476, + "grad_norm": 0.435546875, + "learning_rate": 0.00029392519378750905, + "loss": 2.4594, + "step": 10804 + }, + { + "epoch": 0.45494736842105266, + "grad_norm": 0.4296875, + "learning_rate": 0.00029389197404441463, + "loss": 2.8781, + "step": 10805 + }, + { + "epoch": 0.4549894736842105, + "grad_norm": 0.447265625, + "learning_rate": 0.0002938587535016601, + "loss": 3.3049, + "step": 10806 + }, + { + "epoch": 0.4550315789473684, + "grad_norm": 0.4453125, + "learning_rate": 0.00029382553215985053, + "loss": 3.5205, + "step": 10807 + }, + { + "epoch": 0.4550736842105263, + "grad_norm": 0.408203125, + "learning_rate": 0.00029379231001959124, + "loss": 3.0637, + "step": 10808 + }, + { + "epoch": 0.4551157894736842, + "grad_norm": 0.40234375, + "learning_rate": 0.0002937590870814876, + "loss": 2.7098, + "step": 10809 + }, + { + "epoch": 0.4551578947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.0002937258633461447, + "loss": 3.6539, + "step": 10810 + }, + { + "epoch": 0.4552, + "grad_norm": 0.55078125, + "learning_rate": 0.000293692638814168, + "loss": 3.2849, + "step": 10811 + }, + { + "epoch": 0.4552421052631579, + "grad_norm": 0.44921875, + "learning_rate": 0.0002936594134861628, + "loss": 2.8887, + "step": 10812 + }, + { + "epoch": 0.4552842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.0002936261873627343, + "loss": 2.8937, + "step": 10813 + }, + { + "epoch": 0.4553263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.0002935929604444879, + "loss": 3.2617, + "step": 10814 + }, + { + "epoch": 0.4553684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.00029355973273202906, + "loss": 3.758, + "step": 10815 + }, + { + "epoch": 0.4554105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.00029352650422596313, + "loss": 3.0081, + "step": 10816 + }, + { + "epoch": 0.4554526315789474, + "grad_norm": 0.4140625, + "learning_rate": 0.0002934932749268953, + "loss": 3.3197, + "step": 10817 + }, + { + "epoch": 0.4554947368421053, + "grad_norm": 0.419921875, + "learning_rate": 0.0002934600448354312, + "loss": 3.046, + "step": 10818 + }, + { + "epoch": 0.4555368421052632, + "grad_norm": 0.4375, + "learning_rate": 0.00029342681395217606, + "loss": 3.0923, + "step": 10819 + }, + { + "epoch": 0.45557894736842103, + "grad_norm": 0.41015625, + "learning_rate": 0.0002933935822777355, + "loss": 2.9966, + "step": 10820 + }, + { + "epoch": 0.4556210526315789, + "grad_norm": 0.451171875, + "learning_rate": 0.00029336034981271476, + "loss": 2.9183, + "step": 10821 + }, + { + "epoch": 0.4556631578947368, + "grad_norm": 0.453125, + "learning_rate": 0.00029332711655771944, + "loss": 3.259, + "step": 10822 + }, + { + "epoch": 0.4557052631578947, + "grad_norm": 0.400390625, + "learning_rate": 0.0002932938825133549, + "loss": 3.1181, + "step": 10823 + }, + { + "epoch": 0.4557473684210526, + "grad_norm": 0.400390625, + "learning_rate": 0.00029326064768022683, + "loss": 3.3532, + "step": 10824 + }, + { + "epoch": 0.4557894736842105, + "grad_norm": 0.43359375, + "learning_rate": 0.0002932274120589405, + "loss": 2.6507, + "step": 10825 + }, + { + "epoch": 0.4558315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.00029319417565010155, + "loss": 3.4675, + "step": 10826 + }, + { + "epoch": 0.4558736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.00029316093845431533, + "loss": 2.7558, + "step": 10827 + }, + { + "epoch": 0.4559157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.0002931277004721877, + "loss": 2.8448, + "step": 10828 + }, + { + "epoch": 0.4559578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0002930944617043239, + "loss": 2.8134, + "step": 10829 + }, + { + "epoch": 0.456, + "grad_norm": 0.4140625, + "learning_rate": 0.0002930612221513297, + "loss": 3.2255, + "step": 10830 + }, + { + "epoch": 0.4560421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.0002930279818138107, + "loss": 3.111, + "step": 10831 + }, + { + "epoch": 0.4560842105263158, + "grad_norm": 0.439453125, + "learning_rate": 0.0002929947406923724, + "loss": 2.8973, + "step": 10832 + }, + { + "epoch": 0.45612631578947366, + "grad_norm": 0.419921875, + "learning_rate": 0.0002929614987876204, + "loss": 3.1223, + "step": 10833 + }, + { + "epoch": 0.45616842105263156, + "grad_norm": 0.4765625, + "learning_rate": 0.00029292825610016036, + "loss": 2.9167, + "step": 10834 + }, + { + "epoch": 0.45621052631578946, + "grad_norm": 0.4375, + "learning_rate": 0.00029289501263059795, + "loss": 3.2726, + "step": 10835 + }, + { + "epoch": 0.45625263157894735, + "grad_norm": 0.46484375, + "learning_rate": 0.0002928617683795388, + "loss": 3.4003, + "step": 10836 + }, + { + "epoch": 0.45629473684210525, + "grad_norm": 0.4296875, + "learning_rate": 0.0002928285233475886, + "loss": 3.4195, + "step": 10837 + }, + { + "epoch": 0.45633684210526315, + "grad_norm": 0.44140625, + "learning_rate": 0.00029279527753535303, + "loss": 2.9525, + "step": 10838 + }, + { + "epoch": 0.45637894736842105, + "grad_norm": 0.435546875, + "learning_rate": 0.00029276203094343777, + "loss": 3.2492, + "step": 10839 + }, + { + "epoch": 0.45642105263157895, + "grad_norm": 0.45703125, + "learning_rate": 0.0002927287835724486, + "loss": 3.3463, + "step": 10840 + }, + { + "epoch": 0.45646315789473685, + "grad_norm": 0.45703125, + "learning_rate": 0.0002926955354229911, + "loss": 3.2141, + "step": 10841 + }, + { + "epoch": 0.45650526315789475, + "grad_norm": 0.44140625, + "learning_rate": 0.00029266228649567116, + "loss": 3.1454, + "step": 10842 + }, + { + "epoch": 0.45654736842105265, + "grad_norm": 0.41015625, + "learning_rate": 0.00029262903679109456, + "loss": 3.17, + "step": 10843 + }, + { + "epoch": 0.45658947368421055, + "grad_norm": 0.427734375, + "learning_rate": 0.0002925957863098669, + "loss": 3.4798, + "step": 10844 + }, + { + "epoch": 0.45663157894736844, + "grad_norm": 0.46484375, + "learning_rate": 0.00029256253505259404, + "loss": 3.1126, + "step": 10845 + }, + { + "epoch": 0.45667368421052634, + "grad_norm": 0.435546875, + "learning_rate": 0.0002925292830198819, + "loss": 3.2892, + "step": 10846 + }, + { + "epoch": 0.4567157894736842, + "grad_norm": 0.44921875, + "learning_rate": 0.00029249603021233616, + "loss": 3.2025, + "step": 10847 + }, + { + "epoch": 0.4567578947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.00029246277663056265, + "loss": 2.9418, + "step": 10848 + }, + { + "epoch": 0.4568, + "grad_norm": 0.408203125, + "learning_rate": 0.0002924295222751672, + "loss": 3.6867, + "step": 10849 + }, + { + "epoch": 0.4568421052631579, + "grad_norm": 0.58203125, + "learning_rate": 0.00029239626714675583, + "loss": 3.2373, + "step": 10850 + }, + { + "epoch": 0.4568842105263158, + "grad_norm": 0.46484375, + "learning_rate": 0.0002923630112459342, + "loss": 2.6255, + "step": 10851 + }, + { + "epoch": 0.4569263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.0002923297545733083, + "loss": 3.1834, + "step": 10852 + }, + { + "epoch": 0.4569684210526316, + "grad_norm": 0.44140625, + "learning_rate": 0.000292296497129484, + "loss": 2.9693, + "step": 10853 + }, + { + "epoch": 0.4570105263157895, + "grad_norm": 0.40234375, + "learning_rate": 0.0002922632389150672, + "loss": 3.2842, + "step": 10854 + }, + { + "epoch": 0.4570526315789474, + "grad_norm": 0.408203125, + "learning_rate": 0.0002922299799306639, + "loss": 3.5767, + "step": 10855 + }, + { + "epoch": 0.4570947368421053, + "grad_norm": 0.474609375, + "learning_rate": 0.00029219672017687993, + "loss": 3.406, + "step": 10856 + }, + { + "epoch": 0.4571368421052632, + "grad_norm": 0.447265625, + "learning_rate": 0.00029216345965432135, + "loss": 3.2245, + "step": 10857 + }, + { + "epoch": 0.4571789473684211, + "grad_norm": 0.44921875, + "learning_rate": 0.00029213019836359403, + "loss": 3.284, + "step": 10858 + }, + { + "epoch": 0.457221052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.0002920969363053041, + "loss": 3.5865, + "step": 10859 + }, + { + "epoch": 0.4572631578947368, + "grad_norm": 0.41796875, + "learning_rate": 0.0002920636734800574, + "loss": 2.8177, + "step": 10860 + }, + { + "epoch": 0.4573052631578947, + "grad_norm": 0.4375, + "learning_rate": 0.00029203040988845985, + "loss": 3.277, + "step": 10861 + }, + { + "epoch": 0.4573473684210526, + "grad_norm": 0.43359375, + "learning_rate": 0.00029199714553111765, + "loss": 3.2967, + "step": 10862 + }, + { + "epoch": 0.4573894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.0002919638804086369, + "loss": 2.8957, + "step": 10863 + }, + { + "epoch": 0.4574315789473684, + "grad_norm": 0.44921875, + "learning_rate": 0.0002919306145216236, + "loss": 3.1055, + "step": 10864 + }, + { + "epoch": 0.4574736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.00029189734787068365, + "loss": 3.307, + "step": 10865 + }, + { + "epoch": 0.4575157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.0002918640804564234, + "loss": 2.779, + "step": 10866 + }, + { + "epoch": 0.4575578947368421, + "grad_norm": 0.5625, + "learning_rate": 0.0002918308122794487, + "loss": 3.2026, + "step": 10867 + }, + { + "epoch": 0.4576, + "grad_norm": 0.451171875, + "learning_rate": 0.0002917975433403657, + "loss": 3.1163, + "step": 10868 + }, + { + "epoch": 0.4576421052631579, + "grad_norm": 0.40234375, + "learning_rate": 0.00029176427363978056, + "loss": 2.857, + "step": 10869 + }, + { + "epoch": 0.4576842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.00029173100317829946, + "loss": 3.3748, + "step": 10870 + }, + { + "epoch": 0.4577263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.00029169773195652853, + "loss": 3.3672, + "step": 10871 + }, + { + "epoch": 0.4577684210526316, + "grad_norm": 0.443359375, + "learning_rate": 0.00029166445997507394, + "loss": 3.3622, + "step": 10872 + }, + { + "epoch": 0.4578105263157895, + "grad_norm": 0.423828125, + "learning_rate": 0.0002916311872345419, + "loss": 3.1364, + "step": 10873 + }, + { + "epoch": 0.45785263157894734, + "grad_norm": 0.423828125, + "learning_rate": 0.00029159791373553845, + "loss": 3.3991, + "step": 10874 + }, + { + "epoch": 0.45789473684210524, + "grad_norm": 0.421875, + "learning_rate": 0.00029156463947866985, + "loss": 3.5751, + "step": 10875 + }, + { + "epoch": 0.45793684210526314, + "grad_norm": 0.439453125, + "learning_rate": 0.0002915313644645424, + "loss": 3.1791, + "step": 10876 + }, + { + "epoch": 0.45797894736842104, + "grad_norm": 0.3984375, + "learning_rate": 0.0002914980886937623, + "loss": 2.9953, + "step": 10877 + }, + { + "epoch": 0.45802105263157894, + "grad_norm": 0.4375, + "learning_rate": 0.00029146481216693577, + "loss": 3.1552, + "step": 10878 + }, + { + "epoch": 0.45806315789473684, + "grad_norm": 0.51953125, + "learning_rate": 0.0002914315348846691, + "loss": 2.9996, + "step": 10879 + }, + { + "epoch": 0.45810526315789474, + "grad_norm": 0.421875, + "learning_rate": 0.00029139825684756855, + "loss": 3.187, + "step": 10880 + }, + { + "epoch": 0.45814736842105264, + "grad_norm": 0.44921875, + "learning_rate": 0.0002913649780562404, + "loss": 3.2066, + "step": 10881 + }, + { + "epoch": 0.45818947368421054, + "grad_norm": 0.40625, + "learning_rate": 0.0002913316985112909, + "loss": 2.8667, + "step": 10882 + }, + { + "epoch": 0.45823157894736843, + "grad_norm": 0.44921875, + "learning_rate": 0.0002912984182133265, + "loss": 2.8634, + "step": 10883 + }, + { + "epoch": 0.45827368421052633, + "grad_norm": 0.4375, + "learning_rate": 0.00029126513716295333, + "loss": 3.4343, + "step": 10884 + }, + { + "epoch": 0.45831578947368423, + "grad_norm": 0.5, + "learning_rate": 0.00029123185536077795, + "loss": 3.296, + "step": 10885 + }, + { + "epoch": 0.45835789473684213, + "grad_norm": 0.451171875, + "learning_rate": 0.00029119857280740654, + "loss": 2.8707, + "step": 10886 + }, + { + "epoch": 0.4584, + "grad_norm": 0.41796875, + "learning_rate": 0.00029116528950344554, + "loss": 3.2486, + "step": 10887 + }, + { + "epoch": 0.4584421052631579, + "grad_norm": 0.44921875, + "learning_rate": 0.00029113200544950135, + "loss": 2.8539, + "step": 10888 + }, + { + "epoch": 0.45848421052631577, + "grad_norm": 0.53515625, + "learning_rate": 0.00029109872064618037, + "loss": 3.1036, + "step": 10889 + }, + { + "epoch": 0.45852631578947367, + "grad_norm": 0.44140625, + "learning_rate": 0.000291065435094089, + "loss": 3.2735, + "step": 10890 + }, + { + "epoch": 0.45856842105263157, + "grad_norm": 0.453125, + "learning_rate": 0.0002910321487938337, + "loss": 3.2538, + "step": 10891 + }, + { + "epoch": 0.45861052631578947, + "grad_norm": 0.451171875, + "learning_rate": 0.00029099886174602073, + "loss": 3.0726, + "step": 10892 + }, + { + "epoch": 0.45865263157894737, + "grad_norm": 0.435546875, + "learning_rate": 0.00029096557395125676, + "loss": 3.3193, + "step": 10893 + }, + { + "epoch": 0.45869473684210527, + "grad_norm": 0.4140625, + "learning_rate": 0.0002909322854101481, + "loss": 3.2963, + "step": 10894 + }, + { + "epoch": 0.45873684210526317, + "grad_norm": 0.421875, + "learning_rate": 0.0002908989961233014, + "loss": 3.0883, + "step": 10895 + }, + { + "epoch": 0.45877894736842106, + "grad_norm": 0.384765625, + "learning_rate": 0.0002908657060913229, + "loss": 3.082, + "step": 10896 + }, + { + "epoch": 0.45882105263157896, + "grad_norm": 0.44140625, + "learning_rate": 0.00029083241531481936, + "loss": 3.1598, + "step": 10897 + }, + { + "epoch": 0.45886315789473686, + "grad_norm": 0.474609375, + "learning_rate": 0.00029079912379439716, + "loss": 3.3134, + "step": 10898 + }, + { + "epoch": 0.45890526315789476, + "grad_norm": 0.478515625, + "learning_rate": 0.00029076583153066286, + "loss": 3.0177, + "step": 10899 + }, + { + "epoch": 0.4589473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.000290732538524223, + "loss": 3.5606, + "step": 10900 + }, + { + "epoch": 0.4589894736842105, + "grad_norm": 0.443359375, + "learning_rate": 0.0002906992447756842, + "loss": 3.1346, + "step": 10901 + }, + { + "epoch": 0.4590315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.0002906659502856529, + "loss": 3.073, + "step": 10902 + }, + { + "epoch": 0.4590736842105263, + "grad_norm": 0.45703125, + "learning_rate": 0.0002906326550547358, + "loss": 2.9789, + "step": 10903 + }, + { + "epoch": 0.4591157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.0002905993590835394, + "loss": 3.4766, + "step": 10904 + }, + { + "epoch": 0.4591578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0002905660623726705, + "loss": 3.1613, + "step": 10905 + }, + { + "epoch": 0.4592, + "grad_norm": 0.4140625, + "learning_rate": 0.00029053276492273553, + "loss": 3.3436, + "step": 10906 + }, + { + "epoch": 0.4592421052631579, + "grad_norm": 0.396484375, + "learning_rate": 0.0002904994667343412, + "loss": 2.7727, + "step": 10907 + }, + { + "epoch": 0.4592842105263158, + "grad_norm": 0.412109375, + "learning_rate": 0.00029046616780809426, + "loss": 2.9928, + "step": 10908 + }, + { + "epoch": 0.4593263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00029043286814460116, + "loss": 3.3697, + "step": 10909 + }, + { + "epoch": 0.4593684210526316, + "grad_norm": 0.40625, + "learning_rate": 0.00029039956774446884, + "loss": 3.0619, + "step": 10910 + }, + { + "epoch": 0.4594105263157895, + "grad_norm": 0.44140625, + "learning_rate": 0.0002903662666083038, + "loss": 3.3863, + "step": 10911 + }, + { + "epoch": 0.4594526315789474, + "grad_norm": 0.392578125, + "learning_rate": 0.0002903329647367128, + "loss": 3.1996, + "step": 10912 + }, + { + "epoch": 0.4594947368421053, + "grad_norm": 0.41015625, + "learning_rate": 0.00029029966213030257, + "loss": 3.2989, + "step": 10913 + }, + { + "epoch": 0.45953684210526313, + "grad_norm": 0.4140625, + "learning_rate": 0.0002902663587896799, + "loss": 3.2251, + "step": 10914 + }, + { + "epoch": 0.45957894736842103, + "grad_norm": 0.45703125, + "learning_rate": 0.00029023305471545134, + "loss": 3.2306, + "step": 10915 + }, + { + "epoch": 0.45962105263157893, + "grad_norm": 0.408203125, + "learning_rate": 0.00029019974990822394, + "loss": 2.8315, + "step": 10916 + }, + { + "epoch": 0.45966315789473683, + "grad_norm": 0.42578125, + "learning_rate": 0.00029016644436860426, + "loss": 2.8982, + "step": 10917 + }, + { + "epoch": 0.45970526315789473, + "grad_norm": 0.412109375, + "learning_rate": 0.0002901331380971992, + "loss": 3.0095, + "step": 10918 + }, + { + "epoch": 0.4597473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.0002900998310946156, + "loss": 2.9804, + "step": 10919 + }, + { + "epoch": 0.4597894736842105, + "grad_norm": 0.443359375, + "learning_rate": 0.00029006652336146, + "loss": 3.062, + "step": 10920 + }, + { + "epoch": 0.4598315789473684, + "grad_norm": 0.431640625, + "learning_rate": 0.00029003321489833955, + "loss": 2.8508, + "step": 10921 + }, + { + "epoch": 0.4598736842105263, + "grad_norm": 0.408203125, + "learning_rate": 0.0002899999057058609, + "loss": 3.248, + "step": 10922 + }, + { + "epoch": 0.4599157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.00028996659578463095, + "loss": 3.3538, + "step": 10923 + }, + { + "epoch": 0.4599578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.0002899332851352567, + "loss": 2.8977, + "step": 10924 + }, + { + "epoch": 0.46, + "grad_norm": 0.41015625, + "learning_rate": 0.00028989997375834483, + "loss": 3.2788, + "step": 10925 + }, + { + "epoch": 0.4600421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.00028986666165450235, + "loss": 2.8974, + "step": 10926 + }, + { + "epoch": 0.46008421052631576, + "grad_norm": 0.4375, + "learning_rate": 0.00028983334882433614, + "loss": 3.2715, + "step": 10927 + }, + { + "epoch": 0.46012631578947366, + "grad_norm": 0.447265625, + "learning_rate": 0.00028980003526845303, + "loss": 3.5181, + "step": 10928 + }, + { + "epoch": 0.46016842105263156, + "grad_norm": 0.400390625, + "learning_rate": 0.00028976672098746006, + "loss": 3.0002, + "step": 10929 + }, + { + "epoch": 0.46021052631578946, + "grad_norm": 0.443359375, + "learning_rate": 0.0002897334059819642, + "loss": 3.199, + "step": 10930 + }, + { + "epoch": 0.46025263157894736, + "grad_norm": 0.41796875, + "learning_rate": 0.00028970009025257245, + "loss": 3.7068, + "step": 10931 + }, + { + "epoch": 0.46029473684210526, + "grad_norm": 0.408203125, + "learning_rate": 0.00028966677379989157, + "loss": 3.3196, + "step": 10932 + }, + { + "epoch": 0.46033684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.0002896334566245288, + "loss": 2.9581, + "step": 10933 + }, + { + "epoch": 0.46037894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.0002896001387270909, + "loss": 3.3289, + "step": 10934 + }, + { + "epoch": 0.46042105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.00028956682010818503, + "loss": 3.3524, + "step": 10935 + }, + { + "epoch": 0.46046315789473685, + "grad_norm": 0.45703125, + "learning_rate": 0.00028953350076841824, + "loss": 3.3424, + "step": 10936 + }, + { + "epoch": 0.46050526315789475, + "grad_norm": 0.42578125, + "learning_rate": 0.0002895001807083974, + "loss": 3.4535, + "step": 10937 + }, + { + "epoch": 0.46054736842105265, + "grad_norm": 0.443359375, + "learning_rate": 0.0002894668599287298, + "loss": 2.9181, + "step": 10938 + }, + { + "epoch": 0.46058947368421055, + "grad_norm": 0.462890625, + "learning_rate": 0.0002894335384300223, + "loss": 3.1239, + "step": 10939 + }, + { + "epoch": 0.46063157894736845, + "grad_norm": 0.42578125, + "learning_rate": 0.00028940021621288217, + "loss": 3.2357, + "step": 10940 + }, + { + "epoch": 0.4606736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.0002893668932779163, + "loss": 3.0255, + "step": 10941 + }, + { + "epoch": 0.4607157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.0002893335696257319, + "loss": 3.3788, + "step": 10942 + }, + { + "epoch": 0.4607578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.0002893002452569361, + "loss": 3.4051, + "step": 10943 + }, + { + "epoch": 0.4608, + "grad_norm": 0.41796875, + "learning_rate": 0.000289266920172136, + "loss": 2.7226, + "step": 10944 + }, + { + "epoch": 0.4608421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.0002892335943719387, + "loss": 3.1065, + "step": 10945 + }, + { + "epoch": 0.4608842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.00028920026785695155, + "loss": 3.1016, + "step": 10946 + }, + { + "epoch": 0.4609263157894737, + "grad_norm": 0.44921875, + "learning_rate": 0.0002891669406277815, + "loss": 3.5839, + "step": 10947 + }, + { + "epoch": 0.4609684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.0002891336126850358, + "loss": 3.0331, + "step": 10948 + }, + { + "epoch": 0.4610105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.0002891002840293216, + "loss": 3.5054, + "step": 10949 + }, + { + "epoch": 0.4610526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.0002890669546612462, + "loss": 3.1413, + "step": 10950 + }, + { + "epoch": 0.4610947368421053, + "grad_norm": 0.404296875, + "learning_rate": 0.00028903362458141687, + "loss": 3.0297, + "step": 10951 + }, + { + "epoch": 0.4611368421052632, + "grad_norm": 0.416015625, + "learning_rate": 0.0002890002937904407, + "loss": 3.3813, + "step": 10952 + }, + { + "epoch": 0.4611789473684211, + "grad_norm": 0.41015625, + "learning_rate": 0.000288966962288925, + "loss": 3.4627, + "step": 10953 + }, + { + "epoch": 0.4612210526315789, + "grad_norm": 0.400390625, + "learning_rate": 0.000288933630077477, + "loss": 3.0975, + "step": 10954 + }, + { + "epoch": 0.4612631578947368, + "grad_norm": 0.462890625, + "learning_rate": 0.00028890029715670406, + "loss": 2.6028, + "step": 10955 + }, + { + "epoch": 0.4613052631578947, + "grad_norm": 0.408203125, + "learning_rate": 0.00028886696352721335, + "loss": 3.401, + "step": 10956 + }, + { + "epoch": 0.4613473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.0002888336291896123, + "loss": 3.2191, + "step": 10957 + }, + { + "epoch": 0.4613894736842105, + "grad_norm": 0.39453125, + "learning_rate": 0.00028880029414450815, + "loss": 2.9675, + "step": 10958 + }, + { + "epoch": 0.4614315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.00028876695839250824, + "loss": 3.4638, + "step": 10959 + }, + { + "epoch": 0.4614736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00028873362193421984, + "loss": 2.9631, + "step": 10960 + }, + { + "epoch": 0.4615157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.0002887002847702504, + "loss": 3.2601, + "step": 10961 + }, + { + "epoch": 0.4615578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00028866694690120716, + "loss": 3.5154, + "step": 10962 + }, + { + "epoch": 0.4616, + "grad_norm": 0.42578125, + "learning_rate": 0.00028863360832769757, + "loss": 3.104, + "step": 10963 + }, + { + "epoch": 0.4616421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.00028860026905032913, + "loss": 3.4063, + "step": 10964 + }, + { + "epoch": 0.4616842105263158, + "grad_norm": 0.443359375, + "learning_rate": 0.00028856692906970916, + "loss": 3.114, + "step": 10965 + }, + { + "epoch": 0.4617263157894737, + "grad_norm": 0.462890625, + "learning_rate": 0.000288533588386445, + "loss": 2.7107, + "step": 10966 + }, + { + "epoch": 0.4617684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.00028850024700114417, + "loss": 2.871, + "step": 10967 + }, + { + "epoch": 0.46181052631578945, + "grad_norm": 0.40625, + "learning_rate": 0.0002884669049144141, + "loss": 3.3199, + "step": 10968 + }, + { + "epoch": 0.46185263157894735, + "grad_norm": 0.474609375, + "learning_rate": 0.0002884335621268621, + "loss": 3.0826, + "step": 10969 + }, + { + "epoch": 0.46189473684210525, + "grad_norm": 0.435546875, + "learning_rate": 0.00028840021863909583, + "loss": 3.0368, + "step": 10970 + }, + { + "epoch": 0.46193684210526315, + "grad_norm": 0.44140625, + "learning_rate": 0.00028836687445172264, + "loss": 3.1435, + "step": 10971 + }, + { + "epoch": 0.46197894736842104, + "grad_norm": 0.431640625, + "learning_rate": 0.0002883335295653501, + "loss": 3.3272, + "step": 10972 + }, + { + "epoch": 0.46202105263157894, + "grad_norm": 0.412109375, + "learning_rate": 0.00028830018398058577, + "loss": 3.5651, + "step": 10973 + }, + { + "epoch": 0.46206315789473684, + "grad_norm": 0.408203125, + "learning_rate": 0.0002882668376980371, + "loss": 3.0045, + "step": 10974 + }, + { + "epoch": 0.46210526315789474, + "grad_norm": 0.443359375, + "learning_rate": 0.0002882334907183115, + "loss": 3.0445, + "step": 10975 + }, + { + "epoch": 0.46214736842105264, + "grad_norm": 0.42578125, + "learning_rate": 0.0002882001430420166, + "loss": 3.1384, + "step": 10976 + }, + { + "epoch": 0.46218947368421054, + "grad_norm": 0.462890625, + "learning_rate": 0.0002881667946697601, + "loss": 3.1792, + "step": 10977 + }, + { + "epoch": 0.46223157894736844, + "grad_norm": 0.419921875, + "learning_rate": 0.0002881334456021494, + "loss": 3.0474, + "step": 10978 + }, + { + "epoch": 0.46227368421052634, + "grad_norm": 0.4453125, + "learning_rate": 0.00028810009583979206, + "loss": 3.3346, + "step": 10979 + }, + { + "epoch": 0.46231578947368424, + "grad_norm": 0.400390625, + "learning_rate": 0.0002880667453832958, + "loss": 2.6771, + "step": 10980 + }, + { + "epoch": 0.4623578947368421, + "grad_norm": 0.46875, + "learning_rate": 0.0002880333942332682, + "loss": 3.0706, + "step": 10981 + }, + { + "epoch": 0.4624, + "grad_norm": 0.421875, + "learning_rate": 0.00028800004239031685, + "loss": 3.3357, + "step": 10982 + }, + { + "epoch": 0.4624421052631579, + "grad_norm": 0.470703125, + "learning_rate": 0.00028796668985504924, + "loss": 2.5734, + "step": 10983 + }, + { + "epoch": 0.4624842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.0002879333366280733, + "loss": 3.1539, + "step": 10984 + }, + { + "epoch": 0.4625263157894737, + "grad_norm": 0.404296875, + "learning_rate": 0.00028789998270999643, + "loss": 3.2092, + "step": 10985 + }, + { + "epoch": 0.4625684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.0002878666281014265, + "loss": 3.2848, + "step": 10986 + }, + { + "epoch": 0.46261052631578947, + "grad_norm": 0.90625, + "learning_rate": 0.000287833272802971, + "loss": 3.0889, + "step": 10987 + }, + { + "epoch": 0.46265263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.0002877999168152378, + "loss": 3.4342, + "step": 10988 + }, + { + "epoch": 0.46269473684210527, + "grad_norm": 0.427734375, + "learning_rate": 0.00028776656013883447, + "loss": 3.1307, + "step": 10989 + }, + { + "epoch": 0.46273684210526317, + "grad_norm": 0.43359375, + "learning_rate": 0.00028773320277436883, + "loss": 3.1539, + "step": 10990 + }, + { + "epoch": 0.46277894736842107, + "grad_norm": 0.41015625, + "learning_rate": 0.0002876998447224486, + "loss": 3.0921, + "step": 10991 + }, + { + "epoch": 0.46282105263157897, + "grad_norm": 0.439453125, + "learning_rate": 0.00028766648598368147, + "loss": 3.2349, + "step": 10992 + }, + { + "epoch": 0.46286315789473687, + "grad_norm": 0.4140625, + "learning_rate": 0.0002876331265586752, + "loss": 3.487, + "step": 10993 + }, + { + "epoch": 0.46290526315789476, + "grad_norm": 0.435546875, + "learning_rate": 0.0002875997664480376, + "loss": 3.1138, + "step": 10994 + }, + { + "epoch": 0.4629473684210526, + "grad_norm": 0.466796875, + "learning_rate": 0.00028756640565237633, + "loss": 3.7036, + "step": 10995 + }, + { + "epoch": 0.4629894736842105, + "grad_norm": 0.412109375, + "learning_rate": 0.0002875330441722994, + "loss": 3.1889, + "step": 10996 + }, + { + "epoch": 0.4630315789473684, + "grad_norm": 0.404296875, + "learning_rate": 0.00028749968200841446, + "loss": 3.1696, + "step": 10997 + }, + { + "epoch": 0.4630736842105263, + "grad_norm": 0.40625, + "learning_rate": 0.0002874663191613294, + "loss": 3.0466, + "step": 10998 + }, + { + "epoch": 0.4631157894736842, + "grad_norm": 0.455078125, + "learning_rate": 0.00028743295563165205, + "loss": 2.8668, + "step": 10999 + }, + { + "epoch": 0.4631578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.0002873995914199902, + "loss": 3.1494, + "step": 11000 + }, + { + "epoch": 0.4632, + "grad_norm": 0.431640625, + "learning_rate": 0.00028736622652695176, + "loss": 3.2486, + "step": 11001 + }, + { + "epoch": 0.4632421052631579, + "grad_norm": 0.451171875, + "learning_rate": 0.0002873328609531446, + "loss": 3.0619, + "step": 11002 + }, + { + "epoch": 0.4632842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.0002872994946991765, + "loss": 2.9887, + "step": 11003 + }, + { + "epoch": 0.4633263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.0002872661277656555, + "loss": 3.2071, + "step": 11004 + }, + { + "epoch": 0.4633684210526316, + "grad_norm": 0.443359375, + "learning_rate": 0.0002872327601531893, + "loss": 2.7406, + "step": 11005 + }, + { + "epoch": 0.4634105263157895, + "grad_norm": 0.44140625, + "learning_rate": 0.0002871993918623861, + "loss": 3.0966, + "step": 11006 + }, + { + "epoch": 0.4634526315789474, + "grad_norm": 0.4453125, + "learning_rate": 0.00028716602289385364, + "loss": 3.284, + "step": 11007 + }, + { + "epoch": 0.46349473684210524, + "grad_norm": 0.421875, + "learning_rate": 0.00028713265324819995, + "loss": 3.2656, + "step": 11008 + }, + { + "epoch": 0.46353684210526314, + "grad_norm": 0.412109375, + "learning_rate": 0.0002870992829260329, + "loss": 3.0845, + "step": 11009 + }, + { + "epoch": 0.46357894736842103, + "grad_norm": 0.427734375, + "learning_rate": 0.0002870659119279605, + "loss": 3.3118, + "step": 11010 + }, + { + "epoch": 0.46362105263157893, + "grad_norm": 0.4296875, + "learning_rate": 0.0002870325402545908, + "loss": 2.9708, + "step": 11011 + }, + { + "epoch": 0.46366315789473683, + "grad_norm": 0.45703125, + "learning_rate": 0.0002869991679065317, + "loss": 3.1585, + "step": 11012 + }, + { + "epoch": 0.46370526315789473, + "grad_norm": 0.431640625, + "learning_rate": 0.00028696579488439114, + "loss": 3.1989, + "step": 11013 + }, + { + "epoch": 0.46374736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.0002869324211887773, + "loss": 3.3521, + "step": 11014 + }, + { + "epoch": 0.46378947368421053, + "grad_norm": 0.435546875, + "learning_rate": 0.0002868990468202982, + "loss": 2.7115, + "step": 11015 + }, + { + "epoch": 0.46383157894736843, + "grad_norm": 0.421875, + "learning_rate": 0.0002868656717795617, + "loss": 3.1267, + "step": 11016 + }, + { + "epoch": 0.4638736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.0002868322960671761, + "loss": 3.3949, + "step": 11017 + }, + { + "epoch": 0.4639157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.0002867989196837493, + "loss": 3.2507, + "step": 11018 + }, + { + "epoch": 0.4639578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.0002867655426298894, + "loss": 3.4806, + "step": 11019 + }, + { + "epoch": 0.464, + "grad_norm": 0.4296875, + "learning_rate": 0.0002867321649062045, + "loss": 3.3975, + "step": 11020 + }, + { + "epoch": 0.4640421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.0002866987865133027, + "loss": 3.2164, + "step": 11021 + }, + { + "epoch": 0.46408421052631577, + "grad_norm": 0.41796875, + "learning_rate": 0.0002866654074517922, + "loss": 3.0578, + "step": 11022 + }, + { + "epoch": 0.46412631578947366, + "grad_norm": 0.38671875, + "learning_rate": 0.00028663202772228095, + "loss": 3.2089, + "step": 11023 + }, + { + "epoch": 0.46416842105263156, + "grad_norm": 0.419921875, + "learning_rate": 0.00028659864732537727, + "loss": 3.5458, + "step": 11024 + }, + { + "epoch": 0.46421052631578946, + "grad_norm": 0.482421875, + "learning_rate": 0.00028656526626168926, + "loss": 3.1491, + "step": 11025 + }, + { + "epoch": 0.46425263157894736, + "grad_norm": 0.498046875, + "learning_rate": 0.00028653188453182503, + "loss": 3.1083, + "step": 11026 + }, + { + "epoch": 0.46429473684210526, + "grad_norm": 0.404296875, + "learning_rate": 0.00028649850213639284, + "loss": 2.7537, + "step": 11027 + }, + { + "epoch": 0.46433684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.00028646511907600076, + "loss": 3.3336, + "step": 11028 + }, + { + "epoch": 0.46437894736842106, + "grad_norm": 0.431640625, + "learning_rate": 0.00028643173535125707, + "loss": 3.2345, + "step": 11029 + }, + { + "epoch": 0.46442105263157896, + "grad_norm": 0.439453125, + "learning_rate": 0.0002863983509627699, + "loss": 3.2096, + "step": 11030 + }, + { + "epoch": 0.46446315789473686, + "grad_norm": 0.41796875, + "learning_rate": 0.00028636496591114756, + "loss": 3.0876, + "step": 11031 + }, + { + "epoch": 0.46450526315789475, + "grad_norm": 0.38671875, + "learning_rate": 0.0002863315801969984, + "loss": 3.0769, + "step": 11032 + }, + { + "epoch": 0.46454736842105265, + "grad_norm": 0.44140625, + "learning_rate": 0.00028629819382093043, + "loss": 3.7223, + "step": 11033 + }, + { + "epoch": 0.46458947368421055, + "grad_norm": 0.412109375, + "learning_rate": 0.00028626480678355207, + "loss": 3.6163, + "step": 11034 + }, + { + "epoch": 0.4646315789473684, + "grad_norm": 0.462890625, + "learning_rate": 0.00028623141908547153, + "loss": 3.3951, + "step": 11035 + }, + { + "epoch": 0.4646736842105263, + "grad_norm": 0.404296875, + "learning_rate": 0.00028619803072729705, + "loss": 2.9789, + "step": 11036 + }, + { + "epoch": 0.4647157894736842, + "grad_norm": 0.40234375, + "learning_rate": 0.000286164641709637, + "loss": 3.3617, + "step": 11037 + }, + { + "epoch": 0.4647578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.00028613125203309965, + "loss": 4.0109, + "step": 11038 + }, + { + "epoch": 0.4648, + "grad_norm": 0.396484375, + "learning_rate": 0.00028609786169829336, + "loss": 2.8828, + "step": 11039 + }, + { + "epoch": 0.4648421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.0002860644707058265, + "loss": 3.1537, + "step": 11040 + }, + { + "epoch": 0.4648842105263158, + "grad_norm": 0.439453125, + "learning_rate": 0.0002860310790563073, + "loss": 3.1394, + "step": 11041 + }, + { + "epoch": 0.4649263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.0002859976867503442, + "loss": 3.222, + "step": 11042 + }, + { + "epoch": 0.4649684210526316, + "grad_norm": 0.443359375, + "learning_rate": 0.00028596429378854556, + "loss": 3.1944, + "step": 11043 + }, + { + "epoch": 0.4650105263157895, + "grad_norm": 0.416015625, + "learning_rate": 0.0002859309001715197, + "loss": 3.3569, + "step": 11044 + }, + { + "epoch": 0.4650526315789474, + "grad_norm": 0.431640625, + "learning_rate": 0.00028589750589987507, + "loss": 3.0646, + "step": 11045 + }, + { + "epoch": 0.4650947368421053, + "grad_norm": 0.421875, + "learning_rate": 0.00028586411097422003, + "loss": 3.0007, + "step": 11046 + }, + { + "epoch": 0.4651368421052632, + "grad_norm": 0.40625, + "learning_rate": 0.0002858307153951631, + "loss": 3.1132, + "step": 11047 + }, + { + "epoch": 0.465178947368421, + "grad_norm": 0.400390625, + "learning_rate": 0.0002857973191633126, + "loss": 2.786, + "step": 11048 + }, + { + "epoch": 0.4652210526315789, + "grad_norm": 0.5546875, + "learning_rate": 0.000285763922279277, + "loss": 2.9335, + "step": 11049 + }, + { + "epoch": 0.4652631578947368, + "grad_norm": 0.44921875, + "learning_rate": 0.00028573052474366474, + "loss": 3.4274, + "step": 11050 + }, + { + "epoch": 0.4653052631578947, + "grad_norm": 0.419921875, + "learning_rate": 0.0002856971265570843, + "loss": 3.6876, + "step": 11051 + }, + { + "epoch": 0.4653473684210526, + "grad_norm": 0.41796875, + "learning_rate": 0.0002856637277201442, + "loss": 3.0635, + "step": 11052 + }, + { + "epoch": 0.4653894736842105, + "grad_norm": 0.431640625, + "learning_rate": 0.0002856303282334529, + "loss": 3.123, + "step": 11053 + }, + { + "epoch": 0.4654315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.00028559692809761877, + "loss": 3.3445, + "step": 11054 + }, + { + "epoch": 0.4654736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.00028556352731325054, + "loss": 3.2822, + "step": 11055 + }, + { + "epoch": 0.4655157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.0002855301258809566, + "loss": 3.2106, + "step": 11056 + }, + { + "epoch": 0.4655578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.00028549672380134546, + "loss": 3.3361, + "step": 11057 + }, + { + "epoch": 0.4656, + "grad_norm": 0.4375, + "learning_rate": 0.00028546332107502574, + "loss": 3.286, + "step": 11058 + }, + { + "epoch": 0.4656421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.0002854299177026061, + "loss": 3.0591, + "step": 11059 + }, + { + "epoch": 0.4656842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.0002853965136846948, + "loss": 3.2394, + "step": 11060 + }, + { + "epoch": 0.4657263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.0002853631090219008, + "loss": 2.9695, + "step": 11061 + }, + { + "epoch": 0.46576842105263155, + "grad_norm": 0.41796875, + "learning_rate": 0.0002853297037148323, + "loss": 3.101, + "step": 11062 + }, + { + "epoch": 0.46581052631578945, + "grad_norm": 0.423828125, + "learning_rate": 0.00028529629776409825, + "loss": 3.5741, + "step": 11063 + }, + { + "epoch": 0.46585263157894735, + "grad_norm": 0.4453125, + "learning_rate": 0.0002852628911703071, + "loss": 3.1201, + "step": 11064 + }, + { + "epoch": 0.46589473684210525, + "grad_norm": 0.4609375, + "learning_rate": 0.0002852294839340674, + "loss": 2.9888, + "step": 11065 + }, + { + "epoch": 0.46593684210526315, + "grad_norm": 0.4296875, + "learning_rate": 0.0002851960760559881, + "loss": 3.2819, + "step": 11066 + }, + { + "epoch": 0.46597894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.0002851626675366775, + "loss": 3.3095, + "step": 11067 + }, + { + "epoch": 0.46602105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.0002851292583767445, + "loss": 3.3439, + "step": 11068 + }, + { + "epoch": 0.46606315789473685, + "grad_norm": 0.423828125, + "learning_rate": 0.0002850958485767976, + "loss": 3.1526, + "step": 11069 + }, + { + "epoch": 0.46610526315789474, + "grad_norm": 0.4921875, + "learning_rate": 0.0002850624381374456, + "loss": 3.1498, + "step": 11070 + }, + { + "epoch": 0.46614736842105264, + "grad_norm": 0.384765625, + "learning_rate": 0.00028502902705929725, + "loss": 3.1174, + "step": 11071 + }, + { + "epoch": 0.46618947368421054, + "grad_norm": 0.427734375, + "learning_rate": 0.000284995615342961, + "loss": 3.1359, + "step": 11072 + }, + { + "epoch": 0.46623157894736844, + "grad_norm": 0.421875, + "learning_rate": 0.0002849622029890459, + "loss": 2.6997, + "step": 11073 + }, + { + "epoch": 0.46627368421052634, + "grad_norm": 0.41015625, + "learning_rate": 0.0002849287899981606, + "loss": 2.4903, + "step": 11074 + }, + { + "epoch": 0.4663157894736842, + "grad_norm": 0.451171875, + "learning_rate": 0.00028489537637091367, + "loss": 3.2007, + "step": 11075 + }, + { + "epoch": 0.4663578947368421, + "grad_norm": 0.45703125, + "learning_rate": 0.00028486196210791403, + "loss": 3.5014, + "step": 11076 + }, + { + "epoch": 0.4664, + "grad_norm": 0.423828125, + "learning_rate": 0.0002848285472097704, + "loss": 3.1761, + "step": 11077 + }, + { + "epoch": 0.4664421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00028479513167709155, + "loss": 3.4365, + "step": 11078 + }, + { + "epoch": 0.4664842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.0002847617155104862, + "loss": 3.5588, + "step": 11079 + }, + { + "epoch": 0.4665263157894737, + "grad_norm": 0.435546875, + "learning_rate": 0.0002847282987105633, + "loss": 3.3281, + "step": 11080 + }, + { + "epoch": 0.4665684210526316, + "grad_norm": 0.416015625, + "learning_rate": 0.0002846948812779317, + "loss": 2.904, + "step": 11081 + }, + { + "epoch": 0.4666105263157895, + "grad_norm": 0.41015625, + "learning_rate": 0.0002846614632132, + "loss": 3.5838, + "step": 11082 + }, + { + "epoch": 0.4666526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.0002846280445169772, + "loss": 2.8639, + "step": 11083 + }, + { + "epoch": 0.4666947368421053, + "grad_norm": 0.40625, + "learning_rate": 0.0002845946251898722, + "loss": 3.0193, + "step": 11084 + }, + { + "epoch": 0.46673684210526317, + "grad_norm": 0.404296875, + "learning_rate": 0.00028456120523249367, + "loss": 3.4019, + "step": 11085 + }, + { + "epoch": 0.46677894736842107, + "grad_norm": 0.4453125, + "learning_rate": 0.00028452778464545063, + "loss": 2.9672, + "step": 11086 + }, + { + "epoch": 0.46682105263157897, + "grad_norm": 0.4140625, + "learning_rate": 0.000284494363429352, + "loss": 3.1525, + "step": 11087 + }, + { + "epoch": 0.46686315789473687, + "grad_norm": 0.423828125, + "learning_rate": 0.0002844609415848064, + "loss": 3.7454, + "step": 11088 + }, + { + "epoch": 0.4669052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.0002844275191124231, + "loss": 3.264, + "step": 11089 + }, + { + "epoch": 0.4669473684210526, + "grad_norm": 0.4375, + "learning_rate": 0.00028439409601281075, + "loss": 3.7794, + "step": 11090 + }, + { + "epoch": 0.4669894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.00028436067228657845, + "loss": 3.3333, + "step": 11091 + }, + { + "epoch": 0.4670315789473684, + "grad_norm": 0.400390625, + "learning_rate": 0.0002843272479343351, + "loss": 3.0639, + "step": 11092 + }, + { + "epoch": 0.4670736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.0002842938229566895, + "loss": 3.3473, + "step": 11093 + }, + { + "epoch": 0.4671157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.0002842603973542509, + "loss": 3.4347, + "step": 11094 + }, + { + "epoch": 0.4671578947368421, + "grad_norm": 0.392578125, + "learning_rate": 0.000284226971127628, + "loss": 3.0985, + "step": 11095 + }, + { + "epoch": 0.4672, + "grad_norm": 0.41796875, + "learning_rate": 0.00028419354427742995, + "loss": 3.1642, + "step": 11096 + }, + { + "epoch": 0.4672421052631579, + "grad_norm": 0.3984375, + "learning_rate": 0.00028416011680426563, + "loss": 2.8935, + "step": 11097 + }, + { + "epoch": 0.4672842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.0002841266887087442, + "loss": 3.3013, + "step": 11098 + }, + { + "epoch": 0.4673263157894737, + "grad_norm": 0.421875, + "learning_rate": 0.00028409325999147455, + "loss": 2.9669, + "step": 11099 + }, + { + "epoch": 0.4673684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.00028405983065306583, + "loss": 2.8304, + "step": 11100 + }, + { + "epoch": 0.4674105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.000284026400694127, + "loss": 3.3224, + "step": 11101 + }, + { + "epoch": 0.46745263157894734, + "grad_norm": 0.419921875, + "learning_rate": 0.0002839929701152671, + "loss": 3.4339, + "step": 11102 + }, + { + "epoch": 0.46749473684210524, + "grad_norm": 0.4765625, + "learning_rate": 0.00028395953891709527, + "loss": 3.136, + "step": 11103 + }, + { + "epoch": 0.46753684210526314, + "grad_norm": 0.39453125, + "learning_rate": 0.0002839261071002204, + "loss": 3.067, + "step": 11104 + }, + { + "epoch": 0.46757894736842104, + "grad_norm": 0.423828125, + "learning_rate": 0.0002838926746652518, + "loss": 3.2678, + "step": 11105 + }, + { + "epoch": 0.46762105263157894, + "grad_norm": 0.4296875, + "learning_rate": 0.00028385924161279853, + "loss": 3.1551, + "step": 11106 + }, + { + "epoch": 0.46766315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.00028382580794346964, + "loss": 3.562, + "step": 11107 + }, + { + "epoch": 0.46770526315789474, + "grad_norm": 0.44140625, + "learning_rate": 0.0002837923736578742, + "loss": 3.3161, + "step": 11108 + }, + { + "epoch": 0.46774736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00028375893875662154, + "loss": 3.3308, + "step": 11109 + }, + { + "epoch": 0.46778947368421053, + "grad_norm": 0.41015625, + "learning_rate": 0.0002837255032403206, + "loss": 3.0422, + "step": 11110 + }, + { + "epoch": 0.46783157894736843, + "grad_norm": 0.416015625, + "learning_rate": 0.00028369206710958064, + "loss": 3.3357, + "step": 11111 + }, + { + "epoch": 0.46787368421052633, + "grad_norm": 0.412109375, + "learning_rate": 0.00028365863036501084, + "loss": 2.9283, + "step": 11112 + }, + { + "epoch": 0.46791578947368423, + "grad_norm": 0.4296875, + "learning_rate": 0.0002836251930072203, + "loss": 3.2735, + "step": 11113 + }, + { + "epoch": 0.46795789473684213, + "grad_norm": 0.419921875, + "learning_rate": 0.0002835917550368182, + "loss": 3.0573, + "step": 11114 + }, + { + "epoch": 0.468, + "grad_norm": 0.41015625, + "learning_rate": 0.0002835583164544139, + "loss": 3.2945, + "step": 11115 + }, + { + "epoch": 0.46804210526315787, + "grad_norm": 0.45703125, + "learning_rate": 0.0002835248772606165, + "loss": 3.3422, + "step": 11116 + }, + { + "epoch": 0.46808421052631577, + "grad_norm": 0.408203125, + "learning_rate": 0.00028349143745603513, + "loss": 3.2905, + "step": 11117 + }, + { + "epoch": 0.46812631578947367, + "grad_norm": 0.427734375, + "learning_rate": 0.00028345799704127916, + "loss": 3.8991, + "step": 11118 + }, + { + "epoch": 0.46816842105263157, + "grad_norm": 0.416015625, + "learning_rate": 0.00028342455601695785, + "loss": 3.3379, + "step": 11119 + }, + { + "epoch": 0.46821052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.00028339111438368034, + "loss": 2.9561, + "step": 11120 + }, + { + "epoch": 0.46825263157894736, + "grad_norm": 0.396484375, + "learning_rate": 0.00028335767214205597, + "loss": 3.0417, + "step": 11121 + }, + { + "epoch": 0.46829473684210526, + "grad_norm": 0.4375, + "learning_rate": 0.000283324229292694, + "loss": 3.5106, + "step": 11122 + }, + { + "epoch": 0.46833684210526316, + "grad_norm": 0.47265625, + "learning_rate": 0.00028329078583620373, + "loss": 3.2043, + "step": 11123 + }, + { + "epoch": 0.46837894736842106, + "grad_norm": 0.423828125, + "learning_rate": 0.00028325734177319446, + "loss": 3.1109, + "step": 11124 + }, + { + "epoch": 0.46842105263157896, + "grad_norm": 0.53125, + "learning_rate": 0.0002832238971042755, + "loss": 3.38, + "step": 11125 + }, + { + "epoch": 0.46846315789473686, + "grad_norm": 0.451171875, + "learning_rate": 0.00028319045183005623, + "loss": 3.1454, + "step": 11126 + }, + { + "epoch": 0.46850526315789476, + "grad_norm": 0.4609375, + "learning_rate": 0.00028315700595114585, + "loss": 3.8016, + "step": 11127 + }, + { + "epoch": 0.46854736842105266, + "grad_norm": 0.4609375, + "learning_rate": 0.0002831235594681538, + "loss": 3.1104, + "step": 11128 + }, + { + "epoch": 0.4685894736842105, + "grad_norm": 0.46484375, + "learning_rate": 0.00028309011238168946, + "loss": 3.2327, + "step": 11129 + }, + { + "epoch": 0.4686315789473684, + "grad_norm": 0.38671875, + "learning_rate": 0.00028305666469236203, + "loss": 2.708, + "step": 11130 + }, + { + "epoch": 0.4686736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.00028302321640078106, + "loss": 3.0652, + "step": 11131 + }, + { + "epoch": 0.4687157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.000282989767507556, + "loss": 3.0473, + "step": 11132 + }, + { + "epoch": 0.4687578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.00028295631801329604, + "loss": 3.4193, + "step": 11133 + }, + { + "epoch": 0.4688, + "grad_norm": 0.443359375, + "learning_rate": 0.00028292286791861064, + "loss": 3.3724, + "step": 11134 + }, + { + "epoch": 0.4688421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00028288941722410935, + "loss": 3.113, + "step": 11135 + }, + { + "epoch": 0.4688842105263158, + "grad_norm": 0.474609375, + "learning_rate": 0.0002828559659304015, + "loss": 3.3249, + "step": 11136 + }, + { + "epoch": 0.4689263157894737, + "grad_norm": 0.4453125, + "learning_rate": 0.00028282251403809655, + "loss": 3.0884, + "step": 11137 + }, + { + "epoch": 0.4689684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.000282789061547804, + "loss": 3.3311, + "step": 11138 + }, + { + "epoch": 0.4690105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.00028275560846013323, + "loss": 3.0631, + "step": 11139 + }, + { + "epoch": 0.4690526315789474, + "grad_norm": 0.431640625, + "learning_rate": 0.00028272215477569373, + "loss": 3.1261, + "step": 11140 + }, + { + "epoch": 0.4690947368421053, + "grad_norm": 0.400390625, + "learning_rate": 0.0002826887004950951, + "loss": 3.1957, + "step": 11141 + }, + { + "epoch": 0.4691368421052632, + "grad_norm": 0.43359375, + "learning_rate": 0.0002826552456189467, + "loss": 3.0753, + "step": 11142 + }, + { + "epoch": 0.46917894736842103, + "grad_norm": 0.44921875, + "learning_rate": 0.0002826217901478581, + "loss": 3.0499, + "step": 11143 + }, + { + "epoch": 0.4692210526315789, + "grad_norm": 0.4453125, + "learning_rate": 0.00028258833408243875, + "loss": 3.5921, + "step": 11144 + }, + { + "epoch": 0.4692631578947368, + "grad_norm": 0.443359375, + "learning_rate": 0.00028255487742329837, + "loss": 3.0914, + "step": 11145 + }, + { + "epoch": 0.4693052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.0002825214201710463, + "loss": 3.5112, + "step": 11146 + }, + { + "epoch": 0.4693473684210526, + "grad_norm": 0.4140625, + "learning_rate": 0.00028248796232629216, + "loss": 3.1328, + "step": 11147 + }, + { + "epoch": 0.4693894736842105, + "grad_norm": 0.404296875, + "learning_rate": 0.0002824545038896456, + "loss": 3.3992, + "step": 11148 + }, + { + "epoch": 0.4694315789473684, + "grad_norm": 0.431640625, + "learning_rate": 0.00028242104486171605, + "loss": 3.2938, + "step": 11149 + }, + { + "epoch": 0.4694736842105263, + "grad_norm": 0.40234375, + "learning_rate": 0.00028238758524311313, + "loss": 3.1996, + "step": 11150 + }, + { + "epoch": 0.4695157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.00028235412503444646, + "loss": 3.1004, + "step": 11151 + }, + { + "epoch": 0.4695578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.0002823206642363257, + "loss": 3.4519, + "step": 11152 + }, + { + "epoch": 0.4696, + "grad_norm": 0.419921875, + "learning_rate": 0.0002822872028493604, + "loss": 3.4161, + "step": 11153 + }, + { + "epoch": 0.4696421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.0002822537408741602, + "loss": 3.5683, + "step": 11154 + }, + { + "epoch": 0.4696842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.00028222027831133466, + "loss": 3.3038, + "step": 11155 + }, + { + "epoch": 0.46972631578947366, + "grad_norm": 0.427734375, + "learning_rate": 0.00028218681516149356, + "loss": 3.1519, + "step": 11156 + }, + { + "epoch": 0.46976842105263156, + "grad_norm": 0.40234375, + "learning_rate": 0.0002821533514252466, + "loss": 3.2105, + "step": 11157 + }, + { + "epoch": 0.46981052631578946, + "grad_norm": 0.39453125, + "learning_rate": 0.0002821198871032032, + "loss": 3.2832, + "step": 11158 + }, + { + "epoch": 0.46985263157894736, + "grad_norm": 0.423828125, + "learning_rate": 0.00028208642219597325, + "loss": 3.3814, + "step": 11159 + }, + { + "epoch": 0.46989473684210525, + "grad_norm": 0.4296875, + "learning_rate": 0.00028205295670416644, + "loss": 3.0818, + "step": 11160 + }, + { + "epoch": 0.46993684210526315, + "grad_norm": 0.419921875, + "learning_rate": 0.0002820194906283924, + "loss": 3.2789, + "step": 11161 + }, + { + "epoch": 0.46997894736842105, + "grad_norm": 0.3984375, + "learning_rate": 0.0002819860239692609, + "loss": 3.1111, + "step": 11162 + }, + { + "epoch": 0.47002105263157895, + "grad_norm": 0.40234375, + "learning_rate": 0.0002819525567273815, + "loss": 3.519, + "step": 11163 + }, + { + "epoch": 0.47006315789473685, + "grad_norm": 0.41015625, + "learning_rate": 0.00028191908890336416, + "loss": 3.4117, + "step": 11164 + }, + { + "epoch": 0.47010526315789475, + "grad_norm": 0.447265625, + "learning_rate": 0.0002818856204978184, + "loss": 3.4008, + "step": 11165 + }, + { + "epoch": 0.47014736842105265, + "grad_norm": 0.4140625, + "learning_rate": 0.0002818521515113542, + "loss": 3.3098, + "step": 11166 + }, + { + "epoch": 0.47018947368421055, + "grad_norm": 0.451171875, + "learning_rate": 0.0002818186819445813, + "loss": 3.1341, + "step": 11167 + }, + { + "epoch": 0.47023157894736844, + "grad_norm": 0.419921875, + "learning_rate": 0.00028178521179810927, + "loss": 3.1343, + "step": 11168 + }, + { + "epoch": 0.4702736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.00028175174107254806, + "loss": 3.2277, + "step": 11169 + }, + { + "epoch": 0.4703157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.00028171826976850753, + "loss": 3.4406, + "step": 11170 + }, + { + "epoch": 0.4703578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.0002816847978865972, + "loss": 3.0975, + "step": 11171 + }, + { + "epoch": 0.4704, + "grad_norm": 0.431640625, + "learning_rate": 0.00028165132542742725, + "loss": 3.553, + "step": 11172 + }, + { + "epoch": 0.4704421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00028161785239160723, + "loss": 3.2445, + "step": 11173 + }, + { + "epoch": 0.4704842105263158, + "grad_norm": 0.39453125, + "learning_rate": 0.0002815843787797472, + "loss": 3.0592, + "step": 11174 + }, + { + "epoch": 0.4705263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.0002815509045924568, + "loss": 3.1512, + "step": 11175 + }, + { + "epoch": 0.4705684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.0002815174298303461, + "loss": 3.6098, + "step": 11176 + }, + { + "epoch": 0.4706105263157895, + "grad_norm": 0.435546875, + "learning_rate": 0.00028148395449402474, + "loss": 3.4264, + "step": 11177 + }, + { + "epoch": 0.4706526315789474, + "grad_norm": 0.404296875, + "learning_rate": 0.0002814504785841028, + "loss": 2.8674, + "step": 11178 + }, + { + "epoch": 0.4706947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.00028141700210119005, + "loss": 3.0892, + "step": 11179 + }, + { + "epoch": 0.4707368421052632, + "grad_norm": 0.44140625, + "learning_rate": 0.0002813835250458964, + "loss": 3.4468, + "step": 11180 + }, + { + "epoch": 0.4707789473684211, + "grad_norm": 0.52734375, + "learning_rate": 0.00028135004741883186, + "loss": 3.2925, + "step": 11181 + }, + { + "epoch": 0.470821052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.0002813165692206063, + "loss": 3.2443, + "step": 11182 + }, + { + "epoch": 0.4708631578947368, + "grad_norm": 0.451171875, + "learning_rate": 0.0002812830904518296, + "loss": 3.084, + "step": 11183 + }, + { + "epoch": 0.4709052631578947, + "grad_norm": 0.44140625, + "learning_rate": 0.00028124961111311174, + "loss": 3.6488, + "step": 11184 + }, + { + "epoch": 0.4709473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.00028121613120506275, + "loss": 3.1927, + "step": 11185 + }, + { + "epoch": 0.4709894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.0002811826507282925, + "loss": 3.3869, + "step": 11186 + }, + { + "epoch": 0.4710315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.000281149169683411, + "loss": 3.2963, + "step": 11187 + }, + { + "epoch": 0.4710736842105263, + "grad_norm": 0.47265625, + "learning_rate": 0.0002811156880710282, + "loss": 3.0905, + "step": 11188 + }, + { + "epoch": 0.4711157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.0002810822058917541, + "loss": 3.5734, + "step": 11189 + }, + { + "epoch": 0.4711578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.00028104872314619873, + "loss": 3.2389, + "step": 11190 + }, + { + "epoch": 0.4712, + "grad_norm": 0.451171875, + "learning_rate": 0.0002810152398349721, + "loss": 3.1316, + "step": 11191 + }, + { + "epoch": 0.4712421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.0002809817559586843, + "loss": 2.9237, + "step": 11192 + }, + { + "epoch": 0.4712842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.0002809482715179453, + "loss": 3.2665, + "step": 11193 + }, + { + "epoch": 0.4713263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.0002809147865133652, + "loss": 3.4429, + "step": 11194 + }, + { + "epoch": 0.4713684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.00028088130094555393, + "loss": 3.2462, + "step": 11195 + }, + { + "epoch": 0.47141052631578945, + "grad_norm": 0.388671875, + "learning_rate": 0.00028084781481512165, + "loss": 2.5726, + "step": 11196 + }, + { + "epoch": 0.47145263157894735, + "grad_norm": 0.41796875, + "learning_rate": 0.00028081432812267844, + "loss": 3.3107, + "step": 11197 + }, + { + "epoch": 0.47149473684210524, + "grad_norm": 0.419921875, + "learning_rate": 0.0002807808408688343, + "loss": 3.199, + "step": 11198 + }, + { + "epoch": 0.47153684210526314, + "grad_norm": 0.39453125, + "learning_rate": 0.0002807473530541995, + "loss": 3.2169, + "step": 11199 + }, + { + "epoch": 0.47157894736842104, + "grad_norm": 0.42578125, + "learning_rate": 0.000280713864679384, + "loss": 3.3756, + "step": 11200 + }, + { + "epoch": 0.47162105263157894, + "grad_norm": 0.41796875, + "learning_rate": 0.00028068037574499805, + "loss": 2.849, + "step": 11201 + }, + { + "epoch": 0.47166315789473684, + "grad_norm": 0.40234375, + "learning_rate": 0.00028064688625165167, + "loss": 3.1861, + "step": 11202 + }, + { + "epoch": 0.47170526315789474, + "grad_norm": 0.416015625, + "learning_rate": 0.0002806133961999551, + "loss": 3.4251, + "step": 11203 + }, + { + "epoch": 0.47174736842105264, + "grad_norm": 0.58203125, + "learning_rate": 0.0002805799055905183, + "loss": 3.0857, + "step": 11204 + }, + { + "epoch": 0.47178947368421054, + "grad_norm": 0.42578125, + "learning_rate": 0.00028054641442395154, + "loss": 2.8668, + "step": 11205 + }, + { + "epoch": 0.47183157894736844, + "grad_norm": 0.404296875, + "learning_rate": 0.00028051292270086505, + "loss": 3.4576, + "step": 11206 + }, + { + "epoch": 0.47187368421052633, + "grad_norm": 0.40234375, + "learning_rate": 0.000280479430421869, + "loss": 3.1642, + "step": 11207 + }, + { + "epoch": 0.47191578947368423, + "grad_norm": 0.390625, + "learning_rate": 0.00028044593758757354, + "loss": 2.6202, + "step": 11208 + }, + { + "epoch": 0.47195789473684213, + "grad_norm": 0.42578125, + "learning_rate": 0.0002804124441985888, + "loss": 3.0581, + "step": 11209 + }, + { + "epoch": 0.472, + "grad_norm": 0.439453125, + "learning_rate": 0.0002803789502555251, + "loss": 2.8977, + "step": 11210 + }, + { + "epoch": 0.4720421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.00028034545575899254, + "loss": 3.1538, + "step": 11211 + }, + { + "epoch": 0.4720842105263158, + "grad_norm": 0.4140625, + "learning_rate": 0.0002803119607096016, + "loss": 3.2904, + "step": 11212 + }, + { + "epoch": 0.47212631578947367, + "grad_norm": 0.4765625, + "learning_rate": 0.00028027846510796225, + "loss": 3.2363, + "step": 11213 + }, + { + "epoch": 0.47216842105263157, + "grad_norm": 0.416015625, + "learning_rate": 0.00028024496895468474, + "loss": 3.1192, + "step": 11214 + }, + { + "epoch": 0.47221052631578947, + "grad_norm": 0.408203125, + "learning_rate": 0.0002802114722503796, + "loss": 2.9258, + "step": 11215 + }, + { + "epoch": 0.47225263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.0002801779749956569, + "loss": 3.3017, + "step": 11216 + }, + { + "epoch": 0.47229473684210527, + "grad_norm": 0.408203125, + "learning_rate": 0.0002801444771911269, + "loss": 3.1346, + "step": 11217 + }, + { + "epoch": 0.47233684210526317, + "grad_norm": 0.421875, + "learning_rate": 0.00028011097883739994, + "loss": 3.2025, + "step": 11218 + }, + { + "epoch": 0.47237894736842106, + "grad_norm": 0.41796875, + "learning_rate": 0.00028007747993508635, + "loss": 3.4768, + "step": 11219 + }, + { + "epoch": 0.47242105263157896, + "grad_norm": 0.41015625, + "learning_rate": 0.00028004398048479644, + "loss": 2.9053, + "step": 11220 + }, + { + "epoch": 0.47246315789473686, + "grad_norm": 0.41015625, + "learning_rate": 0.0002800104804871405, + "loss": 2.9437, + "step": 11221 + }, + { + "epoch": 0.47250526315789476, + "grad_norm": 0.443359375, + "learning_rate": 0.00027997697994272883, + "loss": 3.2862, + "step": 11222 + }, + { + "epoch": 0.4725473684210526, + "grad_norm": 0.4140625, + "learning_rate": 0.0002799434788521718, + "loss": 3.3606, + "step": 11223 + }, + { + "epoch": 0.4725894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.0002799099772160798, + "loss": 3.2482, + "step": 11224 + }, + { + "epoch": 0.4726315789473684, + "grad_norm": 0.53515625, + "learning_rate": 0.0002798764750350631, + "loss": 2.9727, + "step": 11225 + }, + { + "epoch": 0.4726736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.0002798429723097322, + "loss": 2.7849, + "step": 11226 + }, + { + "epoch": 0.4727157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.00027980946904069745, + "loss": 3.3083, + "step": 11227 + }, + { + "epoch": 0.4727578947368421, + "grad_norm": 0.3984375, + "learning_rate": 0.0002797759652285691, + "loss": 2.9202, + "step": 11228 + }, + { + "epoch": 0.4728, + "grad_norm": 0.43359375, + "learning_rate": 0.00027974246087395774, + "loss": 2.8535, + "step": 11229 + }, + { + "epoch": 0.4728421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.0002797089559774736, + "loss": 2.8304, + "step": 11230 + }, + { + "epoch": 0.4728842105263158, + "grad_norm": 0.4453125, + "learning_rate": 0.0002796754505397272, + "loss": 2.9279, + "step": 11231 + }, + { + "epoch": 0.4729263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.00027964194456132896, + "loss": 3.1045, + "step": 11232 + }, + { + "epoch": 0.4729684210526316, + "grad_norm": 0.396484375, + "learning_rate": 0.0002796084380428894, + "loss": 2.9357, + "step": 11233 + }, + { + "epoch": 0.4730105263157895, + "grad_norm": 0.41015625, + "learning_rate": 0.0002795749309850188, + "loss": 3.2696, + "step": 11234 + }, + { + "epoch": 0.4730526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.0002795414233883278, + "loss": 3.3796, + "step": 11235 + }, + { + "epoch": 0.4730947368421053, + "grad_norm": 0.431640625, + "learning_rate": 0.00027950791525342667, + "loss": 3.3979, + "step": 11236 + }, + { + "epoch": 0.47313684210526313, + "grad_norm": 0.421875, + "learning_rate": 0.00027947440658092603, + "loss": 3.1063, + "step": 11237 + }, + { + "epoch": 0.47317894736842103, + "grad_norm": 0.458984375, + "learning_rate": 0.0002794408973714363, + "loss": 3.7891, + "step": 11238 + }, + { + "epoch": 0.47322105263157893, + "grad_norm": 0.423828125, + "learning_rate": 0.00027940738762556805, + "loss": 3.162, + "step": 11239 + }, + { + "epoch": 0.47326315789473683, + "grad_norm": 0.4765625, + "learning_rate": 0.00027937387734393174, + "loss": 3.1884, + "step": 11240 + }, + { + "epoch": 0.47330526315789473, + "grad_norm": 0.412109375, + "learning_rate": 0.0002793403665271379, + "loss": 3.5732, + "step": 11241 + }, + { + "epoch": 0.47334736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.0002793068551757971, + "loss": 3.3219, + "step": 11242 + }, + { + "epoch": 0.4733894736842105, + "grad_norm": 0.494140625, + "learning_rate": 0.0002792733432905198, + "loss": 2.9995, + "step": 11243 + }, + { + "epoch": 0.4734315789473684, + "grad_norm": 0.419921875, + "learning_rate": 0.00027923983087191654, + "loss": 2.6831, + "step": 11244 + }, + { + "epoch": 0.4734736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.000279206317920598, + "loss": 2.9121, + "step": 11245 + }, + { + "epoch": 0.4735157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.0002791728044371746, + "loss": 2.9025, + "step": 11246 + }, + { + "epoch": 0.4735578947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.000279139290422257, + "loss": 3.5811, + "step": 11247 + }, + { + "epoch": 0.4736, + "grad_norm": 0.40625, + "learning_rate": 0.00027910577587645567, + "loss": 3.4719, + "step": 11248 + }, + { + "epoch": 0.4736421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.00027907226080038145, + "loss": 3.3567, + "step": 11249 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 0.41015625, + "learning_rate": 0.0002790387451946447, + "loss": 3.1155, + "step": 11250 + }, + { + "epoch": 0.47372631578947366, + "grad_norm": 0.490234375, + "learning_rate": 0.00027900522905985624, + "loss": 2.2488, + "step": 11251 + }, + { + "epoch": 0.47376842105263156, + "grad_norm": 0.4375, + "learning_rate": 0.00027897171239662643, + "loss": 3.37, + "step": 11252 + }, + { + "epoch": 0.47381052631578946, + "grad_norm": 0.423828125, + "learning_rate": 0.0002789381952055662, + "loss": 3.3016, + "step": 11253 + }, + { + "epoch": 0.47385263157894736, + "grad_norm": 0.439453125, + "learning_rate": 0.000278904677487286, + "loss": 3.3227, + "step": 11254 + }, + { + "epoch": 0.47389473684210526, + "grad_norm": 0.4765625, + "learning_rate": 0.0002788711592423966, + "loss": 2.9876, + "step": 11255 + }, + { + "epoch": 0.47393684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.00027883764047150844, + "loss": 2.6306, + "step": 11256 + }, + { + "epoch": 0.47397894736842106, + "grad_norm": 0.435546875, + "learning_rate": 0.0002788041211752325, + "loss": 2.464, + "step": 11257 + }, + { + "epoch": 0.47402105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.0002787706013541792, + "loss": 2.8837, + "step": 11258 + }, + { + "epoch": 0.47406315789473685, + "grad_norm": 0.42578125, + "learning_rate": 0.00027873708100895945, + "loss": 3.2407, + "step": 11259 + }, + { + "epoch": 0.47410526315789475, + "grad_norm": 0.4609375, + "learning_rate": 0.00027870356014018385, + "loss": 3.3901, + "step": 11260 + }, + { + "epoch": 0.47414736842105265, + "grad_norm": 0.427734375, + "learning_rate": 0.0002786700387484631, + "loss": 2.7232, + "step": 11261 + }, + { + "epoch": 0.47418947368421055, + "grad_norm": 0.419921875, + "learning_rate": 0.00027863651683440787, + "loss": 2.9739, + "step": 11262 + }, + { + "epoch": 0.47423157894736845, + "grad_norm": 0.427734375, + "learning_rate": 0.000278602994398629, + "loss": 2.8542, + "step": 11263 + }, + { + "epoch": 0.4742736842105263, + "grad_norm": 0.486328125, + "learning_rate": 0.0002785694714417371, + "loss": 2.9496, + "step": 11264 + }, + { + "epoch": 0.4743157894736842, + "grad_norm": 0.478515625, + "learning_rate": 0.00027853594796434304, + "loss": 3.1237, + "step": 11265 + }, + { + "epoch": 0.4743578947368421, + "grad_norm": 0.4609375, + "learning_rate": 0.0002785024239670576, + "loss": 3.4626, + "step": 11266 + }, + { + "epoch": 0.4744, + "grad_norm": 0.42578125, + "learning_rate": 0.00027846889945049144, + "loss": 3.5521, + "step": 11267 + }, + { + "epoch": 0.4744421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.0002784353744152554, + "loss": 3.1624, + "step": 11268 + }, + { + "epoch": 0.4744842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.0002784018488619602, + "loss": 3.1416, + "step": 11269 + }, + { + "epoch": 0.4745263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00027836832279121674, + "loss": 2.9493, + "step": 11270 + }, + { + "epoch": 0.4745684210526316, + "grad_norm": 0.4453125, + "learning_rate": 0.0002783347962036358, + "loss": 3.3596, + "step": 11271 + }, + { + "epoch": 0.4746105263157895, + "grad_norm": 0.439453125, + "learning_rate": 0.00027830126909982804, + "loss": 2.7841, + "step": 11272 + }, + { + "epoch": 0.4746526315789474, + "grad_norm": 0.419921875, + "learning_rate": 0.00027826774148040454, + "loss": 3.5661, + "step": 11273 + }, + { + "epoch": 0.4746947368421053, + "grad_norm": 0.40625, + "learning_rate": 0.00027823421334597585, + "loss": 3.0382, + "step": 11274 + }, + { + "epoch": 0.4747368421052632, + "grad_norm": 0.396484375, + "learning_rate": 0.0002782006846971531, + "loss": 2.6756, + "step": 11275 + }, + { + "epoch": 0.4747789473684211, + "grad_norm": 0.416015625, + "learning_rate": 0.00027816715553454695, + "loss": 3.4888, + "step": 11276 + }, + { + "epoch": 0.4748210526315789, + "grad_norm": 0.5546875, + "learning_rate": 0.0002781336258587684, + "loss": 2.9153, + "step": 11277 + }, + { + "epoch": 0.4748631578947368, + "grad_norm": 0.421875, + "learning_rate": 0.0002781000956704282, + "loss": 3.0417, + "step": 11278 + }, + { + "epoch": 0.4749052631578947, + "grad_norm": 0.44921875, + "learning_rate": 0.00027806656497013723, + "loss": 3.6996, + "step": 11279 + }, + { + "epoch": 0.4749473684210526, + "grad_norm": 0.484375, + "learning_rate": 0.0002780330337585065, + "loss": 3.3797, + "step": 11280 + }, + { + "epoch": 0.4749894736842105, + "grad_norm": 0.447265625, + "learning_rate": 0.0002779995020361468, + "loss": 3.0935, + "step": 11281 + }, + { + "epoch": 0.4750315789473684, + "grad_norm": 0.43359375, + "learning_rate": 0.00027796596980366893, + "loss": 2.8673, + "step": 11282 + }, + { + "epoch": 0.4750736842105263, + "grad_norm": 0.455078125, + "learning_rate": 0.00027793243706168415, + "loss": 3.3431, + "step": 11283 + }, + { + "epoch": 0.4751157894736842, + "grad_norm": 0.451171875, + "learning_rate": 0.00027789890381080306, + "loss": 2.7922, + "step": 11284 + }, + { + "epoch": 0.4751578947368421, + "grad_norm": 0.45703125, + "learning_rate": 0.00027786537005163676, + "loss": 3.5673, + "step": 11285 + }, + { + "epoch": 0.4752, + "grad_norm": 0.435546875, + "learning_rate": 0.0002778318357847962, + "loss": 2.8825, + "step": 11286 + }, + { + "epoch": 0.4752421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.00027779830101089226, + "loss": 3.315, + "step": 11287 + }, + { + "epoch": 0.4752842105263158, + "grad_norm": 0.4609375, + "learning_rate": 0.00027776476573053595, + "loss": 3.214, + "step": 11288 + }, + { + "epoch": 0.4753263157894737, + "grad_norm": 0.734375, + "learning_rate": 0.00027773122994433823, + "loss": 3.4841, + "step": 11289 + }, + { + "epoch": 0.47536842105263155, + "grad_norm": 0.431640625, + "learning_rate": 0.00027769769365291005, + "loss": 3.425, + "step": 11290 + }, + { + "epoch": 0.47541052631578945, + "grad_norm": 0.46484375, + "learning_rate": 0.00027766415685686246, + "loss": 3.4371, + "step": 11291 + }, + { + "epoch": 0.47545263157894735, + "grad_norm": 0.4453125, + "learning_rate": 0.0002776306195568064, + "loss": 3.7009, + "step": 11292 + }, + { + "epoch": 0.47549473684210525, + "grad_norm": 0.474609375, + "learning_rate": 0.000277597081753353, + "loss": 2.9753, + "step": 11293 + }, + { + "epoch": 0.47553684210526315, + "grad_norm": 0.447265625, + "learning_rate": 0.0002775635434471131, + "loss": 2.8093, + "step": 11294 + }, + { + "epoch": 0.47557894736842105, + "grad_norm": 0.46875, + "learning_rate": 0.0002775300046386979, + "loss": 3.0758, + "step": 11295 + }, + { + "epoch": 0.47562105263157894, + "grad_norm": 0.4375, + "learning_rate": 0.00027749646532871833, + "loss": 3.5004, + "step": 11296 + }, + { + "epoch": 0.47566315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.00027746292551778553, + "loss": 3.3972, + "step": 11297 + }, + { + "epoch": 0.47570526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.0002774293852065104, + "loss": 3.3036, + "step": 11298 + }, + { + "epoch": 0.47574736842105264, + "grad_norm": 0.419921875, + "learning_rate": 0.0002773958443955041, + "loss": 3.2322, + "step": 11299 + }, + { + "epoch": 0.47578947368421054, + "grad_norm": 0.404296875, + "learning_rate": 0.0002773623030853777, + "loss": 3.0336, + "step": 11300 + }, + { + "epoch": 0.47583157894736844, + "grad_norm": 0.421875, + "learning_rate": 0.00027732876127674235, + "loss": 3.303, + "step": 11301 + }, + { + "epoch": 0.47587368421052634, + "grad_norm": 0.4375, + "learning_rate": 0.0002772952189702091, + "loss": 2.9711, + "step": 11302 + }, + { + "epoch": 0.47591578947368424, + "grad_norm": 0.42578125, + "learning_rate": 0.000277261676166389, + "loss": 2.7269, + "step": 11303 + }, + { + "epoch": 0.4759578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00027722813286589317, + "loss": 3.3304, + "step": 11304 + }, + { + "epoch": 0.476, + "grad_norm": 0.45703125, + "learning_rate": 0.0002771945890693328, + "loss": 3.4456, + "step": 11305 + }, + { + "epoch": 0.4760421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00027716104477731884, + "loss": 2.9679, + "step": 11306 + }, + { + "epoch": 0.4760842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.0002771274999904626, + "loss": 3.0881, + "step": 11307 + }, + { + "epoch": 0.4761263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.00027709395470937526, + "loss": 2.6589, + "step": 11308 + }, + { + "epoch": 0.4761684210526316, + "grad_norm": 0.412109375, + "learning_rate": 0.0002770604089346679, + "loss": 3.2297, + "step": 11309 + }, + { + "epoch": 0.4762105263157895, + "grad_norm": 0.455078125, + "learning_rate": 0.00027702686266695164, + "loss": 3.1835, + "step": 11310 + }, + { + "epoch": 0.47625263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.00027699331590683775, + "loss": 3.2934, + "step": 11311 + }, + { + "epoch": 0.47629473684210527, + "grad_norm": 0.453125, + "learning_rate": 0.0002769597686549373, + "loss": 3.6355, + "step": 11312 + }, + { + "epoch": 0.47633684210526317, + "grad_norm": 0.423828125, + "learning_rate": 0.00027692622091186154, + "loss": 3.1638, + "step": 11313 + }, + { + "epoch": 0.47637894736842107, + "grad_norm": 0.41015625, + "learning_rate": 0.0002768926726782217, + "loss": 2.9164, + "step": 11314 + }, + { + "epoch": 0.47642105263157897, + "grad_norm": 0.4140625, + "learning_rate": 0.0002768591239546289, + "loss": 3.1783, + "step": 11315 + }, + { + "epoch": 0.47646315789473687, + "grad_norm": 0.404296875, + "learning_rate": 0.0002768255747416945, + "loss": 3.1852, + "step": 11316 + }, + { + "epoch": 0.4765052631578947, + "grad_norm": 0.416015625, + "learning_rate": 0.00027679202504002966, + "loss": 3.161, + "step": 11317 + }, + { + "epoch": 0.4765473684210526, + "grad_norm": 0.3828125, + "learning_rate": 0.0002767584748502456, + "loss": 2.7965, + "step": 11318 + }, + { + "epoch": 0.4765894736842105, + "grad_norm": 0.412109375, + "learning_rate": 0.0002767249241729535, + "loss": 3.2595, + "step": 11319 + }, + { + "epoch": 0.4766315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.0002766913730087647, + "loss": 3.1578, + "step": 11320 + }, + { + "epoch": 0.4766736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.0002766578213582905, + "loss": 2.8727, + "step": 11321 + }, + { + "epoch": 0.4767157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.0002766242692221421, + "loss": 3.1883, + "step": 11322 + }, + { + "epoch": 0.4767578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.00027659071660093073, + "loss": 3.4165, + "step": 11323 + }, + { + "epoch": 0.4768, + "grad_norm": 0.4296875, + "learning_rate": 0.00027655716349526775, + "loss": 3.1915, + "step": 11324 + }, + { + "epoch": 0.4768421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00027652360990576455, + "loss": 3.0349, + "step": 11325 + }, + { + "epoch": 0.4768842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.0002764900558330322, + "loss": 3.0085, + "step": 11326 + }, + { + "epoch": 0.4769263157894737, + "grad_norm": 0.421875, + "learning_rate": 0.0002764565012776823, + "loss": 3.1023, + "step": 11327 + }, + { + "epoch": 0.4769684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.00027642294624032597, + "loss": 2.7274, + "step": 11328 + }, + { + "epoch": 0.4770105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.00027638939072157466, + "loss": 3.3183, + "step": 11329 + }, + { + "epoch": 0.4770526315789474, + "grad_norm": 0.447265625, + "learning_rate": 0.0002763558347220396, + "loss": 3.506, + "step": 11330 + }, + { + "epoch": 0.47709473684210524, + "grad_norm": 0.41796875, + "learning_rate": 0.00027632227824233213, + "loss": 3.4384, + "step": 11331 + }, + { + "epoch": 0.47713684210526314, + "grad_norm": 0.447265625, + "learning_rate": 0.0002762887212830638, + "loss": 3.1726, + "step": 11332 + }, + { + "epoch": 0.47717894736842104, + "grad_norm": 0.42578125, + "learning_rate": 0.0002762551638448458, + "loss": 3.1624, + "step": 11333 + }, + { + "epoch": 0.47722105263157893, + "grad_norm": 0.3984375, + "learning_rate": 0.00027622160592828953, + "loss": 2.8228, + "step": 11334 + }, + { + "epoch": 0.47726315789473683, + "grad_norm": 0.43359375, + "learning_rate": 0.0002761880475340065, + "loss": 3.2728, + "step": 11335 + }, + { + "epoch": 0.47730526315789473, + "grad_norm": 0.455078125, + "learning_rate": 0.00027615448866260796, + "loss": 3.145, + "step": 11336 + }, + { + "epoch": 0.47734736842105263, + "grad_norm": 0.455078125, + "learning_rate": 0.0002761209293147054, + "loss": 3.2462, + "step": 11337 + }, + { + "epoch": 0.47738947368421053, + "grad_norm": 0.408203125, + "learning_rate": 0.0002760873694909102, + "loss": 2.6945, + "step": 11338 + }, + { + "epoch": 0.47743157894736843, + "grad_norm": 0.44140625, + "learning_rate": 0.0002760538091918337, + "loss": 2.8207, + "step": 11339 + }, + { + "epoch": 0.47747368421052633, + "grad_norm": 0.3828125, + "learning_rate": 0.0002760202484180875, + "loss": 2.6326, + "step": 11340 + }, + { + "epoch": 0.4775157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.00027598668717028284, + "loss": 3.2766, + "step": 11341 + }, + { + "epoch": 0.4775578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.0002759531254490313, + "loss": 3.0902, + "step": 11342 + }, + { + "epoch": 0.4776, + "grad_norm": 0.39453125, + "learning_rate": 0.0002759195632549444, + "loss": 2.6482, + "step": 11343 + }, + { + "epoch": 0.47764210526315787, + "grad_norm": 0.462890625, + "learning_rate": 0.00027588600058863345, + "loss": 3.321, + "step": 11344 + }, + { + "epoch": 0.47768421052631577, + "grad_norm": 0.455078125, + "learning_rate": 0.00027585243745071003, + "loss": 3.1135, + "step": 11345 + }, + { + "epoch": 0.47772631578947367, + "grad_norm": 0.4375, + "learning_rate": 0.0002758188738417855, + "loss": 2.8064, + "step": 11346 + }, + { + "epoch": 0.47776842105263156, + "grad_norm": 0.4375, + "learning_rate": 0.00027578530976247144, + "loss": 3.0362, + "step": 11347 + }, + { + "epoch": 0.47781052631578946, + "grad_norm": 0.4296875, + "learning_rate": 0.0002757517452133794, + "loss": 3.6248, + "step": 11348 + }, + { + "epoch": 0.47785263157894736, + "grad_norm": 0.39453125, + "learning_rate": 0.00027571818019512075, + "loss": 2.9248, + "step": 11349 + }, + { + "epoch": 0.47789473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.0002756846147083071, + "loss": 3.3977, + "step": 11350 + }, + { + "epoch": 0.47793684210526316, + "grad_norm": 0.43359375, + "learning_rate": 0.00027565104875354994, + "loss": 3.2024, + "step": 11351 + }, + { + "epoch": 0.47797894736842106, + "grad_norm": 0.421875, + "learning_rate": 0.0002756174823314608, + "loss": 3.227, + "step": 11352 + }, + { + "epoch": 0.47802105263157896, + "grad_norm": 0.4140625, + "learning_rate": 0.00027558391544265124, + "loss": 3.1085, + "step": 11353 + }, + { + "epoch": 0.47806315789473686, + "grad_norm": 0.408203125, + "learning_rate": 0.00027555034808773285, + "loss": 2.9561, + "step": 11354 + }, + { + "epoch": 0.47810526315789476, + "grad_norm": 0.408203125, + "learning_rate": 0.0002755167802673171, + "loss": 2.9616, + "step": 11355 + }, + { + "epoch": 0.47814736842105265, + "grad_norm": 0.4140625, + "learning_rate": 0.0002754832119820156, + "loss": 3.0042, + "step": 11356 + }, + { + "epoch": 0.47818947368421055, + "grad_norm": 0.400390625, + "learning_rate": 0.0002754496432324399, + "loss": 3.0959, + "step": 11357 + }, + { + "epoch": 0.4782315789473684, + "grad_norm": 0.431640625, + "learning_rate": 0.0002754160740192016, + "loss": 2.9433, + "step": 11358 + }, + { + "epoch": 0.4782736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.0002753825043429123, + "loss": 3.1231, + "step": 11359 + }, + { + "epoch": 0.4783157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.00027534893420418354, + "loss": 3.3139, + "step": 11360 + }, + { + "epoch": 0.4783578947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.00027531536360362707, + "loss": 3.1961, + "step": 11361 + }, + { + "epoch": 0.4784, + "grad_norm": 0.4453125, + "learning_rate": 0.0002752817925418544, + "loss": 3.0347, + "step": 11362 + }, + { + "epoch": 0.4784421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.0002752482210194772, + "loss": 2.9467, + "step": 11363 + }, + { + "epoch": 0.4784842105263158, + "grad_norm": 0.470703125, + "learning_rate": 0.00027521464903710707, + "loss": 2.8194, + "step": 11364 + }, + { + "epoch": 0.4785263157894737, + "grad_norm": 0.453125, + "learning_rate": 0.00027518107659535565, + "loss": 3.4005, + "step": 11365 + }, + { + "epoch": 0.4785684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.0002751475036948345, + "loss": 3.2878, + "step": 11366 + }, + { + "epoch": 0.4786105263157895, + "grad_norm": 0.443359375, + "learning_rate": 0.0002751139303361555, + "loss": 3.4864, + "step": 11367 + }, + { + "epoch": 0.4786526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00027508035651993014, + "loss": 3.2432, + "step": 11368 + }, + { + "epoch": 0.4786947368421053, + "grad_norm": 0.431640625, + "learning_rate": 0.0002750467822467702, + "loss": 3.0233, + "step": 11369 + }, + { + "epoch": 0.4787368421052632, + "grad_norm": 0.435546875, + "learning_rate": 0.0002750132075172873, + "loss": 3.1092, + "step": 11370 + }, + { + "epoch": 0.478778947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.00027497963233209316, + "loss": 3.1232, + "step": 11371 + }, + { + "epoch": 0.4788210526315789, + "grad_norm": 0.43359375, + "learning_rate": 0.00027494605669179945, + "loss": 2.6268, + "step": 11372 + }, + { + "epoch": 0.4788631578947368, + "grad_norm": 0.439453125, + "learning_rate": 0.0002749124805970179, + "loss": 2.7704, + "step": 11373 + }, + { + "epoch": 0.4789052631578947, + "grad_norm": 0.421875, + "learning_rate": 0.0002748789040483602, + "loss": 3.0863, + "step": 11374 + }, + { + "epoch": 0.4789473684210526, + "grad_norm": 0.41796875, + "learning_rate": 0.0002748453270464381, + "loss": 3.7434, + "step": 11375 + }, + { + "epoch": 0.4789894736842105, + "grad_norm": 0.431640625, + "learning_rate": 0.0002748117495918634, + "loss": 3.4542, + "step": 11376 + }, + { + "epoch": 0.4790315789473684, + "grad_norm": 0.404296875, + "learning_rate": 0.0002747781716852477, + "loss": 3.398, + "step": 11377 + }, + { + "epoch": 0.4790736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.00027474459332720285, + "loss": 3.355, + "step": 11378 + }, + { + "epoch": 0.4791157894736842, + "grad_norm": 0.484375, + "learning_rate": 0.00027471101451834065, + "loss": 3.0224, + "step": 11379 + }, + { + "epoch": 0.4791578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.0002746774352592727, + "loss": 2.6568, + "step": 11380 + }, + { + "epoch": 0.4792, + "grad_norm": 0.453125, + "learning_rate": 0.0002746438555506109, + "loss": 2.9874, + "step": 11381 + }, + { + "epoch": 0.4792421052631579, + "grad_norm": 0.40234375, + "learning_rate": 0.000274610275392967, + "loss": 3.3681, + "step": 11382 + }, + { + "epoch": 0.4792842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.0002745766947869528, + "loss": 2.9196, + "step": 11383 + }, + { + "epoch": 0.4793263157894737, + "grad_norm": 0.404296875, + "learning_rate": 0.00027454311373318013, + "loss": 3.179, + "step": 11384 + }, + { + "epoch": 0.47936842105263155, + "grad_norm": 0.43359375, + "learning_rate": 0.0002745095322322608, + "loss": 3.1272, + "step": 11385 + }, + { + "epoch": 0.47941052631578945, + "grad_norm": 0.5, + "learning_rate": 0.0002744759502848065, + "loss": 3.2385, + "step": 11386 + }, + { + "epoch": 0.47945263157894735, + "grad_norm": 0.443359375, + "learning_rate": 0.0002744423678914292, + "loss": 3.8109, + "step": 11387 + }, + { + "epoch": 0.47949473684210525, + "grad_norm": 0.412109375, + "learning_rate": 0.00027440878505274066, + "loss": 2.9637, + "step": 11388 + }, + { + "epoch": 0.47953684210526315, + "grad_norm": 0.404296875, + "learning_rate": 0.0002743752017693528, + "loss": 3.0334, + "step": 11389 + }, + { + "epoch": 0.47957894736842105, + "grad_norm": 0.388671875, + "learning_rate": 0.0002743416180418772, + "loss": 2.7744, + "step": 11390 + }, + { + "epoch": 0.47962105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.0002743080338709261, + "loss": 2.9044, + "step": 11391 + }, + { + "epoch": 0.47966315789473685, + "grad_norm": 0.44921875, + "learning_rate": 0.0002742744492571112, + "loss": 3.008, + "step": 11392 + }, + { + "epoch": 0.47970526315789475, + "grad_norm": 0.4296875, + "learning_rate": 0.0002742408642010442, + "loss": 3.4472, + "step": 11393 + }, + { + "epoch": 0.47974736842105264, + "grad_norm": 0.43359375, + "learning_rate": 0.00027420727870333726, + "loss": 3.4503, + "step": 11394 + }, + { + "epoch": 0.47978947368421054, + "grad_norm": 0.396484375, + "learning_rate": 0.00027417369276460215, + "loss": 2.9477, + "step": 11395 + }, + { + "epoch": 0.47983157894736844, + "grad_norm": 0.46484375, + "learning_rate": 0.0002741401063854507, + "loss": 3.2224, + "step": 11396 + }, + { + "epoch": 0.47987368421052634, + "grad_norm": 0.7265625, + "learning_rate": 0.00027410651956649495, + "loss": 2.95, + "step": 11397 + }, + { + "epoch": 0.4799157894736842, + "grad_norm": 0.40234375, + "learning_rate": 0.00027407293230834667, + "loss": 2.6958, + "step": 11398 + }, + { + "epoch": 0.4799578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.0002740393446116179, + "loss": 3.5148, + "step": 11399 + }, + { + "epoch": 0.48, + "grad_norm": 0.423828125, + "learning_rate": 0.00027400575647692046, + "loss": 3.1592, + "step": 11400 + }, + { + "epoch": 0.4800421052631579, + "grad_norm": 0.3984375, + "learning_rate": 0.0002739721679048663, + "loss": 3.054, + "step": 11401 + }, + { + "epoch": 0.4800842105263158, + "grad_norm": 0.4609375, + "learning_rate": 0.0002739385788960675, + "loss": 3.4561, + "step": 11402 + }, + { + "epoch": 0.4801263157894737, + "grad_norm": 0.400390625, + "learning_rate": 0.00027390498945113593, + "loss": 3.1696, + "step": 11403 + }, + { + "epoch": 0.4801684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.00027387139957068353, + "loss": 3.2177, + "step": 11404 + }, + { + "epoch": 0.4802105263157895, + "grad_norm": 0.4140625, + "learning_rate": 0.00027383780925532223, + "loss": 2.7724, + "step": 11405 + }, + { + "epoch": 0.4802526315789474, + "grad_norm": 0.41015625, + "learning_rate": 0.0002738042185056641, + "loss": 3.1676, + "step": 11406 + }, + { + "epoch": 0.4802947368421053, + "grad_norm": 0.4375, + "learning_rate": 0.00027377062732232105, + "loss": 3.0837, + "step": 11407 + }, + { + "epoch": 0.4803368421052632, + "grad_norm": 0.447265625, + "learning_rate": 0.000273737035705905, + "loss": 3.0646, + "step": 11408 + }, + { + "epoch": 0.48037894736842107, + "grad_norm": 0.40625, + "learning_rate": 0.0002737034436570282, + "loss": 2.853, + "step": 11409 + }, + { + "epoch": 0.48042105263157897, + "grad_norm": 0.423828125, + "learning_rate": 0.0002736698511763025, + "loss": 2.9993, + "step": 11410 + }, + { + "epoch": 0.48046315789473687, + "grad_norm": 0.412109375, + "learning_rate": 0.00027363625826433993, + "loss": 3.0365, + "step": 11411 + }, + { + "epoch": 0.4805052631578947, + "grad_norm": 0.41796875, + "learning_rate": 0.0002736026649217524, + "loss": 2.3459, + "step": 11412 + }, + { + "epoch": 0.4805473684210526, + "grad_norm": 0.4375, + "learning_rate": 0.0002735690711491521, + "loss": 2.957, + "step": 11413 + }, + { + "epoch": 0.4805894736842105, + "grad_norm": 0.443359375, + "learning_rate": 0.00027353547694715103, + "loss": 3.1379, + "step": 11414 + }, + { + "epoch": 0.4806315789473684, + "grad_norm": 0.453125, + "learning_rate": 0.0002735018823163612, + "loss": 3.2797, + "step": 11415 + }, + { + "epoch": 0.4806736842105263, + "grad_norm": 0.39453125, + "learning_rate": 0.0002734682872573947, + "loss": 3.0173, + "step": 11416 + }, + { + "epoch": 0.4807157894736842, + "grad_norm": 0.400390625, + "learning_rate": 0.00027343469177086354, + "loss": 3.4544, + "step": 11417 + }, + { + "epoch": 0.4807578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.0002734010958573799, + "loss": 3.2111, + "step": 11418 + }, + { + "epoch": 0.4808, + "grad_norm": 0.3984375, + "learning_rate": 0.0002733674995175558, + "loss": 3.4395, + "step": 11419 + }, + { + "epoch": 0.4808421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.0002733339027520032, + "loss": 3.3871, + "step": 11420 + }, + { + "epoch": 0.4808842105263158, + "grad_norm": 0.443359375, + "learning_rate": 0.00027330030556133446, + "loss": 3.1829, + "step": 11421 + }, + { + "epoch": 0.4809263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0002732667079461614, + "loss": 2.9954, + "step": 11422 + }, + { + "epoch": 0.4809684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.0002732331099070964, + "loss": 2.6275, + "step": 11423 + }, + { + "epoch": 0.4810105263157895, + "grad_norm": 0.4140625, + "learning_rate": 0.00027319951144475125, + "loss": 3.1068, + "step": 11424 + }, + { + "epoch": 0.48105263157894734, + "grad_norm": 0.41796875, + "learning_rate": 0.0002731659125597384, + "loss": 2.7786, + "step": 11425 + }, + { + "epoch": 0.48109473684210524, + "grad_norm": 0.42578125, + "learning_rate": 0.0002731323132526698, + "loss": 3.0806, + "step": 11426 + }, + { + "epoch": 0.48113684210526314, + "grad_norm": 0.4296875, + "learning_rate": 0.0002730987135241576, + "loss": 2.7643, + "step": 11427 + }, + { + "epoch": 0.48117894736842104, + "grad_norm": 0.427734375, + "learning_rate": 0.000273065113374814, + "loss": 3.2343, + "step": 11428 + }, + { + "epoch": 0.48122105263157894, + "grad_norm": 0.46875, + "learning_rate": 0.0002730315128052512, + "loss": 3.299, + "step": 11429 + }, + { + "epoch": 0.48126315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.0002729979118160812, + "loss": 3.0913, + "step": 11430 + }, + { + "epoch": 0.48130526315789474, + "grad_norm": 0.447265625, + "learning_rate": 0.0002729643104079164, + "loss": 2.9683, + "step": 11431 + }, + { + "epoch": 0.48134736842105263, + "grad_norm": 0.412109375, + "learning_rate": 0.00027293070858136866, + "loss": 2.9203, + "step": 11432 + }, + { + "epoch": 0.48138947368421053, + "grad_norm": 0.390625, + "learning_rate": 0.00027289710633705047, + "loss": 2.8536, + "step": 11433 + }, + { + "epoch": 0.48143157894736843, + "grad_norm": 0.427734375, + "learning_rate": 0.00027286350367557383, + "loss": 3.4811, + "step": 11434 + }, + { + "epoch": 0.48147368421052633, + "grad_norm": 0.4453125, + "learning_rate": 0.0002728299005975511, + "loss": 3.1947, + "step": 11435 + }, + { + "epoch": 0.48151578947368423, + "grad_norm": 0.44140625, + "learning_rate": 0.0002727962971035944, + "loss": 2.8504, + "step": 11436 + }, + { + "epoch": 0.48155789473684213, + "grad_norm": 0.453125, + "learning_rate": 0.00027276269319431593, + "loss": 3.1503, + "step": 11437 + }, + { + "epoch": 0.4816, + "grad_norm": 0.40625, + "learning_rate": 0.0002727290888703279, + "loss": 3.274, + "step": 11438 + }, + { + "epoch": 0.48164210526315787, + "grad_norm": 0.470703125, + "learning_rate": 0.00027269548413224264, + "loss": 2.8836, + "step": 11439 + }, + { + "epoch": 0.48168421052631577, + "grad_norm": 0.431640625, + "learning_rate": 0.0002726618789806723, + "loss": 3.3781, + "step": 11440 + }, + { + "epoch": 0.48172631578947367, + "grad_norm": 0.400390625, + "learning_rate": 0.00027262827341622913, + "loss": 2.7527, + "step": 11441 + }, + { + "epoch": 0.48176842105263157, + "grad_norm": 0.462890625, + "learning_rate": 0.0002725946674395254, + "loss": 3.0498, + "step": 11442 + }, + { + "epoch": 0.48181052631578947, + "grad_norm": 0.404296875, + "learning_rate": 0.0002725610610511734, + "loss": 3.2121, + "step": 11443 + }, + { + "epoch": 0.48185263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.0002725274542517854, + "loss": 3.2327, + "step": 11444 + }, + { + "epoch": 0.48189473684210526, + "grad_norm": 0.4140625, + "learning_rate": 0.0002724938470419736, + "loss": 3.3195, + "step": 11445 + }, + { + "epoch": 0.48193684210526316, + "grad_norm": 0.375, + "learning_rate": 0.00027246023942235036, + "loss": 2.9329, + "step": 11446 + }, + { + "epoch": 0.48197894736842106, + "grad_norm": 0.392578125, + "learning_rate": 0.000272426631393528, + "loss": 3.2601, + "step": 11447 + }, + { + "epoch": 0.48202105263157896, + "grad_norm": 0.431640625, + "learning_rate": 0.0002723930229561187, + "loss": 3.3901, + "step": 11448 + }, + { + "epoch": 0.48206315789473686, + "grad_norm": 0.41796875, + "learning_rate": 0.0002723594141107348, + "loss": 3.2576, + "step": 11449 + }, + { + "epoch": 0.48210526315789476, + "grad_norm": 0.419921875, + "learning_rate": 0.0002723258048579887, + "loss": 3.0759, + "step": 11450 + }, + { + "epoch": 0.48214736842105266, + "grad_norm": 0.408203125, + "learning_rate": 0.00027229219519849267, + "loss": 3.1091, + "step": 11451 + }, + { + "epoch": 0.4821894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.000272258585132859, + "loss": 3.2236, + "step": 11452 + }, + { + "epoch": 0.4822315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.00027222497466170014, + "loss": 3.5546, + "step": 11453 + }, + { + "epoch": 0.4822736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.00027219136378562827, + "loss": 3.2331, + "step": 11454 + }, + { + "epoch": 0.4823157894736842, + "grad_norm": 0.6484375, + "learning_rate": 0.0002721577525052559, + "loss": 3.1089, + "step": 11455 + }, + { + "epoch": 0.4823578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.00027212414082119525, + "loss": 3.2191, + "step": 11456 + }, + { + "epoch": 0.4824, + "grad_norm": 0.427734375, + "learning_rate": 0.00027209052873405877, + "loss": 3.0693, + "step": 11457 + }, + { + "epoch": 0.4824421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.0002720569162444588, + "loss": 3.0904, + "step": 11458 + }, + { + "epoch": 0.4824842105263158, + "grad_norm": 0.392578125, + "learning_rate": 0.00027202330335300773, + "loss": 3.1729, + "step": 11459 + }, + { + "epoch": 0.4825263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.0002719896900603179, + "loss": 3.3877, + "step": 11460 + }, + { + "epoch": 0.4825684210526316, + "grad_norm": 0.46875, + "learning_rate": 0.00027195607636700187, + "loss": 2.7472, + "step": 11461 + }, + { + "epoch": 0.4826105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.0002719224622736719, + "loss": 3.1258, + "step": 11462 + }, + { + "epoch": 0.4826526315789474, + "grad_norm": 0.45703125, + "learning_rate": 0.0002718888477809404, + "loss": 3.5352, + "step": 11463 + }, + { + "epoch": 0.4826947368421053, + "grad_norm": 0.447265625, + "learning_rate": 0.0002718552328894198, + "loss": 3.4957, + "step": 11464 + }, + { + "epoch": 0.48273684210526313, + "grad_norm": 0.416015625, + "learning_rate": 0.00027182161759972256, + "loss": 3.4153, + "step": 11465 + }, + { + "epoch": 0.48277894736842103, + "grad_norm": 0.4140625, + "learning_rate": 0.00027178800191246104, + "loss": 2.9373, + "step": 11466 + }, + { + "epoch": 0.48282105263157893, + "grad_norm": 0.416015625, + "learning_rate": 0.00027175438582824774, + "loss": 3.1656, + "step": 11467 + }, + { + "epoch": 0.4828631578947368, + "grad_norm": 0.423828125, + "learning_rate": 0.0002717207693476951, + "loss": 3.4927, + "step": 11468 + }, + { + "epoch": 0.4829052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.0002716871524714156, + "loss": 3.4933, + "step": 11469 + }, + { + "epoch": 0.4829473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.0002716535352000217, + "loss": 3.2775, + "step": 11470 + }, + { + "epoch": 0.4829894736842105, + "grad_norm": 0.443359375, + "learning_rate": 0.00027161991753412573, + "loss": 2.7777, + "step": 11471 + }, + { + "epoch": 0.4830315789473684, + "grad_norm": 0.40234375, + "learning_rate": 0.00027158629947434033, + "loss": 3.3784, + "step": 11472 + }, + { + "epoch": 0.4830736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.0002715526810212779, + "loss": 3.0621, + "step": 11473 + }, + { + "epoch": 0.4831157894736842, + "grad_norm": 0.48046875, + "learning_rate": 0.0002715190621755509, + "loss": 2.7169, + "step": 11474 + }, + { + "epoch": 0.4831578947368421, + "grad_norm": 0.400390625, + "learning_rate": 0.0002714854429377718, + "loss": 3.3297, + "step": 11475 + }, + { + "epoch": 0.4832, + "grad_norm": 0.439453125, + "learning_rate": 0.00027145182330855336, + "loss": 3.4379, + "step": 11476 + }, + { + "epoch": 0.4832421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.0002714182032885078, + "loss": 3.0992, + "step": 11477 + }, + { + "epoch": 0.4832842105263158, + "grad_norm": 0.412109375, + "learning_rate": 0.0002713845828782478, + "loss": 3.2114, + "step": 11478 + }, + { + "epoch": 0.48332631578947366, + "grad_norm": 0.455078125, + "learning_rate": 0.00027135096207838577, + "loss": 2.8559, + "step": 11479 + }, + { + "epoch": 0.48336842105263156, + "grad_norm": 0.4140625, + "learning_rate": 0.0002713173408895343, + "loss": 3.0378, + "step": 11480 + }, + { + "epoch": 0.48341052631578946, + "grad_norm": 0.419921875, + "learning_rate": 0.00027128371931230594, + "loss": 2.9799, + "step": 11481 + }, + { + "epoch": 0.48345263157894736, + "grad_norm": 0.419921875, + "learning_rate": 0.00027125009734731327, + "loss": 2.6369, + "step": 11482 + }, + { + "epoch": 0.48349473684210525, + "grad_norm": 0.4609375, + "learning_rate": 0.00027121647499516865, + "loss": 2.882, + "step": 11483 + }, + { + "epoch": 0.48353684210526315, + "grad_norm": 0.443359375, + "learning_rate": 0.0002711828522564849, + "loss": 3.2796, + "step": 11484 + }, + { + "epoch": 0.48357894736842105, + "grad_norm": 0.5234375, + "learning_rate": 0.0002711492291318745, + "loss": 3.0068, + "step": 11485 + }, + { + "epoch": 0.48362105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.00027111560562195, + "loss": 3.4852, + "step": 11486 + }, + { + "epoch": 0.48366315789473685, + "grad_norm": 0.43359375, + "learning_rate": 0.00027108198172732384, + "loss": 2.7816, + "step": 11487 + }, + { + "epoch": 0.48370526315789475, + "grad_norm": 0.447265625, + "learning_rate": 0.0002710483574486088, + "loss": 3.4087, + "step": 11488 + }, + { + "epoch": 0.48374736842105265, + "grad_norm": 0.412109375, + "learning_rate": 0.0002710147327864175, + "loss": 2.9015, + "step": 11489 + }, + { + "epoch": 0.48378947368421055, + "grad_norm": 0.423828125, + "learning_rate": 0.00027098110774136244, + "loss": 3.228, + "step": 11490 + }, + { + "epoch": 0.48383157894736845, + "grad_norm": 0.447265625, + "learning_rate": 0.00027094748231405616, + "loss": 3.1674, + "step": 11491 + }, + { + "epoch": 0.4838736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.0002709138565051114, + "loss": 2.7901, + "step": 11492 + }, + { + "epoch": 0.4839157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.0002708802303151408, + "loss": 3.1453, + "step": 11493 + }, + { + "epoch": 0.4839578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.0002708466037447568, + "loss": 3.2396, + "step": 11494 + }, + { + "epoch": 0.484, + "grad_norm": 0.400390625, + "learning_rate": 0.00027081297679457236, + "loss": 2.8111, + "step": 11495 + }, + { + "epoch": 0.4840421052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.00027077934946519985, + "loss": 3.3532, + "step": 11496 + }, + { + "epoch": 0.4840842105263158, + "grad_norm": 0.447265625, + "learning_rate": 0.000270745721757252, + "loss": 2.8147, + "step": 11497 + }, + { + "epoch": 0.4841263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.0002707120936713415, + "loss": 3.1329, + "step": 11498 + }, + { + "epoch": 0.4841684210526316, + "grad_norm": 0.46484375, + "learning_rate": 0.000270678465208081, + "loss": 3.5328, + "step": 11499 + }, + { + "epoch": 0.4842105263157895, + "grad_norm": 0.39453125, + "learning_rate": 0.00027064483636808314, + "loss": 2.9755, + "step": 11500 + }, + { + "epoch": 0.4842526315789474, + "grad_norm": 0.44140625, + "learning_rate": 0.00027061120715196054, + "loss": 3.1495, + "step": 11501 + }, + { + "epoch": 0.4842947368421053, + "grad_norm": 0.4140625, + "learning_rate": 0.00027057757756032604, + "loss": 3.3247, + "step": 11502 + }, + { + "epoch": 0.4843368421052632, + "grad_norm": 0.51953125, + "learning_rate": 0.00027054394759379227, + "loss": 2.9325, + "step": 11503 + }, + { + "epoch": 0.4843789473684211, + "grad_norm": 0.46484375, + "learning_rate": 0.00027051031725297195, + "loss": 2.8036, + "step": 11504 + }, + { + "epoch": 0.484421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.00027047668653847767, + "loss": 3.2401, + "step": 11505 + }, + { + "epoch": 0.4844631578947368, + "grad_norm": 0.44921875, + "learning_rate": 0.0002704430554509222, + "loss": 3.0875, + "step": 11506 + }, + { + "epoch": 0.4845052631578947, + "grad_norm": 0.4140625, + "learning_rate": 0.0002704094239909183, + "loss": 2.9815, + "step": 11507 + }, + { + "epoch": 0.4845473684210526, + "grad_norm": 0.4453125, + "learning_rate": 0.00027037579215907873, + "loss": 3.2354, + "step": 11508 + }, + { + "epoch": 0.4845894736842105, + "grad_norm": 0.435546875, + "learning_rate": 0.00027034215995601606, + "loss": 3.3104, + "step": 11509 + }, + { + "epoch": 0.4846315789473684, + "grad_norm": 0.40625, + "learning_rate": 0.00027030852738234323, + "loss": 2.9534, + "step": 11510 + }, + { + "epoch": 0.4846736842105263, + "grad_norm": 0.408203125, + "learning_rate": 0.0002702748944386729, + "loss": 3.0815, + "step": 11511 + }, + { + "epoch": 0.4847157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.0002702412611256178, + "loss": 3.0287, + "step": 11512 + }, + { + "epoch": 0.4847578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.0002702076274437907, + "loss": 3.0993, + "step": 11513 + }, + { + "epoch": 0.4848, + "grad_norm": 0.412109375, + "learning_rate": 0.00027017399339380435, + "loss": 3.2814, + "step": 11514 + }, + { + "epoch": 0.4848421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00027014035897627155, + "loss": 3.2356, + "step": 11515 + }, + { + "epoch": 0.4848842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.0002701067241918051, + "loss": 3.0912, + "step": 11516 + }, + { + "epoch": 0.4849263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.0002700730890410177, + "loss": 3.2995, + "step": 11517 + }, + { + "epoch": 0.4849684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.00027003945352452227, + "loss": 2.6678, + "step": 11518 + }, + { + "epoch": 0.48501052631578945, + "grad_norm": 0.462890625, + "learning_rate": 0.0002700058176429315, + "loss": 3.0785, + "step": 11519 + }, + { + "epoch": 0.48505263157894735, + "grad_norm": 0.421875, + "learning_rate": 0.00026997218139685825, + "loss": 2.8501, + "step": 11520 + }, + { + "epoch": 0.48509473684210525, + "grad_norm": 0.40625, + "learning_rate": 0.00026993854478691534, + "loss": 3.2432, + "step": 11521 + }, + { + "epoch": 0.48513684210526314, + "grad_norm": 0.423828125, + "learning_rate": 0.00026990490781371554, + "loss": 3.3365, + "step": 11522 + }, + { + "epoch": 0.48517894736842104, + "grad_norm": 0.427734375, + "learning_rate": 0.00026987127047787174, + "loss": 3.3956, + "step": 11523 + }, + { + "epoch": 0.48522105263157894, + "grad_norm": 0.4140625, + "learning_rate": 0.00026983763277999673, + "loss": 2.6143, + "step": 11524 + }, + { + "epoch": 0.48526315789473684, + "grad_norm": 0.400390625, + "learning_rate": 0.0002698039947207033, + "loss": 2.7039, + "step": 11525 + }, + { + "epoch": 0.48530526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.00026977035630060443, + "loss": 3.0832, + "step": 11526 + }, + { + "epoch": 0.48534736842105264, + "grad_norm": 0.416015625, + "learning_rate": 0.0002697367175203129, + "loss": 3.0168, + "step": 11527 + }, + { + "epoch": 0.48538947368421054, + "grad_norm": 0.443359375, + "learning_rate": 0.0002697030783804415, + "loss": 2.8895, + "step": 11528 + }, + { + "epoch": 0.48543157894736844, + "grad_norm": 0.419921875, + "learning_rate": 0.00026966943888160315, + "loss": 3.2859, + "step": 11529 + }, + { + "epoch": 0.48547368421052634, + "grad_norm": 0.44140625, + "learning_rate": 0.0002696357990244108, + "loss": 2.8773, + "step": 11530 + }, + { + "epoch": 0.48551578947368423, + "grad_norm": 0.43359375, + "learning_rate": 0.0002696021588094773, + "loss": 3.3982, + "step": 11531 + }, + { + "epoch": 0.48555789473684213, + "grad_norm": 0.44140625, + "learning_rate": 0.00026956851823741547, + "loss": 2.501, + "step": 11532 + }, + { + "epoch": 0.4856, + "grad_norm": 0.41015625, + "learning_rate": 0.00026953487730883814, + "loss": 3.1027, + "step": 11533 + }, + { + "epoch": 0.4856421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.0002695012360243583, + "loss": 3.0517, + "step": 11534 + }, + { + "epoch": 0.4856842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.000269467594384589, + "loss": 3.277, + "step": 11535 + }, + { + "epoch": 0.4857263157894737, + "grad_norm": 0.4921875, + "learning_rate": 0.0002694339523901429, + "loss": 3.3383, + "step": 11536 + }, + { + "epoch": 0.48576842105263157, + "grad_norm": 0.39453125, + "learning_rate": 0.0002694003100416331, + "loss": 2.7619, + "step": 11537 + }, + { + "epoch": 0.48581052631578947, + "grad_norm": 0.41796875, + "learning_rate": 0.00026936666733967234, + "loss": 3.0152, + "step": 11538 + }, + { + "epoch": 0.48585263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.0002693330242848737, + "loss": 2.8482, + "step": 11539 + }, + { + "epoch": 0.48589473684210527, + "grad_norm": 0.41015625, + "learning_rate": 0.0002692993808778501, + "loss": 3.1498, + "step": 11540 + }, + { + "epoch": 0.48593684210526317, + "grad_norm": 0.3984375, + "learning_rate": 0.00026926573711921444, + "loss": 2.6681, + "step": 11541 + }, + { + "epoch": 0.48597894736842107, + "grad_norm": 0.419921875, + "learning_rate": 0.00026923209300957964, + "loss": 2.9871, + "step": 11542 + }, + { + "epoch": 0.48602105263157896, + "grad_norm": 0.458984375, + "learning_rate": 0.0002691984485495587, + "loss": 2.7575, + "step": 11543 + }, + { + "epoch": 0.48606315789473686, + "grad_norm": 0.439453125, + "learning_rate": 0.00026916480373976464, + "loss": 3.1203, + "step": 11544 + }, + { + "epoch": 0.48610526315789476, + "grad_norm": 0.412109375, + "learning_rate": 0.0002691311585808104, + "loss": 3.366, + "step": 11545 + }, + { + "epoch": 0.4861473684210526, + "grad_norm": 0.4375, + "learning_rate": 0.00026909751307330887, + "loss": 2.9958, + "step": 11546 + }, + { + "epoch": 0.4861894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.0002690638672178732, + "loss": 3.1085, + "step": 11547 + }, + { + "epoch": 0.4862315789473684, + "grad_norm": 0.44140625, + "learning_rate": 0.0002690302210151161, + "loss": 3.3156, + "step": 11548 + }, + { + "epoch": 0.4862736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.0002689965744656508, + "loss": 3.2489, + "step": 11549 + }, + { + "epoch": 0.4863157894736842, + "grad_norm": 0.44921875, + "learning_rate": 0.0002689629275700902, + "loss": 3.1426, + "step": 11550 + }, + { + "epoch": 0.4863578947368421, + "grad_norm": 0.404296875, + "learning_rate": 0.0002689292803290474, + "loss": 3.2146, + "step": 11551 + }, + { + "epoch": 0.4864, + "grad_norm": 0.5, + "learning_rate": 0.0002688956327431354, + "loss": 3.7328, + "step": 11552 + }, + { + "epoch": 0.4864421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.0002688619848129671, + "loss": 3.2254, + "step": 11553 + }, + { + "epoch": 0.4864842105263158, + "grad_norm": 0.44921875, + "learning_rate": 0.00026882833653915563, + "loss": 3.2479, + "step": 11554 + }, + { + "epoch": 0.4865263157894737, + "grad_norm": 0.453125, + "learning_rate": 0.0002687946879223139, + "loss": 3.381, + "step": 11555 + }, + { + "epoch": 0.4865684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.0002687610389630551, + "loss": 3.5601, + "step": 11556 + }, + { + "epoch": 0.4866105263157895, + "grad_norm": 0.400390625, + "learning_rate": 0.0002687273896619923, + "loss": 2.9476, + "step": 11557 + }, + { + "epoch": 0.4866526315789474, + "grad_norm": 0.447265625, + "learning_rate": 0.0002686937400197383, + "loss": 2.7184, + "step": 11558 + }, + { + "epoch": 0.48669473684210524, + "grad_norm": 0.4453125, + "learning_rate": 0.00026866009003690647, + "loss": 2.8956, + "step": 11559 + }, + { + "epoch": 0.48673684210526313, + "grad_norm": 0.404296875, + "learning_rate": 0.0002686264397141096, + "loss": 2.8442, + "step": 11560 + }, + { + "epoch": 0.48677894736842103, + "grad_norm": 0.4453125, + "learning_rate": 0.00026859278905196093, + "loss": 3.2941, + "step": 11561 + }, + { + "epoch": 0.48682105263157893, + "grad_norm": 0.4296875, + "learning_rate": 0.00026855913805107353, + "loss": 3.2551, + "step": 11562 + }, + { + "epoch": 0.48686315789473683, + "grad_norm": 0.458984375, + "learning_rate": 0.0002685254867120604, + "loss": 3.1075, + "step": 11563 + }, + { + "epoch": 0.48690526315789473, + "grad_norm": 0.42578125, + "learning_rate": 0.0002684918350355347, + "loss": 3.1631, + "step": 11564 + }, + { + "epoch": 0.48694736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.0002684581830221095, + "loss": 3.2481, + "step": 11565 + }, + { + "epoch": 0.4869894736842105, + "grad_norm": 0.43359375, + "learning_rate": 0.00026842453067239786, + "loss": 3.2536, + "step": 11566 + }, + { + "epoch": 0.4870315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.00026839087798701287, + "loss": 3.1771, + "step": 11567 + }, + { + "epoch": 0.4870736842105263, + "grad_norm": 0.55078125, + "learning_rate": 0.0002683572249665677, + "loss": 2.5556, + "step": 11568 + }, + { + "epoch": 0.4871157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.0002683235716116756, + "loss": 3.1457, + "step": 11569 + }, + { + "epoch": 0.4871578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.0002682899179229494, + "loss": 3.3454, + "step": 11570 + }, + { + "epoch": 0.4872, + "grad_norm": 0.427734375, + "learning_rate": 0.0002682562639010025, + "loss": 3.3969, + "step": 11571 + }, + { + "epoch": 0.4872421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.0002682226095464479, + "loss": 3.3061, + "step": 11572 + }, + { + "epoch": 0.48728421052631576, + "grad_norm": 0.439453125, + "learning_rate": 0.00026818895485989877, + "loss": 3.3871, + "step": 11573 + }, + { + "epoch": 0.48732631578947366, + "grad_norm": 0.46875, + "learning_rate": 0.00026815529984196816, + "loss": 2.8728, + "step": 11574 + }, + { + "epoch": 0.48736842105263156, + "grad_norm": 0.4296875, + "learning_rate": 0.00026812164449326937, + "loss": 3.1885, + "step": 11575 + }, + { + "epoch": 0.48741052631578946, + "grad_norm": 0.4140625, + "learning_rate": 0.00026808798881441546, + "loss": 3.3922, + "step": 11576 + }, + { + "epoch": 0.48745263157894736, + "grad_norm": 0.419921875, + "learning_rate": 0.0002680543328060197, + "loss": 3.0753, + "step": 11577 + }, + { + "epoch": 0.48749473684210526, + "grad_norm": 0.51171875, + "learning_rate": 0.0002680206764686953, + "loss": 3.1138, + "step": 11578 + }, + { + "epoch": 0.48753684210526316, + "grad_norm": 0.466796875, + "learning_rate": 0.0002679870198030552, + "loss": 3.2394, + "step": 11579 + }, + { + "epoch": 0.48757894736842106, + "grad_norm": 0.41015625, + "learning_rate": 0.00026795336280971286, + "loss": 3.2322, + "step": 11580 + }, + { + "epoch": 0.48762105263157896, + "grad_norm": 0.412109375, + "learning_rate": 0.00026791970548928123, + "loss": 2.6476, + "step": 11581 + }, + { + "epoch": 0.48766315789473685, + "grad_norm": 0.404296875, + "learning_rate": 0.0002678860478423737, + "loss": 3.3496, + "step": 11582 + }, + { + "epoch": 0.48770526315789475, + "grad_norm": 0.44921875, + "learning_rate": 0.00026785238986960335, + "loss": 3.3752, + "step": 11583 + }, + { + "epoch": 0.48774736842105265, + "grad_norm": 0.41015625, + "learning_rate": 0.0002678187315715834, + "loss": 2.8132, + "step": 11584 + }, + { + "epoch": 0.48778947368421055, + "grad_norm": 0.44921875, + "learning_rate": 0.0002677850729489271, + "loss": 3.052, + "step": 11585 + }, + { + "epoch": 0.4878315789473684, + "grad_norm": 0.451171875, + "learning_rate": 0.0002677514140022478, + "loss": 3.1179, + "step": 11586 + }, + { + "epoch": 0.4878736842105263, + "grad_norm": 0.40625, + "learning_rate": 0.0002677177547321585, + "loss": 3.5733, + "step": 11587 + }, + { + "epoch": 0.4879157894736842, + "grad_norm": 0.392578125, + "learning_rate": 0.00026768409513927246, + "loss": 2.9489, + "step": 11588 + }, + { + "epoch": 0.4879578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00026765043522420306, + "loss": 3.0931, + "step": 11589 + }, + { + "epoch": 0.488, + "grad_norm": 0.453125, + "learning_rate": 0.00026761677498756347, + "loss": 2.8685, + "step": 11590 + }, + { + "epoch": 0.4880421052631579, + "grad_norm": 0.39453125, + "learning_rate": 0.00026758311442996697, + "loss": 2.9735, + "step": 11591 + }, + { + "epoch": 0.4880842105263158, + "grad_norm": 0.392578125, + "learning_rate": 0.0002675494535520267, + "loss": 3.2333, + "step": 11592 + }, + { + "epoch": 0.4881263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.0002675157923543561, + "loss": 2.9443, + "step": 11593 + }, + { + "epoch": 0.4881684210526316, + "grad_norm": 0.44921875, + "learning_rate": 0.0002674821308375683, + "loss": 2.9414, + "step": 11594 + }, + { + "epoch": 0.4882105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.0002674484690022765, + "loss": 2.993, + "step": 11595 + }, + { + "epoch": 0.4882526315789474, + "grad_norm": 0.44140625, + "learning_rate": 0.00026741480684909426, + "loss": 3.1588, + "step": 11596 + }, + { + "epoch": 0.4882947368421053, + "grad_norm": 0.416015625, + "learning_rate": 0.00026738114437863467, + "loss": 3.072, + "step": 11597 + }, + { + "epoch": 0.4883368421052632, + "grad_norm": 0.453125, + "learning_rate": 0.000267347481591511, + "loss": 3.397, + "step": 11598 + }, + { + "epoch": 0.4883789473684211, + "grad_norm": 0.44140625, + "learning_rate": 0.0002673138184883367, + "loss": 3.0259, + "step": 11599 + }, + { + "epoch": 0.4884210526315789, + "grad_norm": 0.427734375, + "learning_rate": 0.0002672801550697248, + "loss": 3.0678, + "step": 11600 + }, + { + "epoch": 0.4884631578947368, + "grad_norm": 0.439453125, + "learning_rate": 0.00026724649133628894, + "loss": 3.4206, + "step": 11601 + }, + { + "epoch": 0.4885052631578947, + "grad_norm": 0.4609375, + "learning_rate": 0.0002672128272886422, + "loss": 3.2407, + "step": 11602 + }, + { + "epoch": 0.4885473684210526, + "grad_norm": 0.392578125, + "learning_rate": 0.0002671791629273979, + "loss": 3.533, + "step": 11603 + }, + { + "epoch": 0.4885894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.00026714549825316956, + "loss": 3.5265, + "step": 11604 + }, + { + "epoch": 0.4886315789473684, + "grad_norm": 0.404296875, + "learning_rate": 0.0002671118332665703, + "loss": 3.1144, + "step": 11605 + }, + { + "epoch": 0.4886736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.0002670781679682137, + "loss": 3.4031, + "step": 11606 + }, + { + "epoch": 0.4887157894736842, + "grad_norm": 0.392578125, + "learning_rate": 0.0002670445023587128, + "loss": 2.9668, + "step": 11607 + }, + { + "epoch": 0.4887578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00026701083643868105, + "loss": 2.5575, + "step": 11608 + }, + { + "epoch": 0.4888, + "grad_norm": 0.4296875, + "learning_rate": 0.000266977170208732, + "loss": 3.3395, + "step": 11609 + }, + { + "epoch": 0.4888421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.0002669435036694787, + "loss": 2.9966, + "step": 11610 + }, + { + "epoch": 0.4888842105263158, + "grad_norm": 0.392578125, + "learning_rate": 0.00026690983682153477, + "loss": 3.0507, + "step": 11611 + }, + { + "epoch": 0.4889263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.00026687616966551345, + "loss": 3.5903, + "step": 11612 + }, + { + "epoch": 0.48896842105263155, + "grad_norm": 0.427734375, + "learning_rate": 0.00026684250220202816, + "loss": 3.5714, + "step": 11613 + }, + { + "epoch": 0.48901052631578945, + "grad_norm": 0.427734375, + "learning_rate": 0.0002668088344316923, + "loss": 3.2942, + "step": 11614 + }, + { + "epoch": 0.48905263157894735, + "grad_norm": 0.4375, + "learning_rate": 0.0002667751663551191, + "loss": 2.7759, + "step": 11615 + }, + { + "epoch": 0.48909473684210525, + "grad_norm": 0.396484375, + "learning_rate": 0.00026674149797292226, + "loss": 2.6355, + "step": 11616 + }, + { + "epoch": 0.48913684210526315, + "grad_norm": 0.42578125, + "learning_rate": 0.0002667078292857149, + "loss": 3.0265, + "step": 11617 + }, + { + "epoch": 0.48917894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.00026667416029411045, + "loss": 2.7497, + "step": 11618 + }, + { + "epoch": 0.48922105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.0002666404909987225, + "loss": 3.5654, + "step": 11619 + }, + { + "epoch": 0.48926315789473684, + "grad_norm": 0.44140625, + "learning_rate": 0.00026660682140016437, + "loss": 3.1773, + "step": 11620 + }, + { + "epoch": 0.48930526315789474, + "grad_norm": 0.451171875, + "learning_rate": 0.0002665731514990494, + "loss": 2.2933, + "step": 11621 + }, + { + "epoch": 0.48934736842105264, + "grad_norm": 0.4140625, + "learning_rate": 0.0002665394812959911, + "loss": 2.8146, + "step": 11622 + }, + { + "epoch": 0.48938947368421054, + "grad_norm": 0.427734375, + "learning_rate": 0.00026650581079160285, + "loss": 3.4819, + "step": 11623 + }, + { + "epoch": 0.48943157894736844, + "grad_norm": 0.41796875, + "learning_rate": 0.0002664721399864982, + "loss": 3.2617, + "step": 11624 + }, + { + "epoch": 0.48947368421052634, + "grad_norm": 0.435546875, + "learning_rate": 0.0002664384688812905, + "loss": 2.7383, + "step": 11625 + }, + { + "epoch": 0.48951578947368424, + "grad_norm": 0.427734375, + "learning_rate": 0.00026640479747659315, + "loss": 3.0708, + "step": 11626 + }, + { + "epoch": 0.4895578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.0002663711257730197, + "loss": 2.9493, + "step": 11627 + }, + { + "epoch": 0.4896, + "grad_norm": 0.375, + "learning_rate": 0.00026633745377118356, + "loss": 3.0606, + "step": 11628 + }, + { + "epoch": 0.4896421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.0002663037814716982, + "loss": 2.9282, + "step": 11629 + }, + { + "epoch": 0.4896842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.0002662701088751771, + "loss": 3.1941, + "step": 11630 + }, + { + "epoch": 0.4897263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.0002662364359822338, + "loss": 3.7339, + "step": 11631 + }, + { + "epoch": 0.4897684210526316, + "grad_norm": 0.453125, + "learning_rate": 0.00026620276279348164, + "loss": 3.281, + "step": 11632 + }, + { + "epoch": 0.4898105263157895, + "grad_norm": 0.3984375, + "learning_rate": 0.0002661690893095343, + "loss": 2.9923, + "step": 11633 + }, + { + "epoch": 0.4898526315789474, + "grad_norm": 0.482421875, + "learning_rate": 0.000266135415531005, + "loss": 3.0403, + "step": 11634 + }, + { + "epoch": 0.48989473684210527, + "grad_norm": 0.443359375, + "learning_rate": 0.00026610174145850736, + "loss": 3.0099, + "step": 11635 + }, + { + "epoch": 0.48993684210526317, + "grad_norm": 0.421875, + "learning_rate": 0.000266068067092655, + "loss": 3.6336, + "step": 11636 + }, + { + "epoch": 0.48997894736842107, + "grad_norm": 0.416015625, + "learning_rate": 0.0002660343924340614, + "loss": 2.8436, + "step": 11637 + }, + { + "epoch": 0.49002105263157897, + "grad_norm": 0.43359375, + "learning_rate": 0.00026600071748333984, + "loss": 2.9269, + "step": 11638 + }, + { + "epoch": 0.49006315789473687, + "grad_norm": 0.42578125, + "learning_rate": 0.0002659670422411042, + "loss": 2.776, + "step": 11639 + }, + { + "epoch": 0.4901052631578947, + "grad_norm": 0.443359375, + "learning_rate": 0.0002659333667079677, + "loss": 3.1351, + "step": 11640 + }, + { + "epoch": 0.4901473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.000265899690884544, + "loss": 3.032, + "step": 11641 + }, + { + "epoch": 0.4901894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.00026586601477144656, + "loss": 3.2799, + "step": 11642 + }, + { + "epoch": 0.4902315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.000265832338369289, + "loss": 2.9265, + "step": 11643 + }, + { + "epoch": 0.4902736842105263, + "grad_norm": 0.431640625, + "learning_rate": 0.00026579866167868486, + "loss": 3.6595, + "step": 11644 + }, + { + "epoch": 0.4903157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.0002657649847002477, + "loss": 3.282, + "step": 11645 + }, + { + "epoch": 0.4903578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00026573130743459103, + "loss": 3.1675, + "step": 11646 + }, + { + "epoch": 0.4904, + "grad_norm": 0.41796875, + "learning_rate": 0.0002656976298823284, + "loss": 3.0274, + "step": 11647 + }, + { + "epoch": 0.4904421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.00026566395204407337, + "loss": 2.991, + "step": 11648 + }, + { + "epoch": 0.4904842105263158, + "grad_norm": 0.3984375, + "learning_rate": 0.00026563027392043954, + "loss": 2.6073, + "step": 11649 + }, + { + "epoch": 0.4905263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.0002655965955120405, + "loss": 2.9554, + "step": 11650 + }, + { + "epoch": 0.4905684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.0002655629168194898, + "loss": 2.9446, + "step": 11651 + }, + { + "epoch": 0.4906105263157895, + "grad_norm": 0.48828125, + "learning_rate": 0.0002655292378434011, + "loss": 2.9311, + "step": 11652 + }, + { + "epoch": 0.4906526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.0002654955585843879, + "loss": 3.2881, + "step": 11653 + }, + { + "epoch": 0.49069473684210524, + "grad_norm": 0.400390625, + "learning_rate": 0.00026546187904306386, + "loss": 2.8006, + "step": 11654 + }, + { + "epoch": 0.49073684210526314, + "grad_norm": 0.423828125, + "learning_rate": 0.0002654281992200425, + "loss": 3.1704, + "step": 11655 + }, + { + "epoch": 0.49077894736842104, + "grad_norm": 0.419921875, + "learning_rate": 0.0002653945191159375, + "loss": 3.3464, + "step": 11656 + }, + { + "epoch": 0.49082105263157894, + "grad_norm": 0.408203125, + "learning_rate": 0.00026536083873136243, + "loss": 3.0944, + "step": 11657 + }, + { + "epoch": 0.49086315789473683, + "grad_norm": 0.458984375, + "learning_rate": 0.00026532715806693095, + "loss": 2.3493, + "step": 11658 + }, + { + "epoch": 0.49090526315789473, + "grad_norm": 0.451171875, + "learning_rate": 0.00026529347712325664, + "loss": 3.2935, + "step": 11659 + }, + { + "epoch": 0.49094736842105263, + "grad_norm": 0.408203125, + "learning_rate": 0.0002652597959009531, + "loss": 3.1645, + "step": 11660 + }, + { + "epoch": 0.49098947368421053, + "grad_norm": 0.435546875, + "learning_rate": 0.0002652261144006341, + "loss": 3.3046, + "step": 11661 + }, + { + "epoch": 0.49103157894736843, + "grad_norm": 0.5546875, + "learning_rate": 0.00026519243262291317, + "loss": 2.8383, + "step": 11662 + }, + { + "epoch": 0.49107368421052633, + "grad_norm": 0.400390625, + "learning_rate": 0.00026515875056840396, + "loss": 3.2705, + "step": 11663 + }, + { + "epoch": 0.49111578947368423, + "grad_norm": 0.443359375, + "learning_rate": 0.0002651250682377201, + "loss": 3.4802, + "step": 11664 + }, + { + "epoch": 0.4911578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.0002650913856314753, + "loss": 2.9501, + "step": 11665 + }, + { + "epoch": 0.4912, + "grad_norm": 0.447265625, + "learning_rate": 0.00026505770275028317, + "loss": 3.1806, + "step": 11666 + }, + { + "epoch": 0.49124210526315787, + "grad_norm": 0.41015625, + "learning_rate": 0.0002650240195947573, + "loss": 3.0393, + "step": 11667 + }, + { + "epoch": 0.49128421052631577, + "grad_norm": 0.4140625, + "learning_rate": 0.00026499033616551155, + "loss": 3.4221, + "step": 11668 + }, + { + "epoch": 0.49132631578947367, + "grad_norm": 0.41796875, + "learning_rate": 0.00026495665246315946, + "loss": 3.484, + "step": 11669 + }, + { + "epoch": 0.49136842105263157, + "grad_norm": 0.427734375, + "learning_rate": 0.0002649229684883148, + "loss": 3.3588, + "step": 11670 + }, + { + "epoch": 0.49141052631578946, + "grad_norm": 0.4375, + "learning_rate": 0.00026488928424159107, + "loss": 3.2207, + "step": 11671 + }, + { + "epoch": 0.49145263157894736, + "grad_norm": 0.3984375, + "learning_rate": 0.0002648555997236022, + "loss": 2.8685, + "step": 11672 + }, + { + "epoch": 0.49149473684210526, + "grad_norm": 0.4296875, + "learning_rate": 0.00026482191493496166, + "loss": 3.5406, + "step": 11673 + }, + { + "epoch": 0.49153684210526316, + "grad_norm": 0.470703125, + "learning_rate": 0.00026478822987628333, + "loss": 3.3571, + "step": 11674 + }, + { + "epoch": 0.49157894736842106, + "grad_norm": 0.4609375, + "learning_rate": 0.0002647545445481807, + "loss": 3.335, + "step": 11675 + }, + { + "epoch": 0.49162105263157896, + "grad_norm": 0.423828125, + "learning_rate": 0.00026472085895126767, + "loss": 3.2036, + "step": 11676 + }, + { + "epoch": 0.49166315789473686, + "grad_norm": 0.392578125, + "learning_rate": 0.0002646871730861579, + "loss": 3.0012, + "step": 11677 + }, + { + "epoch": 0.49170526315789476, + "grad_norm": 0.416015625, + "learning_rate": 0.0002646534869534651, + "loss": 3.1464, + "step": 11678 + }, + { + "epoch": 0.49174736842105266, + "grad_norm": 0.419921875, + "learning_rate": 0.00026461980055380294, + "loss": 3.1173, + "step": 11679 + }, + { + "epoch": 0.4917894736842105, + "grad_norm": 0.44921875, + "learning_rate": 0.0002645861138877853, + "loss": 2.8559, + "step": 11680 + }, + { + "epoch": 0.4918315789473684, + "grad_norm": 0.451171875, + "learning_rate": 0.0002645524269560257, + "loss": 3.3412, + "step": 11681 + }, + { + "epoch": 0.4918736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.000264518739759138, + "loss": 2.6561, + "step": 11682 + }, + { + "epoch": 0.4919157894736842, + "grad_norm": 0.4453125, + "learning_rate": 0.0002644850522977359, + "loss": 2.8107, + "step": 11683 + }, + { + "epoch": 0.4919578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00026445136457243324, + "loss": 3.1022, + "step": 11684 + }, + { + "epoch": 0.492, + "grad_norm": 0.451171875, + "learning_rate": 0.0002644176765838436, + "loss": 3.2419, + "step": 11685 + }, + { + "epoch": 0.4920421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.0002643839883325809, + "loss": 3.4385, + "step": 11686 + }, + { + "epoch": 0.4920842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.00026435029981925883, + "loss": 2.5535, + "step": 11687 + }, + { + "epoch": 0.4921263157894737, + "grad_norm": 0.4453125, + "learning_rate": 0.00026431661104449116, + "loss": 3.0868, + "step": 11688 + }, + { + "epoch": 0.4921684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.0002642829220088916, + "loss": 3.3267, + "step": 11689 + }, + { + "epoch": 0.4922105263157895, + "grad_norm": 0.43359375, + "learning_rate": 0.00026424923271307405, + "loss": 3.0876, + "step": 11690 + }, + { + "epoch": 0.4922526315789474, + "grad_norm": 0.390625, + "learning_rate": 0.0002642155431576522, + "loss": 3.0237, + "step": 11691 + }, + { + "epoch": 0.4922947368421053, + "grad_norm": 0.462890625, + "learning_rate": 0.0002641818533432399, + "loss": 2.9536, + "step": 11692 + }, + { + "epoch": 0.4923368421052632, + "grad_norm": 0.404296875, + "learning_rate": 0.00026414816327045084, + "loss": 3.0437, + "step": 11693 + }, + { + "epoch": 0.492378947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.00026411447293989886, + "loss": 3.1427, + "step": 11694 + }, + { + "epoch": 0.4924210526315789, + "grad_norm": 0.43359375, + "learning_rate": 0.0002640807823521978, + "loss": 3.5289, + "step": 11695 + }, + { + "epoch": 0.4924631578947368, + "grad_norm": 0.4375, + "learning_rate": 0.00026404709150796137, + "loss": 3.1923, + "step": 11696 + }, + { + "epoch": 0.4925052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.0002640134004078035, + "loss": 2.9184, + "step": 11697 + }, + { + "epoch": 0.4925473684210526, + "grad_norm": 0.408203125, + "learning_rate": 0.0002639797090523379, + "loss": 3.0645, + "step": 11698 + }, + { + "epoch": 0.4925894736842105, + "grad_norm": 0.427734375, + "learning_rate": 0.0002639460174421785, + "loss": 3.2219, + "step": 11699 + }, + { + "epoch": 0.4926315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.000263912325577939, + "loss": 3.1886, + "step": 11700 + }, + { + "epoch": 0.4926736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.0002638786334602332, + "loss": 2.7134, + "step": 11701 + }, + { + "epoch": 0.4927157894736842, + "grad_norm": 0.41015625, + "learning_rate": 0.0002638449410896751, + "loss": 2.821, + "step": 11702 + }, + { + "epoch": 0.4927578947368421, + "grad_norm": 0.404296875, + "learning_rate": 0.0002638112484668784, + "loss": 3.1285, + "step": 11703 + }, + { + "epoch": 0.4928, + "grad_norm": 0.400390625, + "learning_rate": 0.00026377755559245706, + "loss": 2.9273, + "step": 11704 + }, + { + "epoch": 0.4928421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.00026374386246702476, + "loss": 3.5778, + "step": 11705 + }, + { + "epoch": 0.4928842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.0002637101690911954, + "loss": 2.8352, + "step": 11706 + }, + { + "epoch": 0.49292631578947366, + "grad_norm": 0.431640625, + "learning_rate": 0.000263676475465583, + "loss": 3.4082, + "step": 11707 + }, + { + "epoch": 0.49296842105263156, + "grad_norm": 0.431640625, + "learning_rate": 0.0002636427815908012, + "loss": 2.9746, + "step": 11708 + }, + { + "epoch": 0.49301052631578945, + "grad_norm": 0.458984375, + "learning_rate": 0.0002636090874674639, + "loss": 3.482, + "step": 11709 + }, + { + "epoch": 0.49305263157894735, + "grad_norm": 0.400390625, + "learning_rate": 0.00026357539309618505, + "loss": 2.7955, + "step": 11710 + }, + { + "epoch": 0.49309473684210525, + "grad_norm": 0.390625, + "learning_rate": 0.0002635416984775785, + "loss": 2.7271, + "step": 11711 + }, + { + "epoch": 0.49313684210526315, + "grad_norm": 0.4453125, + "learning_rate": 0.0002635080036122582, + "loss": 3.7043, + "step": 11712 + }, + { + "epoch": 0.49317894736842105, + "grad_norm": 0.427734375, + "learning_rate": 0.00026347430850083786, + "loss": 3.3319, + "step": 11713 + }, + { + "epoch": 0.49322105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.0002634406131439315, + "loss": 3.0794, + "step": 11714 + }, + { + "epoch": 0.49326315789473685, + "grad_norm": 0.423828125, + "learning_rate": 0.000263406917542153, + "loss": 3.3191, + "step": 11715 + }, + { + "epoch": 0.49330526315789475, + "grad_norm": 0.404296875, + "learning_rate": 0.00026337322169611607, + "loss": 3.4763, + "step": 11716 + }, + { + "epoch": 0.49334736842105265, + "grad_norm": 0.458984375, + "learning_rate": 0.0002633395256064348, + "loss": 3.601, + "step": 11717 + }, + { + "epoch": 0.49338947368421054, + "grad_norm": 0.419921875, + "learning_rate": 0.000263305829273723, + "loss": 2.9146, + "step": 11718 + }, + { + "epoch": 0.49343157894736844, + "grad_norm": 0.43359375, + "learning_rate": 0.00026327213269859473, + "loss": 2.6144, + "step": 11719 + }, + { + "epoch": 0.49347368421052634, + "grad_norm": 0.42578125, + "learning_rate": 0.0002632384358816638, + "loss": 3.3835, + "step": 11720 + }, + { + "epoch": 0.4935157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.0002632047388235441, + "loss": 3.0755, + "step": 11721 + }, + { + "epoch": 0.4935578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.00026317104152484956, + "loss": 2.9746, + "step": 11722 + }, + { + "epoch": 0.4936, + "grad_norm": 0.427734375, + "learning_rate": 0.0002631373439861941, + "loss": 3.1699, + "step": 11723 + }, + { + "epoch": 0.4936421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00026310364620819166, + "loss": 3.038, + "step": 11724 + }, + { + "epoch": 0.4936842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.00026306994819145623, + "loss": 2.8827, + "step": 11725 + }, + { + "epoch": 0.4937263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.0002630362499366017, + "loss": 2.8776, + "step": 11726 + }, + { + "epoch": 0.4937684210526316, + "grad_norm": 0.40234375, + "learning_rate": 0.00026300255144424197, + "loss": 3.3501, + "step": 11727 + }, + { + "epoch": 0.4938105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.00026296885271499105, + "loss": 3.0118, + "step": 11728 + }, + { + "epoch": 0.4938526315789474, + "grad_norm": 0.41015625, + "learning_rate": 0.0002629351537494628, + "loss": 3.0965, + "step": 11729 + }, + { + "epoch": 0.4938947368421053, + "grad_norm": 0.421875, + "learning_rate": 0.0002629014545482714, + "loss": 3.1056, + "step": 11730 + }, + { + "epoch": 0.4939368421052632, + "grad_norm": 0.427734375, + "learning_rate": 0.00026286775511203054, + "loss": 3.1787, + "step": 11731 + }, + { + "epoch": 0.4939789473684211, + "grad_norm": 0.453125, + "learning_rate": 0.00026283405544135435, + "loss": 3.2477, + "step": 11732 + }, + { + "epoch": 0.49402105263157897, + "grad_norm": 0.4296875, + "learning_rate": 0.0002628003555368568, + "loss": 2.9984, + "step": 11733 + }, + { + "epoch": 0.4940631578947368, + "grad_norm": 0.408203125, + "learning_rate": 0.0002627666553991517, + "loss": 2.4976, + "step": 11734 + }, + { + "epoch": 0.4941052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.00026273295502885315, + "loss": 2.9531, + "step": 11735 + }, + { + "epoch": 0.4941473684210526, + "grad_norm": 0.58984375, + "learning_rate": 0.0002626992544265751, + "loss": 3.2128, + "step": 11736 + }, + { + "epoch": 0.4941894736842105, + "grad_norm": 0.412109375, + "learning_rate": 0.0002626655535929317, + "loss": 3.0804, + "step": 11737 + }, + { + "epoch": 0.4942315789473684, + "grad_norm": 0.443359375, + "learning_rate": 0.0002626318525285367, + "loss": 3.0013, + "step": 11738 + }, + { + "epoch": 0.4942736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.0002625981512340042, + "loss": 3.5655, + "step": 11739 + }, + { + "epoch": 0.4943157894736842, + "grad_norm": 0.447265625, + "learning_rate": 0.00026256444970994817, + "loss": 2.5667, + "step": 11740 + }, + { + "epoch": 0.4943578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00026253074795698267, + "loss": 3.4955, + "step": 11741 + }, + { + "epoch": 0.4944, + "grad_norm": 0.421875, + "learning_rate": 0.00026249704597572165, + "loss": 3.1781, + "step": 11742 + }, + { + "epoch": 0.4944421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00026246334376677904, + "loss": 2.9139, + "step": 11743 + }, + { + "epoch": 0.4944842105263158, + "grad_norm": 0.396484375, + "learning_rate": 0.00026242964133076904, + "loss": 2.9561, + "step": 11744 + }, + { + "epoch": 0.4945263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.0002623959386683056, + "loss": 3.4379, + "step": 11745 + }, + { + "epoch": 0.4945684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.00026236223578000265, + "loss": 3.5519, + "step": 11746 + }, + { + "epoch": 0.4946105263157895, + "grad_norm": 0.42578125, + "learning_rate": 0.0002623285326664744, + "loss": 3.1626, + "step": 11747 + }, + { + "epoch": 0.49465263157894734, + "grad_norm": 0.4375, + "learning_rate": 0.00026229482932833473, + "loss": 3.4296, + "step": 11748 + }, + { + "epoch": 0.49469473684210524, + "grad_norm": 0.41015625, + "learning_rate": 0.00026226112576619766, + "loss": 3.5605, + "step": 11749 + }, + { + "epoch": 0.49473684210526314, + "grad_norm": 0.416015625, + "learning_rate": 0.00026222742198067726, + "loss": 3.4966, + "step": 11750 + }, + { + "epoch": 0.49477894736842104, + "grad_norm": 0.451171875, + "learning_rate": 0.0002621937179723877, + "loss": 2.9615, + "step": 11751 + }, + { + "epoch": 0.49482105263157894, + "grad_norm": 0.46484375, + "learning_rate": 0.0002621600137419428, + "loss": 3.2592, + "step": 11752 + }, + { + "epoch": 0.49486315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.0002621263092899568, + "loss": 3.3818, + "step": 11753 + }, + { + "epoch": 0.49490526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.0002620926046170438, + "loss": 3.4676, + "step": 11754 + }, + { + "epoch": 0.49494736842105264, + "grad_norm": 0.46484375, + "learning_rate": 0.00026205889972381766, + "loss": 3.3511, + "step": 11755 + }, + { + "epoch": 0.49498947368421053, + "grad_norm": 0.41796875, + "learning_rate": 0.0002620251946108925, + "loss": 2.9201, + "step": 11756 + }, + { + "epoch": 0.49503157894736843, + "grad_norm": 0.423828125, + "learning_rate": 0.00026199148927888243, + "loss": 2.9879, + "step": 11757 + }, + { + "epoch": 0.49507368421052633, + "grad_norm": 0.470703125, + "learning_rate": 0.00026195778372840157, + "loss": 3.1764, + "step": 11758 + }, + { + "epoch": 0.49511578947368423, + "grad_norm": 0.431640625, + "learning_rate": 0.00026192407796006394, + "loss": 3.0332, + "step": 11759 + }, + { + "epoch": 0.49515789473684213, + "grad_norm": 0.400390625, + "learning_rate": 0.0002618903719744836, + "loss": 3.1114, + "step": 11760 + }, + { + "epoch": 0.4952, + "grad_norm": 0.453125, + "learning_rate": 0.00026185666577227464, + "loss": 3.2835, + "step": 11761 + }, + { + "epoch": 0.4952421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00026182295935405116, + "loss": 3.1296, + "step": 11762 + }, + { + "epoch": 0.49528421052631577, + "grad_norm": 0.392578125, + "learning_rate": 0.00026178925272042717, + "loss": 2.964, + "step": 11763 + }, + { + "epoch": 0.49532631578947367, + "grad_norm": 0.470703125, + "learning_rate": 0.000261755545872017, + "loss": 2.9682, + "step": 11764 + }, + { + "epoch": 0.49536842105263157, + "grad_norm": 0.443359375, + "learning_rate": 0.00026172183880943453, + "loss": 3.1118, + "step": 11765 + }, + { + "epoch": 0.49541052631578947, + "grad_norm": 0.435546875, + "learning_rate": 0.00026168813153329396, + "loss": 3.347, + "step": 11766 + }, + { + "epoch": 0.49545263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00026165442404420936, + "loss": 3.2379, + "step": 11767 + }, + { + "epoch": 0.49549473684210527, + "grad_norm": 0.439453125, + "learning_rate": 0.00026162071634279477, + "loss": 2.97, + "step": 11768 + }, + { + "epoch": 0.49553684210526316, + "grad_norm": 0.466796875, + "learning_rate": 0.00026158700842966444, + "loss": 2.6273, + "step": 11769 + }, + { + "epoch": 0.49557894736842106, + "grad_norm": 0.4609375, + "learning_rate": 0.0002615533003054324, + "loss": 2.9531, + "step": 11770 + }, + { + "epoch": 0.49562105263157896, + "grad_norm": 0.421875, + "learning_rate": 0.00026151959197071283, + "loss": 2.8991, + "step": 11771 + }, + { + "epoch": 0.49566315789473686, + "grad_norm": 0.447265625, + "learning_rate": 0.00026148588342611985, + "loss": 3.0404, + "step": 11772 + }, + { + "epoch": 0.49570526315789476, + "grad_norm": 0.421875, + "learning_rate": 0.00026145217467226755, + "loss": 3.4929, + "step": 11773 + }, + { + "epoch": 0.49574736842105266, + "grad_norm": 0.423828125, + "learning_rate": 0.00026141846570977014, + "loss": 2.8661, + "step": 11774 + }, + { + "epoch": 0.4957894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.00026138475653924166, + "loss": 3.3141, + "step": 11775 + }, + { + "epoch": 0.4958315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.0002613510471612962, + "loss": 3.2719, + "step": 11776 + }, + { + "epoch": 0.4958736842105263, + "grad_norm": 0.4609375, + "learning_rate": 0.0002613173375765481, + "loss": 2.9511, + "step": 11777 + }, + { + "epoch": 0.4959157894736842, + "grad_norm": 0.4453125, + "learning_rate": 0.0002612836277856113, + "loss": 3.0608, + "step": 11778 + }, + { + "epoch": 0.4959578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00026124991778910016, + "loss": 2.9175, + "step": 11779 + }, + { + "epoch": 0.496, + "grad_norm": 0.48046875, + "learning_rate": 0.00026121620758762877, + "loss": 3.3419, + "step": 11780 + }, + { + "epoch": 0.4960421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.0002611824971818112, + "loss": 3.0833, + "step": 11781 + }, + { + "epoch": 0.4960842105263158, + "grad_norm": 0.44921875, + "learning_rate": 0.0002611487865722617, + "loss": 3.0832, + "step": 11782 + }, + { + "epoch": 0.4961263157894737, + "grad_norm": 0.65625, + "learning_rate": 0.0002611150757595943, + "loss": 2.9122, + "step": 11783 + }, + { + "epoch": 0.4961684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.0002610813647444234, + "loss": 2.7323, + "step": 11784 + }, + { + "epoch": 0.4962105263157895, + "grad_norm": 0.404296875, + "learning_rate": 0.000261047653527363, + "loss": 3.015, + "step": 11785 + }, + { + "epoch": 0.4962526315789474, + "grad_norm": 0.427734375, + "learning_rate": 0.00026101394210902734, + "loss": 3.2002, + "step": 11786 + }, + { + "epoch": 0.4962947368421053, + "grad_norm": 0.4140625, + "learning_rate": 0.0002609802304900306, + "loss": 3.1196, + "step": 11787 + }, + { + "epoch": 0.49633684210526313, + "grad_norm": 0.42578125, + "learning_rate": 0.000260946518670987, + "loss": 3.1333, + "step": 11788 + }, + { + "epoch": 0.49637894736842103, + "grad_norm": 0.423828125, + "learning_rate": 0.00026091280665251066, + "loss": 3.5919, + "step": 11789 + }, + { + "epoch": 0.49642105263157893, + "grad_norm": 0.439453125, + "learning_rate": 0.0002608790944352157, + "loss": 2.9109, + "step": 11790 + }, + { + "epoch": 0.49646315789473683, + "grad_norm": 0.431640625, + "learning_rate": 0.0002608453820197166, + "loss": 3.1266, + "step": 11791 + }, + { + "epoch": 0.4965052631578947, + "grad_norm": 0.408203125, + "learning_rate": 0.0002608116694066273, + "loss": 3.3968, + "step": 11792 + }, + { + "epoch": 0.4965473684210526, + "grad_norm": 0.447265625, + "learning_rate": 0.0002607779565965621, + "loss": 3.098, + "step": 11793 + }, + { + "epoch": 0.4965894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.00026074424359013513, + "loss": 3.2611, + "step": 11794 + }, + { + "epoch": 0.4966315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00026071053038796075, + "loss": 3.5618, + "step": 11795 + }, + { + "epoch": 0.4966736842105263, + "grad_norm": 0.431640625, + "learning_rate": 0.0002606768169906531, + "loss": 3.5454, + "step": 11796 + }, + { + "epoch": 0.4967157894736842, + "grad_norm": 0.462890625, + "learning_rate": 0.0002606431033988263, + "loss": 2.9509, + "step": 11797 + }, + { + "epoch": 0.4967578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00026060938961309474, + "loss": 3.616, + "step": 11798 + }, + { + "epoch": 0.4968, + "grad_norm": 0.435546875, + "learning_rate": 0.00026057567563407254, + "loss": 2.7526, + "step": 11799 + }, + { + "epoch": 0.4968421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00026054196146237395, + "loss": 3.2414, + "step": 11800 + }, + { + "epoch": 0.4968842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.0002605082470986133, + "loss": 3.4821, + "step": 11801 + }, + { + "epoch": 0.49692631578947366, + "grad_norm": 0.78125, + "learning_rate": 0.00026047453254340457, + "loss": 3.0048, + "step": 11802 + }, + { + "epoch": 0.49696842105263156, + "grad_norm": 0.392578125, + "learning_rate": 0.0002604408177973622, + "loss": 3.2276, + "step": 11803 + }, + { + "epoch": 0.49701052631578946, + "grad_norm": 0.416015625, + "learning_rate": 0.00026040710286110043, + "loss": 3.0849, + "step": 11804 + }, + { + "epoch": 0.49705263157894736, + "grad_norm": 0.4453125, + "learning_rate": 0.0002603733877352335, + "loss": 3.2406, + "step": 11805 + }, + { + "epoch": 0.49709473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.0002603396724203757, + "loss": 3.1194, + "step": 11806 + }, + { + "epoch": 0.49713684210526315, + "grad_norm": 0.45703125, + "learning_rate": 0.0002603059569171411, + "loss": 3.3074, + "step": 11807 + }, + { + "epoch": 0.49717894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.0002602722412261441, + "loss": 3.1868, + "step": 11808 + }, + { + "epoch": 0.49722105263157895, + "grad_norm": 0.44140625, + "learning_rate": 0.000260238525347999, + "loss": 3.376, + "step": 11809 + }, + { + "epoch": 0.49726315789473685, + "grad_norm": 0.44140625, + "learning_rate": 0.0002602048092833199, + "loss": 3.4394, + "step": 11810 + }, + { + "epoch": 0.49730526315789475, + "grad_norm": 0.4453125, + "learning_rate": 0.00026017109303272113, + "loss": 3.0019, + "step": 11811 + }, + { + "epoch": 0.49734736842105265, + "grad_norm": 0.421875, + "learning_rate": 0.00026013737659681706, + "loss": 3.1913, + "step": 11812 + }, + { + "epoch": 0.49738947368421055, + "grad_norm": 0.408203125, + "learning_rate": 0.0002601036599762219, + "loss": 3.1737, + "step": 11813 + }, + { + "epoch": 0.49743157894736845, + "grad_norm": 0.421875, + "learning_rate": 0.00026006994317155, + "loss": 3.2834, + "step": 11814 + }, + { + "epoch": 0.4974736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.00026003622618341553, + "loss": 3.6543, + "step": 11815 + }, + { + "epoch": 0.4975157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.0002600025090124328, + "loss": 3.3228, + "step": 11816 + }, + { + "epoch": 0.4975578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00025996879165921606, + "loss": 3.434, + "step": 11817 + }, + { + "epoch": 0.4976, + "grad_norm": 0.412109375, + "learning_rate": 0.0002599350741243797, + "loss": 3.4325, + "step": 11818 + }, + { + "epoch": 0.4976421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.0002599013564085379, + "loss": 3.0194, + "step": 11819 + }, + { + "epoch": 0.4976842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.00025986763851230504, + "loss": 2.9731, + "step": 11820 + }, + { + "epoch": 0.4977263157894737, + "grad_norm": 0.484375, + "learning_rate": 0.00025983392043629544, + "loss": 3.3173, + "step": 11821 + }, + { + "epoch": 0.4977684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.0002598002021811234, + "loss": 3.2137, + "step": 11822 + }, + { + "epoch": 0.4978105263157895, + "grad_norm": 0.423828125, + "learning_rate": 0.0002597664837474031, + "loss": 3.4264, + "step": 11823 + }, + { + "epoch": 0.4978526315789474, + "grad_norm": 0.455078125, + "learning_rate": 0.00025973276513574896, + "loss": 3.1748, + "step": 11824 + }, + { + "epoch": 0.4978947368421053, + "grad_norm": 0.416015625, + "learning_rate": 0.00025969904634677533, + "loss": 2.7568, + "step": 11825 + }, + { + "epoch": 0.4979368421052632, + "grad_norm": 0.43359375, + "learning_rate": 0.0002596653273810964, + "loss": 3.4514, + "step": 11826 + }, + { + "epoch": 0.4979789473684211, + "grad_norm": 0.42578125, + "learning_rate": 0.0002596316082393266, + "loss": 3.0647, + "step": 11827 + }, + { + "epoch": 0.4980210526315789, + "grad_norm": 0.4296875, + "learning_rate": 0.00025959788892208023, + "loss": 3.1129, + "step": 11828 + }, + { + "epoch": 0.4980631578947368, + "grad_norm": 0.41015625, + "learning_rate": 0.00025956416942997157, + "loss": 3.181, + "step": 11829 + }, + { + "epoch": 0.4981052631578947, + "grad_norm": 0.4140625, + "learning_rate": 0.000259530449763615, + "loss": 3.2907, + "step": 11830 + }, + { + "epoch": 0.4981473684210526, + "grad_norm": 0.421875, + "learning_rate": 0.0002594967299236248, + "loss": 3.1404, + "step": 11831 + }, + { + "epoch": 0.4981894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.0002594630099106154, + "loss": 3.0203, + "step": 11832 + }, + { + "epoch": 0.4982315789473684, + "grad_norm": 0.419921875, + "learning_rate": 0.00025942928972520106, + "loss": 3.284, + "step": 11833 + }, + { + "epoch": 0.4982736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.00025939556936799616, + "loss": 2.9613, + "step": 11834 + }, + { + "epoch": 0.4983157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.00025936184883961497, + "loss": 3.2107, + "step": 11835 + }, + { + "epoch": 0.4983578947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.0002593281281406719, + "loss": 3.4872, + "step": 11836 + }, + { + "epoch": 0.4984, + "grad_norm": 0.435546875, + "learning_rate": 0.00025929440727178127, + "loss": 2.4908, + "step": 11837 + }, + { + "epoch": 0.4984421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00025926068623355754, + "loss": 3.2222, + "step": 11838 + }, + { + "epoch": 0.4984842105263158, + "grad_norm": 0.4453125, + "learning_rate": 0.0002592269650266149, + "loss": 3.5823, + "step": 11839 + }, + { + "epoch": 0.4985263157894737, + "grad_norm": 0.40234375, + "learning_rate": 0.0002591932436515679, + "loss": 3.4019, + "step": 11840 + }, + { + "epoch": 0.4985684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.0002591595221090308, + "loss": 3.2646, + "step": 11841 + }, + { + "epoch": 0.49861052631578945, + "grad_norm": 0.396484375, + "learning_rate": 0.00025912580039961786, + "loss": 3.4698, + "step": 11842 + }, + { + "epoch": 0.49865263157894735, + "grad_norm": 0.39453125, + "learning_rate": 0.00025909207852394363, + "loss": 3.3831, + "step": 11843 + }, + { + "epoch": 0.49869473684210525, + "grad_norm": 0.5078125, + "learning_rate": 0.0002590583564826224, + "loss": 2.6153, + "step": 11844 + }, + { + "epoch": 0.49873684210526315, + "grad_norm": 0.4296875, + "learning_rate": 0.0002590246342762685, + "loss": 2.8707, + "step": 11845 + }, + { + "epoch": 0.49877894736842104, + "grad_norm": 0.443359375, + "learning_rate": 0.00025899091190549644, + "loss": 3.2202, + "step": 11846 + }, + { + "epoch": 0.49882105263157894, + "grad_norm": 0.419921875, + "learning_rate": 0.0002589571893709205, + "loss": 3.2959, + "step": 11847 + }, + { + "epoch": 0.49886315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.00025892346667315513, + "loss": 3.4278, + "step": 11848 + }, + { + "epoch": 0.49890526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00025888974381281467, + "loss": 3.1796, + "step": 11849 + }, + { + "epoch": 0.49894736842105264, + "grad_norm": 0.4375, + "learning_rate": 0.00025885602079051353, + "loss": 2.8782, + "step": 11850 + }, + { + "epoch": 0.49898947368421054, + "grad_norm": 0.451171875, + "learning_rate": 0.00025882229760686604, + "loss": 2.8731, + "step": 11851 + }, + { + "epoch": 0.49903157894736844, + "grad_norm": 0.419921875, + "learning_rate": 0.0002587885742624867, + "loss": 3.3547, + "step": 11852 + }, + { + "epoch": 0.49907368421052634, + "grad_norm": 0.431640625, + "learning_rate": 0.0002587548507579898, + "loss": 3.0811, + "step": 11853 + }, + { + "epoch": 0.49911578947368423, + "grad_norm": 0.43359375, + "learning_rate": 0.00025872112709398985, + "loss": 3.2308, + "step": 11854 + }, + { + "epoch": 0.4991578947368421, + "grad_norm": 0.466796875, + "learning_rate": 0.0002586874032711013, + "loss": 3.591, + "step": 11855 + }, + { + "epoch": 0.4992, + "grad_norm": 0.431640625, + "learning_rate": 0.0002586536792899384, + "loss": 2.988, + "step": 11856 + }, + { + "epoch": 0.4992421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00025861995515111556, + "loss": 3.383, + "step": 11857 + }, + { + "epoch": 0.4992842105263158, + "grad_norm": 0.396484375, + "learning_rate": 0.00025858623085524735, + "loss": 2.7262, + "step": 11858 + }, + { + "epoch": 0.4993263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.00025855250640294806, + "loss": 2.8443, + "step": 11859 + }, + { + "epoch": 0.4993684210526316, + "grad_norm": 0.412109375, + "learning_rate": 0.0002585187817948322, + "loss": 3.2173, + "step": 11860 + }, + { + "epoch": 0.49941052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.0002584850570315141, + "loss": 3.369, + "step": 11861 + }, + { + "epoch": 0.49945263157894737, + "grad_norm": 0.404296875, + "learning_rate": 0.00025845133211360824, + "loss": 3.1172, + "step": 11862 + }, + { + "epoch": 0.49949473684210527, + "grad_norm": 0.376953125, + "learning_rate": 0.00025841760704172905, + "loss": 2.3914, + "step": 11863 + }, + { + "epoch": 0.49953684210526317, + "grad_norm": 0.431640625, + "learning_rate": 0.0002583838818164909, + "loss": 3.3364, + "step": 11864 + }, + { + "epoch": 0.49957894736842107, + "grad_norm": 0.431640625, + "learning_rate": 0.0002583501564385083, + "loss": 3.0707, + "step": 11865 + }, + { + "epoch": 0.49962105263157897, + "grad_norm": 0.43359375, + "learning_rate": 0.00025831643090839574, + "loss": 3.2036, + "step": 11866 + }, + { + "epoch": 0.49966315789473686, + "grad_norm": 0.435546875, + "learning_rate": 0.00025828270522676753, + "loss": 2.9932, + "step": 11867 + }, + { + "epoch": 0.49970526315789476, + "grad_norm": 0.400390625, + "learning_rate": 0.00025824897939423817, + "loss": 3.3564, + "step": 11868 + }, + { + "epoch": 0.4997473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.00025821525341142206, + "loss": 2.5034, + "step": 11869 + }, + { + "epoch": 0.4997894736842105, + "grad_norm": 0.44140625, + "learning_rate": 0.0002581815272789337, + "loss": 3.0827, + "step": 11870 + }, + { + "epoch": 0.4998315789473684, + "grad_norm": 0.431640625, + "learning_rate": 0.00025814780099738747, + "loss": 3.2617, + "step": 11871 + }, + { + "epoch": 0.4998736842105263, + "grad_norm": 0.40234375, + "learning_rate": 0.000258114074567398, + "loss": 3.366, + "step": 11872 + }, + { + "epoch": 0.4999157894736842, + "grad_norm": 0.458984375, + "learning_rate": 0.0002580803479895796, + "loss": 2.9887, + "step": 11873 + }, + { + "epoch": 0.4999578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.0002580466212645467, + "loss": 3.4758, + "step": 11874 + }, + { + "epoch": 0.5, + "grad_norm": 0.41015625, + "learning_rate": 0.00025801289439291385, + "loss": 2.5793, + "step": 11875 + }, + { + "epoch": 0.5000421052631578, + "grad_norm": 0.41796875, + "learning_rate": 0.00025797916737529546, + "loss": 3.5292, + "step": 11876 + }, + { + "epoch": 0.5000842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.000257945440212306, + "loss": 3.0543, + "step": 11877 + }, + { + "epoch": 0.5001263157894736, + "grad_norm": 0.4296875, + "learning_rate": 0.00025791171290455987, + "loss": 3.3211, + "step": 11878 + }, + { + "epoch": 0.5001684210526316, + "grad_norm": 0.50390625, + "learning_rate": 0.0002578779854526718, + "loss": 3.1241, + "step": 11879 + }, + { + "epoch": 0.5002105263157894, + "grad_norm": 0.4609375, + "learning_rate": 0.000257844257857256, + "loss": 3.6102, + "step": 11880 + }, + { + "epoch": 0.5002526315789474, + "grad_norm": 0.45703125, + "learning_rate": 0.0002578105301189271, + "loss": 3.3974, + "step": 11881 + }, + { + "epoch": 0.5002947368421052, + "grad_norm": 0.404296875, + "learning_rate": 0.0002577768022382995, + "loss": 3.3197, + "step": 11882 + }, + { + "epoch": 0.5003368421052632, + "grad_norm": 0.4140625, + "learning_rate": 0.0002577430742159877, + "loss": 2.8323, + "step": 11883 + }, + { + "epoch": 0.500378947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.0002577093460526061, + "loss": 2.7986, + "step": 11884 + }, + { + "epoch": 0.500421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.00025767561774876936, + "loss": 3.0178, + "step": 11885 + }, + { + "epoch": 0.5004631578947368, + "grad_norm": 0.427734375, + "learning_rate": 0.0002576418893050919, + "loss": 3.3002, + "step": 11886 + }, + { + "epoch": 0.5005052631578948, + "grad_norm": 0.43359375, + "learning_rate": 0.00025760816072218804, + "loss": 3.0102, + "step": 11887 + }, + { + "epoch": 0.5005473684210526, + "grad_norm": 0.43359375, + "learning_rate": 0.0002575744320006726, + "loss": 3.3164, + "step": 11888 + }, + { + "epoch": 0.5005894736842105, + "grad_norm": 0.458984375, + "learning_rate": 0.0002575407031411599, + "loss": 2.9908, + "step": 11889 + }, + { + "epoch": 0.5006315789473684, + "grad_norm": 0.45703125, + "learning_rate": 0.0002575069741442644, + "loss": 3.0685, + "step": 11890 + }, + { + "epoch": 0.5006736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.0002574732450106006, + "loss": 2.9118, + "step": 11891 + }, + { + "epoch": 0.5007157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.00025743951574078313, + "loss": 3.1362, + "step": 11892 + }, + { + "epoch": 0.5007578947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.00025740578633542645, + "loss": 2.7773, + "step": 11893 + }, + { + "epoch": 0.5008, + "grad_norm": 0.4296875, + "learning_rate": 0.000257372056795145, + "loss": 3.1506, + "step": 11894 + }, + { + "epoch": 0.5008421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.00025733832712055327, + "loss": 2.9039, + "step": 11895 + }, + { + "epoch": 0.5008842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.000257304597312266, + "loss": 2.9184, + "step": 11896 + }, + { + "epoch": 0.5009263157894737, + "grad_norm": 0.396484375, + "learning_rate": 0.0002572708673708974, + "loss": 2.8301, + "step": 11897 + }, + { + "epoch": 0.5009684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.0002572371372970622, + "loss": 2.8448, + "step": 11898 + }, + { + "epoch": 0.5010105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.00025720340709137483, + "loss": 2.9318, + "step": 11899 + }, + { + "epoch": 0.5010526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.0002571696767544499, + "loss": 3.0695, + "step": 11900 + }, + { + "epoch": 0.5010947368421053, + "grad_norm": 0.431640625, + "learning_rate": 0.0002571359462869019, + "loss": 3.5978, + "step": 11901 + }, + { + "epoch": 0.5011368421052631, + "grad_norm": 0.400390625, + "learning_rate": 0.00025710221568934534, + "loss": 3.0096, + "step": 11902 + }, + { + "epoch": 0.5011789473684211, + "grad_norm": 0.439453125, + "learning_rate": 0.0002570684849623947, + "loss": 3.462, + "step": 11903 + }, + { + "epoch": 0.5012210526315789, + "grad_norm": 0.4296875, + "learning_rate": 0.0002570347541066645, + "loss": 3.1149, + "step": 11904 + }, + { + "epoch": 0.5012631578947369, + "grad_norm": 0.453125, + "learning_rate": 0.00025700102312276946, + "loss": 2.7461, + "step": 11905 + }, + { + "epoch": 0.5013052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.00025696729201132396, + "loss": 3.3615, + "step": 11906 + }, + { + "epoch": 0.5013473684210527, + "grad_norm": 0.39453125, + "learning_rate": 0.0002569335607729426, + "loss": 3.2187, + "step": 11907 + }, + { + "epoch": 0.5013894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.00025689982940823995, + "loss": 2.4131, + "step": 11908 + }, + { + "epoch": 0.5014315789473685, + "grad_norm": 0.4296875, + "learning_rate": 0.0002568660979178305, + "loss": 2.9688, + "step": 11909 + }, + { + "epoch": 0.5014736842105263, + "grad_norm": 0.412109375, + "learning_rate": 0.0002568323663023288, + "loss": 2.9243, + "step": 11910 + }, + { + "epoch": 0.5015157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.00025679863456234937, + "loss": 3.2165, + "step": 11911 + }, + { + "epoch": 0.5015578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.0002567649026985068, + "loss": 3.2529, + "step": 11912 + }, + { + "epoch": 0.5016, + "grad_norm": 0.431640625, + "learning_rate": 0.0002567311707114157, + "loss": 3.3742, + "step": 11913 + }, + { + "epoch": 0.5016421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.00025669743860169057, + "loss": 3.3712, + "step": 11914 + }, + { + "epoch": 0.5016842105263158, + "grad_norm": 0.3984375, + "learning_rate": 0.000256663706369946, + "loss": 2.9002, + "step": 11915 + }, + { + "epoch": 0.5017263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.00025662997401679655, + "loss": 3.1564, + "step": 11916 + }, + { + "epoch": 0.5017684210526315, + "grad_norm": 0.40625, + "learning_rate": 0.00025659624154285673, + "loss": 3.456, + "step": 11917 + }, + { + "epoch": 0.5018105263157895, + "grad_norm": 0.412109375, + "learning_rate": 0.00025656250894874106, + "loss": 3.2188, + "step": 11918 + }, + { + "epoch": 0.5018526315789473, + "grad_norm": 0.455078125, + "learning_rate": 0.00025652877623506424, + "loss": 3.2334, + "step": 11919 + }, + { + "epoch": 0.5018947368421053, + "grad_norm": 0.419921875, + "learning_rate": 0.00025649504340244085, + "loss": 2.9544, + "step": 11920 + }, + { + "epoch": 0.5019368421052631, + "grad_norm": 0.44921875, + "learning_rate": 0.00025646131045148536, + "loss": 3.4514, + "step": 11921 + }, + { + "epoch": 0.5019789473684211, + "grad_norm": 0.416015625, + "learning_rate": 0.00025642757738281234, + "loss": 2.8682, + "step": 11922 + }, + { + "epoch": 0.5020210526315789, + "grad_norm": 0.52734375, + "learning_rate": 0.0002563938441970365, + "loss": 3.3967, + "step": 11923 + }, + { + "epoch": 0.5020631578947369, + "grad_norm": 0.427734375, + "learning_rate": 0.0002563601108947723, + "loss": 2.9892, + "step": 11924 + }, + { + "epoch": 0.5021052631578947, + "grad_norm": 0.423828125, + "learning_rate": 0.0002563263774766343, + "loss": 3.3888, + "step": 11925 + }, + { + "epoch": 0.5021473684210527, + "grad_norm": 0.400390625, + "learning_rate": 0.0002562926439432372, + "loss": 2.9315, + "step": 11926 + }, + { + "epoch": 0.5021894736842105, + "grad_norm": 0.416015625, + "learning_rate": 0.0002562589102951955, + "loss": 3.1879, + "step": 11927 + }, + { + "epoch": 0.5022315789473685, + "grad_norm": 0.4296875, + "learning_rate": 0.0002562251765331238, + "loss": 3.6553, + "step": 11928 + }, + { + "epoch": 0.5022736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.0002561914426576367, + "loss": 3.1002, + "step": 11929 + }, + { + "epoch": 0.5023157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.0002561577086693488, + "loss": 3.1038, + "step": 11930 + }, + { + "epoch": 0.5023578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.0002561239745688746, + "loss": 2.7897, + "step": 11931 + }, + { + "epoch": 0.5024, + "grad_norm": 0.431640625, + "learning_rate": 0.0002560902403568289, + "loss": 3.1365, + "step": 11932 + }, + { + "epoch": 0.5024421052631579, + "grad_norm": 0.45703125, + "learning_rate": 0.0002560565060338261, + "loss": 3.1951, + "step": 11933 + }, + { + "epoch": 0.5024842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.00025602277160048095, + "loss": 3.7368, + "step": 11934 + }, + { + "epoch": 0.5025263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.00025598903705740796, + "loss": 2.8116, + "step": 11935 + }, + { + "epoch": 0.5025684210526316, + "grad_norm": 0.451171875, + "learning_rate": 0.0002559553024052216, + "loss": 3.518, + "step": 11936 + }, + { + "epoch": 0.5026105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.00025592156764453684, + "loss": 3.515, + "step": 11937 + }, + { + "epoch": 0.5026526315789474, + "grad_norm": 0.4375, + "learning_rate": 0.0002558878327759679, + "loss": 3.3112, + "step": 11938 + }, + { + "epoch": 0.5026947368421053, + "grad_norm": 0.412109375, + "learning_rate": 0.00025585409780012967, + "loss": 3.3916, + "step": 11939 + }, + { + "epoch": 0.5027368421052631, + "grad_norm": 0.3984375, + "learning_rate": 0.0002558203627176367, + "loss": 2.9375, + "step": 11940 + }, + { + "epoch": 0.5027789473684211, + "grad_norm": 0.43359375, + "learning_rate": 0.0002557866275291035, + "loss": 2.9133, + "step": 11941 + }, + { + "epoch": 0.502821052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.00025575289223514475, + "loss": 2.917, + "step": 11942 + }, + { + "epoch": 0.5028631578947368, + "grad_norm": 0.419921875, + "learning_rate": 0.0002557191568363751, + "loss": 3.0016, + "step": 11943 + }, + { + "epoch": 0.5029052631578947, + "grad_norm": 0.435546875, + "learning_rate": 0.000255685421333409, + "loss": 3.3195, + "step": 11944 + }, + { + "epoch": 0.5029473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.00025565168572686133, + "loss": 3.2903, + "step": 11945 + }, + { + "epoch": 0.5029894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.0002556179500173465, + "loss": 3.6244, + "step": 11946 + }, + { + "epoch": 0.5030315789473684, + "grad_norm": 0.443359375, + "learning_rate": 0.0002555842142054793, + "loss": 3.1248, + "step": 11947 + }, + { + "epoch": 0.5030736842105263, + "grad_norm": 0.404296875, + "learning_rate": 0.0002555504782918742, + "loss": 3.5486, + "step": 11948 + }, + { + "epoch": 0.5031157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.000255516742277146, + "loss": 2.926, + "step": 11949 + }, + { + "epoch": 0.5031578947368421, + "grad_norm": 0.390625, + "learning_rate": 0.00025548300616190923, + "loss": 2.6506, + "step": 11950 + }, + { + "epoch": 0.5032, + "grad_norm": 0.404296875, + "learning_rate": 0.00025544926994677846, + "loss": 2.9996, + "step": 11951 + }, + { + "epoch": 0.5032421052631579, + "grad_norm": 0.474609375, + "learning_rate": 0.00025541553363236843, + "loss": 2.9144, + "step": 11952 + }, + { + "epoch": 0.5032842105263158, + "grad_norm": 0.50390625, + "learning_rate": 0.0002553817972192937, + "loss": 2.6542, + "step": 11953 + }, + { + "epoch": 0.5033263157894737, + "grad_norm": 0.3984375, + "learning_rate": 0.000255348060708169, + "loss": 3.3758, + "step": 11954 + }, + { + "epoch": 0.5033684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.00025531432409960894, + "loss": 3.543, + "step": 11955 + }, + { + "epoch": 0.5034105263157894, + "grad_norm": 0.41015625, + "learning_rate": 0.00025528058739422813, + "loss": 2.9508, + "step": 11956 + }, + { + "epoch": 0.5034526315789474, + "grad_norm": 0.419921875, + "learning_rate": 0.0002552468505926411, + "loss": 3.195, + "step": 11957 + }, + { + "epoch": 0.5034947368421052, + "grad_norm": 0.423828125, + "learning_rate": 0.0002552131136954627, + "loss": 3.1182, + "step": 11958 + }, + { + "epoch": 0.5035368421052632, + "grad_norm": 0.435546875, + "learning_rate": 0.00025517937670330756, + "loss": 3.4034, + "step": 11959 + }, + { + "epoch": 0.503578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.0002551456396167902, + "loss": 2.8364, + "step": 11960 + }, + { + "epoch": 0.503621052631579, + "grad_norm": 0.490234375, + "learning_rate": 0.0002551119024365254, + "loss": 3.4364, + "step": 11961 + }, + { + "epoch": 0.5036631578947368, + "grad_norm": 0.44140625, + "learning_rate": 0.0002550781651631276, + "loss": 3.1737, + "step": 11962 + }, + { + "epoch": 0.5037052631578948, + "grad_norm": 0.4375, + "learning_rate": 0.0002550444277972117, + "loss": 2.8238, + "step": 11963 + }, + { + "epoch": 0.5037473684210526, + "grad_norm": 0.421875, + "learning_rate": 0.0002550106903393922, + "loss": 3.046, + "step": 11964 + }, + { + "epoch": 0.5037894736842106, + "grad_norm": 0.419921875, + "learning_rate": 0.0002549769527902838, + "loss": 3.4918, + "step": 11965 + }, + { + "epoch": 0.5038315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.0002549432151505013, + "loss": 3.7213, + "step": 11966 + }, + { + "epoch": 0.5038736842105264, + "grad_norm": 0.419921875, + "learning_rate": 0.0002549094774206591, + "loss": 2.9697, + "step": 11967 + }, + { + "epoch": 0.5039157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.00025487573960137206, + "loss": 3.066, + "step": 11968 + }, + { + "epoch": 0.5039578947368422, + "grad_norm": 0.447265625, + "learning_rate": 0.00025484200169325477, + "loss": 3.2236, + "step": 11969 + }, + { + "epoch": 0.504, + "grad_norm": 0.4140625, + "learning_rate": 0.0002548082636969218, + "loss": 3.4485, + "step": 11970 + }, + { + "epoch": 0.5040421052631578, + "grad_norm": 0.419921875, + "learning_rate": 0.000254774525612988, + "loss": 3.5289, + "step": 11971 + }, + { + "epoch": 0.5040842105263158, + "grad_norm": 0.44921875, + "learning_rate": 0.0002547407874420679, + "loss": 3.4352, + "step": 11972 + }, + { + "epoch": 0.5041263157894736, + "grad_norm": 0.419921875, + "learning_rate": 0.0002547070491847763, + "loss": 3.6419, + "step": 11973 + }, + { + "epoch": 0.5041684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.00025467331084172776, + "loss": 3.2991, + "step": 11974 + }, + { + "epoch": 0.5042105263157894, + "grad_norm": 0.43359375, + "learning_rate": 0.00025463957241353695, + "loss": 2.7614, + "step": 11975 + }, + { + "epoch": 0.5042526315789474, + "grad_norm": 0.44140625, + "learning_rate": 0.00025460583390081866, + "loss": 3.1424, + "step": 11976 + }, + { + "epoch": 0.5042947368421052, + "grad_norm": 0.41015625, + "learning_rate": 0.00025457209530418745, + "loss": 3.0214, + "step": 11977 + }, + { + "epoch": 0.5043368421052632, + "grad_norm": 0.419921875, + "learning_rate": 0.000254538356624258, + "loss": 3.3662, + "step": 11978 + }, + { + "epoch": 0.504378947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00025450461786164495, + "loss": 3.1607, + "step": 11979 + }, + { + "epoch": 0.504421052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.0002544708790169632, + "loss": 3.3502, + "step": 11980 + }, + { + "epoch": 0.5044631578947368, + "grad_norm": 0.40625, + "learning_rate": 0.0002544371400908273, + "loss": 2.8075, + "step": 11981 + }, + { + "epoch": 0.5045052631578948, + "grad_norm": 0.431640625, + "learning_rate": 0.00025440340108385184, + "loss": 3.174, + "step": 11982 + }, + { + "epoch": 0.5045473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.0002543696619966516, + "loss": 2.7678, + "step": 11983 + }, + { + "epoch": 0.5045894736842105, + "grad_norm": 0.435546875, + "learning_rate": 0.0002543359228298413, + "loss": 3.1831, + "step": 11984 + }, + { + "epoch": 0.5046315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.0002543021835840355, + "loss": 2.8303, + "step": 11985 + }, + { + "epoch": 0.5046736842105263, + "grad_norm": 0.431640625, + "learning_rate": 0.00025426844425984905, + "loss": 3.5595, + "step": 11986 + }, + { + "epoch": 0.5047157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.00025423470485789643, + "loss": 3.2853, + "step": 11987 + }, + { + "epoch": 0.5047578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.00025420096537879264, + "loss": 2.78, + "step": 11988 + }, + { + "epoch": 0.5048, + "grad_norm": 0.3984375, + "learning_rate": 0.0002541672258231521, + "loss": 2.9894, + "step": 11989 + }, + { + "epoch": 0.5048421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.0002541334861915897, + "loss": 3.319, + "step": 11990 + }, + { + "epoch": 0.5048842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.0002540997464847199, + "loss": 3.0665, + "step": 11991 + }, + { + "epoch": 0.5049263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.00025406600670315756, + "loss": 3.1564, + "step": 11992 + }, + { + "epoch": 0.5049684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.0002540322668475174, + "loss": 3.6432, + "step": 11993 + }, + { + "epoch": 0.5050105263157895, + "grad_norm": 0.44921875, + "learning_rate": 0.0002539985269184141, + "loss": 3.6628, + "step": 11994 + }, + { + "epoch": 0.5050526315789474, + "grad_norm": 0.416015625, + "learning_rate": 0.00025396478691646225, + "loss": 3.0249, + "step": 11995 + }, + { + "epoch": 0.5050947368421053, + "grad_norm": 0.419921875, + "learning_rate": 0.0002539310468422767, + "loss": 3.1347, + "step": 11996 + }, + { + "epoch": 0.5051368421052631, + "grad_norm": 0.427734375, + "learning_rate": 0.0002538973066964721, + "loss": 3.2627, + "step": 11997 + }, + { + "epoch": 0.5051789473684211, + "grad_norm": 0.4140625, + "learning_rate": 0.00025386356647966315, + "loss": 3.1881, + "step": 11998 + }, + { + "epoch": 0.5052210526315789, + "grad_norm": 0.4296875, + "learning_rate": 0.0002538298261924645, + "loss": 3.4062, + "step": 11999 + }, + { + "epoch": 0.5052631578947369, + "grad_norm": 0.421875, + "learning_rate": 0.000253796085835491, + "loss": 3.5201, + "step": 12000 + }, + { + "epoch": 0.5052631578947369, + "eval_loss": 3.1277520656585693, + "eval_runtime": 335.5448, + "eval_samples_per_second": 44.703, + "eval_steps_per_second": 5.588, + "step": 12000 + }, + { + "epoch": 0.5053052631578947, + "grad_norm": 0.416015625, + "learning_rate": 0.00025376234540935724, + "loss": 3.1084, + "step": 12001 + }, + { + "epoch": 0.5053473684210527, + "grad_norm": 0.4140625, + "learning_rate": 0.00025372860491467797, + "loss": 3.5781, + "step": 12002 + }, + { + "epoch": 0.5053894736842105, + "grad_norm": 0.412109375, + "learning_rate": 0.00025369486435206784, + "loss": 3.2449, + "step": 12003 + }, + { + "epoch": 0.5054315789473685, + "grad_norm": 0.435546875, + "learning_rate": 0.0002536611237221417, + "loss": 3.1852, + "step": 12004 + }, + { + "epoch": 0.5054736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.0002536273830255141, + "loss": 3.1418, + "step": 12005 + }, + { + "epoch": 0.5055157894736843, + "grad_norm": 0.412109375, + "learning_rate": 0.00025359364226279995, + "loss": 3.1013, + "step": 12006 + }, + { + "epoch": 0.5055578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00025355990143461385, + "loss": 3.0224, + "step": 12007 + }, + { + "epoch": 0.5056, + "grad_norm": 0.4375, + "learning_rate": 0.0002535261605415705, + "loss": 2.8494, + "step": 12008 + }, + { + "epoch": 0.5056421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00025349241958428465, + "loss": 3.1999, + "step": 12009 + }, + { + "epoch": 0.5056842105263157, + "grad_norm": 0.40625, + "learning_rate": 0.000253458678563371, + "loss": 2.7035, + "step": 12010 + }, + { + "epoch": 0.5057263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.00025342493747944426, + "loss": 2.6765, + "step": 12011 + }, + { + "epoch": 0.5057684210526315, + "grad_norm": 0.46875, + "learning_rate": 0.00025339119633311925, + "loss": 3.5958, + "step": 12012 + }, + { + "epoch": 0.5058105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.00025335745512501055, + "loss": 3.2025, + "step": 12013 + }, + { + "epoch": 0.5058526315789473, + "grad_norm": 0.404296875, + "learning_rate": 0.00025332371385573306, + "loss": 3.2225, + "step": 12014 + }, + { + "epoch": 0.5058947368421053, + "grad_norm": 0.4375, + "learning_rate": 0.0002532899725259014, + "loss": 3.1904, + "step": 12015 + }, + { + "epoch": 0.5059368421052631, + "grad_norm": 0.408203125, + "learning_rate": 0.0002532562311361303, + "loss": 3.1173, + "step": 12016 + }, + { + "epoch": 0.5059789473684211, + "grad_norm": 0.41796875, + "learning_rate": 0.0002532224896870344, + "loss": 3.4095, + "step": 12017 + }, + { + "epoch": 0.5060210526315789, + "grad_norm": 0.423828125, + "learning_rate": 0.00025318874817922864, + "loss": 3.4642, + "step": 12018 + }, + { + "epoch": 0.5060631578947369, + "grad_norm": 0.41796875, + "learning_rate": 0.0002531550066133276, + "loss": 3.3437, + "step": 12019 + }, + { + "epoch": 0.5061052631578947, + "grad_norm": 0.4453125, + "learning_rate": 0.00025312126498994603, + "loss": 2.8973, + "step": 12020 + }, + { + "epoch": 0.5061473684210527, + "grad_norm": 0.451171875, + "learning_rate": 0.0002530875233096986, + "loss": 3.1795, + "step": 12021 + }, + { + "epoch": 0.5061894736842105, + "grad_norm": 0.470703125, + "learning_rate": 0.00025305378157320025, + "loss": 2.9435, + "step": 12022 + }, + { + "epoch": 0.5062315789473684, + "grad_norm": 0.4140625, + "learning_rate": 0.0002530200397810655, + "loss": 3.1416, + "step": 12023 + }, + { + "epoch": 0.5062736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.0002529862979339093, + "loss": 3.2411, + "step": 12024 + }, + { + "epoch": 0.5063157894736842, + "grad_norm": 0.400390625, + "learning_rate": 0.00025295255603234617, + "loss": 2.8795, + "step": 12025 + }, + { + "epoch": 0.5063578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.0002529188140769909, + "loss": 3.1479, + "step": 12026 + }, + { + "epoch": 0.5064, + "grad_norm": 0.439453125, + "learning_rate": 0.00025288507206845835, + "loss": 3.1442, + "step": 12027 + }, + { + "epoch": 0.5064421052631579, + "grad_norm": 0.404296875, + "learning_rate": 0.0002528513300073631, + "loss": 2.9922, + "step": 12028 + }, + { + "epoch": 0.5064842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.00025281758789432, + "loss": 2.7362, + "step": 12029 + }, + { + "epoch": 0.5065263157894737, + "grad_norm": 0.40625, + "learning_rate": 0.0002527838457299438, + "loss": 2.718, + "step": 12030 + }, + { + "epoch": 0.5065684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.0002527501035148492, + "loss": 3.397, + "step": 12031 + }, + { + "epoch": 0.5066105263157895, + "grad_norm": 0.3984375, + "learning_rate": 0.0002527163612496508, + "loss": 2.4836, + "step": 12032 + }, + { + "epoch": 0.5066526315789474, + "grad_norm": 0.4140625, + "learning_rate": 0.0002526826189349636, + "loss": 3.2031, + "step": 12033 + }, + { + "epoch": 0.5066947368421053, + "grad_norm": 0.4375, + "learning_rate": 0.0002526488765714023, + "loss": 3.0282, + "step": 12034 + }, + { + "epoch": 0.5067368421052632, + "grad_norm": 0.40625, + "learning_rate": 0.0002526151341595815, + "loss": 3.2466, + "step": 12035 + }, + { + "epoch": 0.5067789473684211, + "grad_norm": 0.482421875, + "learning_rate": 0.00025258139170011607, + "loss": 3.0334, + "step": 12036 + }, + { + "epoch": 0.506821052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.0002525476491936207, + "loss": 2.9378, + "step": 12037 + }, + { + "epoch": 0.5068631578947368, + "grad_norm": 0.42578125, + "learning_rate": 0.0002525139066407101, + "loss": 3.3028, + "step": 12038 + }, + { + "epoch": 0.5069052631578947, + "grad_norm": 0.4375, + "learning_rate": 0.0002524801640419991, + "loss": 3.4621, + "step": 12039 + }, + { + "epoch": 0.5069473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.0002524464213981025, + "loss": 3.241, + "step": 12040 + }, + { + "epoch": 0.5069894736842105, + "grad_norm": 0.439453125, + "learning_rate": 0.0002524126787096349, + "loss": 3.3807, + "step": 12041 + }, + { + "epoch": 0.5070315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.0002523789359772112, + "loss": 3.2486, + "step": 12042 + }, + { + "epoch": 0.5070736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.000252345193201446, + "loss": 3.6243, + "step": 12043 + }, + { + "epoch": 0.5071157894736842, + "grad_norm": 0.4609375, + "learning_rate": 0.00025231145038295415, + "loss": 3.0721, + "step": 12044 + }, + { + "epoch": 0.5071578947368421, + "grad_norm": 0.498046875, + "learning_rate": 0.0002522777075223504, + "loss": 3.4299, + "step": 12045 + }, + { + "epoch": 0.5072, + "grad_norm": 0.423828125, + "learning_rate": 0.00025224396462024943, + "loss": 3.1958, + "step": 12046 + }, + { + "epoch": 0.5072421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.0002522102216772662, + "loss": 3.0929, + "step": 12047 + }, + { + "epoch": 0.5072842105263158, + "grad_norm": 0.55859375, + "learning_rate": 0.00025217647869401526, + "loss": 3.1264, + "step": 12048 + }, + { + "epoch": 0.5073263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.00025214273567111144, + "loss": 3.2995, + "step": 12049 + }, + { + "epoch": 0.5073684210526316, + "grad_norm": 0.4296875, + "learning_rate": 0.0002521089926091695, + "loss": 2.853, + "step": 12050 + }, + { + "epoch": 0.5074105263157894, + "grad_norm": 0.41796875, + "learning_rate": 0.00025207524950880423, + "loss": 3.5925, + "step": 12051 + }, + { + "epoch": 0.5074526315789474, + "grad_norm": 0.46484375, + "learning_rate": 0.00025204150637063026, + "loss": 3.211, + "step": 12052 + }, + { + "epoch": 0.5074947368421052, + "grad_norm": 0.435546875, + "learning_rate": 0.00025200776319526247, + "loss": 3.6927, + "step": 12053 + }, + { + "epoch": 0.5075368421052632, + "grad_norm": 0.4609375, + "learning_rate": 0.00025197401998331564, + "loss": 2.9643, + "step": 12054 + }, + { + "epoch": 0.507578947368421, + "grad_norm": 0.3984375, + "learning_rate": 0.0002519402767354044, + "loss": 3.0827, + "step": 12055 + }, + { + "epoch": 0.507621052631579, + "grad_norm": 0.490234375, + "learning_rate": 0.0002519065334521437, + "loss": 3.1019, + "step": 12056 + }, + { + "epoch": 0.5076631578947368, + "grad_norm": 0.44140625, + "learning_rate": 0.00025187279013414815, + "loss": 3.094, + "step": 12057 + }, + { + "epoch": 0.5077052631578948, + "grad_norm": 0.470703125, + "learning_rate": 0.00025183904678203263, + "loss": 3.1969, + "step": 12058 + }, + { + "epoch": 0.5077473684210526, + "grad_norm": 0.4453125, + "learning_rate": 0.00025180530339641175, + "loss": 3.5149, + "step": 12059 + }, + { + "epoch": 0.5077894736842106, + "grad_norm": 0.5390625, + "learning_rate": 0.00025177155997790034, + "loss": 2.7354, + "step": 12060 + }, + { + "epoch": 0.5078315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.0002517378165271133, + "loss": 3.0681, + "step": 12061 + }, + { + "epoch": 0.5078736842105264, + "grad_norm": 0.431640625, + "learning_rate": 0.00025170407304466526, + "loss": 3.3541, + "step": 12062 + }, + { + "epoch": 0.5079157894736842, + "grad_norm": 0.447265625, + "learning_rate": 0.0002516703295311709, + "loss": 2.9952, + "step": 12063 + }, + { + "epoch": 0.507957894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.0002516365859872452, + "loss": 2.8894, + "step": 12064 + }, + { + "epoch": 0.508, + "grad_norm": 0.412109375, + "learning_rate": 0.0002516028424135028, + "loss": 3.0108, + "step": 12065 + }, + { + "epoch": 0.5080421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00025156909881055847, + "loss": 3.2818, + "step": 12066 + }, + { + "epoch": 0.5080842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.00025153535517902703, + "loss": 2.9561, + "step": 12067 + }, + { + "epoch": 0.5081263157894736, + "grad_norm": 0.42578125, + "learning_rate": 0.00025150161151952323, + "loss": 2.9998, + "step": 12068 + }, + { + "epoch": 0.5081684210526316, + "grad_norm": 0.453125, + "learning_rate": 0.00025146786783266186, + "loss": 3.1814, + "step": 12069 + }, + { + "epoch": 0.5082105263157894, + "grad_norm": 0.427734375, + "learning_rate": 0.0002514341241190576, + "loss": 3.215, + "step": 12070 + }, + { + "epoch": 0.5082526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.0002514003803793253, + "loss": 3.3154, + "step": 12071 + }, + { + "epoch": 0.5082947368421052, + "grad_norm": 0.412109375, + "learning_rate": 0.00025136663661407975, + "loss": 3.0814, + "step": 12072 + }, + { + "epoch": 0.5083368421052632, + "grad_norm": 0.423828125, + "learning_rate": 0.0002513328928239356, + "loss": 3.1449, + "step": 12073 + }, + { + "epoch": 0.508378947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00025129914900950777, + "loss": 3.1711, + "step": 12074 + }, + { + "epoch": 0.508421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.000251265405171411, + "loss": 3.1284, + "step": 12075 + }, + { + "epoch": 0.5084631578947368, + "grad_norm": 0.376953125, + "learning_rate": 0.00025123166131026006, + "loss": 2.1731, + "step": 12076 + }, + { + "epoch": 0.5085052631578947, + "grad_norm": 0.419921875, + "learning_rate": 0.00025119791742666967, + "loss": 3.0633, + "step": 12077 + }, + { + "epoch": 0.5085473684210526, + "grad_norm": 0.40625, + "learning_rate": 0.0002511641735212546, + "loss": 3.2886, + "step": 12078 + }, + { + "epoch": 0.5085894736842105, + "grad_norm": 0.38671875, + "learning_rate": 0.00025113042959462963, + "loss": 3.3694, + "step": 12079 + }, + { + "epoch": 0.5086315789473684, + "grad_norm": 0.44140625, + "learning_rate": 0.0002510966856474096, + "loss": 3.1678, + "step": 12080 + }, + { + "epoch": 0.5086736842105263, + "grad_norm": 0.453125, + "learning_rate": 0.0002510629416802092, + "loss": 3.2288, + "step": 12081 + }, + { + "epoch": 0.5087157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.0002510291976936433, + "loss": 3.2909, + "step": 12082 + }, + { + "epoch": 0.5087578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00025099545368832666, + "loss": 3.3384, + "step": 12083 + }, + { + "epoch": 0.5088, + "grad_norm": 0.42578125, + "learning_rate": 0.00025096170966487397, + "loss": 3.3906, + "step": 12084 + }, + { + "epoch": 0.5088421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.0002509279656239001, + "loss": 3.2106, + "step": 12085 + }, + { + "epoch": 0.5088842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.0002508942215660197, + "loss": 3.0348, + "step": 12086 + }, + { + "epoch": 0.5089263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0002508604774918477, + "loss": 3.1138, + "step": 12087 + }, + { + "epoch": 0.5089684210526316, + "grad_norm": 0.4609375, + "learning_rate": 0.0002508267334019988, + "loss": 3.0862, + "step": 12088 + }, + { + "epoch": 0.5090105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.0002507929892970878, + "loss": 3.2202, + "step": 12089 + }, + { + "epoch": 0.5090526315789474, + "grad_norm": 0.431640625, + "learning_rate": 0.0002507592451777295, + "loss": 3.2161, + "step": 12090 + }, + { + "epoch": 0.5090947368421053, + "grad_norm": 0.5078125, + "learning_rate": 0.0002507255010445386, + "loss": 3.1247, + "step": 12091 + }, + { + "epoch": 0.5091368421052631, + "grad_norm": 0.423828125, + "learning_rate": 0.0002506917568981299, + "loss": 2.7256, + "step": 12092 + }, + { + "epoch": 0.5091789473684211, + "grad_norm": 0.4296875, + "learning_rate": 0.0002506580127391182, + "loss": 3.1457, + "step": 12093 + }, + { + "epoch": 0.5092210526315789, + "grad_norm": 0.498046875, + "learning_rate": 0.00025062426856811834, + "loss": 3.0477, + "step": 12094 + }, + { + "epoch": 0.5092631578947369, + "grad_norm": 0.43359375, + "learning_rate": 0.00025059052438574503, + "loss": 3.3582, + "step": 12095 + }, + { + "epoch": 0.5093052631578947, + "grad_norm": 0.4140625, + "learning_rate": 0.00025055678019261297, + "loss": 3.2627, + "step": 12096 + }, + { + "epoch": 0.5093473684210527, + "grad_norm": 0.52734375, + "learning_rate": 0.0002505230359893371, + "loss": 3.0709, + "step": 12097 + }, + { + "epoch": 0.5093894736842105, + "grad_norm": 0.435546875, + "learning_rate": 0.0002504892917765322, + "loss": 3.1097, + "step": 12098 + }, + { + "epoch": 0.5094315789473685, + "grad_norm": 0.4375, + "learning_rate": 0.0002504555475548129, + "loss": 3.1494, + "step": 12099 + }, + { + "epoch": 0.5094736842105263, + "grad_norm": 0.458984375, + "learning_rate": 0.000250421803324794, + "loss": 3.3128, + "step": 12100 + }, + { + "epoch": 0.5095157894736843, + "grad_norm": 0.443359375, + "learning_rate": 0.00025038805908709046, + "loss": 3.0236, + "step": 12101 + }, + { + "epoch": 0.5095578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.00025035431484231687, + "loss": 2.6067, + "step": 12102 + }, + { + "epoch": 0.5096, + "grad_norm": 0.4765625, + "learning_rate": 0.0002503205705910881, + "loss": 3.1457, + "step": 12103 + }, + { + "epoch": 0.5096421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.0002502868263340189, + "loss": 3.1816, + "step": 12104 + }, + { + "epoch": 0.5096842105263157, + "grad_norm": 0.412109375, + "learning_rate": 0.000250253082071724, + "loss": 3.1244, + "step": 12105 + }, + { + "epoch": 0.5097263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.00025021933780481834, + "loss": 3.3044, + "step": 12106 + }, + { + "epoch": 0.5097684210526315, + "grad_norm": 0.423828125, + "learning_rate": 0.00025018559353391656, + "loss": 3.0747, + "step": 12107 + }, + { + "epoch": 0.5098105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.00025015184925963354, + "loss": 3.0653, + "step": 12108 + }, + { + "epoch": 0.5098526315789473, + "grad_norm": 0.4453125, + "learning_rate": 0.00025011810498258395, + "loss": 3.2287, + "step": 12109 + }, + { + "epoch": 0.5098947368421053, + "grad_norm": 0.431640625, + "learning_rate": 0.00025008436070338266, + "loss": 2.655, + "step": 12110 + }, + { + "epoch": 0.5099368421052631, + "grad_norm": 0.421875, + "learning_rate": 0.0002500506164226444, + "loss": 3.2279, + "step": 12111 + }, + { + "epoch": 0.5099789473684211, + "grad_norm": 0.43359375, + "learning_rate": 0.00025001687214098393, + "loss": 3.1841, + "step": 12112 + }, + { + "epoch": 0.5100210526315789, + "grad_norm": 0.42578125, + "learning_rate": 0.00024998312785901613, + "loss": 3.4688, + "step": 12113 + }, + { + "epoch": 0.5100631578947369, + "grad_norm": 0.4140625, + "learning_rate": 0.0002499493835773557, + "loss": 3.502, + "step": 12114 + }, + { + "epoch": 0.5101052631578947, + "grad_norm": 0.451171875, + "learning_rate": 0.00024991563929661735, + "loss": 3.0315, + "step": 12115 + }, + { + "epoch": 0.5101473684210527, + "grad_norm": 0.44921875, + "learning_rate": 0.00024988189501741606, + "loss": 3.0181, + "step": 12116 + }, + { + "epoch": 0.5101894736842105, + "grad_norm": 0.435546875, + "learning_rate": 0.00024984815074036653, + "loss": 3.5937, + "step": 12117 + }, + { + "epoch": 0.5102315789473684, + "grad_norm": 0.388671875, + "learning_rate": 0.0002498144064660834, + "loss": 2.9864, + "step": 12118 + }, + { + "epoch": 0.5102736842105263, + "grad_norm": 0.4375, + "learning_rate": 0.00024978066219518173, + "loss": 3.341, + "step": 12119 + }, + { + "epoch": 0.5103157894736842, + "grad_norm": 0.396484375, + "learning_rate": 0.000249746917928276, + "loss": 3.2266, + "step": 12120 + }, + { + "epoch": 0.5103578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00024971317366598116, + "loss": 3.032, + "step": 12121 + }, + { + "epoch": 0.5104, + "grad_norm": 0.419921875, + "learning_rate": 0.0002496794294089119, + "loss": 3.4523, + "step": 12122 + }, + { + "epoch": 0.5104421052631579, + "grad_norm": 0.404296875, + "learning_rate": 0.0002496456851576831, + "loss": 3.3017, + "step": 12123 + }, + { + "epoch": 0.5104842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.0002496119409129096, + "loss": 2.8967, + "step": 12124 + }, + { + "epoch": 0.5105263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.00024957819667520593, + "loss": 3.4734, + "step": 12125 + }, + { + "epoch": 0.5105684210526316, + "grad_norm": 0.4453125, + "learning_rate": 0.0002495444524451872, + "loss": 2.9296, + "step": 12126 + }, + { + "epoch": 0.5106105263157895, + "grad_norm": 0.443359375, + "learning_rate": 0.00024951070822346787, + "loss": 2.9928, + "step": 12127 + }, + { + "epoch": 0.5106526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.0002494769640106629, + "loss": 3.6764, + "step": 12128 + }, + { + "epoch": 0.5106947368421053, + "grad_norm": 0.416015625, + "learning_rate": 0.00024944321980738704, + "loss": 3.5326, + "step": 12129 + }, + { + "epoch": 0.5107368421052632, + "grad_norm": 0.46484375, + "learning_rate": 0.00024940947561425504, + "loss": 2.9893, + "step": 12130 + }, + { + "epoch": 0.510778947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.0002493757314318817, + "loss": 3.2129, + "step": 12131 + }, + { + "epoch": 0.510821052631579, + "grad_norm": 0.451171875, + "learning_rate": 0.0002493419872608818, + "loss": 3.1903, + "step": 12132 + }, + { + "epoch": 0.5108631578947368, + "grad_norm": 0.400390625, + "learning_rate": 0.0002493082431018702, + "loss": 2.9323, + "step": 12133 + }, + { + "epoch": 0.5109052631578948, + "grad_norm": 0.40625, + "learning_rate": 0.0002492744989554615, + "loss": 2.8906, + "step": 12134 + }, + { + "epoch": 0.5109473684210526, + "grad_norm": 0.41796875, + "learning_rate": 0.0002492407548222706, + "loss": 3.4325, + "step": 12135 + }, + { + "epoch": 0.5109894736842105, + "grad_norm": 0.4140625, + "learning_rate": 0.0002492070107029123, + "loss": 3.6045, + "step": 12136 + }, + { + "epoch": 0.5110315789473684, + "grad_norm": 0.4921875, + "learning_rate": 0.0002491732665980012, + "loss": 3.3395, + "step": 12137 + }, + { + "epoch": 0.5110736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.0002491395225081523, + "loss": 3.3997, + "step": 12138 + }, + { + "epoch": 0.5111157894736842, + "grad_norm": 0.404296875, + "learning_rate": 0.00024910577843398026, + "loss": 2.8872, + "step": 12139 + }, + { + "epoch": 0.5111578947368421, + "grad_norm": 0.443359375, + "learning_rate": 0.00024907203437610003, + "loss": 3.0892, + "step": 12140 + }, + { + "epoch": 0.5112, + "grad_norm": 0.4453125, + "learning_rate": 0.00024903829033512604, + "loss": 3.5654, + "step": 12141 + }, + { + "epoch": 0.5112421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.00024900454631167346, + "loss": 3.155, + "step": 12142 + }, + { + "epoch": 0.5112842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.00024897080230635676, + "loss": 3.1129, + "step": 12143 + }, + { + "epoch": 0.5113263157894736, + "grad_norm": 0.43359375, + "learning_rate": 0.0002489370583197908, + "loss": 3.2347, + "step": 12144 + }, + { + "epoch": 0.5113684210526316, + "grad_norm": 0.484375, + "learning_rate": 0.0002489033143525905, + "loss": 2.69, + "step": 12145 + }, + { + "epoch": 0.5114105263157894, + "grad_norm": 0.404296875, + "learning_rate": 0.00024886957040537043, + "loss": 3.3338, + "step": 12146 + }, + { + "epoch": 0.5114526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.0002488358264787455, + "loss": 2.9988, + "step": 12147 + }, + { + "epoch": 0.5114947368421052, + "grad_norm": 0.439453125, + "learning_rate": 0.0002488020825733304, + "loss": 2.9507, + "step": 12148 + }, + { + "epoch": 0.5115368421052632, + "grad_norm": 0.396484375, + "learning_rate": 0.00024876833868974006, + "loss": 2.901, + "step": 12149 + }, + { + "epoch": 0.511578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.000248734594828589, + "loss": 3.1979, + "step": 12150 + }, + { + "epoch": 0.511621052631579, + "grad_norm": 0.49609375, + "learning_rate": 0.0002487008509904922, + "loss": 3.811, + "step": 12151 + }, + { + "epoch": 0.5116631578947368, + "grad_norm": 0.41015625, + "learning_rate": 0.0002486671071760644, + "loss": 3.4091, + "step": 12152 + }, + { + "epoch": 0.5117052631578948, + "grad_norm": 0.427734375, + "learning_rate": 0.0002486333633859203, + "loss": 3.0215, + "step": 12153 + }, + { + "epoch": 0.5117473684210526, + "grad_norm": 0.46484375, + "learning_rate": 0.0002485996196206747, + "loss": 3.2772, + "step": 12154 + }, + { + "epoch": 0.5117894736842106, + "grad_norm": 0.44921875, + "learning_rate": 0.0002485658758809424, + "loss": 3.1454, + "step": 12155 + }, + { + "epoch": 0.5118315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.00024853213216733826, + "loss": 2.9137, + "step": 12156 + }, + { + "epoch": 0.5118736842105264, + "grad_norm": 0.44140625, + "learning_rate": 0.0002484983884804768, + "loss": 3.0146, + "step": 12157 + }, + { + "epoch": 0.5119157894736842, + "grad_norm": 0.48828125, + "learning_rate": 0.00024846464482097293, + "loss": 3.1763, + "step": 12158 + }, + { + "epoch": 0.5119578947368421, + "grad_norm": 0.46875, + "learning_rate": 0.0002484309011894416, + "loss": 3.2043, + "step": 12159 + }, + { + "epoch": 0.512, + "grad_norm": 0.44921875, + "learning_rate": 0.0002483971575864972, + "loss": 3.3013, + "step": 12160 + }, + { + "epoch": 0.5120421052631579, + "grad_norm": 0.453125, + "learning_rate": 0.00024836341401275494, + "loss": 3.1218, + "step": 12161 + }, + { + "epoch": 0.5120842105263158, + "grad_norm": 0.50390625, + "learning_rate": 0.00024832967046882915, + "loss": 3.0082, + "step": 12162 + }, + { + "epoch": 0.5121263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0002482959269553348, + "loss": 3.3194, + "step": 12163 + }, + { + "epoch": 0.5121684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.00024826218347288675, + "loss": 3.3184, + "step": 12164 + }, + { + "epoch": 0.5122105263157894, + "grad_norm": 0.490234375, + "learning_rate": 0.0002482284400220996, + "loss": 2.9509, + "step": 12165 + }, + { + "epoch": 0.5122526315789474, + "grad_norm": 0.419921875, + "learning_rate": 0.0002481946966035883, + "loss": 3.5582, + "step": 12166 + }, + { + "epoch": 0.5122947368421052, + "grad_norm": 0.41015625, + "learning_rate": 0.00024816095321796743, + "loss": 3.4749, + "step": 12167 + }, + { + "epoch": 0.5123368421052632, + "grad_norm": 0.4453125, + "learning_rate": 0.0002481272098658519, + "loss": 3.3094, + "step": 12168 + }, + { + "epoch": 0.512378947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.00024809346654785635, + "loss": 2.8079, + "step": 12169 + }, + { + "epoch": 0.512421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.00024805972326459553, + "loss": 3.0434, + "step": 12170 + }, + { + "epoch": 0.5124631578947368, + "grad_norm": 0.396484375, + "learning_rate": 0.00024802598001668437, + "loss": 2.9434, + "step": 12171 + }, + { + "epoch": 0.5125052631578947, + "grad_norm": 0.419921875, + "learning_rate": 0.00024799223680473754, + "loss": 3.3398, + "step": 12172 + }, + { + "epoch": 0.5125473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.0002479584936293698, + "loss": 3.215, + "step": 12173 + }, + { + "epoch": 0.5125894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.00024792475049119583, + "loss": 2.7238, + "step": 12174 + }, + { + "epoch": 0.5126315789473684, + "grad_norm": 0.451171875, + "learning_rate": 0.00024789100739083054, + "loss": 3.1644, + "step": 12175 + }, + { + "epoch": 0.5126736842105263, + "grad_norm": 0.46484375, + "learning_rate": 0.00024785726432888857, + "loss": 2.9208, + "step": 12176 + }, + { + "epoch": 0.5127157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.00024782352130598475, + "loss": 3.4491, + "step": 12177 + }, + { + "epoch": 0.5127578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.00024778977832273384, + "loss": 3.524, + "step": 12178 + }, + { + "epoch": 0.5128, + "grad_norm": 0.427734375, + "learning_rate": 0.0002477560353797505, + "loss": 3.1669, + "step": 12179 + }, + { + "epoch": 0.5128421052631579, + "grad_norm": 0.466796875, + "learning_rate": 0.00024772229247764965, + "loss": 3.3603, + "step": 12180 + }, + { + "epoch": 0.5128842105263158, + "grad_norm": 0.412109375, + "learning_rate": 0.00024768854961704586, + "loss": 3.387, + "step": 12181 + }, + { + "epoch": 0.5129263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.0002476548067985541, + "loss": 3.3329, + "step": 12182 + }, + { + "epoch": 0.5129684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.00024762106402278887, + "loss": 3.0421, + "step": 12183 + }, + { + "epoch": 0.5130105263157895, + "grad_norm": 0.404296875, + "learning_rate": 0.0002475873212903651, + "loss": 3.0694, + "step": 12184 + }, + { + "epoch": 0.5130526315789473, + "grad_norm": 0.455078125, + "learning_rate": 0.0002475535786018976, + "loss": 3.4284, + "step": 12185 + }, + { + "epoch": 0.5130947368421053, + "grad_norm": 0.478515625, + "learning_rate": 0.0002475198359580009, + "loss": 3.0238, + "step": 12186 + }, + { + "epoch": 0.5131368421052631, + "grad_norm": 0.427734375, + "learning_rate": 0.00024748609335928997, + "loss": 2.8265, + "step": 12187 + }, + { + "epoch": 0.5131789473684211, + "grad_norm": 0.423828125, + "learning_rate": 0.0002474523508063794, + "loss": 2.8474, + "step": 12188 + }, + { + "epoch": 0.5132210526315789, + "grad_norm": 0.416015625, + "learning_rate": 0.000247418608299884, + "loss": 2.464, + "step": 12189 + }, + { + "epoch": 0.5132631578947369, + "grad_norm": 0.435546875, + "learning_rate": 0.00024738486584041853, + "loss": 3.3961, + "step": 12190 + }, + { + "epoch": 0.5133052631578947, + "grad_norm": 0.390625, + "learning_rate": 0.0002473511234285977, + "loss": 2.8654, + "step": 12191 + }, + { + "epoch": 0.5133473684210527, + "grad_norm": 0.41015625, + "learning_rate": 0.0002473173810650364, + "loss": 3.6859, + "step": 12192 + }, + { + "epoch": 0.5133894736842105, + "grad_norm": 0.431640625, + "learning_rate": 0.00024728363875034916, + "loss": 3.2257, + "step": 12193 + }, + { + "epoch": 0.5134315789473685, + "grad_norm": 0.419921875, + "learning_rate": 0.00024724989648515093, + "loss": 2.8332, + "step": 12194 + }, + { + "epoch": 0.5134736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.0002472161542700563, + "loss": 3.198, + "step": 12195 + }, + { + "epoch": 0.5135157894736843, + "grad_norm": 0.427734375, + "learning_rate": 0.00024718241210568, + "loss": 3.609, + "step": 12196 + }, + { + "epoch": 0.5135578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00024714866999263693, + "loss": 3.1794, + "step": 12197 + }, + { + "epoch": 0.5136, + "grad_norm": 0.41015625, + "learning_rate": 0.00024711492793154166, + "loss": 3.2849, + "step": 12198 + }, + { + "epoch": 0.5136421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.00024708118592300915, + "loss": 3.2719, + "step": 12199 + }, + { + "epoch": 0.5136842105263157, + "grad_norm": 0.40234375, + "learning_rate": 0.0002470474439676539, + "loss": 2.8864, + "step": 12200 + }, + { + "epoch": 0.5137263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.00024701370206609084, + "loss": 3.0759, + "step": 12201 + }, + { + "epoch": 0.5137684210526315, + "grad_norm": 0.443359375, + "learning_rate": 0.0002469799602189345, + "loss": 2.9301, + "step": 12202 + }, + { + "epoch": 0.5138105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.00024694621842679987, + "loss": 2.9565, + "step": 12203 + }, + { + "epoch": 0.5138526315789473, + "grad_norm": 0.439453125, + "learning_rate": 0.00024691247669030145, + "loss": 2.9639, + "step": 12204 + }, + { + "epoch": 0.5138947368421053, + "grad_norm": 0.421875, + "learning_rate": 0.00024687873501005403, + "loss": 3.2476, + "step": 12205 + }, + { + "epoch": 0.5139368421052631, + "grad_norm": 0.41015625, + "learning_rate": 0.0002468449933866725, + "loss": 3.1205, + "step": 12206 + }, + { + "epoch": 0.5139789473684211, + "grad_norm": 0.43359375, + "learning_rate": 0.00024681125182077137, + "loss": 3.1007, + "step": 12207 + }, + { + "epoch": 0.5140210526315789, + "grad_norm": 0.453125, + "learning_rate": 0.00024677751031296566, + "loss": 2.6997, + "step": 12208 + }, + { + "epoch": 0.5140631578947369, + "grad_norm": 0.423828125, + "learning_rate": 0.00024674376886386977, + "loss": 3.1469, + "step": 12209 + }, + { + "epoch": 0.5141052631578947, + "grad_norm": 0.447265625, + "learning_rate": 0.0002467100274740987, + "loss": 2.9534, + "step": 12210 + }, + { + "epoch": 0.5141473684210527, + "grad_norm": 0.52734375, + "learning_rate": 0.000246676286144267, + "loss": 3.0507, + "step": 12211 + }, + { + "epoch": 0.5141894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.0002466425448749894, + "loss": 3.4129, + "step": 12212 + }, + { + "epoch": 0.5142315789473684, + "grad_norm": 0.41015625, + "learning_rate": 0.00024660880366688076, + "loss": 3.2609, + "step": 12213 + }, + { + "epoch": 0.5142736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.00024657506252055575, + "loss": 3.2974, + "step": 12214 + }, + { + "epoch": 0.5143157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.0002465413214366291, + "loss": 3.147, + "step": 12215 + }, + { + "epoch": 0.5143578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.0002465075804157154, + "loss": 2.4456, + "step": 12216 + }, + { + "epoch": 0.5144, + "grad_norm": 0.421875, + "learning_rate": 0.0002464738394584295, + "loss": 3.297, + "step": 12217 + }, + { + "epoch": 0.5144421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.0002464400985653862, + "loss": 3.6292, + "step": 12218 + }, + { + "epoch": 0.5144842105263158, + "grad_norm": 0.40234375, + "learning_rate": 0.00024640635773720006, + "loss": 3.2386, + "step": 12219 + }, + { + "epoch": 0.5145263157894737, + "grad_norm": 0.451171875, + "learning_rate": 0.0002463726169744859, + "loss": 3.0935, + "step": 12220 + }, + { + "epoch": 0.5145684210526316, + "grad_norm": 0.40234375, + "learning_rate": 0.00024633887627785835, + "loss": 2.9387, + "step": 12221 + }, + { + "epoch": 0.5146105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.00024630513564793217, + "loss": 2.6977, + "step": 12222 + }, + { + "epoch": 0.5146526315789474, + "grad_norm": 0.447265625, + "learning_rate": 0.00024627139508532204, + "loss": 3.055, + "step": 12223 + }, + { + "epoch": 0.5146947368421053, + "grad_norm": 0.4140625, + "learning_rate": 0.00024623765459064277, + "loss": 3.5394, + "step": 12224 + }, + { + "epoch": 0.5147368421052632, + "grad_norm": 0.427734375, + "learning_rate": 0.0002462039141645091, + "loss": 3.0072, + "step": 12225 + }, + { + "epoch": 0.514778947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.00024617017380753546, + "loss": 2.8815, + "step": 12226 + }, + { + "epoch": 0.514821052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00024613643352033697, + "loss": 2.8721, + "step": 12227 + }, + { + "epoch": 0.5148631578947368, + "grad_norm": 0.578125, + "learning_rate": 0.0002461026933035279, + "loss": 2.9788, + "step": 12228 + }, + { + "epoch": 0.5149052631578948, + "grad_norm": 0.427734375, + "learning_rate": 0.0002460689531577233, + "loss": 3.3951, + "step": 12229 + }, + { + "epoch": 0.5149473684210526, + "grad_norm": 0.49609375, + "learning_rate": 0.0002460352130835378, + "loss": 2.9128, + "step": 12230 + }, + { + "epoch": 0.5149894736842106, + "grad_norm": 0.443359375, + "learning_rate": 0.000246001473081586, + "loss": 3.2425, + "step": 12231 + }, + { + "epoch": 0.5150315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.0002459677331524826, + "loss": 3.0483, + "step": 12232 + }, + { + "epoch": 0.5150736842105264, + "grad_norm": 0.41015625, + "learning_rate": 0.0002459339932968424, + "loss": 3.0331, + "step": 12233 + }, + { + "epoch": 0.5151157894736842, + "grad_norm": 0.458984375, + "learning_rate": 0.0002459002535152802, + "loss": 2.8052, + "step": 12234 + }, + { + "epoch": 0.5151578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.0002458665138084104, + "loss": 3.1245, + "step": 12235 + }, + { + "epoch": 0.5152, + "grad_norm": 0.43359375, + "learning_rate": 0.000245832774176848, + "loss": 3.4676, + "step": 12236 + }, + { + "epoch": 0.515242105263158, + "grad_norm": 0.404296875, + "learning_rate": 0.0002457990346212074, + "loss": 3.2111, + "step": 12237 + }, + { + "epoch": 0.5152842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.0002457652951421035, + "loss": 3.2141, + "step": 12238 + }, + { + "epoch": 0.5153263157894736, + "grad_norm": 0.453125, + "learning_rate": 0.000245731555740151, + "loss": 2.7014, + "step": 12239 + }, + { + "epoch": 0.5153684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.00024569781641596447, + "loss": 3.2407, + "step": 12240 + }, + { + "epoch": 0.5154105263157894, + "grad_norm": 0.435546875, + "learning_rate": 0.0002456640771701588, + "loss": 3.2367, + "step": 12241 + }, + { + "epoch": 0.5154526315789474, + "grad_norm": 0.4140625, + "learning_rate": 0.0002456303380033484, + "loss": 3.0961, + "step": 12242 + }, + { + "epoch": 0.5154947368421052, + "grad_norm": 0.416015625, + "learning_rate": 0.00024559659891614823, + "loss": 3.3362, + "step": 12243 + }, + { + "epoch": 0.5155368421052632, + "grad_norm": 0.42578125, + "learning_rate": 0.0002455628599091728, + "loss": 3.2345, + "step": 12244 + }, + { + "epoch": 0.515578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.00024552912098303677, + "loss": 3.051, + "step": 12245 + }, + { + "epoch": 0.515621052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00024549538213835506, + "loss": 3.188, + "step": 12246 + }, + { + "epoch": 0.5156631578947368, + "grad_norm": 0.439453125, + "learning_rate": 0.00024546164337574205, + "loss": 3.2395, + "step": 12247 + }, + { + "epoch": 0.5157052631578948, + "grad_norm": 0.390625, + "learning_rate": 0.0002454279046958126, + "loss": 3.292, + "step": 12248 + }, + { + "epoch": 0.5157473684210526, + "grad_norm": 0.45703125, + "learning_rate": 0.0002453941660991814, + "loss": 3.4256, + "step": 12249 + }, + { + "epoch": 0.5157894736842106, + "grad_norm": 0.416015625, + "learning_rate": 0.0002453604275864631, + "loss": 3.2756, + "step": 12250 + }, + { + "epoch": 0.5158315789473684, + "grad_norm": 0.47265625, + "learning_rate": 0.0002453266891582723, + "loss": 3.3899, + "step": 12251 + }, + { + "epoch": 0.5158736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.0002452929508152237, + "loss": 2.4669, + "step": 12252 + }, + { + "epoch": 0.5159157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.00024525921255793214, + "loss": 3.1027, + "step": 12253 + }, + { + "epoch": 0.5159578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00024522547438701205, + "loss": 3.2144, + "step": 12254 + }, + { + "epoch": 0.516, + "grad_norm": 0.423828125, + "learning_rate": 0.0002451917363030782, + "loss": 3.2645, + "step": 12255 + }, + { + "epoch": 0.5160421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.0002451579983067453, + "loss": 3.2968, + "step": 12256 + }, + { + "epoch": 0.5160842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.00024512426039862805, + "loss": 3.382, + "step": 12257 + }, + { + "epoch": 0.5161263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.0002450905225793409, + "loss": 3.0617, + "step": 12258 + }, + { + "epoch": 0.5161684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.0002450567848494987, + "loss": 3.0794, + "step": 12259 + }, + { + "epoch": 0.5162105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.0002450230472097162, + "loss": 3.5174, + "step": 12260 + }, + { + "epoch": 0.5162526315789474, + "grad_norm": 0.451171875, + "learning_rate": 0.0002449893096606078, + "loss": 3.0724, + "step": 12261 + }, + { + "epoch": 0.5162947368421053, + "grad_norm": 0.408203125, + "learning_rate": 0.0002449555722027884, + "loss": 3.0555, + "step": 12262 + }, + { + "epoch": 0.5163368421052632, + "grad_norm": 0.419921875, + "learning_rate": 0.0002449218348368724, + "loss": 3.5196, + "step": 12263 + }, + { + "epoch": 0.516378947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.0002448880975634747, + "loss": 3.1938, + "step": 12264 + }, + { + "epoch": 0.516421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.00024485436038320984, + "loss": 2.8785, + "step": 12265 + }, + { + "epoch": 0.5164631578947368, + "grad_norm": 0.423828125, + "learning_rate": 0.00024482062329669245, + "loss": 3.3379, + "step": 12266 + }, + { + "epoch": 0.5165052631578947, + "grad_norm": 0.423828125, + "learning_rate": 0.00024478688630453734, + "loss": 3.3396, + "step": 12267 + }, + { + "epoch": 0.5165473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.0002447531494073589, + "loss": 3.0865, + "step": 12268 + }, + { + "epoch": 0.5165894736842105, + "grad_norm": 0.396484375, + "learning_rate": 0.00024471941260577204, + "loss": 2.5941, + "step": 12269 + }, + { + "epoch": 0.5166315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.0002446856759003911, + "loss": 2.9426, + "step": 12270 + }, + { + "epoch": 0.5166736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.000244651939291831, + "loss": 3.482, + "step": 12271 + }, + { + "epoch": 0.5167157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.0002446182027807063, + "loss": 3.0566, + "step": 12272 + }, + { + "epoch": 0.5167578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.00024458446636763163, + "loss": 3.3214, + "step": 12273 + }, + { + "epoch": 0.5168, + "grad_norm": 0.431640625, + "learning_rate": 0.0002445507300532216, + "loss": 3.0885, + "step": 12274 + }, + { + "epoch": 0.5168421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00024451699383809083, + "loss": 2.9159, + "step": 12275 + }, + { + "epoch": 0.5168842105263158, + "grad_norm": 0.453125, + "learning_rate": 0.0002444832577228541, + "loss": 3.3702, + "step": 12276 + }, + { + "epoch": 0.5169263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.0002444495217081258, + "loss": 3.1155, + "step": 12277 + }, + { + "epoch": 0.5169684210526316, + "grad_norm": 0.43359375, + "learning_rate": 0.0002444157857945207, + "loss": 2.7856, + "step": 12278 + }, + { + "epoch": 0.5170105263157895, + "grad_norm": 0.46484375, + "learning_rate": 0.00024438204998265356, + "loss": 3.3925, + "step": 12279 + }, + { + "epoch": 0.5170526315789473, + "grad_norm": 0.4453125, + "learning_rate": 0.00024434831427313873, + "loss": 3.1533, + "step": 12280 + }, + { + "epoch": 0.5170947368421053, + "grad_norm": 0.4296875, + "learning_rate": 0.000244314578666591, + "loss": 3.1405, + "step": 12281 + }, + { + "epoch": 0.5171368421052631, + "grad_norm": 0.44140625, + "learning_rate": 0.00024428084316362495, + "loss": 3.144, + "step": 12282 + }, + { + "epoch": 0.5171789473684211, + "grad_norm": 0.42578125, + "learning_rate": 0.0002442471077648553, + "loss": 3.4128, + "step": 12283 + }, + { + "epoch": 0.5172210526315789, + "grad_norm": 0.443359375, + "learning_rate": 0.00024421337247089653, + "loss": 2.9597, + "step": 12284 + }, + { + "epoch": 0.5172631578947369, + "grad_norm": 0.431640625, + "learning_rate": 0.0002441796372823633, + "loss": 3.3521, + "step": 12285 + }, + { + "epoch": 0.5173052631578947, + "grad_norm": 0.416015625, + "learning_rate": 0.00024414590219987034, + "loss": 3.2852, + "step": 12286 + }, + { + "epoch": 0.5173473684210527, + "grad_norm": 0.40234375, + "learning_rate": 0.00024411216722403206, + "loss": 2.9238, + "step": 12287 + }, + { + "epoch": 0.5173894736842105, + "grad_norm": 0.412109375, + "learning_rate": 0.0002440784323554632, + "loss": 3.099, + "step": 12288 + }, + { + "epoch": 0.5174315789473685, + "grad_norm": 0.431640625, + "learning_rate": 0.00024404469759477837, + "loss": 3.1167, + "step": 12289 + }, + { + "epoch": 0.5174736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.00024401096294259216, + "loss": 3.13, + "step": 12290 + }, + { + "epoch": 0.5175157894736843, + "grad_norm": 0.44921875, + "learning_rate": 0.00024397722839951912, + "loss": 3.5161, + "step": 12291 + }, + { + "epoch": 0.5175578947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.00024394349396617386, + "loss": 3.1147, + "step": 12292 + }, + { + "epoch": 0.5176, + "grad_norm": 0.421875, + "learning_rate": 0.00024390975964317117, + "loss": 3.2513, + "step": 12293 + }, + { + "epoch": 0.5176421052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.00024387602543112538, + "loss": 2.9772, + "step": 12294 + }, + { + "epoch": 0.5176842105263157, + "grad_norm": 0.443359375, + "learning_rate": 0.0002438422913306513, + "loss": 3.2483, + "step": 12295 + }, + { + "epoch": 0.5177263157894737, + "grad_norm": 0.458984375, + "learning_rate": 0.00024380855734236338, + "loss": 3.0495, + "step": 12296 + }, + { + "epoch": 0.5177684210526315, + "grad_norm": 0.466796875, + "learning_rate": 0.0002437748234668763, + "loss": 2.714, + "step": 12297 + }, + { + "epoch": 0.5178105263157895, + "grad_norm": 0.390625, + "learning_rate": 0.00024374108970480455, + "loss": 3.358, + "step": 12298 + }, + { + "epoch": 0.5178526315789473, + "grad_norm": 0.4296875, + "learning_rate": 0.0002437073560567628, + "loss": 3.141, + "step": 12299 + }, + { + "epoch": 0.5178947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.00024367362252336575, + "loss": 2.8486, + "step": 12300 + }, + { + "epoch": 0.5179368421052631, + "grad_norm": 0.421875, + "learning_rate": 0.00024363988910522776, + "loss": 2.6724, + "step": 12301 + }, + { + "epoch": 0.5179789473684211, + "grad_norm": 0.439453125, + "learning_rate": 0.00024360615580296357, + "loss": 3.2516, + "step": 12302 + }, + { + "epoch": 0.5180210526315789, + "grad_norm": 0.423828125, + "learning_rate": 0.00024357242261718767, + "loss": 3.4128, + "step": 12303 + }, + { + "epoch": 0.5180631578947369, + "grad_norm": 0.4140625, + "learning_rate": 0.0002435386895485147, + "loss": 2.6205, + "step": 12304 + }, + { + "epoch": 0.5181052631578947, + "grad_norm": 0.466796875, + "learning_rate": 0.0002435049565975592, + "loss": 3.1658, + "step": 12305 + }, + { + "epoch": 0.5181473684210526, + "grad_norm": 0.578125, + "learning_rate": 0.00024347122376493574, + "loss": 2.8273, + "step": 12306 + }, + { + "epoch": 0.5181894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.00024343749105125895, + "loss": 2.8882, + "step": 12307 + }, + { + "epoch": 0.5182315789473684, + "grad_norm": 0.4453125, + "learning_rate": 0.00024340375845714334, + "loss": 3.1004, + "step": 12308 + }, + { + "epoch": 0.5182736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.00024337002598320357, + "loss": 2.9158, + "step": 12309 + }, + { + "epoch": 0.5183157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.00024333629363005403, + "loss": 3.1073, + "step": 12310 + }, + { + "epoch": 0.5183578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.0002433025613983095, + "loss": 3.5573, + "step": 12311 + }, + { + "epoch": 0.5184, + "grad_norm": 0.453125, + "learning_rate": 0.00024326882928858435, + "loss": 3.2337, + "step": 12312 + }, + { + "epoch": 0.5184421052631579, + "grad_norm": 0.55078125, + "learning_rate": 0.00024323509730149318, + "loss": 3.5741, + "step": 12313 + }, + { + "epoch": 0.5184842105263158, + "grad_norm": 0.453125, + "learning_rate": 0.00024320136543765067, + "loss": 3.2324, + "step": 12314 + }, + { + "epoch": 0.5185263157894737, + "grad_norm": 0.46484375, + "learning_rate": 0.00024316763369767127, + "loss": 3.2759, + "step": 12315 + }, + { + "epoch": 0.5185684210526316, + "grad_norm": 0.439453125, + "learning_rate": 0.0002431339020821696, + "loss": 3.2133, + "step": 12316 + }, + { + "epoch": 0.5186105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.0002431001705917601, + "loss": 3.2245, + "step": 12317 + }, + { + "epoch": 0.5186526315789474, + "grad_norm": 0.412109375, + "learning_rate": 0.00024306643922705737, + "loss": 2.9696, + "step": 12318 + }, + { + "epoch": 0.5186947368421052, + "grad_norm": 0.47265625, + "learning_rate": 0.00024303270798867608, + "loss": 3.1864, + "step": 12319 + }, + { + "epoch": 0.5187368421052632, + "grad_norm": 0.431640625, + "learning_rate": 0.00024299897687723055, + "loss": 3.1686, + "step": 12320 + }, + { + "epoch": 0.518778947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00024296524589333555, + "loss": 3.2667, + "step": 12321 + }, + { + "epoch": 0.518821052631579, + "grad_norm": 0.421875, + "learning_rate": 0.00024293151503760541, + "loss": 3.3006, + "step": 12322 + }, + { + "epoch": 0.5188631578947368, + "grad_norm": 0.4140625, + "learning_rate": 0.00024289778431065478, + "loss": 2.759, + "step": 12323 + }, + { + "epoch": 0.5189052631578948, + "grad_norm": 0.431640625, + "learning_rate": 0.00024286405371309818, + "loss": 3.1951, + "step": 12324 + }, + { + "epoch": 0.5189473684210526, + "grad_norm": 0.439453125, + "learning_rate": 0.00024283032324555007, + "loss": 3.4533, + "step": 12325 + }, + { + "epoch": 0.5189894736842106, + "grad_norm": 0.42578125, + "learning_rate": 0.00024279659290862518, + "loss": 3.0924, + "step": 12326 + }, + { + "epoch": 0.5190315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.00024276286270293783, + "loss": 2.7843, + "step": 12327 + }, + { + "epoch": 0.5190736842105264, + "grad_norm": 0.4296875, + "learning_rate": 0.00024272913262910267, + "loss": 3.1106, + "step": 12328 + }, + { + "epoch": 0.5191157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.0002426954026877341, + "loss": 2.5767, + "step": 12329 + }, + { + "epoch": 0.5191578947368422, + "grad_norm": 0.412109375, + "learning_rate": 0.00024266167287944674, + "loss": 2.9513, + "step": 12330 + }, + { + "epoch": 0.5192, + "grad_norm": 0.40234375, + "learning_rate": 0.00024262794320485505, + "loss": 3.1142, + "step": 12331 + }, + { + "epoch": 0.519242105263158, + "grad_norm": 0.443359375, + "learning_rate": 0.00024259421366457364, + "loss": 3.1643, + "step": 12332 + }, + { + "epoch": 0.5192842105263158, + "grad_norm": 0.447265625, + "learning_rate": 0.0002425604842592169, + "loss": 3.1648, + "step": 12333 + }, + { + "epoch": 0.5193263157894736, + "grad_norm": 0.435546875, + "learning_rate": 0.00024252675498939938, + "loss": 3.6169, + "step": 12334 + }, + { + "epoch": 0.5193684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.00024249302585573572, + "loss": 3.1232, + "step": 12335 + }, + { + "epoch": 0.5194105263157894, + "grad_norm": 1.1953125, + "learning_rate": 0.00024245929685884015, + "loss": 3.4468, + "step": 12336 + }, + { + "epoch": 0.5194526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.0002424255679993275, + "loss": 3.1718, + "step": 12337 + }, + { + "epoch": 0.5194947368421052, + "grad_norm": 0.439453125, + "learning_rate": 0.00024239183927781194, + "loss": 2.8976, + "step": 12338 + }, + { + "epoch": 0.5195368421052632, + "grad_norm": 0.43359375, + "learning_rate": 0.00024235811069490818, + "loss": 2.8872, + "step": 12339 + }, + { + "epoch": 0.519578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00024232438225123067, + "loss": 3.3939, + "step": 12340 + }, + { + "epoch": 0.519621052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.0002422906539473939, + "loss": 2.9512, + "step": 12341 + }, + { + "epoch": 0.5196631578947368, + "grad_norm": 0.44921875, + "learning_rate": 0.0002422569257840124, + "loss": 3.4896, + "step": 12342 + }, + { + "epoch": 0.5197052631578948, + "grad_norm": 0.4453125, + "learning_rate": 0.00024222319776170053, + "loss": 3.3663, + "step": 12343 + }, + { + "epoch": 0.5197473684210526, + "grad_norm": 0.44140625, + "learning_rate": 0.00024218946988107298, + "loss": 3.1689, + "step": 12344 + }, + { + "epoch": 0.5197894736842106, + "grad_norm": 0.4296875, + "learning_rate": 0.00024215574214274402, + "loss": 2.9509, + "step": 12345 + }, + { + "epoch": 0.5198315789473684, + "grad_norm": 0.462890625, + "learning_rate": 0.00024212201454732818, + "loss": 3.1362, + "step": 12346 + }, + { + "epoch": 0.5198736842105263, + "grad_norm": 0.453125, + "learning_rate": 0.00024208828709544012, + "loss": 2.6479, + "step": 12347 + }, + { + "epoch": 0.5199157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.00024205455978769405, + "loss": 2.8719, + "step": 12348 + }, + { + "epoch": 0.5199578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00024202083262470463, + "loss": 3.0595, + "step": 12349 + }, + { + "epoch": 0.52, + "grad_norm": 0.43359375, + "learning_rate": 0.0002419871056070862, + "loss": 3.6503, + "step": 12350 + }, + { + "epoch": 0.5200421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.0002419533787354534, + "loss": 2.7708, + "step": 12351 + }, + { + "epoch": 0.5200842105263158, + "grad_norm": 0.40234375, + "learning_rate": 0.00024191965201042048, + "loss": 2.873, + "step": 12352 + }, + { + "epoch": 0.5201263157894737, + "grad_norm": 0.4453125, + "learning_rate": 0.00024188592543260203, + "loss": 3.283, + "step": 12353 + }, + { + "epoch": 0.5201684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.00024185219900261254, + "loss": 3.2003, + "step": 12354 + }, + { + "epoch": 0.5202105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.00024181847272106632, + "loss": 3.0545, + "step": 12355 + }, + { + "epoch": 0.5202526315789474, + "grad_norm": 0.416015625, + "learning_rate": 0.000241784746588578, + "loss": 2.8742, + "step": 12356 + }, + { + "epoch": 0.5202947368421053, + "grad_norm": 0.46484375, + "learning_rate": 0.00024175102060576187, + "loss": 3.1363, + "step": 12357 + }, + { + "epoch": 0.5203368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.00024171729477323256, + "loss": 2.8911, + "step": 12358 + }, + { + "epoch": 0.520378947368421, + "grad_norm": 0.443359375, + "learning_rate": 0.0002416835690916043, + "loss": 3.1137, + "step": 12359 + }, + { + "epoch": 0.5204210526315789, + "grad_norm": 0.408203125, + "learning_rate": 0.00024164984356149163, + "loss": 2.8964, + "step": 12360 + }, + { + "epoch": 0.5204631578947368, + "grad_norm": 0.439453125, + "learning_rate": 0.00024161611818350912, + "loss": 2.9419, + "step": 12361 + }, + { + "epoch": 0.5205052631578947, + "grad_norm": 0.455078125, + "learning_rate": 0.00024158239295827096, + "loss": 3.023, + "step": 12362 + }, + { + "epoch": 0.5205473684210526, + "grad_norm": 4.0, + "learning_rate": 0.00024154866788639185, + "loss": 3.3454, + "step": 12363 + }, + { + "epoch": 0.5205894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.00024151494296848597, + "loss": 2.8844, + "step": 12364 + }, + { + "epoch": 0.5206315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00024148121820516782, + "loss": 3.4306, + "step": 12365 + }, + { + "epoch": 0.5206736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.00024144749359705195, + "loss": 3.0017, + "step": 12366 + }, + { + "epoch": 0.5207157894736842, + "grad_norm": 0.4453125, + "learning_rate": 0.00024141376914475266, + "loss": 3.5576, + "step": 12367 + }, + { + "epoch": 0.5207578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.00024138004484888448, + "loss": 3.3512, + "step": 12368 + }, + { + "epoch": 0.5208, + "grad_norm": 0.41015625, + "learning_rate": 0.00024134632071006165, + "loss": 2.8676, + "step": 12369 + }, + { + "epoch": 0.5208421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00024131259672889883, + "loss": 2.8283, + "step": 12370 + }, + { + "epoch": 0.5208842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.00024127887290601013, + "loss": 3.3267, + "step": 12371 + }, + { + "epoch": 0.5209263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.00024124514924201015, + "loss": 3.4884, + "step": 12372 + }, + { + "epoch": 0.5209684210526315, + "grad_norm": 0.466796875, + "learning_rate": 0.00024121142573751335, + "loss": 3.0178, + "step": 12373 + }, + { + "epoch": 0.5210105263157895, + "grad_norm": 0.458984375, + "learning_rate": 0.000241177702393134, + "loss": 3.0593, + "step": 12374 + }, + { + "epoch": 0.5210526315789473, + "grad_norm": 0.412109375, + "learning_rate": 0.00024114397920948656, + "loss": 2.8292, + "step": 12375 + }, + { + "epoch": 0.5210947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.00024111025618718534, + "loss": 3.2152, + "step": 12376 + }, + { + "epoch": 0.5211368421052631, + "grad_norm": 0.41796875, + "learning_rate": 0.00024107653332684496, + "loss": 3.0594, + "step": 12377 + }, + { + "epoch": 0.5211789473684211, + "grad_norm": 0.4453125, + "learning_rate": 0.00024104281062907953, + "loss": 3.4461, + "step": 12378 + }, + { + "epoch": 0.5212210526315789, + "grad_norm": 0.408203125, + "learning_rate": 0.00024100908809450357, + "loss": 3.1054, + "step": 12379 + }, + { + "epoch": 0.5212631578947369, + "grad_norm": 0.423828125, + "learning_rate": 0.00024097536572373156, + "loss": 3.2267, + "step": 12380 + }, + { + "epoch": 0.5213052631578947, + "grad_norm": 0.41796875, + "learning_rate": 0.00024094164351737765, + "loss": 3.2725, + "step": 12381 + }, + { + "epoch": 0.5213473684210527, + "grad_norm": 0.41796875, + "learning_rate": 0.00024090792147605646, + "loss": 2.9615, + "step": 12382 + }, + { + "epoch": 0.5213894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.00024087419960038215, + "loss": 3.0452, + "step": 12383 + }, + { + "epoch": 0.5214315789473685, + "grad_norm": 0.439453125, + "learning_rate": 0.00024084047789096937, + "loss": 2.9175, + "step": 12384 + }, + { + "epoch": 0.5214736842105263, + "grad_norm": 0.40625, + "learning_rate": 0.00024080675634843217, + "loss": 2.9063, + "step": 12385 + }, + { + "epoch": 0.5215157894736843, + "grad_norm": 0.48046875, + "learning_rate": 0.00024077303497338506, + "loss": 3.0822, + "step": 12386 + }, + { + "epoch": 0.5215578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.00024073931376644253, + "loss": 2.9695, + "step": 12387 + }, + { + "epoch": 0.5216, + "grad_norm": 0.423828125, + "learning_rate": 0.00024070559272821874, + "loss": 3.2468, + "step": 12388 + }, + { + "epoch": 0.5216421052631579, + "grad_norm": 0.455078125, + "learning_rate": 0.00024067187185932814, + "loss": 3.3241, + "step": 12389 + }, + { + "epoch": 0.5216842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.00024063815116038512, + "loss": 3.1056, + "step": 12390 + }, + { + "epoch": 0.5217263157894737, + "grad_norm": 2.421875, + "learning_rate": 0.00024060443063200396, + "loss": 2.5764, + "step": 12391 + }, + { + "epoch": 0.5217684210526315, + "grad_norm": 0.4609375, + "learning_rate": 0.000240570710274799, + "loss": 3.4007, + "step": 12392 + }, + { + "epoch": 0.5218105263157895, + "grad_norm": 0.490234375, + "learning_rate": 0.00024053699008938457, + "loss": 3.2498, + "step": 12393 + }, + { + "epoch": 0.5218526315789473, + "grad_norm": 0.447265625, + "learning_rate": 0.0002405032700763752, + "loss": 3.559, + "step": 12394 + }, + { + "epoch": 0.5218947368421053, + "grad_norm": 0.455078125, + "learning_rate": 0.000240469550236385, + "loss": 2.9602, + "step": 12395 + }, + { + "epoch": 0.5219368421052631, + "grad_norm": 0.42578125, + "learning_rate": 0.0002404358305700285, + "loss": 2.6052, + "step": 12396 + }, + { + "epoch": 0.5219789473684211, + "grad_norm": 0.435546875, + "learning_rate": 0.0002404021110779198, + "loss": 3.1265, + "step": 12397 + }, + { + "epoch": 0.5220210526315789, + "grad_norm": 0.419921875, + "learning_rate": 0.00024036839176067343, + "loss": 3.3766, + "step": 12398 + }, + { + "epoch": 0.5220631578947369, + "grad_norm": 0.43359375, + "learning_rate": 0.0002403346726189036, + "loss": 2.6975, + "step": 12399 + }, + { + "epoch": 0.5221052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.00024030095365322465, + "loss": 3.1492, + "step": 12400 + }, + { + "epoch": 0.5221473684210526, + "grad_norm": 0.41015625, + "learning_rate": 0.00024026723486425105, + "loss": 3.1906, + "step": 12401 + }, + { + "epoch": 0.5221894736842105, + "grad_norm": 0.451171875, + "learning_rate": 0.0002402335162525969, + "loss": 3.0686, + "step": 12402 + }, + { + "epoch": 0.5222315789473684, + "grad_norm": 0.447265625, + "learning_rate": 0.00024019979781887671, + "loss": 3.06, + "step": 12403 + }, + { + "epoch": 0.5222736842105263, + "grad_norm": 0.46484375, + "learning_rate": 0.0002401660795637046, + "loss": 3.5952, + "step": 12404 + }, + { + "epoch": 0.5223157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.000240132361487695, + "loss": 2.8188, + "step": 12405 + }, + { + "epoch": 0.5223578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00024009864359146216, + "loss": 2.7708, + "step": 12406 + }, + { + "epoch": 0.5224, + "grad_norm": 0.44921875, + "learning_rate": 0.00024006492587562038, + "loss": 3.3041, + "step": 12407 + }, + { + "epoch": 0.5224421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.000240031208340784, + "loss": 3.1785, + "step": 12408 + }, + { + "epoch": 0.5224842105263158, + "grad_norm": 0.45703125, + "learning_rate": 0.00023999749098756727, + "loss": 3.3944, + "step": 12409 + }, + { + "epoch": 0.5225263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.0002399637738165846, + "loss": 3.2215, + "step": 12410 + }, + { + "epoch": 0.5225684210526316, + "grad_norm": 0.40625, + "learning_rate": 0.00023993005682845003, + "loss": 3.2354, + "step": 12411 + }, + { + "epoch": 0.5226105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.00023989634002377804, + "loss": 2.4986, + "step": 12412 + }, + { + "epoch": 0.5226526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.00023986262340318297, + "loss": 2.9684, + "step": 12413 + }, + { + "epoch": 0.5226947368421052, + "grad_norm": 0.419921875, + "learning_rate": 0.00023982890696727885, + "loss": 2.9437, + "step": 12414 + }, + { + "epoch": 0.5227368421052632, + "grad_norm": 0.42578125, + "learning_rate": 0.0002397951907166802, + "loss": 3.2296, + "step": 12415 + }, + { + "epoch": 0.522778947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00023976147465200106, + "loss": 3.078, + "step": 12416 + }, + { + "epoch": 0.522821052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.000239727758773856, + "loss": 3.1443, + "step": 12417 + }, + { + "epoch": 0.5228631578947368, + "grad_norm": 0.435546875, + "learning_rate": 0.00023969404308285893, + "loss": 3.0736, + "step": 12418 + }, + { + "epoch": 0.5229052631578948, + "grad_norm": 0.3984375, + "learning_rate": 0.00023966032757962433, + "loss": 3.089, + "step": 12419 + }, + { + "epoch": 0.5229473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.00023962661226476653, + "loss": 2.7387, + "step": 12420 + }, + { + "epoch": 0.5229894736842106, + "grad_norm": 0.41015625, + "learning_rate": 0.00023959289713889953, + "loss": 3.1779, + "step": 12421 + }, + { + "epoch": 0.5230315789473684, + "grad_norm": 0.44140625, + "learning_rate": 0.00023955918220263783, + "loss": 3.1233, + "step": 12422 + }, + { + "epoch": 0.5230736842105264, + "grad_norm": 0.423828125, + "learning_rate": 0.00023952546745659547, + "loss": 3.515, + "step": 12423 + }, + { + "epoch": 0.5231157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.00023949175290138684, + "loss": 3.2101, + "step": 12424 + }, + { + "epoch": 0.5231578947368422, + "grad_norm": 0.53125, + "learning_rate": 0.00023945803853762606, + "loss": 2.875, + "step": 12425 + }, + { + "epoch": 0.5232, + "grad_norm": 0.41796875, + "learning_rate": 0.00023942432436592742, + "loss": 3.1816, + "step": 12426 + }, + { + "epoch": 0.5232421052631578, + "grad_norm": 0.47265625, + "learning_rate": 0.00023939061038690532, + "loss": 2.8526, + "step": 12427 + }, + { + "epoch": 0.5232842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.00023935689660117368, + "loss": 2.7425, + "step": 12428 + }, + { + "epoch": 0.5233263157894736, + "grad_norm": 0.435546875, + "learning_rate": 0.000239323183009347, + "loss": 3.0416, + "step": 12429 + }, + { + "epoch": 0.5233684210526316, + "grad_norm": 0.443359375, + "learning_rate": 0.00023928946961203926, + "loss": 3.2021, + "step": 12430 + }, + { + "epoch": 0.5234105263157894, + "grad_norm": 0.474609375, + "learning_rate": 0.00023925575640986488, + "loss": 3.0565, + "step": 12431 + }, + { + "epoch": 0.5234526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.00023922204340343792, + "loss": 3.5443, + "step": 12432 + }, + { + "epoch": 0.5234947368421052, + "grad_norm": 0.419921875, + "learning_rate": 0.00023918833059337273, + "loss": 3.4299, + "step": 12433 + }, + { + "epoch": 0.5235368421052632, + "grad_norm": 0.462890625, + "learning_rate": 0.00023915461798028345, + "loss": 3.169, + "step": 12434 + }, + { + "epoch": 0.523578947368421, + "grad_norm": 0.390625, + "learning_rate": 0.0002391209055647842, + "loss": 2.9163, + "step": 12435 + }, + { + "epoch": 0.523621052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00023908719334748944, + "loss": 2.7838, + "step": 12436 + }, + { + "epoch": 0.5236631578947368, + "grad_norm": 0.419921875, + "learning_rate": 0.00023905348132901305, + "loss": 2.5605, + "step": 12437 + }, + { + "epoch": 0.5237052631578948, + "grad_norm": 0.4296875, + "learning_rate": 0.00023901976950996948, + "loss": 3.4295, + "step": 12438 + }, + { + "epoch": 0.5237473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.00023898605789097273, + "loss": 2.8216, + "step": 12439 + }, + { + "epoch": 0.5237894736842105, + "grad_norm": 0.443359375, + "learning_rate": 0.00023895234647263702, + "loss": 3.7811, + "step": 12440 + }, + { + "epoch": 0.5238315789473684, + "grad_norm": 0.5078125, + "learning_rate": 0.00023891863525557665, + "loss": 2.6695, + "step": 12441 + }, + { + "epoch": 0.5238736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.00023888492424040567, + "loss": 3.2999, + "step": 12442 + }, + { + "epoch": 0.5239157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.00023885121342773842, + "loss": 3.2658, + "step": 12443 + }, + { + "epoch": 0.5239578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.00023881750281818883, + "loss": 3.54, + "step": 12444 + }, + { + "epoch": 0.524, + "grad_norm": 0.44140625, + "learning_rate": 0.00023878379241237135, + "loss": 3.2714, + "step": 12445 + }, + { + "epoch": 0.5240421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.00023875008221089985, + "loss": 3.3491, + "step": 12446 + }, + { + "epoch": 0.5240842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.00023871637221438866, + "loss": 2.7296, + "step": 12447 + }, + { + "epoch": 0.5241263157894737, + "grad_norm": 0.439453125, + "learning_rate": 0.00023868266242345196, + "loss": 3.15, + "step": 12448 + }, + { + "epoch": 0.5241684210526316, + "grad_norm": 0.40625, + "learning_rate": 0.00023864895283870382, + "loss": 2.7114, + "step": 12449 + }, + { + "epoch": 0.5242105263157895, + "grad_norm": 0.466796875, + "learning_rate": 0.00023861524346075846, + "loss": 3.1248, + "step": 12450 + }, + { + "epoch": 0.5242526315789474, + "grad_norm": 0.431640625, + "learning_rate": 0.00023858153429022995, + "loss": 3.1649, + "step": 12451 + }, + { + "epoch": 0.5242947368421053, + "grad_norm": 0.451171875, + "learning_rate": 0.00023854782532773254, + "loss": 3.5739, + "step": 12452 + }, + { + "epoch": 0.5243368421052632, + "grad_norm": 0.44140625, + "learning_rate": 0.0002385141165738802, + "loss": 2.9997, + "step": 12453 + }, + { + "epoch": 0.5243789473684211, + "grad_norm": 0.453125, + "learning_rate": 0.00023848040802928718, + "loss": 2.8612, + "step": 12454 + }, + { + "epoch": 0.5244210526315789, + "grad_norm": 0.4140625, + "learning_rate": 0.00023844669969456765, + "loss": 2.985, + "step": 12455 + }, + { + "epoch": 0.5244631578947369, + "grad_norm": 0.41796875, + "learning_rate": 0.0002384129915703356, + "loss": 3.0751, + "step": 12456 + }, + { + "epoch": 0.5245052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.0002383792836572053, + "loss": 3.1573, + "step": 12457 + }, + { + "epoch": 0.5245473684210527, + "grad_norm": 0.43359375, + "learning_rate": 0.0002383455759557907, + "loss": 3.4282, + "step": 12458 + }, + { + "epoch": 0.5245894736842105, + "grad_norm": 0.439453125, + "learning_rate": 0.00023831186846670616, + "loss": 2.9263, + "step": 12459 + }, + { + "epoch": 0.5246315789473684, + "grad_norm": 0.443359375, + "learning_rate": 0.0002382781611905655, + "loss": 3.4646, + "step": 12460 + }, + { + "epoch": 0.5246736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00023824445412798303, + "loss": 3.1308, + "step": 12461 + }, + { + "epoch": 0.5247157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.00023821074727957284, + "loss": 3.3393, + "step": 12462 + }, + { + "epoch": 0.5247578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00023817704064594888, + "loss": 3.2434, + "step": 12463 + }, + { + "epoch": 0.5248, + "grad_norm": 0.439453125, + "learning_rate": 0.00023814333422772545, + "loss": 3.036, + "step": 12464 + }, + { + "epoch": 0.5248421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.0002381096280255165, + "loss": 3.1238, + "step": 12465 + }, + { + "epoch": 0.5248842105263158, + "grad_norm": 0.443359375, + "learning_rate": 0.0002380759220399361, + "loss": 2.7108, + "step": 12466 + }, + { + "epoch": 0.5249263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.00023804221627159847, + "loss": 3.4026, + "step": 12467 + }, + { + "epoch": 0.5249684210526315, + "grad_norm": 0.412109375, + "learning_rate": 0.00023800851072111753, + "loss": 3.3602, + "step": 12468 + }, + { + "epoch": 0.5250105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.00023797480538910757, + "loss": 2.8494, + "step": 12469 + }, + { + "epoch": 0.5250526315789473, + "grad_norm": 0.404296875, + "learning_rate": 0.0002379411002761824, + "loss": 3.1315, + "step": 12470 + }, + { + "epoch": 0.5250947368421053, + "grad_norm": 0.439453125, + "learning_rate": 0.00023790739538295634, + "loss": 3.1601, + "step": 12471 + }, + { + "epoch": 0.5251368421052631, + "grad_norm": 0.43359375, + "learning_rate": 0.0002378736907100432, + "loss": 2.8565, + "step": 12472 + }, + { + "epoch": 0.5251789473684211, + "grad_norm": 0.427734375, + "learning_rate": 0.00023783998625805715, + "loss": 3.1007, + "step": 12473 + }, + { + "epoch": 0.5252210526315789, + "grad_norm": 0.40625, + "learning_rate": 0.00023780628202761233, + "loss": 2.9499, + "step": 12474 + }, + { + "epoch": 0.5252631578947369, + "grad_norm": 0.41796875, + "learning_rate": 0.0002377725780193227, + "loss": 2.7723, + "step": 12475 + }, + { + "epoch": 0.5253052631578947, + "grad_norm": 0.41015625, + "learning_rate": 0.0002377388742338024, + "loss": 2.924, + "step": 12476 + }, + { + "epoch": 0.5253473684210527, + "grad_norm": 0.40625, + "learning_rate": 0.00023770517067166533, + "loss": 2.8851, + "step": 12477 + }, + { + "epoch": 0.5253894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.0002376714673335257, + "loss": 3.3263, + "step": 12478 + }, + { + "epoch": 0.5254315789473685, + "grad_norm": 0.4375, + "learning_rate": 0.00023763776421999736, + "loss": 3.231, + "step": 12479 + }, + { + "epoch": 0.5254736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.0002376040613316944, + "loss": 3.1015, + "step": 12480 + }, + { + "epoch": 0.5255157894736842, + "grad_norm": 0.44921875, + "learning_rate": 0.00023757035866923102, + "loss": 3.1895, + "step": 12481 + }, + { + "epoch": 0.5255578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.00023753665623322097, + "loss": 2.3863, + "step": 12482 + }, + { + "epoch": 0.5256, + "grad_norm": 0.50390625, + "learning_rate": 0.00023750295402427847, + "loss": 2.8034, + "step": 12483 + }, + { + "epoch": 0.5256421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00023746925204301737, + "loss": 3.6204, + "step": 12484 + }, + { + "epoch": 0.5256842105263158, + "grad_norm": 0.4453125, + "learning_rate": 0.00023743555029005192, + "loss": 3.2046, + "step": 12485 + }, + { + "epoch": 0.5257263157894737, + "grad_norm": 0.45703125, + "learning_rate": 0.00023740184876599584, + "loss": 3.0912, + "step": 12486 + }, + { + "epoch": 0.5257684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.00023736814747146333, + "loss": 2.7349, + "step": 12487 + }, + { + "epoch": 0.5258105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.0002373344464070684, + "loss": 3.0671, + "step": 12488 + }, + { + "epoch": 0.5258526315789473, + "grad_norm": 0.423828125, + "learning_rate": 0.00023730074557342485, + "loss": 3.4956, + "step": 12489 + }, + { + "epoch": 0.5258947368421053, + "grad_norm": 0.68359375, + "learning_rate": 0.00023726704497114686, + "loss": 3.4693, + "step": 12490 + }, + { + "epoch": 0.5259368421052631, + "grad_norm": 0.44140625, + "learning_rate": 0.00023723334460084834, + "loss": 3.1823, + "step": 12491 + }, + { + "epoch": 0.5259789473684211, + "grad_norm": 0.439453125, + "learning_rate": 0.00023719964446314334, + "loss": 2.8025, + "step": 12492 + }, + { + "epoch": 0.526021052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.0002371659445586457, + "loss": 2.956, + "step": 12493 + }, + { + "epoch": 0.5260631578947368, + "grad_norm": 0.62890625, + "learning_rate": 0.00023713224488796945, + "loss": 2.7892, + "step": 12494 + }, + { + "epoch": 0.5261052631578947, + "grad_norm": 0.4453125, + "learning_rate": 0.0002370985454517287, + "loss": 3.2175, + "step": 12495 + }, + { + "epoch": 0.5261473684210526, + "grad_norm": 0.458984375, + "learning_rate": 0.00023706484625053714, + "loss": 3.4431, + "step": 12496 + }, + { + "epoch": 0.5261894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.00023703114728500905, + "loss": 3.3788, + "step": 12497 + }, + { + "epoch": 0.5262315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00023699744855575807, + "loss": 3.214, + "step": 12498 + }, + { + "epoch": 0.5262736842105263, + "grad_norm": 0.462890625, + "learning_rate": 0.0002369637500633984, + "loss": 3.044, + "step": 12499 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.0002369300518085438, + "loss": 3.1763, + "step": 12500 + }, + { + "epoch": 0.5263578947368421, + "grad_norm": 0.400390625, + "learning_rate": 0.00023689635379180833, + "loss": 3.1968, + "step": 12501 + }, + { + "epoch": 0.5264, + "grad_norm": 0.423828125, + "learning_rate": 0.00023686265601380598, + "loss": 3.2779, + "step": 12502 + }, + { + "epoch": 0.5264421052631579, + "grad_norm": 0.65234375, + "learning_rate": 0.00023682895847515047, + "loss": 3.3576, + "step": 12503 + }, + { + "epoch": 0.5264842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.000236795261176456, + "loss": 3.3213, + "step": 12504 + }, + { + "epoch": 0.5265263157894737, + "grad_norm": 0.404296875, + "learning_rate": 0.00023676156411833624, + "loss": 2.7958, + "step": 12505 + }, + { + "epoch": 0.5265684210526316, + "grad_norm": 0.453125, + "learning_rate": 0.0002367278673014053, + "loss": 2.6401, + "step": 12506 + }, + { + "epoch": 0.5266105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.000236694170726277, + "loss": 3.4331, + "step": 12507 + }, + { + "epoch": 0.5266526315789474, + "grad_norm": 0.3984375, + "learning_rate": 0.00023666047439356523, + "loss": 2.7475, + "step": 12508 + }, + { + "epoch": 0.5266947368421052, + "grad_norm": 0.404296875, + "learning_rate": 0.00023662677830388402, + "loss": 3.0667, + "step": 12509 + }, + { + "epoch": 0.5267368421052632, + "grad_norm": 0.52734375, + "learning_rate": 0.0002365930824578471, + "loss": 3.4498, + "step": 12510 + }, + { + "epoch": 0.526778947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.0002365593868560686, + "loss": 2.9331, + "step": 12511 + }, + { + "epoch": 0.526821052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.00023652569149916217, + "loss": 2.9081, + "step": 12512 + }, + { + "epoch": 0.5268631578947368, + "grad_norm": 0.443359375, + "learning_rate": 0.0002364919963877418, + "loss": 3.2116, + "step": 12513 + }, + { + "epoch": 0.5269052631578948, + "grad_norm": 0.42578125, + "learning_rate": 0.00023645830152242147, + "loss": 3.2564, + "step": 12514 + }, + { + "epoch": 0.5269473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.00023642460690381494, + "loss": 3.0385, + "step": 12515 + }, + { + "epoch": 0.5269894736842106, + "grad_norm": 0.46484375, + "learning_rate": 0.0002363909125325361, + "loss": 2.4723, + "step": 12516 + }, + { + "epoch": 0.5270315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.00023635721840919882, + "loss": 3.1462, + "step": 12517 + }, + { + "epoch": 0.5270736842105264, + "grad_norm": 0.421875, + "learning_rate": 0.0002363235245344171, + "loss": 3.2589, + "step": 12518 + }, + { + "epoch": 0.5271157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.0002362898309088046, + "loss": 2.8462, + "step": 12519 + }, + { + "epoch": 0.5271578947368422, + "grad_norm": 0.39453125, + "learning_rate": 0.00023625613753297523, + "loss": 2.7998, + "step": 12520 + }, + { + "epoch": 0.5272, + "grad_norm": 0.3984375, + "learning_rate": 0.00023622244440754304, + "loss": 3.1943, + "step": 12521 + }, + { + "epoch": 0.5272421052631578, + "grad_norm": 0.419921875, + "learning_rate": 0.0002361887515331216, + "loss": 3.0822, + "step": 12522 + }, + { + "epoch": 0.5272842105263158, + "grad_norm": 0.51171875, + "learning_rate": 0.00023615505891032496, + "loss": 3.4414, + "step": 12523 + }, + { + "epoch": 0.5273263157894736, + "grad_norm": 0.4140625, + "learning_rate": 0.00023612136653976683, + "loss": 2.8385, + "step": 12524 + }, + { + "epoch": 0.5273684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.0002360876744220611, + "loss": 3.2219, + "step": 12525 + }, + { + "epoch": 0.5274105263157894, + "grad_norm": 0.416015625, + "learning_rate": 0.00023605398255782155, + "loss": 3.537, + "step": 12526 + }, + { + "epoch": 0.5274526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00023602029094766205, + "loss": 3.1937, + "step": 12527 + }, + { + "epoch": 0.5274947368421052, + "grad_norm": 0.419921875, + "learning_rate": 0.00023598659959219654, + "loss": 3.4697, + "step": 12528 + }, + { + "epoch": 0.5275368421052632, + "grad_norm": 0.451171875, + "learning_rate": 0.00023595290849203862, + "loss": 3.3632, + "step": 12529 + }, + { + "epoch": 0.527578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.0002359192176478023, + "loss": 3.4115, + "step": 12530 + }, + { + "epoch": 0.527621052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00023588552706010118, + "loss": 3.1885, + "step": 12531 + }, + { + "epoch": 0.5276631578947368, + "grad_norm": 0.416015625, + "learning_rate": 0.00023585183672954923, + "loss": 3.1806, + "step": 12532 + }, + { + "epoch": 0.5277052631578948, + "grad_norm": 0.416015625, + "learning_rate": 0.00023581814665676017, + "loss": 3.2838, + "step": 12533 + }, + { + "epoch": 0.5277473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.0002357844568423478, + "loss": 3.2951, + "step": 12534 + }, + { + "epoch": 0.5277894736842105, + "grad_norm": 0.46484375, + "learning_rate": 0.00023575076728692598, + "loss": 3.1688, + "step": 12535 + }, + { + "epoch": 0.5278315789473684, + "grad_norm": 0.43359375, + "learning_rate": 0.00023571707799110837, + "loss": 3.0596, + "step": 12536 + }, + { + "epoch": 0.5278736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00023568338895550893, + "loss": 3.3758, + "step": 12537 + }, + { + "epoch": 0.5279157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.0002356497001807412, + "loss": 3.3384, + "step": 12538 + }, + { + "epoch": 0.5279578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.0002356160116674192, + "loss": 3.1275, + "step": 12539 + }, + { + "epoch": 0.528, + "grad_norm": 0.427734375, + "learning_rate": 0.00023558232341615643, + "loss": 3.7641, + "step": 12540 + }, + { + "epoch": 0.5280421052631579, + "grad_norm": 0.45703125, + "learning_rate": 0.0002355486354275668, + "loss": 3.3112, + "step": 12541 + }, + { + "epoch": 0.5280842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.0002355149477022641, + "loss": 3.35, + "step": 12542 + }, + { + "epoch": 0.5281263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.000235481260240862, + "loss": 3.3637, + "step": 12543 + }, + { + "epoch": 0.5281684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.00023544757304397437, + "loss": 3.4667, + "step": 12544 + }, + { + "epoch": 0.5282105263157895, + "grad_norm": 0.388671875, + "learning_rate": 0.00023541388611221476, + "loss": 2.8021, + "step": 12545 + }, + { + "epoch": 0.5282526315789474, + "grad_norm": 0.427734375, + "learning_rate": 0.0002353801994461971, + "loss": 3.2508, + "step": 12546 + }, + { + "epoch": 0.5282947368421053, + "grad_norm": 0.3984375, + "learning_rate": 0.00023534651304653496, + "loss": 2.4988, + "step": 12547 + }, + { + "epoch": 0.5283368421052631, + "grad_norm": 0.408203125, + "learning_rate": 0.0002353128269138421, + "loss": 3.3198, + "step": 12548 + }, + { + "epoch": 0.5283789473684211, + "grad_norm": 0.431640625, + "learning_rate": 0.0002352791410487323, + "loss": 3.0241, + "step": 12549 + }, + { + "epoch": 0.5284210526315789, + "grad_norm": 0.419921875, + "learning_rate": 0.00023524545545181933, + "loss": 2.8186, + "step": 12550 + }, + { + "epoch": 0.5284631578947369, + "grad_norm": 0.41015625, + "learning_rate": 0.0002352117701237168, + "loss": 2.9601, + "step": 12551 + }, + { + "epoch": 0.5285052631578947, + "grad_norm": 0.41015625, + "learning_rate": 0.00023517808506503835, + "loss": 3.2577, + "step": 12552 + }, + { + "epoch": 0.5285473684210527, + "grad_norm": 0.40625, + "learning_rate": 0.00023514440027639792, + "loss": 2.7615, + "step": 12553 + }, + { + "epoch": 0.5285894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.00023511071575840894, + "loss": 3.4114, + "step": 12554 + }, + { + "epoch": 0.5286315789473685, + "grad_norm": 0.408203125, + "learning_rate": 0.00023507703151168526, + "loss": 3.2768, + "step": 12555 + }, + { + "epoch": 0.5286736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.00023504334753684058, + "loss": 2.928, + "step": 12556 + }, + { + "epoch": 0.5287157894736843, + "grad_norm": 0.42578125, + "learning_rate": 0.00023500966383448846, + "loss": 3.1946, + "step": 12557 + }, + { + "epoch": 0.5287578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.0002349759804052427, + "loss": 3.11, + "step": 12558 + }, + { + "epoch": 0.5288, + "grad_norm": 0.400390625, + "learning_rate": 0.0002349422972497169, + "loss": 3.0096, + "step": 12559 + }, + { + "epoch": 0.5288421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.0002349086143685247, + "loss": 3.206, + "step": 12560 + }, + { + "epoch": 0.5288842105263157, + "grad_norm": 0.41796875, + "learning_rate": 0.00023487493176227995, + "loss": 3.1264, + "step": 12561 + }, + { + "epoch": 0.5289263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.00023484124943159607, + "loss": 3.5353, + "step": 12562 + }, + { + "epoch": 0.5289684210526315, + "grad_norm": 0.42578125, + "learning_rate": 0.00023480756737708693, + "loss": 3.3653, + "step": 12563 + }, + { + "epoch": 0.5290105263157895, + "grad_norm": 0.412109375, + "learning_rate": 0.00023477388559936592, + "loss": 3.0333, + "step": 12564 + }, + { + "epoch": 0.5290526315789473, + "grad_norm": 0.396484375, + "learning_rate": 0.0002347402040990469, + "loss": 3.2067, + "step": 12565 + }, + { + "epoch": 0.5290947368421053, + "grad_norm": 0.419921875, + "learning_rate": 0.00023470652287674342, + "loss": 3.1623, + "step": 12566 + }, + { + "epoch": 0.5291368421052631, + "grad_norm": 0.427734375, + "learning_rate": 0.00023467284193306908, + "loss": 3.2997, + "step": 12567 + }, + { + "epoch": 0.5291789473684211, + "grad_norm": 0.609375, + "learning_rate": 0.0002346391612686376, + "loss": 2.574, + "step": 12568 + }, + { + "epoch": 0.5292210526315789, + "grad_norm": 0.421875, + "learning_rate": 0.00023460548088406253, + "loss": 3.3146, + "step": 12569 + }, + { + "epoch": 0.5292631578947369, + "grad_norm": 0.419921875, + "learning_rate": 0.00023457180077995758, + "loss": 3.1554, + "step": 12570 + }, + { + "epoch": 0.5293052631578947, + "grad_norm": 0.478515625, + "learning_rate": 0.00023453812095693618, + "loss": 2.8136, + "step": 12571 + }, + { + "epoch": 0.5293473684210527, + "grad_norm": 0.482421875, + "learning_rate": 0.0002345044414156122, + "loss": 2.8707, + "step": 12572 + }, + { + "epoch": 0.5293894736842105, + "grad_norm": 0.43359375, + "learning_rate": 0.00023447076215659895, + "loss": 3.3357, + "step": 12573 + }, + { + "epoch": 0.5294315789473685, + "grad_norm": 0.44921875, + "learning_rate": 0.00023443708318051016, + "loss": 3.4162, + "step": 12574 + }, + { + "epoch": 0.5294736842105263, + "grad_norm": 0.44140625, + "learning_rate": 0.0002344034044879595, + "loss": 2.9379, + "step": 12575 + }, + { + "epoch": 0.5295157894736842, + "grad_norm": 0.390625, + "learning_rate": 0.00023436972607956044, + "loss": 2.9, + "step": 12576 + }, + { + "epoch": 0.5295578947368421, + "grad_norm": 0.447265625, + "learning_rate": 0.0002343360479559267, + "loss": 2.9898, + "step": 12577 + }, + { + "epoch": 0.5296, + "grad_norm": 0.4140625, + "learning_rate": 0.00023430237011767165, + "loss": 3.0563, + "step": 12578 + }, + { + "epoch": 0.5296421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.0002342686925654091, + "loss": 3.2241, + "step": 12579 + }, + { + "epoch": 0.5296842105263158, + "grad_norm": 0.466796875, + "learning_rate": 0.00023423501529975236, + "loss": 3.4646, + "step": 12580 + }, + { + "epoch": 0.5297263157894737, + "grad_norm": 0.435546875, + "learning_rate": 0.00023420133832131513, + "loss": 3.0999, + "step": 12581 + }, + { + "epoch": 0.5297684210526316, + "grad_norm": 0.439453125, + "learning_rate": 0.00023416766163071104, + "loss": 3.2165, + "step": 12582 + }, + { + "epoch": 0.5298105263157895, + "grad_norm": 0.431640625, + "learning_rate": 0.00023413398522855347, + "loss": 3.371, + "step": 12583 + }, + { + "epoch": 0.5298526315789474, + "grad_norm": 0.4765625, + "learning_rate": 0.0002341003091154561, + "loss": 2.5699, + "step": 12584 + }, + { + "epoch": 0.5298947368421053, + "grad_norm": 0.40625, + "learning_rate": 0.00023406663329203235, + "loss": 2.6565, + "step": 12585 + }, + { + "epoch": 0.5299368421052632, + "grad_norm": 0.4453125, + "learning_rate": 0.00023403295775889594, + "loss": 3.0076, + "step": 12586 + }, + { + "epoch": 0.5299789473684211, + "grad_norm": 0.392578125, + "learning_rate": 0.00023399928251666014, + "loss": 3.0702, + "step": 12587 + }, + { + "epoch": 0.530021052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00023396560756593865, + "loss": 3.1357, + "step": 12588 + }, + { + "epoch": 0.5300631578947368, + "grad_norm": 0.4453125, + "learning_rate": 0.00023393193290734504, + "loss": 3.0796, + "step": 12589 + }, + { + "epoch": 0.5301052631578947, + "grad_norm": 0.421875, + "learning_rate": 0.00023389825854149262, + "loss": 2.7889, + "step": 12590 + }, + { + "epoch": 0.5301473684210526, + "grad_norm": 0.482421875, + "learning_rate": 0.0002338645844689951, + "loss": 2.6947, + "step": 12591 + }, + { + "epoch": 0.5301894736842105, + "grad_norm": 0.44921875, + "learning_rate": 0.0002338309106904658, + "loss": 2.9049, + "step": 12592 + }, + { + "epoch": 0.5302315789473684, + "grad_norm": 0.439453125, + "learning_rate": 0.00023379723720651845, + "loss": 2.9446, + "step": 12593 + }, + { + "epoch": 0.5302736842105263, + "grad_norm": 0.455078125, + "learning_rate": 0.00023376356401776625, + "loss": 3.0439, + "step": 12594 + }, + { + "epoch": 0.5303157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.00023372989112482287, + "loss": 3.2283, + "step": 12595 + }, + { + "epoch": 0.5303578947368421, + "grad_norm": 0.455078125, + "learning_rate": 0.00023369621852830185, + "loss": 3.1895, + "step": 12596 + }, + { + "epoch": 0.5304, + "grad_norm": 0.62890625, + "learning_rate": 0.00023366254622881648, + "loss": 2.9715, + "step": 12597 + }, + { + "epoch": 0.5304421052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.00023362887422698043, + "loss": 3.0714, + "step": 12598 + }, + { + "epoch": 0.5304842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.00023359520252340691, + "loss": 3.1533, + "step": 12599 + }, + { + "epoch": 0.5305263157894737, + "grad_norm": 0.4921875, + "learning_rate": 0.0002335615311187096, + "loss": 2.7955, + "step": 12600 + }, + { + "epoch": 0.5305684210526316, + "grad_norm": 0.44921875, + "learning_rate": 0.00023352786001350185, + "loss": 2.9551, + "step": 12601 + }, + { + "epoch": 0.5306105263157894, + "grad_norm": 0.4140625, + "learning_rate": 0.0002334941892083971, + "loss": 2.6214, + "step": 12602 + }, + { + "epoch": 0.5306526315789474, + "grad_norm": 0.443359375, + "learning_rate": 0.00023346051870400898, + "loss": 3.1395, + "step": 12603 + }, + { + "epoch": 0.5306947368421052, + "grad_norm": 0.427734375, + "learning_rate": 0.00023342684850095064, + "loss": 3.1144, + "step": 12604 + }, + { + "epoch": 0.5307368421052632, + "grad_norm": 0.41796875, + "learning_rate": 0.00023339317859983575, + "loss": 3.4439, + "step": 12605 + }, + { + "epoch": 0.530778947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.00023335950900127753, + "loss": 2.9315, + "step": 12606 + }, + { + "epoch": 0.530821052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00023332583970588956, + "loss": 3.504, + "step": 12607 + }, + { + "epoch": 0.5308631578947368, + "grad_norm": 0.44140625, + "learning_rate": 0.00023329217071428514, + "loss": 2.6561, + "step": 12608 + }, + { + "epoch": 0.5309052631578948, + "grad_norm": 0.431640625, + "learning_rate": 0.00023325850202707784, + "loss": 3.4882, + "step": 12609 + }, + { + "epoch": 0.5309473684210526, + "grad_norm": 0.435546875, + "learning_rate": 0.00023322483364488088, + "loss": 3.1438, + "step": 12610 + }, + { + "epoch": 0.5309894736842106, + "grad_norm": 0.474609375, + "learning_rate": 0.00023319116556830773, + "loss": 2.9365, + "step": 12611 + }, + { + "epoch": 0.5310315789473684, + "grad_norm": 0.392578125, + "learning_rate": 0.00023315749779797193, + "loss": 3.1366, + "step": 12612 + }, + { + "epoch": 0.5310736842105264, + "grad_norm": 0.4375, + "learning_rate": 0.0002331238303344866, + "loss": 3.1568, + "step": 12613 + }, + { + "epoch": 0.5311157894736842, + "grad_norm": 0.44921875, + "learning_rate": 0.00023309016317846524, + "loss": 3.5282, + "step": 12614 + }, + { + "epoch": 0.531157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.00023305649633052135, + "loss": 2.6739, + "step": 12615 + }, + { + "epoch": 0.5312, + "grad_norm": 0.427734375, + "learning_rate": 0.0002330228297912681, + "loss": 3.6034, + "step": 12616 + }, + { + "epoch": 0.5312421052631578, + "grad_norm": 0.427734375, + "learning_rate": 0.00023298916356131896, + "loss": 3.169, + "step": 12617 + }, + { + "epoch": 0.5312842105263158, + "grad_norm": 0.4140625, + "learning_rate": 0.00023295549764128725, + "loss": 3.2674, + "step": 12618 + }, + { + "epoch": 0.5313263157894736, + "grad_norm": 0.419921875, + "learning_rate": 0.00023292183203178646, + "loss": 2.5441, + "step": 12619 + }, + { + "epoch": 0.5313684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.00023288816673342969, + "loss": 2.7802, + "step": 12620 + }, + { + "epoch": 0.5314105263157894, + "grad_norm": 0.416015625, + "learning_rate": 0.00023285450174683042, + "loss": 3.0206, + "step": 12621 + }, + { + "epoch": 0.5314526315789474, + "grad_norm": 0.404296875, + "learning_rate": 0.0002328208370726021, + "loss": 2.464, + "step": 12622 + }, + { + "epoch": 0.5314947368421052, + "grad_norm": 0.419921875, + "learning_rate": 0.00023278717271135786, + "loss": 2.9625, + "step": 12623 + }, + { + "epoch": 0.5315368421052632, + "grad_norm": 0.427734375, + "learning_rate": 0.00023275350866371113, + "loss": 3.0688, + "step": 12624 + }, + { + "epoch": 0.531578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.0002327198449302752, + "loss": 3.3764, + "step": 12625 + }, + { + "epoch": 0.531621052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00023268618151166343, + "loss": 3.4584, + "step": 12626 + }, + { + "epoch": 0.5316631578947368, + "grad_norm": 0.439453125, + "learning_rate": 0.000232652518408489, + "loss": 3.0768, + "step": 12627 + }, + { + "epoch": 0.5317052631578948, + "grad_norm": 0.427734375, + "learning_rate": 0.00023261885562136535, + "loss": 3.1833, + "step": 12628 + }, + { + "epoch": 0.5317473684210526, + "grad_norm": 0.453125, + "learning_rate": 0.00023258519315090578, + "loss": 3.4424, + "step": 12629 + }, + { + "epoch": 0.5317894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.00023255153099772344, + "loss": 3.1627, + "step": 12630 + }, + { + "epoch": 0.5318315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.00023251786916243184, + "loss": 3.2133, + "step": 12631 + }, + { + "epoch": 0.5318736842105263, + "grad_norm": 0.431640625, + "learning_rate": 0.00023248420764564398, + "loss": 3.3195, + "step": 12632 + }, + { + "epoch": 0.5319157894736842, + "grad_norm": 0.515625, + "learning_rate": 0.00023245054644797334, + "loss": 2.9517, + "step": 12633 + }, + { + "epoch": 0.5319578947368421, + "grad_norm": 0.46484375, + "learning_rate": 0.00023241688557003307, + "loss": 3.0739, + "step": 12634 + }, + { + "epoch": 0.532, + "grad_norm": 0.443359375, + "learning_rate": 0.00023238322501243649, + "loss": 3.0601, + "step": 12635 + }, + { + "epoch": 0.5320421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.00023234956477579698, + "loss": 2.9509, + "step": 12636 + }, + { + "epoch": 0.5320842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.0002323159048607275, + "loss": 3.0394, + "step": 12637 + }, + { + "epoch": 0.5321263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.00023228224526784162, + "loss": 2.9287, + "step": 12638 + }, + { + "epoch": 0.5321684210526316, + "grad_norm": 0.43359375, + "learning_rate": 0.00023224858599775228, + "loss": 3.185, + "step": 12639 + }, + { + "epoch": 0.5322105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.00023221492705107293, + "loss": 2.8927, + "step": 12640 + }, + { + "epoch": 0.5322526315789474, + "grad_norm": 0.40625, + "learning_rate": 0.00023218126842841663, + "loss": 3.2685, + "step": 12641 + }, + { + "epoch": 0.5322947368421053, + "grad_norm": 0.4140625, + "learning_rate": 0.0002321476101303967, + "loss": 3.0433, + "step": 12642 + }, + { + "epoch": 0.5323368421052631, + "grad_norm": 0.421875, + "learning_rate": 0.00023211395215762636, + "loss": 3.1253, + "step": 12643 + }, + { + "epoch": 0.5323789473684211, + "grad_norm": 0.427734375, + "learning_rate": 0.00023208029451071878, + "loss": 3.5283, + "step": 12644 + }, + { + "epoch": 0.5324210526315789, + "grad_norm": 0.408203125, + "learning_rate": 0.00023204663719028726, + "loss": 3.4994, + "step": 12645 + }, + { + "epoch": 0.5324631578947369, + "grad_norm": 0.408203125, + "learning_rate": 0.0002320129801969448, + "loss": 3.1769, + "step": 12646 + }, + { + "epoch": 0.5325052631578947, + "grad_norm": 0.423828125, + "learning_rate": 0.00023197932353130485, + "loss": 2.718, + "step": 12647 + }, + { + "epoch": 0.5325473684210527, + "grad_norm": 0.404296875, + "learning_rate": 0.0002319456671939803, + "loss": 2.8629, + "step": 12648 + }, + { + "epoch": 0.5325894736842105, + "grad_norm": 0.396484375, + "learning_rate": 0.00023191201118558452, + "loss": 2.8033, + "step": 12649 + }, + { + "epoch": 0.5326315789473685, + "grad_norm": 0.423828125, + "learning_rate": 0.00023187835550673067, + "loss": 3.1805, + "step": 12650 + }, + { + "epoch": 0.5326736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.00023184470015803185, + "loss": 3.4089, + "step": 12651 + }, + { + "epoch": 0.5327157894736843, + "grad_norm": 0.423828125, + "learning_rate": 0.00023181104514010138, + "loss": 3.0306, + "step": 12652 + }, + { + "epoch": 0.5327578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.00023177739045355215, + "loss": 3.2424, + "step": 12653 + }, + { + "epoch": 0.5328, + "grad_norm": 0.42578125, + "learning_rate": 0.00023174373609899759, + "loss": 2.9711, + "step": 12654 + }, + { + "epoch": 0.5328421052631579, + "grad_norm": 0.462890625, + "learning_rate": 0.0002317100820770506, + "loss": 3.0502, + "step": 12655 + }, + { + "epoch": 0.5328842105263157, + "grad_norm": 0.455078125, + "learning_rate": 0.00023167642838832443, + "loss": 2.8089, + "step": 12656 + }, + { + "epoch": 0.5329263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.00023164277503343232, + "loss": 3.4106, + "step": 12657 + }, + { + "epoch": 0.5329684210526315, + "grad_norm": 0.4296875, + "learning_rate": 0.00023160912201298714, + "loss": 3.6643, + "step": 12658 + }, + { + "epoch": 0.5330105263157895, + "grad_norm": 0.443359375, + "learning_rate": 0.0002315754693276022, + "loss": 2.6841, + "step": 12659 + }, + { + "epoch": 0.5330526315789473, + "grad_norm": 0.462890625, + "learning_rate": 0.00023154181697789054, + "loss": 3.1348, + "step": 12660 + }, + { + "epoch": 0.5330947368421053, + "grad_norm": 0.447265625, + "learning_rate": 0.00023150816496446527, + "loss": 3.4332, + "step": 12661 + }, + { + "epoch": 0.5331368421052631, + "grad_norm": 0.45703125, + "learning_rate": 0.00023147451328793962, + "loss": 3.7398, + "step": 12662 + }, + { + "epoch": 0.5331789473684211, + "grad_norm": 0.443359375, + "learning_rate": 0.00023144086194892648, + "loss": 2.7761, + "step": 12663 + }, + { + "epoch": 0.5332210526315789, + "grad_norm": 0.41015625, + "learning_rate": 0.00023140721094803914, + "loss": 2.8294, + "step": 12664 + }, + { + "epoch": 0.5332631578947369, + "grad_norm": 0.439453125, + "learning_rate": 0.0002313735602858904, + "loss": 2.7632, + "step": 12665 + }, + { + "epoch": 0.5333052631578947, + "grad_norm": 0.439453125, + "learning_rate": 0.00023133990996309362, + "loss": 2.5546, + "step": 12666 + }, + { + "epoch": 0.5333473684210527, + "grad_norm": 0.455078125, + "learning_rate": 0.00023130625998026174, + "loss": 3.4901, + "step": 12667 + }, + { + "epoch": 0.5333894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.00023127261033800779, + "loss": 3.3508, + "step": 12668 + }, + { + "epoch": 0.5334315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00023123896103694491, + "loss": 2.7997, + "step": 12669 + }, + { + "epoch": 0.5334736842105263, + "grad_norm": 0.451171875, + "learning_rate": 0.00023120531207768607, + "loss": 2.8876, + "step": 12670 + }, + { + "epoch": 0.5335157894736842, + "grad_norm": 0.4375, + "learning_rate": 0.0002311716634608445, + "loss": 3.0089, + "step": 12671 + }, + { + "epoch": 0.5335578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00023113801518703295, + "loss": 3.1059, + "step": 12672 + }, + { + "epoch": 0.5336, + "grad_norm": 0.4140625, + "learning_rate": 0.00023110436725686475, + "loss": 3.026, + "step": 12673 + }, + { + "epoch": 0.5336421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00023107071967095263, + "loss": 3.6347, + "step": 12674 + }, + { + "epoch": 0.5336842105263158, + "grad_norm": 0.482421875, + "learning_rate": 0.00023103707242990977, + "loss": 2.9436, + "step": 12675 + }, + { + "epoch": 0.5337263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.0002310034255343492, + "loss": 3.2199, + "step": 12676 + }, + { + "epoch": 0.5337684210526316, + "grad_norm": 0.4296875, + "learning_rate": 0.0002309697789848839, + "loss": 2.837, + "step": 12677 + }, + { + "epoch": 0.5338105263157895, + "grad_norm": 0.44921875, + "learning_rate": 0.00023093613278212693, + "loss": 3.2399, + "step": 12678 + }, + { + "epoch": 0.5338526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.00023090248692669114, + "loss": 2.9832, + "step": 12679 + }, + { + "epoch": 0.5338947368421053, + "grad_norm": 0.453125, + "learning_rate": 0.00023086884141918968, + "loss": 2.8075, + "step": 12680 + }, + { + "epoch": 0.5339368421052632, + "grad_norm": 0.412109375, + "learning_rate": 0.00023083519626023537, + "loss": 2.996, + "step": 12681 + }, + { + "epoch": 0.533978947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00023080155145044126, + "loss": 3.3807, + "step": 12682 + }, + { + "epoch": 0.534021052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00023076790699042045, + "loss": 3.3397, + "step": 12683 + }, + { + "epoch": 0.5340631578947368, + "grad_norm": 0.47265625, + "learning_rate": 0.00023073426288078562, + "loss": 2.9738, + "step": 12684 + }, + { + "epoch": 0.5341052631578947, + "grad_norm": 0.40625, + "learning_rate": 0.00023070061912214995, + "loss": 3.1122, + "step": 12685 + }, + { + "epoch": 0.5341473684210526, + "grad_norm": 0.408203125, + "learning_rate": 0.0002306669757151263, + "loss": 3.2446, + "step": 12686 + }, + { + "epoch": 0.5341894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.00023063333266032775, + "loss": 3.2109, + "step": 12687 + }, + { + "epoch": 0.5342315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.00023059968995836698, + "loss": 3.0674, + "step": 12688 + }, + { + "epoch": 0.5342736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00023056604760985708, + "loss": 2.9957, + "step": 12689 + }, + { + "epoch": 0.5343157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.00023053240561541106, + "loss": 3.1843, + "step": 12690 + }, + { + "epoch": 0.5343578947368421, + "grad_norm": 0.39453125, + "learning_rate": 0.00023049876397564166, + "loss": 3.1221, + "step": 12691 + }, + { + "epoch": 0.5344, + "grad_norm": 0.4296875, + "learning_rate": 0.00023046512269116187, + "loss": 3.128, + "step": 12692 + }, + { + "epoch": 0.5344421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00023043148176258457, + "loss": 3.1448, + "step": 12693 + }, + { + "epoch": 0.5344842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.00023039784119052278, + "loss": 2.991, + "step": 12694 + }, + { + "epoch": 0.5345263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.0002303642009755892, + "loss": 3.4573, + "step": 12695 + }, + { + "epoch": 0.5345684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.00023033056111839678, + "loss": 3.2706, + "step": 12696 + }, + { + "epoch": 0.5346105263157894, + "grad_norm": 0.41796875, + "learning_rate": 0.00023029692161955856, + "loss": 2.9666, + "step": 12697 + }, + { + "epoch": 0.5346526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.00023026328247968716, + "loss": 3.0784, + "step": 12698 + }, + { + "epoch": 0.5346947368421052, + "grad_norm": 0.435546875, + "learning_rate": 0.00023022964369939566, + "loss": 3.332, + "step": 12699 + }, + { + "epoch": 0.5347368421052632, + "grad_norm": 0.416015625, + "learning_rate": 0.00023019600527929675, + "loss": 3.3824, + "step": 12700 + }, + { + "epoch": 0.534778947368421, + "grad_norm": 0.4609375, + "learning_rate": 0.00023016236722000338, + "loss": 3.0983, + "step": 12701 + }, + { + "epoch": 0.534821052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00023012872952212832, + "loss": 2.9692, + "step": 12702 + }, + { + "epoch": 0.5348631578947368, + "grad_norm": 0.416015625, + "learning_rate": 0.00023009509218628444, + "loss": 3.6116, + "step": 12703 + }, + { + "epoch": 0.5349052631578948, + "grad_norm": 0.416015625, + "learning_rate": 0.00023006145521308475, + "loss": 3.3333, + "step": 12704 + }, + { + "epoch": 0.5349473684210526, + "grad_norm": 0.4140625, + "learning_rate": 0.00023002781860314176, + "loss": 2.5794, + "step": 12705 + }, + { + "epoch": 0.5349894736842106, + "grad_norm": 0.44140625, + "learning_rate": 0.0002299941823570686, + "loss": 3.3274, + "step": 12706 + }, + { + "epoch": 0.5350315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.0002299605464754778, + "loss": 3.0704, + "step": 12707 + }, + { + "epoch": 0.5350736842105264, + "grad_norm": 0.400390625, + "learning_rate": 0.00022992691095898235, + "loss": 2.479, + "step": 12708 + }, + { + "epoch": 0.5351157894736842, + "grad_norm": 0.44921875, + "learning_rate": 0.00022989327580819494, + "loss": 3.4943, + "step": 12709 + }, + { + "epoch": 0.535157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.0002298596410237285, + "loss": 3.3364, + "step": 12710 + }, + { + "epoch": 0.5352, + "grad_norm": 0.45703125, + "learning_rate": 0.0002298260066061957, + "loss": 2.7743, + "step": 12711 + }, + { + "epoch": 0.5352421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00022979237255620934, + "loss": 3.2902, + "step": 12712 + }, + { + "epoch": 0.5352842105263158, + "grad_norm": 0.54296875, + "learning_rate": 0.0002297587388743823, + "loss": 3.3277, + "step": 12713 + }, + { + "epoch": 0.5353263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.00022972510556132714, + "loss": 3.544, + "step": 12714 + }, + { + "epoch": 0.5353684210526316, + "grad_norm": 0.462890625, + "learning_rate": 0.00022969147261765676, + "loss": 3.2444, + "step": 12715 + }, + { + "epoch": 0.5354105263157894, + "grad_norm": 0.3984375, + "learning_rate": 0.00022965784004398397, + "loss": 3.186, + "step": 12716 + }, + { + "epoch": 0.5354526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.0002296242078409213, + "loss": 3.4802, + "step": 12717 + }, + { + "epoch": 0.5354947368421052, + "grad_norm": 0.43359375, + "learning_rate": 0.00022959057600908174, + "loss": 3.1814, + "step": 12718 + }, + { + "epoch": 0.5355368421052632, + "grad_norm": 0.44140625, + "learning_rate": 0.0002295569445490778, + "loss": 2.8028, + "step": 12719 + }, + { + "epoch": 0.535578947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.00022952331346152245, + "loss": 2.7376, + "step": 12720 + }, + { + "epoch": 0.535621052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.00022948968274702814, + "loss": 2.7118, + "step": 12721 + }, + { + "epoch": 0.5356631578947368, + "grad_norm": 0.453125, + "learning_rate": 0.00022945605240620774, + "loss": 2.7902, + "step": 12722 + }, + { + "epoch": 0.5357052631578947, + "grad_norm": 0.44140625, + "learning_rate": 0.000229422422439674, + "loss": 3.1385, + "step": 12723 + }, + { + "epoch": 0.5357473684210526, + "grad_norm": 0.435546875, + "learning_rate": 0.00022938879284803947, + "loss": 3.6393, + "step": 12724 + }, + { + "epoch": 0.5357894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.00022935516363191695, + "loss": 3.3796, + "step": 12725 + }, + { + "epoch": 0.5358315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.00022932153479191908, + "loss": 3.0666, + "step": 12726 + }, + { + "epoch": 0.5358736842105263, + "grad_norm": 0.44921875, + "learning_rate": 0.00022928790632865856, + "loss": 2.6989, + "step": 12727 + }, + { + "epoch": 0.5359157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.00022925427824274804, + "loss": 3.2232, + "step": 12728 + }, + { + "epoch": 0.5359578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00022922065053480013, + "loss": 3.3304, + "step": 12729 + }, + { + "epoch": 0.536, + "grad_norm": 0.44140625, + "learning_rate": 0.0002291870232054277, + "loss": 3.2385, + "step": 12730 + }, + { + "epoch": 0.5360421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00022915339625524312, + "loss": 3.0788, + "step": 12731 + }, + { + "epoch": 0.5360842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.0002291197696848593, + "loss": 3.4092, + "step": 12732 + }, + { + "epoch": 0.5361263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.00022908614349488861, + "loss": 3.0481, + "step": 12733 + }, + { + "epoch": 0.5361684210526316, + "grad_norm": 0.453125, + "learning_rate": 0.0002290525176859439, + "loss": 2.987, + "step": 12734 + }, + { + "epoch": 0.5362105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.00022901889225863763, + "loss": 3.137, + "step": 12735 + }, + { + "epoch": 0.5362526315789473, + "grad_norm": 0.4375, + "learning_rate": 0.00022898526721358247, + "loss": 2.9314, + "step": 12736 + }, + { + "epoch": 0.5362947368421053, + "grad_norm": 0.451171875, + "learning_rate": 0.0002289516425513912, + "loss": 3.1312, + "step": 12737 + }, + { + "epoch": 0.5363368421052631, + "grad_norm": 0.416015625, + "learning_rate": 0.00022891801827267614, + "loss": 2.8295, + "step": 12738 + }, + { + "epoch": 0.5363789473684211, + "grad_norm": 0.41796875, + "learning_rate": 0.00022888439437805015, + "loss": 2.8614, + "step": 12739 + }, + { + "epoch": 0.5364210526315789, + "grad_norm": 0.42578125, + "learning_rate": 0.00022885077086812552, + "loss": 3.3308, + "step": 12740 + }, + { + "epoch": 0.5364631578947369, + "grad_norm": 0.4140625, + "learning_rate": 0.00022881714774351512, + "loss": 3.3051, + "step": 12741 + }, + { + "epoch": 0.5365052631578947, + "grad_norm": 0.431640625, + "learning_rate": 0.00022878352500483134, + "loss": 2.389, + "step": 12742 + }, + { + "epoch": 0.5365473684210527, + "grad_norm": 0.482421875, + "learning_rate": 0.0002287499026526868, + "loss": 3.3163, + "step": 12743 + }, + { + "epoch": 0.5365894736842105, + "grad_norm": 0.431640625, + "learning_rate": 0.0002287162806876941, + "loss": 3.2768, + "step": 12744 + }, + { + "epoch": 0.5366315789473685, + "grad_norm": 0.4453125, + "learning_rate": 0.00022868265911046569, + "loss": 3.2537, + "step": 12745 + }, + { + "epoch": 0.5366736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.0002286490379216143, + "loss": 3.0788, + "step": 12746 + }, + { + "epoch": 0.5367157894736843, + "grad_norm": 0.466796875, + "learning_rate": 0.00022861541712175228, + "loss": 2.7111, + "step": 12747 + }, + { + "epoch": 0.5367578947368421, + "grad_norm": 0.447265625, + "learning_rate": 0.0002285817967114923, + "loss": 3.0657, + "step": 12748 + }, + { + "epoch": 0.5368, + "grad_norm": 0.412109375, + "learning_rate": 0.0002285481766914467, + "loss": 2.9346, + "step": 12749 + }, + { + "epoch": 0.5368421052631579, + "grad_norm": 0.447265625, + "learning_rate": 0.00022851455706222816, + "loss": 3.2789, + "step": 12750 + }, + { + "epoch": 0.5368842105263157, + "grad_norm": 0.427734375, + "learning_rate": 0.00022848093782444916, + "loss": 3.2632, + "step": 12751 + }, + { + "epoch": 0.5369263157894737, + "grad_norm": 0.482421875, + "learning_rate": 0.00022844731897872216, + "loss": 3.3419, + "step": 12752 + }, + { + "epoch": 0.5369684210526315, + "grad_norm": 0.4375, + "learning_rate": 0.00022841370052565976, + "loss": 3.1362, + "step": 12753 + }, + { + "epoch": 0.5370105263157895, + "grad_norm": 0.43359375, + "learning_rate": 0.0002283800824658743, + "loss": 3.3425, + "step": 12754 + }, + { + "epoch": 0.5370526315789473, + "grad_norm": 0.423828125, + "learning_rate": 0.00022834646479997844, + "loss": 3.169, + "step": 12755 + }, + { + "epoch": 0.5370947368421053, + "grad_norm": 0.4609375, + "learning_rate": 0.00022831284752858444, + "loss": 2.8892, + "step": 12756 + }, + { + "epoch": 0.5371368421052631, + "grad_norm": 0.435546875, + "learning_rate": 0.0002282792306523049, + "loss": 3.0912, + "step": 12757 + }, + { + "epoch": 0.5371789473684211, + "grad_norm": 0.443359375, + "learning_rate": 0.0002282456141717523, + "loss": 3.2636, + "step": 12758 + }, + { + "epoch": 0.5372210526315789, + "grad_norm": 0.431640625, + "learning_rate": 0.00022821199808753897, + "loss": 3.4331, + "step": 12759 + }, + { + "epoch": 0.5372631578947369, + "grad_norm": 0.43359375, + "learning_rate": 0.00022817838240027754, + "loss": 3.2414, + "step": 12760 + }, + { + "epoch": 0.5373052631578947, + "grad_norm": 0.8671875, + "learning_rate": 0.00022814476711058022, + "loss": 2.6413, + "step": 12761 + }, + { + "epoch": 0.5373473684210527, + "grad_norm": 0.42578125, + "learning_rate": 0.0002281111522190596, + "loss": 2.685, + "step": 12762 + }, + { + "epoch": 0.5373894736842105, + "grad_norm": 0.43359375, + "learning_rate": 0.00022807753772632818, + "loss": 3.3057, + "step": 12763 + }, + { + "epoch": 0.5374315789473684, + "grad_norm": 0.47265625, + "learning_rate": 0.0002280439236329981, + "loss": 3.4439, + "step": 12764 + }, + { + "epoch": 0.5374736842105263, + "grad_norm": 0.404296875, + "learning_rate": 0.0002280103099396821, + "loss": 2.6983, + "step": 12765 + }, + { + "epoch": 0.5375157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.0002279766966469923, + "loss": 3.5582, + "step": 12766 + }, + { + "epoch": 0.5375578947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.00022794308375554128, + "loss": 3.3419, + "step": 12767 + }, + { + "epoch": 0.5376, + "grad_norm": 0.42578125, + "learning_rate": 0.00022790947126594124, + "loss": 2.8263, + "step": 12768 + }, + { + "epoch": 0.5376421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.00022787585917880479, + "loss": 3.1683, + "step": 12769 + }, + { + "epoch": 0.5376842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.0002278422474947442, + "loss": 2.8922, + "step": 12770 + }, + { + "epoch": 0.5377263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.00022780863621437174, + "loss": 2.9278, + "step": 12771 + }, + { + "epoch": 0.5377684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.00022777502533829997, + "loss": 3.5117, + "step": 12772 + }, + { + "epoch": 0.5378105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.000227741414867141, + "loss": 3.3157, + "step": 12773 + }, + { + "epoch": 0.5378526315789474, + "grad_norm": 0.427734375, + "learning_rate": 0.00022770780480150743, + "loss": 2.7496, + "step": 12774 + }, + { + "epoch": 0.5378947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.00022767419514201136, + "loss": 2.9335, + "step": 12775 + }, + { + "epoch": 0.5379368421052632, + "grad_norm": 0.427734375, + "learning_rate": 0.0002276405858892652, + "loss": 2.9917, + "step": 12776 + }, + { + "epoch": 0.537978947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.00022760697704388136, + "loss": 3.0267, + "step": 12777 + }, + { + "epoch": 0.538021052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.00022757336860647204, + "loss": 3.0119, + "step": 12778 + }, + { + "epoch": 0.5380631578947368, + "grad_norm": 0.431640625, + "learning_rate": 0.00022753976057764968, + "loss": 3.0938, + "step": 12779 + }, + { + "epoch": 0.5381052631578948, + "grad_norm": 0.447265625, + "learning_rate": 0.00022750615295802644, + "loss": 3.0561, + "step": 12780 + }, + { + "epoch": 0.5381473684210526, + "grad_norm": 0.458984375, + "learning_rate": 0.0002274725457482147, + "loss": 3.4779, + "step": 12781 + }, + { + "epoch": 0.5381894736842106, + "grad_norm": 0.419921875, + "learning_rate": 0.00022743893894882665, + "loss": 2.5003, + "step": 12782 + }, + { + "epoch": 0.5382315789473684, + "grad_norm": 0.439453125, + "learning_rate": 0.0002274053325604746, + "loss": 3.4628, + "step": 12783 + }, + { + "epoch": 0.5382736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.00022737172658377093, + "loss": 2.8963, + "step": 12784 + }, + { + "epoch": 0.5383157894736842, + "grad_norm": 0.376953125, + "learning_rate": 0.00022733812101932778, + "loss": 2.941, + "step": 12785 + }, + { + "epoch": 0.5383578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00022730451586775745, + "loss": 2.7317, + "step": 12786 + }, + { + "epoch": 0.5384, + "grad_norm": 0.404296875, + "learning_rate": 0.0002272709111296721, + "loss": 2.7255, + "step": 12787 + }, + { + "epoch": 0.5384421052631579, + "grad_norm": 0.5078125, + "learning_rate": 0.00022723730680568416, + "loss": 3.1659, + "step": 12788 + }, + { + "epoch": 0.5384842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.00022720370289640564, + "loss": 3.1836, + "step": 12789 + }, + { + "epoch": 0.5385263157894736, + "grad_norm": 0.451171875, + "learning_rate": 0.0002271700994024489, + "loss": 3.4736, + "step": 12790 + }, + { + "epoch": 0.5385684210526316, + "grad_norm": 0.443359375, + "learning_rate": 0.0002271364963244262, + "loss": 3.1953, + "step": 12791 + }, + { + "epoch": 0.5386105263157894, + "grad_norm": 0.392578125, + "learning_rate": 0.0002271028936629496, + "loss": 2.4013, + "step": 12792 + }, + { + "epoch": 0.5386526315789474, + "grad_norm": 0.443359375, + "learning_rate": 0.0002270692914186314, + "loss": 2.7199, + "step": 12793 + }, + { + "epoch": 0.5386947368421052, + "grad_norm": 0.453125, + "learning_rate": 0.0002270356895920837, + "loss": 3.281, + "step": 12794 + }, + { + "epoch": 0.5387368421052632, + "grad_norm": 0.412109375, + "learning_rate": 0.00022700208818391887, + "loss": 3.1711, + "step": 12795 + }, + { + "epoch": 0.538778947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00022696848719474887, + "loss": 3.2292, + "step": 12796 + }, + { + "epoch": 0.538821052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00022693488662518595, + "loss": 3.1128, + "step": 12797 + }, + { + "epoch": 0.5388631578947368, + "grad_norm": 0.40234375, + "learning_rate": 0.00022690128647584245, + "loss": 3.1284, + "step": 12798 + }, + { + "epoch": 0.5389052631578948, + "grad_norm": 0.439453125, + "learning_rate": 0.00022686768674733024, + "loss": 3.068, + "step": 12799 + }, + { + "epoch": 0.5389473684210526, + "grad_norm": 0.400390625, + "learning_rate": 0.00022683408744026171, + "loss": 2.8425, + "step": 12800 + }, + { + "epoch": 0.5389894736842106, + "grad_norm": 0.427734375, + "learning_rate": 0.00022680048855524878, + "loss": 3.1166, + "step": 12801 + }, + { + "epoch": 0.5390315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.00022676689009290375, + "loss": 3.3533, + "step": 12802 + }, + { + "epoch": 0.5390736842105263, + "grad_norm": 0.388671875, + "learning_rate": 0.00022673329205383862, + "loss": 2.9523, + "step": 12803 + }, + { + "epoch": 0.5391157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.00022669969443866555, + "loss": 3.4934, + "step": 12804 + }, + { + "epoch": 0.5391578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0002266660972479968, + "loss": 3.3707, + "step": 12805 + }, + { + "epoch": 0.5392, + "grad_norm": 0.546875, + "learning_rate": 0.00022663250048244425, + "loss": 2.6656, + "step": 12806 + }, + { + "epoch": 0.5392421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00022659890414262018, + "loss": 3.4155, + "step": 12807 + }, + { + "epoch": 0.5392842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.00022656530822913647, + "loss": 3.1296, + "step": 12808 + }, + { + "epoch": 0.5393263157894737, + "grad_norm": 0.421875, + "learning_rate": 0.0002265317127426053, + "loss": 2.7463, + "step": 12809 + }, + { + "epoch": 0.5393684210526316, + "grad_norm": 0.470703125, + "learning_rate": 0.00022649811768363882, + "loss": 2.7752, + "step": 12810 + }, + { + "epoch": 0.5394105263157895, + "grad_norm": 0.443359375, + "learning_rate": 0.00022646452305284898, + "loss": 3.2148, + "step": 12811 + }, + { + "epoch": 0.5394526315789474, + "grad_norm": 0.412109375, + "learning_rate": 0.00022643092885084795, + "loss": 3.0524, + "step": 12812 + }, + { + "epoch": 0.5394947368421052, + "grad_norm": 0.390625, + "learning_rate": 0.0002263973350782476, + "loss": 2.7869, + "step": 12813 + }, + { + "epoch": 0.5395368421052632, + "grad_norm": 0.419921875, + "learning_rate": 0.0002263637417356602, + "loss": 2.6607, + "step": 12814 + }, + { + "epoch": 0.539578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00022633014882369757, + "loss": 3.1635, + "step": 12815 + }, + { + "epoch": 0.539621052631579, + "grad_norm": 0.384765625, + "learning_rate": 0.0002262965563429718, + "loss": 2.4586, + "step": 12816 + }, + { + "epoch": 0.5396631578947368, + "grad_norm": 0.4296875, + "learning_rate": 0.00022626296429409498, + "loss": 3.3586, + "step": 12817 + }, + { + "epoch": 0.5397052631578947, + "grad_norm": 0.427734375, + "learning_rate": 0.00022622937267767902, + "loss": 3.0057, + "step": 12818 + }, + { + "epoch": 0.5397473684210526, + "grad_norm": 0.4609375, + "learning_rate": 0.00022619578149433599, + "loss": 3.1835, + "step": 12819 + }, + { + "epoch": 0.5397894736842105, + "grad_norm": 0.4453125, + "learning_rate": 0.00022616219074467778, + "loss": 2.8604, + "step": 12820 + }, + { + "epoch": 0.5398315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.00022612860042931656, + "loss": 3.3622, + "step": 12821 + }, + { + "epoch": 0.5398736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.0002260950105488641, + "loss": 3.2495, + "step": 12822 + }, + { + "epoch": 0.5399157894736842, + "grad_norm": 0.447265625, + "learning_rate": 0.00022606142110393246, + "loss": 3.1095, + "step": 12823 + }, + { + "epoch": 0.5399578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.0002260278320951337, + "loss": 2.8223, + "step": 12824 + }, + { + "epoch": 0.54, + "grad_norm": 0.431640625, + "learning_rate": 0.00022599424352307955, + "loss": 3.3127, + "step": 12825 + }, + { + "epoch": 0.5400421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00022596065538838217, + "loss": 3.2161, + "step": 12826 + }, + { + "epoch": 0.5400842105263158, + "grad_norm": 0.4453125, + "learning_rate": 0.0002259270676916534, + "loss": 3.2006, + "step": 12827 + }, + { + "epoch": 0.5401263157894737, + "grad_norm": 0.421875, + "learning_rate": 0.00022589348043350514, + "loss": 3.3813, + "step": 12828 + }, + { + "epoch": 0.5401684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.0002258598936145493, + "loss": 3.3803, + "step": 12829 + }, + { + "epoch": 0.5402105263157895, + "grad_norm": 0.412109375, + "learning_rate": 0.0002258263072353978, + "loss": 3.2231, + "step": 12830 + }, + { + "epoch": 0.5402526315789473, + "grad_norm": 0.42578125, + "learning_rate": 0.00022579272129666275, + "loss": 2.9005, + "step": 12831 + }, + { + "epoch": 0.5402947368421053, + "grad_norm": 0.41015625, + "learning_rate": 0.00022575913579895576, + "loss": 3.5052, + "step": 12832 + }, + { + "epoch": 0.5403368421052631, + "grad_norm": 0.42578125, + "learning_rate": 0.00022572555074288893, + "loss": 3.1204, + "step": 12833 + }, + { + "epoch": 0.5403789473684211, + "grad_norm": 0.419921875, + "learning_rate": 0.0002256919661290739, + "loss": 3.284, + "step": 12834 + }, + { + "epoch": 0.5404210526315789, + "grad_norm": 0.423828125, + "learning_rate": 0.0002256583819581228, + "loss": 3.4327, + "step": 12835 + }, + { + "epoch": 0.5404631578947369, + "grad_norm": 0.404296875, + "learning_rate": 0.0002256247982306473, + "loss": 3.0934, + "step": 12836 + }, + { + "epoch": 0.5405052631578947, + "grad_norm": 0.421875, + "learning_rate": 0.00022559121494725932, + "loss": 3.1555, + "step": 12837 + }, + { + "epoch": 0.5405473684210527, + "grad_norm": 0.423828125, + "learning_rate": 0.00022555763210857087, + "loss": 3.2322, + "step": 12838 + }, + { + "epoch": 0.5405894736842105, + "grad_norm": 0.416015625, + "learning_rate": 0.0002255240497151935, + "loss": 3.1816, + "step": 12839 + }, + { + "epoch": 0.5406315789473685, + "grad_norm": 0.435546875, + "learning_rate": 0.00022549046776773932, + "loss": 3.1325, + "step": 12840 + }, + { + "epoch": 0.5406736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.0002254568862668199, + "loss": 3.2035, + "step": 12841 + }, + { + "epoch": 0.5407157894736843, + "grad_norm": 0.423828125, + "learning_rate": 0.0002254233052130472, + "loss": 2.8648, + "step": 12842 + }, + { + "epoch": 0.5407578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.00022538972460703306, + "loss": 3.1046, + "step": 12843 + }, + { + "epoch": 0.5408, + "grad_norm": 0.421875, + "learning_rate": 0.00022535614444938912, + "loss": 2.9757, + "step": 12844 + }, + { + "epoch": 0.5408421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.00022532256474072735, + "loss": 2.9846, + "step": 12845 + }, + { + "epoch": 0.5408842105263157, + "grad_norm": 0.447265625, + "learning_rate": 0.0002252889854816594, + "loss": 2.9782, + "step": 12846 + }, + { + "epoch": 0.5409263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.00022525540667279721, + "loss": 3.0351, + "step": 12847 + }, + { + "epoch": 0.5409684210526315, + "grad_norm": 0.447265625, + "learning_rate": 0.00022522182831475232, + "loss": 2.6855, + "step": 12848 + }, + { + "epoch": 0.5410105263157895, + "grad_norm": 0.443359375, + "learning_rate": 0.00022518825040813672, + "loss": 3.3166, + "step": 12849 + }, + { + "epoch": 0.5410526315789473, + "grad_norm": 0.4609375, + "learning_rate": 0.00022515467295356194, + "loss": 3.3633, + "step": 12850 + }, + { + "epoch": 0.5410947368421053, + "grad_norm": 0.404296875, + "learning_rate": 0.0002251210959516398, + "loss": 3.0377, + "step": 12851 + }, + { + "epoch": 0.5411368421052631, + "grad_norm": 0.494140625, + "learning_rate": 0.00022508751940298217, + "loss": 3.2266, + "step": 12852 + }, + { + "epoch": 0.5411789473684211, + "grad_norm": 0.42578125, + "learning_rate": 0.00022505394330820056, + "loss": 2.6111, + "step": 12853 + }, + { + "epoch": 0.5412210526315789, + "grad_norm": 0.4296875, + "learning_rate": 0.00022502036766790693, + "loss": 2.9188, + "step": 12854 + }, + { + "epoch": 0.5412631578947369, + "grad_norm": 0.419921875, + "learning_rate": 0.00022498679248271272, + "loss": 3.301, + "step": 12855 + }, + { + "epoch": 0.5413052631578947, + "grad_norm": 0.4140625, + "learning_rate": 0.00022495321775322988, + "loss": 2.6922, + "step": 12856 + }, + { + "epoch": 0.5413473684210526, + "grad_norm": 0.453125, + "learning_rate": 0.00022491964348006987, + "loss": 2.8302, + "step": 12857 + }, + { + "epoch": 0.5413894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.0002248860696638445, + "loss": 3.3484, + "step": 12858 + }, + { + "epoch": 0.5414315789473684, + "grad_norm": 0.458984375, + "learning_rate": 0.00022485249630516555, + "loss": 2.8995, + "step": 12859 + }, + { + "epoch": 0.5414736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.00022481892340464444, + "loss": 2.9962, + "step": 12860 + }, + { + "epoch": 0.5415157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.00022478535096289302, + "loss": 3.1667, + "step": 12861 + }, + { + "epoch": 0.5415578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00022475177898052283, + "loss": 3.3313, + "step": 12862 + }, + { + "epoch": 0.5416, + "grad_norm": 0.412109375, + "learning_rate": 0.00022471820745814555, + "loss": 3.1258, + "step": 12863 + }, + { + "epoch": 0.5416421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00022468463639637294, + "loss": 2.9057, + "step": 12864 + }, + { + "epoch": 0.5416842105263158, + "grad_norm": 0.455078125, + "learning_rate": 0.0002246510657958164, + "loss": 2.7129, + "step": 12865 + }, + { + "epoch": 0.5417263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.00022461749565708778, + "loss": 3.1935, + "step": 12866 + }, + { + "epoch": 0.5417684210526316, + "grad_norm": 0.443359375, + "learning_rate": 0.00022458392598079843, + "loss": 3.1722, + "step": 12867 + }, + { + "epoch": 0.5418105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.00022455035676756017, + "loss": 3.047, + "step": 12868 + }, + { + "epoch": 0.5418526315789474, + "grad_norm": 0.421875, + "learning_rate": 0.00022451678801798446, + "loss": 3.0507, + "step": 12869 + }, + { + "epoch": 0.5418947368421053, + "grad_norm": 0.5390625, + "learning_rate": 0.00022448321973268293, + "loss": 3.0461, + "step": 12870 + }, + { + "epoch": 0.5419368421052632, + "grad_norm": 0.443359375, + "learning_rate": 0.00022444965191226722, + "loss": 3.2732, + "step": 12871 + }, + { + "epoch": 0.541978947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00022441608455734874, + "loss": 3.188, + "step": 12872 + }, + { + "epoch": 0.542021052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00022438251766853927, + "loss": 3.0988, + "step": 12873 + }, + { + "epoch": 0.5420631578947368, + "grad_norm": 0.4609375, + "learning_rate": 0.00022434895124645007, + "loss": 3.1495, + "step": 12874 + }, + { + "epoch": 0.5421052631578948, + "grad_norm": 0.482421875, + "learning_rate": 0.000224315385291693, + "loss": 2.893, + "step": 12875 + }, + { + "epoch": 0.5421473684210526, + "grad_norm": 0.447265625, + "learning_rate": 0.0002242818198048793, + "loss": 3.1744, + "step": 12876 + }, + { + "epoch": 0.5421894736842106, + "grad_norm": 0.412109375, + "learning_rate": 0.00022424825478662065, + "loss": 2.8689, + "step": 12877 + }, + { + "epoch": 0.5422315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00022421469023752857, + "loss": 3.0869, + "step": 12878 + }, + { + "epoch": 0.5422736842105264, + "grad_norm": 0.421875, + "learning_rate": 0.0002241811261582145, + "loss": 3.2633, + "step": 12879 + }, + { + "epoch": 0.5423157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.00022414756254929006, + "loss": 3.3491, + "step": 12880 + }, + { + "epoch": 0.5423578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00022411399941136656, + "loss": 3.4902, + "step": 12881 + }, + { + "epoch": 0.5424, + "grad_norm": 0.4375, + "learning_rate": 0.00022408043674505568, + "loss": 3.3735, + "step": 12882 + }, + { + "epoch": 0.542442105263158, + "grad_norm": 0.404296875, + "learning_rate": 0.0002240468745509687, + "loss": 2.8697, + "step": 12883 + }, + { + "epoch": 0.5424842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.00022401331282971717, + "loss": 3.3974, + "step": 12884 + }, + { + "epoch": 0.5425263157894736, + "grad_norm": 0.451171875, + "learning_rate": 0.00022397975158191258, + "loss": 3.6307, + "step": 12885 + }, + { + "epoch": 0.5425684210526316, + "grad_norm": 0.435546875, + "learning_rate": 0.00022394619080816637, + "loss": 2.809, + "step": 12886 + }, + { + "epoch": 0.5426105263157894, + "grad_norm": 0.42578125, + "learning_rate": 0.00022391263050908992, + "loss": 2.9324, + "step": 12887 + }, + { + "epoch": 0.5426526315789474, + "grad_norm": 0.4765625, + "learning_rate": 0.00022387907068529466, + "loss": 2.7167, + "step": 12888 + }, + { + "epoch": 0.5426947368421052, + "grad_norm": 0.416015625, + "learning_rate": 0.00022384551133739213, + "loss": 3.0064, + "step": 12889 + }, + { + "epoch": 0.5427368421052632, + "grad_norm": 0.435546875, + "learning_rate": 0.00022381195246599356, + "loss": 3.0278, + "step": 12890 + }, + { + "epoch": 0.542778947368421, + "grad_norm": 0.458984375, + "learning_rate": 0.00022377839407171043, + "loss": 2.9579, + "step": 12891 + }, + { + "epoch": 0.542821052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.0002237448361551543, + "loss": 2.8737, + "step": 12892 + }, + { + "epoch": 0.5428631578947368, + "grad_norm": 0.421875, + "learning_rate": 0.00022371127871693625, + "loss": 3.0868, + "step": 12893 + }, + { + "epoch": 0.5429052631578948, + "grad_norm": 0.4296875, + "learning_rate": 0.00022367772175766788, + "loss": 3.3868, + "step": 12894 + }, + { + "epoch": 0.5429473684210526, + "grad_norm": 0.404296875, + "learning_rate": 0.00022364416527796046, + "loss": 2.7747, + "step": 12895 + }, + { + "epoch": 0.5429894736842106, + "grad_norm": 0.443359375, + "learning_rate": 0.00022361060927842546, + "loss": 3.3441, + "step": 12896 + }, + { + "epoch": 0.5430315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.00022357705375967407, + "loss": 3.1039, + "step": 12897 + }, + { + "epoch": 0.5430736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.00022354349872231769, + "loss": 3.1572, + "step": 12898 + }, + { + "epoch": 0.5431157894736842, + "grad_norm": 0.46875, + "learning_rate": 0.00022350994416696783, + "loss": 4.0993, + "step": 12899 + }, + { + "epoch": 0.5431578947368421, + "grad_norm": 0.462890625, + "learning_rate": 0.00022347639009423552, + "loss": 3.196, + "step": 12900 + }, + { + "epoch": 0.5432, + "grad_norm": 0.42578125, + "learning_rate": 0.00022344283650473226, + "loss": 3.2129, + "step": 12901 + }, + { + "epoch": 0.5432421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.00022340928339906936, + "loss": 2.7864, + "step": 12902 + }, + { + "epoch": 0.5432842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.00022337573077785804, + "loss": 2.785, + "step": 12903 + }, + { + "epoch": 0.5433263157894737, + "grad_norm": 0.435546875, + "learning_rate": 0.00022334217864170954, + "loss": 3.0181, + "step": 12904 + }, + { + "epoch": 0.5433684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.00022330862699123527, + "loss": 3.2711, + "step": 12905 + }, + { + "epoch": 0.5434105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.00022327507582704657, + "loss": 3.0973, + "step": 12906 + }, + { + "epoch": 0.5434526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.00022324152514975447, + "loss": 3.5134, + "step": 12907 + }, + { + "epoch": 0.5434947368421053, + "grad_norm": 0.435546875, + "learning_rate": 0.00022320797495997046, + "loss": 3.1497, + "step": 12908 + }, + { + "epoch": 0.5435368421052632, + "grad_norm": 0.4296875, + "learning_rate": 0.0002231744252583055, + "loss": 3.5638, + "step": 12909 + }, + { + "epoch": 0.543578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.00022314087604537102, + "loss": 3.4367, + "step": 12910 + }, + { + "epoch": 0.5436210526315789, + "grad_norm": 0.443359375, + "learning_rate": 0.00022310732732177833, + "loss": 3.1139, + "step": 12911 + }, + { + "epoch": 0.5436631578947368, + "grad_norm": 0.431640625, + "learning_rate": 0.00022307377908813841, + "loss": 2.6976, + "step": 12912 + }, + { + "epoch": 0.5437052631578947, + "grad_norm": 0.423828125, + "learning_rate": 0.00022304023134506277, + "loss": 3.1863, + "step": 12913 + }, + { + "epoch": 0.5437473684210526, + "grad_norm": 0.421875, + "learning_rate": 0.0002230066840931623, + "loss": 2.6206, + "step": 12914 + }, + { + "epoch": 0.5437894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.00022297313733304845, + "loss": 3.1941, + "step": 12915 + }, + { + "epoch": 0.5438315789473684, + "grad_norm": 0.447265625, + "learning_rate": 0.00022293959106533216, + "loss": 2.7666, + "step": 12916 + }, + { + "epoch": 0.5438736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.00022290604529062472, + "loss": 2.9412, + "step": 12917 + }, + { + "epoch": 0.5439157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.0002228725000095374, + "loss": 3.2147, + "step": 12918 + }, + { + "epoch": 0.5439578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.00022283895522268117, + "loss": 2.8983, + "step": 12919 + }, + { + "epoch": 0.544, + "grad_norm": 0.412109375, + "learning_rate": 0.0002228054109306673, + "loss": 2.9777, + "step": 12920 + }, + { + "epoch": 0.5440421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.00022277186713410684, + "loss": 3.1171, + "step": 12921 + }, + { + "epoch": 0.5440842105263158, + "grad_norm": 0.443359375, + "learning_rate": 0.00022273832383361113, + "loss": 3.5142, + "step": 12922 + }, + { + "epoch": 0.5441263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.00022270478102979097, + "loss": 3.3884, + "step": 12923 + }, + { + "epoch": 0.5441684210526315, + "grad_norm": 0.431640625, + "learning_rate": 0.00022267123872325763, + "loss": 3.2416, + "step": 12924 + }, + { + "epoch": 0.5442105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.00022263769691462231, + "loss": 3.2345, + "step": 12925 + }, + { + "epoch": 0.5442526315789473, + "grad_norm": 0.3984375, + "learning_rate": 0.00022260415560449592, + "loss": 2.8929, + "step": 12926 + }, + { + "epoch": 0.5442947368421053, + "grad_norm": 0.404296875, + "learning_rate": 0.00022257061479348967, + "loss": 3.526, + "step": 12927 + }, + { + "epoch": 0.5443368421052631, + "grad_norm": 0.458984375, + "learning_rate": 0.00022253707448221456, + "loss": 3.3578, + "step": 12928 + }, + { + "epoch": 0.5443789473684211, + "grad_norm": 0.421875, + "learning_rate": 0.00022250353467128176, + "loss": 3.1546, + "step": 12929 + }, + { + "epoch": 0.5444210526315789, + "grad_norm": 0.3984375, + "learning_rate": 0.00022246999536130213, + "loss": 3.1805, + "step": 12930 + }, + { + "epoch": 0.5444631578947369, + "grad_norm": 0.451171875, + "learning_rate": 0.00022243645655288687, + "loss": 2.9529, + "step": 12931 + }, + { + "epoch": 0.5445052631578947, + "grad_norm": 0.423828125, + "learning_rate": 0.00022240291824664708, + "loss": 3.0559, + "step": 12932 + }, + { + "epoch": 0.5445473684210527, + "grad_norm": 0.4375, + "learning_rate": 0.00022236938044319362, + "loss": 3.3177, + "step": 12933 + }, + { + "epoch": 0.5445894736842105, + "grad_norm": 0.439453125, + "learning_rate": 0.00022233584314313766, + "loss": 3.3355, + "step": 12934 + }, + { + "epoch": 0.5446315789473685, + "grad_norm": 0.443359375, + "learning_rate": 0.00022230230634709002, + "loss": 2.9472, + "step": 12935 + }, + { + "epoch": 0.5446736842105263, + "grad_norm": 0.4375, + "learning_rate": 0.00022226877005566186, + "loss": 3.1843, + "step": 12936 + }, + { + "epoch": 0.5447157894736843, + "grad_norm": 0.416015625, + "learning_rate": 0.0002222352342694641, + "loss": 2.5554, + "step": 12937 + }, + { + "epoch": 0.5447578947368421, + "grad_norm": 0.46484375, + "learning_rate": 0.00022220169898910772, + "loss": 3.4046, + "step": 12938 + }, + { + "epoch": 0.5448, + "grad_norm": 0.41796875, + "learning_rate": 0.00022216816421520387, + "loss": 3.0626, + "step": 12939 + }, + { + "epoch": 0.5448421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00022213462994836322, + "loss": 3.0481, + "step": 12940 + }, + { + "epoch": 0.5448842105263157, + "grad_norm": 0.431640625, + "learning_rate": 0.000222101096189197, + "loss": 3.3538, + "step": 12941 + }, + { + "epoch": 0.5449263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.00022206756293831592, + "loss": 3.3664, + "step": 12942 + }, + { + "epoch": 0.5449684210526315, + "grad_norm": 0.431640625, + "learning_rate": 0.00022203403019633108, + "loss": 3.3199, + "step": 12943 + }, + { + "epoch": 0.5450105263157895, + "grad_norm": 0.439453125, + "learning_rate": 0.0002220004979638533, + "loss": 2.788, + "step": 12944 + }, + { + "epoch": 0.5450526315789473, + "grad_norm": 0.41796875, + "learning_rate": 0.0002219669662414936, + "loss": 3.4296, + "step": 12945 + }, + { + "epoch": 0.5450947368421053, + "grad_norm": 0.427734375, + "learning_rate": 0.00022193343502986283, + "loss": 2.6046, + "step": 12946 + }, + { + "epoch": 0.5451368421052631, + "grad_norm": 0.451171875, + "learning_rate": 0.00022189990432957185, + "loss": 3.1761, + "step": 12947 + }, + { + "epoch": 0.5451789473684211, + "grad_norm": 0.41015625, + "learning_rate": 0.0002218663741412317, + "loss": 2.9329, + "step": 12948 + }, + { + "epoch": 0.5452210526315789, + "grad_norm": 0.447265625, + "learning_rate": 0.00022183284446545304, + "loss": 3.317, + "step": 12949 + }, + { + "epoch": 0.5452631578947369, + "grad_norm": 0.412109375, + "learning_rate": 0.000221799315302847, + "loss": 2.9712, + "step": 12950 + }, + { + "epoch": 0.5453052631578947, + "grad_norm": 0.41015625, + "learning_rate": 0.00022176578665402416, + "loss": 2.8621, + "step": 12951 + }, + { + "epoch": 0.5453473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.00022173225851959553, + "loss": 3.1188, + "step": 12952 + }, + { + "epoch": 0.5453894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.00022169873090017197, + "loss": 3.3152, + "step": 12953 + }, + { + "epoch": 0.5454315789473684, + "grad_norm": 0.478515625, + "learning_rate": 0.00022166520379636424, + "loss": 2.4804, + "step": 12954 + }, + { + "epoch": 0.5454736842105263, + "grad_norm": 0.47265625, + "learning_rate": 0.00022163167720878332, + "loss": 3.3844, + "step": 12955 + }, + { + "epoch": 0.5455157894736842, + "grad_norm": 0.447265625, + "learning_rate": 0.0002215981511380398, + "loss": 2.8548, + "step": 12956 + }, + { + "epoch": 0.5455578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00022156462558474456, + "loss": 2.9641, + "step": 12957 + }, + { + "epoch": 0.5456, + "grad_norm": 0.416015625, + "learning_rate": 0.0002215311005495086, + "loss": 2.8783, + "step": 12958 + }, + { + "epoch": 0.5456421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.0002214975760329424, + "loss": 2.9841, + "step": 12959 + }, + { + "epoch": 0.5456842105263158, + "grad_norm": 0.44140625, + "learning_rate": 0.00022146405203565697, + "loss": 3.2366, + "step": 12960 + }, + { + "epoch": 0.5457263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.0002214305285582629, + "loss": 3.3482, + "step": 12961 + }, + { + "epoch": 0.5457684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.0002213970056013711, + "loss": 3.2304, + "step": 12962 + }, + { + "epoch": 0.5458105263157895, + "grad_norm": 0.458984375, + "learning_rate": 0.00022136348316559215, + "loss": 3.3084, + "step": 12963 + }, + { + "epoch": 0.5458526315789474, + "grad_norm": 0.400390625, + "learning_rate": 0.00022132996125153693, + "loss": 3.0206, + "step": 12964 + }, + { + "epoch": 0.5458947368421052, + "grad_norm": 0.421875, + "learning_rate": 0.00022129643985981622, + "loss": 3.0426, + "step": 12965 + }, + { + "epoch": 0.5459368421052632, + "grad_norm": 0.44140625, + "learning_rate": 0.00022126291899104053, + "loss": 3.0565, + "step": 12966 + }, + { + "epoch": 0.545978947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00022122939864582081, + "loss": 3.2017, + "step": 12967 + }, + { + "epoch": 0.546021052631579, + "grad_norm": 0.421875, + "learning_rate": 0.00022119587882476756, + "loss": 3.1762, + "step": 12968 + }, + { + "epoch": 0.5460631578947368, + "grad_norm": 0.416015625, + "learning_rate": 0.00022116235952849157, + "loss": 2.8913, + "step": 12969 + }, + { + "epoch": 0.5461052631578948, + "grad_norm": 0.49609375, + "learning_rate": 0.00022112884075760347, + "loss": 3.1523, + "step": 12970 + }, + { + "epoch": 0.5461473684210526, + "grad_norm": 0.421875, + "learning_rate": 0.00022109532251271397, + "loss": 2.9942, + "step": 12971 + }, + { + "epoch": 0.5461894736842106, + "grad_norm": 0.419921875, + "learning_rate": 0.00022106180479443383, + "loss": 3.0992, + "step": 12972 + }, + { + "epoch": 0.5462315789473684, + "grad_norm": 0.4765625, + "learning_rate": 0.0002210282876033735, + "loss": 3.1492, + "step": 12973 + }, + { + "epoch": 0.5462736842105264, + "grad_norm": 0.41796875, + "learning_rate": 0.00022099477094014386, + "loss": 3.43, + "step": 12974 + }, + { + "epoch": 0.5463157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.00022096125480535527, + "loss": 2.8924, + "step": 12975 + }, + { + "epoch": 0.5463578947368422, + "grad_norm": 0.431640625, + "learning_rate": 0.00022092773919961864, + "loss": 3.3664, + "step": 12976 + }, + { + "epoch": 0.5464, + "grad_norm": 0.439453125, + "learning_rate": 0.0002208942241235443, + "loss": 3.2815, + "step": 12977 + }, + { + "epoch": 0.5464421052631578, + "grad_norm": 0.451171875, + "learning_rate": 0.00022086070957774305, + "loss": 3.0154, + "step": 12978 + }, + { + "epoch": 0.5464842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.00022082719556282547, + "loss": 2.691, + "step": 12979 + }, + { + "epoch": 0.5465263157894736, + "grad_norm": 0.42578125, + "learning_rate": 0.00022079368207940204, + "loss": 2.9776, + "step": 12980 + }, + { + "epoch": 0.5465684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.0002207601691280835, + "loss": 3.3093, + "step": 12981 + }, + { + "epoch": 0.5466105263157894, + "grad_norm": 0.447265625, + "learning_rate": 0.00022072665670948025, + "loss": 3.3581, + "step": 12982 + }, + { + "epoch": 0.5466526315789474, + "grad_norm": 0.443359375, + "learning_rate": 0.000220693144824203, + "loss": 3.0329, + "step": 12983 + }, + { + "epoch": 0.5466947368421052, + "grad_norm": 0.4375, + "learning_rate": 0.00022065963347286211, + "loss": 3.4624, + "step": 12984 + }, + { + "epoch": 0.5467368421052632, + "grad_norm": 0.44140625, + "learning_rate": 0.00022062612265606824, + "loss": 3.4156, + "step": 12985 + }, + { + "epoch": 0.546778947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00022059261237443196, + "loss": 3.272, + "step": 12986 + }, + { + "epoch": 0.546821052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.00022055910262856372, + "loss": 2.9962, + "step": 12987 + }, + { + "epoch": 0.5468631578947368, + "grad_norm": 0.431640625, + "learning_rate": 0.00022052559341907406, + "loss": 3.574, + "step": 12988 + }, + { + "epoch": 0.5469052631578948, + "grad_norm": 0.4375, + "learning_rate": 0.00022049208474657337, + "loss": 2.7729, + "step": 12989 + }, + { + "epoch": 0.5469473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.00022045857661167235, + "loss": 3.0983, + "step": 12990 + }, + { + "epoch": 0.5469894736842106, + "grad_norm": 0.4921875, + "learning_rate": 0.00022042506901498123, + "loss": 3.4261, + "step": 12991 + }, + { + "epoch": 0.5470315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.00022039156195711063, + "loss": 3.0474, + "step": 12992 + }, + { + "epoch": 0.5470736842105263, + "grad_norm": 0.39453125, + "learning_rate": 0.00022035805543867108, + "loss": 3.1227, + "step": 12993 + }, + { + "epoch": 0.5471157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.0002203245494602728, + "loss": 3.6214, + "step": 12994 + }, + { + "epoch": 0.5471578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00022029104402252644, + "loss": 3.3922, + "step": 12995 + }, + { + "epoch": 0.5472, + "grad_norm": 0.458984375, + "learning_rate": 0.0002202575391260423, + "loss": 2.772, + "step": 12996 + }, + { + "epoch": 0.5472421052631579, + "grad_norm": 0.400390625, + "learning_rate": 0.00022022403477143097, + "loss": 2.9054, + "step": 12997 + }, + { + "epoch": 0.5472842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.0002201905309593026, + "loss": 2.9836, + "step": 12998 + }, + { + "epoch": 0.5473263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.00022015702769026774, + "loss": 3.1555, + "step": 12999 + }, + { + "epoch": 0.5473684210526316, + "grad_norm": 0.43359375, + "learning_rate": 0.0002201235249649369, + "loss": 3.1737, + "step": 13000 + }, + { + "epoch": 0.5474105263157895, + "grad_norm": 0.4453125, + "learning_rate": 0.00022009002278392022, + "loss": 3.1369, + "step": 13001 + }, + { + "epoch": 0.5474526315789474, + "grad_norm": 0.40625, + "learning_rate": 0.0002200565211478282, + "loss": 3.1964, + "step": 13002 + }, + { + "epoch": 0.5474947368421053, + "grad_norm": 0.435546875, + "learning_rate": 0.00022002302005727124, + "loss": 2.7732, + "step": 13003 + }, + { + "epoch": 0.5475368421052632, + "grad_norm": 0.443359375, + "learning_rate": 0.0002199895195128596, + "loss": 3.3481, + "step": 13004 + }, + { + "epoch": 0.547578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.00021995601951520357, + "loss": 3.5979, + "step": 13005 + }, + { + "epoch": 0.5476210526315789, + "grad_norm": 0.44140625, + "learning_rate": 0.0002199225200649136, + "loss": 3.057, + "step": 13006 + }, + { + "epoch": 0.5476631578947369, + "grad_norm": 0.412109375, + "learning_rate": 0.0002198890211626001, + "loss": 2.7924, + "step": 13007 + }, + { + "epoch": 0.5477052631578947, + "grad_norm": 0.416015625, + "learning_rate": 0.0002198555228088731, + "loss": 3.0095, + "step": 13008 + }, + { + "epoch": 0.5477473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.00021982202500434324, + "loss": 3.2217, + "step": 13009 + }, + { + "epoch": 0.5477894736842105, + "grad_norm": 0.43359375, + "learning_rate": 0.00021978852774962043, + "loss": 3.042, + "step": 13010 + }, + { + "epoch": 0.5478315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.0002197550310453152, + "loss": 3.1071, + "step": 13011 + }, + { + "epoch": 0.5478736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.0002197215348920378, + "loss": 3.0826, + "step": 13012 + }, + { + "epoch": 0.5479157894736842, + "grad_norm": 0.462890625, + "learning_rate": 0.00021968803929039844, + "loss": 3.2932, + "step": 13013 + }, + { + "epoch": 0.5479578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00021965454424100744, + "loss": 3.1672, + "step": 13014 + }, + { + "epoch": 0.548, + "grad_norm": 0.4140625, + "learning_rate": 0.00021962104974447493, + "loss": 3.3085, + "step": 13015 + }, + { + "epoch": 0.5480421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00021958755580141128, + "loss": 2.853, + "step": 13016 + }, + { + "epoch": 0.5480842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.00021955406241242652, + "loss": 3.1304, + "step": 13017 + }, + { + "epoch": 0.5481263157894737, + "grad_norm": 0.4453125, + "learning_rate": 0.00021952056957813098, + "loss": 3.2672, + "step": 13018 + }, + { + "epoch": 0.5481684210526315, + "grad_norm": 0.451171875, + "learning_rate": 0.00021948707729913498, + "loss": 3.1367, + "step": 13019 + }, + { + "epoch": 0.5482105263157895, + "grad_norm": 0.470703125, + "learning_rate": 0.00021945358557604844, + "loss": 3.4987, + "step": 13020 + }, + { + "epoch": 0.5482526315789473, + "grad_norm": 0.392578125, + "learning_rate": 0.00021942009440948176, + "loss": 3.2441, + "step": 13021 + }, + { + "epoch": 0.5482947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.00021938660380004497, + "loss": 2.8075, + "step": 13022 + }, + { + "epoch": 0.5483368421052631, + "grad_norm": 0.423828125, + "learning_rate": 0.00021935311374834837, + "loss": 3.238, + "step": 13023 + }, + { + "epoch": 0.5483789473684211, + "grad_norm": 0.421875, + "learning_rate": 0.00021931962425500196, + "loss": 2.8012, + "step": 13024 + }, + { + "epoch": 0.5484210526315789, + "grad_norm": 0.443359375, + "learning_rate": 0.00021928613532061592, + "loss": 3.6176, + "step": 13025 + }, + { + "epoch": 0.5484631578947369, + "grad_norm": 0.404296875, + "learning_rate": 0.00021925264694580055, + "loss": 2.7136, + "step": 13026 + }, + { + "epoch": 0.5485052631578947, + "grad_norm": 0.4140625, + "learning_rate": 0.0002192191591311657, + "loss": 2.8669, + "step": 13027 + }, + { + "epoch": 0.5485473684210527, + "grad_norm": 0.416015625, + "learning_rate": 0.00021918567187732162, + "loss": 2.7375, + "step": 13028 + }, + { + "epoch": 0.5485894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.00021915218518487838, + "loss": 3.2815, + "step": 13029 + }, + { + "epoch": 0.5486315789473685, + "grad_norm": 0.451171875, + "learning_rate": 0.00021911869905444616, + "loss": 3.1672, + "step": 13030 + }, + { + "epoch": 0.5486736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.0002190852134866349, + "loss": 3.0757, + "step": 13031 + }, + { + "epoch": 0.5487157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.00021905172848205468, + "loss": 2.9415, + "step": 13032 + }, + { + "epoch": 0.5487578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00021901824404131574, + "loss": 2.8345, + "step": 13033 + }, + { + "epoch": 0.5488, + "grad_norm": 0.39453125, + "learning_rate": 0.00021898476016502788, + "loss": 3.1218, + "step": 13034 + }, + { + "epoch": 0.5488421052631579, + "grad_norm": 0.44921875, + "learning_rate": 0.00021895127685380133, + "loss": 3.0361, + "step": 13035 + }, + { + "epoch": 0.5488842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.00021891779410824596, + "loss": 3.1933, + "step": 13036 + }, + { + "epoch": 0.5489263157894737, + "grad_norm": 0.439453125, + "learning_rate": 0.0002188843119289719, + "loss": 3.0485, + "step": 13037 + }, + { + "epoch": 0.5489684210526315, + "grad_norm": 0.4296875, + "learning_rate": 0.0002188508303165891, + "loss": 3.4201, + "step": 13038 + }, + { + "epoch": 0.5490105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.0002188173492717075, + "loss": 2.993, + "step": 13039 + }, + { + "epoch": 0.5490526315789473, + "grad_norm": 0.423828125, + "learning_rate": 0.00021878386879493731, + "loss": 2.7195, + "step": 13040 + }, + { + "epoch": 0.5490947368421053, + "grad_norm": 0.427734375, + "learning_rate": 0.00021875038888688825, + "loss": 3.0586, + "step": 13041 + }, + { + "epoch": 0.5491368421052631, + "grad_norm": 0.42578125, + "learning_rate": 0.00021871690954817046, + "loss": 3.5585, + "step": 13042 + }, + { + "epoch": 0.5491789473684211, + "grad_norm": 0.43359375, + "learning_rate": 0.00021868343077939377, + "loss": 2.8942, + "step": 13043 + }, + { + "epoch": 0.5492210526315789, + "grad_norm": 0.431640625, + "learning_rate": 0.0002186499525811682, + "loss": 3.2324, + "step": 13044 + }, + { + "epoch": 0.5492631578947369, + "grad_norm": 0.451171875, + "learning_rate": 0.0002186164749541036, + "loss": 3.1854, + "step": 13045 + }, + { + "epoch": 0.5493052631578947, + "grad_norm": 0.447265625, + "learning_rate": 0.00021858299789881001, + "loss": 3.2194, + "step": 13046 + }, + { + "epoch": 0.5493473684210526, + "grad_norm": 0.462890625, + "learning_rate": 0.00021854952141589729, + "loss": 3.2336, + "step": 13047 + }, + { + "epoch": 0.5493894736842105, + "grad_norm": 0.4140625, + "learning_rate": 0.00021851604550597527, + "loss": 2.9783, + "step": 13048 + }, + { + "epoch": 0.5494315789473684, + "grad_norm": 0.5, + "learning_rate": 0.00021848257016965404, + "loss": 3.5081, + "step": 13049 + }, + { + "epoch": 0.5494736842105263, + "grad_norm": 0.416015625, + "learning_rate": 0.0002184490954075432, + "loss": 3.31, + "step": 13050 + }, + { + "epoch": 0.5495157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.00021841562122025293, + "loss": 2.9811, + "step": 13051 + }, + { + "epoch": 0.5495578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.00021838214760839278, + "loss": 3.1103, + "step": 13052 + }, + { + "epoch": 0.5496, + "grad_norm": 0.4140625, + "learning_rate": 0.00021834867457257279, + "loss": 3.4132, + "step": 13053 + }, + { + "epoch": 0.5496421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00021831520211340277, + "loss": 3.1326, + "step": 13054 + }, + { + "epoch": 0.5496842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.00021828173023149254, + "loss": 2.724, + "step": 13055 + }, + { + "epoch": 0.5497263157894737, + "grad_norm": 0.458984375, + "learning_rate": 0.00021824825892745198, + "loss": 3.5347, + "step": 13056 + }, + { + "epoch": 0.5497684210526316, + "grad_norm": 0.44921875, + "learning_rate": 0.00021821478820189074, + "loss": 3.0379, + "step": 13057 + }, + { + "epoch": 0.5498105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.00021818131805541874, + "loss": 3.6, + "step": 13058 + }, + { + "epoch": 0.5498526315789474, + "grad_norm": 0.453125, + "learning_rate": 0.00021814784848864582, + "loss": 3.241, + "step": 13059 + }, + { + "epoch": 0.5498947368421052, + "grad_norm": 0.470703125, + "learning_rate": 0.00021811437950218154, + "loss": 2.9015, + "step": 13060 + }, + { + "epoch": 0.5499368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.0002180809110966359, + "loss": 3.4148, + "step": 13061 + }, + { + "epoch": 0.549978947368421, + "grad_norm": 0.3984375, + "learning_rate": 0.00021804744327261856, + "loss": 2.6951, + "step": 13062 + }, + { + "epoch": 0.550021052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.00021801397603073925, + "loss": 3.2257, + "step": 13063 + }, + { + "epoch": 0.5500631578947368, + "grad_norm": 0.423828125, + "learning_rate": 0.00021798050937160766, + "loss": 3.0777, + "step": 13064 + }, + { + "epoch": 0.5501052631578948, + "grad_norm": 0.423828125, + "learning_rate": 0.00021794704329583354, + "loss": 3.4933, + "step": 13065 + }, + { + "epoch": 0.5501473684210526, + "grad_norm": 0.458984375, + "learning_rate": 0.00021791357780402676, + "loss": 3.3695, + "step": 13066 + }, + { + "epoch": 0.5501894736842106, + "grad_norm": 0.46875, + "learning_rate": 0.0002178801128967968, + "loss": 2.7132, + "step": 13067 + }, + { + "epoch": 0.5502315789473684, + "grad_norm": 0.40625, + "learning_rate": 0.00021784664857475353, + "loss": 3.1979, + "step": 13068 + }, + { + "epoch": 0.5502736842105264, + "grad_norm": 0.423828125, + "learning_rate": 0.00021781318483850645, + "loss": 3.114, + "step": 13069 + }, + { + "epoch": 0.5503157894736842, + "grad_norm": 0.4609375, + "learning_rate": 0.0002177797216886654, + "loss": 3.1233, + "step": 13070 + }, + { + "epoch": 0.5503578947368422, + "grad_norm": 0.427734375, + "learning_rate": 0.00021774625912583984, + "loss": 3.4415, + "step": 13071 + }, + { + "epoch": 0.5504, + "grad_norm": 0.42578125, + "learning_rate": 0.0002177127971506396, + "loss": 2.7947, + "step": 13072 + }, + { + "epoch": 0.5504421052631578, + "grad_norm": 0.427734375, + "learning_rate": 0.00021767933576367435, + "loss": 3.2409, + "step": 13073 + }, + { + "epoch": 0.5504842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.00021764587496555352, + "loss": 3.2877, + "step": 13074 + }, + { + "epoch": 0.5505263157894736, + "grad_norm": 0.419921875, + "learning_rate": 0.00021761241475688696, + "loss": 3.4383, + "step": 13075 + }, + { + "epoch": 0.5505684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.000217578955138284, + "loss": 3.3968, + "step": 13076 + }, + { + "epoch": 0.5506105263157894, + "grad_norm": 0.419921875, + "learning_rate": 0.0002175454961103545, + "loss": 2.9002, + "step": 13077 + }, + { + "epoch": 0.5506526315789474, + "grad_norm": 0.390625, + "learning_rate": 0.00021751203767370788, + "loss": 2.8556, + "step": 13078 + }, + { + "epoch": 0.5506947368421052, + "grad_norm": 0.400390625, + "learning_rate": 0.00021747857982895368, + "loss": 3.1114, + "step": 13079 + }, + { + "epoch": 0.5507368421052632, + "grad_norm": 0.40625, + "learning_rate": 0.00021744512257670167, + "loss": 3.2508, + "step": 13080 + }, + { + "epoch": 0.550778947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00021741166591756117, + "loss": 3.3409, + "step": 13081 + }, + { + "epoch": 0.550821052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00021737820985214196, + "loss": 3.1382, + "step": 13082 + }, + { + "epoch": 0.5508631578947368, + "grad_norm": 0.4296875, + "learning_rate": 0.00021734475438105332, + "loss": 2.769, + "step": 13083 + }, + { + "epoch": 0.5509052631578948, + "grad_norm": 0.400390625, + "learning_rate": 0.00021731129950490501, + "loss": 3.1677, + "step": 13084 + }, + { + "epoch": 0.5509473684210526, + "grad_norm": 0.421875, + "learning_rate": 0.0002172778452243063, + "loss": 3.457, + "step": 13085 + }, + { + "epoch": 0.5509894736842105, + "grad_norm": 0.412109375, + "learning_rate": 0.0002172443915398668, + "loss": 3.3314, + "step": 13086 + }, + { + "epoch": 0.5510315789473684, + "grad_norm": 0.447265625, + "learning_rate": 0.00021721093845219604, + "loss": 3.4098, + "step": 13087 + }, + { + "epoch": 0.5510736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.00021717748596190343, + "loss": 3.3239, + "step": 13088 + }, + { + "epoch": 0.5511157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00021714403406959855, + "loss": 2.972, + "step": 13089 + }, + { + "epoch": 0.5511578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.00021711058277589066, + "loss": 2.827, + "step": 13090 + }, + { + "epoch": 0.5512, + "grad_norm": 0.443359375, + "learning_rate": 0.0002170771320813894, + "loss": 3.5012, + "step": 13091 + }, + { + "epoch": 0.5512421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00021704368198670405, + "loss": 2.9355, + "step": 13092 + }, + { + "epoch": 0.5512842105263158, + "grad_norm": 0.53515625, + "learning_rate": 0.00021701023249244406, + "loss": 2.9697, + "step": 13093 + }, + { + "epoch": 0.5513263157894737, + "grad_norm": 0.435546875, + "learning_rate": 0.00021697678359921895, + "loss": 2.8756, + "step": 13094 + }, + { + "epoch": 0.5513684210526316, + "grad_norm": 0.54296875, + "learning_rate": 0.00021694333530763798, + "loss": 3.2732, + "step": 13095 + }, + { + "epoch": 0.5514105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.00021690988761831066, + "loss": 3.3246, + "step": 13096 + }, + { + "epoch": 0.5514526315789474, + "grad_norm": 0.431640625, + "learning_rate": 0.00021687644053184623, + "loss": 3.0394, + "step": 13097 + }, + { + "epoch": 0.5514947368421053, + "grad_norm": 0.419921875, + "learning_rate": 0.00021684299404885427, + "loss": 2.6014, + "step": 13098 + }, + { + "epoch": 0.5515368421052631, + "grad_norm": 0.458984375, + "learning_rate": 0.00021680954816994386, + "loss": 2.7838, + "step": 13099 + }, + { + "epoch": 0.5515789473684211, + "grad_norm": 0.431640625, + "learning_rate": 0.0002167761028957245, + "loss": 3.5711, + "step": 13100 + }, + { + "epoch": 0.5516210526315789, + "grad_norm": 0.408203125, + "learning_rate": 0.0002167426582268056, + "loss": 2.8942, + "step": 13101 + }, + { + "epoch": 0.5516631578947369, + "grad_norm": 0.44921875, + "learning_rate": 0.0002167092141637963, + "loss": 3.3247, + "step": 13102 + }, + { + "epoch": 0.5517052631578947, + "grad_norm": 0.4453125, + "learning_rate": 0.00021667577070730605, + "loss": 3.5109, + "step": 13103 + }, + { + "epoch": 0.5517473684210527, + "grad_norm": 0.40234375, + "learning_rate": 0.00021664232785794404, + "loss": 3.1419, + "step": 13104 + }, + { + "epoch": 0.5517894736842105, + "grad_norm": 0.5546875, + "learning_rate": 0.00021660888561631975, + "loss": 2.8525, + "step": 13105 + }, + { + "epoch": 0.5518315789473685, + "grad_norm": 0.455078125, + "learning_rate": 0.00021657544398304224, + "loss": 2.774, + "step": 13106 + }, + { + "epoch": 0.5518736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.00021654200295872085, + "loss": 2.8581, + "step": 13107 + }, + { + "epoch": 0.5519157894736842, + "grad_norm": 0.451171875, + "learning_rate": 0.00021650856254396494, + "loss": 3.122, + "step": 13108 + }, + { + "epoch": 0.5519578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.0002164751227393836, + "loss": 3.2237, + "step": 13109 + }, + { + "epoch": 0.552, + "grad_norm": 0.427734375, + "learning_rate": 0.0002164416835455862, + "loss": 3.129, + "step": 13110 + }, + { + "epoch": 0.5520421052631579, + "grad_norm": 0.46875, + "learning_rate": 0.0002164082449631818, + "loss": 3.2775, + "step": 13111 + }, + { + "epoch": 0.5520842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.00021637480699277973, + "loss": 3.2526, + "step": 13112 + }, + { + "epoch": 0.5521263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00021634136963498922, + "loss": 3.0274, + "step": 13113 + }, + { + "epoch": 0.5521684210526315, + "grad_norm": 0.435546875, + "learning_rate": 0.00021630793289041935, + "loss": 3.6067, + "step": 13114 + }, + { + "epoch": 0.5522105263157895, + "grad_norm": 0.4140625, + "learning_rate": 0.00021627449675967945, + "loss": 2.595, + "step": 13115 + }, + { + "epoch": 0.5522526315789473, + "grad_norm": 0.42578125, + "learning_rate": 0.0002162410612433785, + "loss": 3.5471, + "step": 13116 + }, + { + "epoch": 0.5522947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.00021620762634212584, + "loss": 3.3747, + "step": 13117 + }, + { + "epoch": 0.5523368421052631, + "grad_norm": 0.421875, + "learning_rate": 0.00021617419205653045, + "loss": 3.3234, + "step": 13118 + }, + { + "epoch": 0.5523789473684211, + "grad_norm": 0.41796875, + "learning_rate": 0.00021614075838720148, + "loss": 3.3878, + "step": 13119 + }, + { + "epoch": 0.5524210526315789, + "grad_norm": 0.416015625, + "learning_rate": 0.00021610732533474826, + "loss": 3.4799, + "step": 13120 + }, + { + "epoch": 0.5524631578947369, + "grad_norm": 0.44921875, + "learning_rate": 0.00021607389289977964, + "loss": 2.7874, + "step": 13121 + }, + { + "epoch": 0.5525052631578947, + "grad_norm": 0.40234375, + "learning_rate": 0.00021604046108290487, + "loss": 3.0119, + "step": 13122 + }, + { + "epoch": 0.5525473684210527, + "grad_norm": 0.419921875, + "learning_rate": 0.00021600702988473295, + "loss": 3.2949, + "step": 13123 + }, + { + "epoch": 0.5525894736842105, + "grad_norm": 0.419921875, + "learning_rate": 0.00021597359930587313, + "loss": 3.185, + "step": 13124 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 0.43359375, + "learning_rate": 0.00021594016934693423, + "loss": 3.4764, + "step": 13125 + }, + { + "epoch": 0.5526736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00021590674000852538, + "loss": 3.2228, + "step": 13126 + }, + { + "epoch": 0.5527157894736842, + "grad_norm": 0.388671875, + "learning_rate": 0.00021587331129125585, + "loss": 2.9108, + "step": 13127 + }, + { + "epoch": 0.5527578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00021583988319573435, + "loss": 3.4249, + "step": 13128 + }, + { + "epoch": 0.5528, + "grad_norm": 0.4375, + "learning_rate": 0.0002158064557225701, + "loss": 2.8893, + "step": 13129 + }, + { + "epoch": 0.5528421052631579, + "grad_norm": 0.44921875, + "learning_rate": 0.000215773028872372, + "loss": 3.0339, + "step": 13130 + }, + { + "epoch": 0.5528842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.00021573960264574922, + "loss": 3.1813, + "step": 13131 + }, + { + "epoch": 0.5529263157894737, + "grad_norm": 0.400390625, + "learning_rate": 0.00021570617704331046, + "loss": 3.1033, + "step": 13132 + }, + { + "epoch": 0.5529684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.00021567275206566493, + "loss": 3.1489, + "step": 13133 + }, + { + "epoch": 0.5530105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.00021563932771342159, + "loss": 3.0849, + "step": 13134 + }, + { + "epoch": 0.5530526315789474, + "grad_norm": 0.396484375, + "learning_rate": 0.00021560590398718923, + "loss": 2.319, + "step": 13135 + }, + { + "epoch": 0.5530947368421053, + "grad_norm": 0.3984375, + "learning_rate": 0.00021557248088757702, + "loss": 2.8203, + "step": 13136 + }, + { + "epoch": 0.5531368421052631, + "grad_norm": 0.42578125, + "learning_rate": 0.0002155390584151936, + "loss": 3.0241, + "step": 13137 + }, + { + "epoch": 0.5531789473684211, + "grad_norm": 0.3984375, + "learning_rate": 0.00021550563657064814, + "loss": 3.1614, + "step": 13138 + }, + { + "epoch": 0.553221052631579, + "grad_norm": 0.453125, + "learning_rate": 0.00021547221535454938, + "loss": 3.0889, + "step": 13139 + }, + { + "epoch": 0.5532631578947368, + "grad_norm": 0.42578125, + "learning_rate": 0.0002154387947675063, + "loss": 3.2377, + "step": 13140 + }, + { + "epoch": 0.5533052631578947, + "grad_norm": 0.44140625, + "learning_rate": 0.0002154053748101279, + "loss": 2.959, + "step": 13141 + }, + { + "epoch": 0.5533473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.00021537195548302276, + "loss": 3.2823, + "step": 13142 + }, + { + "epoch": 0.5533894736842105, + "grad_norm": 0.39453125, + "learning_rate": 0.00021533853678680004, + "loss": 3.052, + "step": 13143 + }, + { + "epoch": 0.5534315789473684, + "grad_norm": 0.447265625, + "learning_rate": 0.00021530511872206836, + "loss": 3.2303, + "step": 13144 + }, + { + "epoch": 0.5534736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.0002152717012894367, + "loss": 3.2891, + "step": 13145 + }, + { + "epoch": 0.5535157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.00021523828448951376, + "loss": 2.7642, + "step": 13146 + }, + { + "epoch": 0.5535578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.00021520486832290854, + "loss": 3.2216, + "step": 13147 + }, + { + "epoch": 0.5536, + "grad_norm": 0.412109375, + "learning_rate": 0.0002151714527902297, + "loss": 2.9512, + "step": 13148 + }, + { + "epoch": 0.5536421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.00021513803789208603, + "loss": 3.574, + "step": 13149 + }, + { + "epoch": 0.5536842105263158, + "grad_norm": 0.48828125, + "learning_rate": 0.00021510462362908642, + "loss": 3.1369, + "step": 13150 + }, + { + "epoch": 0.5537263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.0002150712100018395, + "loss": 3.0302, + "step": 13151 + }, + { + "epoch": 0.5537684210526316, + "grad_norm": 0.46484375, + "learning_rate": 0.00021503779701095415, + "loss": 3.2992, + "step": 13152 + }, + { + "epoch": 0.5538105263157894, + "grad_norm": 0.435546875, + "learning_rate": 0.000215004384657039, + "loss": 3.0181, + "step": 13153 + }, + { + "epoch": 0.5538526315789474, + "grad_norm": 0.44921875, + "learning_rate": 0.00021497097294070284, + "loss": 2.7274, + "step": 13154 + }, + { + "epoch": 0.5538947368421052, + "grad_norm": 0.42578125, + "learning_rate": 0.00021493756186255445, + "loss": 3.0258, + "step": 13155 + }, + { + "epoch": 0.5539368421052632, + "grad_norm": 0.41015625, + "learning_rate": 0.0002149041514232024, + "loss": 3.3478, + "step": 13156 + }, + { + "epoch": 0.553978947368421, + "grad_norm": 0.474609375, + "learning_rate": 0.00021487074162325563, + "loss": 3.3587, + "step": 13157 + }, + { + "epoch": 0.554021052631579, + "grad_norm": 0.3984375, + "learning_rate": 0.00021483733246332255, + "loss": 3.0506, + "step": 13158 + }, + { + "epoch": 0.5540631578947368, + "grad_norm": 0.439453125, + "learning_rate": 0.00021480392394401194, + "loss": 2.8549, + "step": 13159 + }, + { + "epoch": 0.5541052631578948, + "grad_norm": 0.44140625, + "learning_rate": 0.0002147705160659326, + "loss": 2.9636, + "step": 13160 + }, + { + "epoch": 0.5541473684210526, + "grad_norm": 0.45703125, + "learning_rate": 0.00021473710882969295, + "loss": 3.3841, + "step": 13161 + }, + { + "epoch": 0.5541894736842106, + "grad_norm": 0.423828125, + "learning_rate": 0.0002147037022359018, + "loss": 3.3301, + "step": 13162 + }, + { + "epoch": 0.5542315789473684, + "grad_norm": 0.43359375, + "learning_rate": 0.0002146702962851677, + "loss": 3.0933, + "step": 13163 + }, + { + "epoch": 0.5542736842105264, + "grad_norm": 0.447265625, + "learning_rate": 0.00021463689097809933, + "loss": 2.9962, + "step": 13164 + }, + { + "epoch": 0.5543157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.0002146034863153052, + "loss": 3.4274, + "step": 13165 + }, + { + "epoch": 0.5543578947368422, + "grad_norm": 0.439453125, + "learning_rate": 0.00021457008229739394, + "loss": 3.4441, + "step": 13166 + }, + { + "epoch": 0.5544, + "grad_norm": 0.42578125, + "learning_rate": 0.00021453667892497427, + "loss": 2.9326, + "step": 13167 + }, + { + "epoch": 0.5544421052631578, + "grad_norm": 0.40625, + "learning_rate": 0.00021450327619865453, + "loss": 3.173, + "step": 13168 + }, + { + "epoch": 0.5544842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.00021446987411904351, + "loss": 3.3385, + "step": 13169 + }, + { + "epoch": 0.5545263157894736, + "grad_norm": 0.423828125, + "learning_rate": 0.00021443647268674953, + "loss": 3.0514, + "step": 13170 + }, + { + "epoch": 0.5545684210526316, + "grad_norm": 0.447265625, + "learning_rate": 0.0002144030719023813, + "loss": 3.0018, + "step": 13171 + }, + { + "epoch": 0.5546105263157894, + "grad_norm": 0.40625, + "learning_rate": 0.00021436967176654718, + "loss": 3.1869, + "step": 13172 + }, + { + "epoch": 0.5546526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.0002143362722798558, + "loss": 3.1425, + "step": 13173 + }, + { + "epoch": 0.5546947368421052, + "grad_norm": 0.53125, + "learning_rate": 0.00021430287344291574, + "loss": 2.9003, + "step": 13174 + }, + { + "epoch": 0.5547368421052632, + "grad_norm": 0.419921875, + "learning_rate": 0.00021426947525633527, + "loss": 2.9766, + "step": 13175 + }, + { + "epoch": 0.554778947368421, + "grad_norm": 0.453125, + "learning_rate": 0.0002142360777207231, + "loss": 3.4352, + "step": 13176 + }, + { + "epoch": 0.554821052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00021420268083668746, + "loss": 3.0748, + "step": 13177 + }, + { + "epoch": 0.5548631578947368, + "grad_norm": 0.408203125, + "learning_rate": 0.000214169284604837, + "loss": 3.0347, + "step": 13178 + }, + { + "epoch": 0.5549052631578948, + "grad_norm": 0.40234375, + "learning_rate": 0.00021413588902578, + "loss": 3.0456, + "step": 13179 + }, + { + "epoch": 0.5549473684210526, + "grad_norm": 0.447265625, + "learning_rate": 0.00021410249410012494, + "loss": 3.3114, + "step": 13180 + }, + { + "epoch": 0.5549894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.00021406909982848033, + "loss": 3.1245, + "step": 13181 + }, + { + "epoch": 0.5550315789473684, + "grad_norm": 0.431640625, + "learning_rate": 0.00021403570621145447, + "loss": 3.5848, + "step": 13182 + }, + { + "epoch": 0.5550736842105263, + "grad_norm": 0.4765625, + "learning_rate": 0.00021400231324965584, + "loss": 3.3441, + "step": 13183 + }, + { + "epoch": 0.5551157894736842, + "grad_norm": 0.455078125, + "learning_rate": 0.0002139689209436927, + "loss": 3.2104, + "step": 13184 + }, + { + "epoch": 0.5551578947368421, + "grad_norm": 0.443359375, + "learning_rate": 0.0002139355292941736, + "loss": 2.8626, + "step": 13185 + }, + { + "epoch": 0.5552, + "grad_norm": 0.419921875, + "learning_rate": 0.00021390213830170665, + "loss": 2.9954, + "step": 13186 + }, + { + "epoch": 0.5552421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00021386874796690034, + "loss": 3.1758, + "step": 13187 + }, + { + "epoch": 0.5552842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.00021383535829036304, + "loss": 2.4312, + "step": 13188 + }, + { + "epoch": 0.5553263157894737, + "grad_norm": 0.412109375, + "learning_rate": 0.00021380196927270296, + "loss": 2.9523, + "step": 13189 + }, + { + "epoch": 0.5553684210526316, + "grad_norm": 0.474609375, + "learning_rate": 0.0002137685809145286, + "loss": 2.8671, + "step": 13190 + }, + { + "epoch": 0.5554105263157895, + "grad_norm": 0.41015625, + "learning_rate": 0.00021373519321644797, + "loss": 2.7751, + "step": 13191 + }, + { + "epoch": 0.5554526315789474, + "grad_norm": 0.443359375, + "learning_rate": 0.00021370180617906963, + "loss": 3.5387, + "step": 13192 + }, + { + "epoch": 0.5554947368421053, + "grad_norm": 0.41796875, + "learning_rate": 0.00021366841980300166, + "loss": 3.1991, + "step": 13193 + }, + { + "epoch": 0.5555368421052631, + "grad_norm": 0.40625, + "learning_rate": 0.00021363503408885237, + "loss": 2.947, + "step": 13194 + }, + { + "epoch": 0.5555789473684211, + "grad_norm": 0.416015625, + "learning_rate": 0.00021360164903723013, + "loss": 3.5365, + "step": 13195 + }, + { + "epoch": 0.5556210526315789, + "grad_norm": 0.40625, + "learning_rate": 0.00021356826464874302, + "loss": 3.1408, + "step": 13196 + }, + { + "epoch": 0.5556631578947369, + "grad_norm": 0.43359375, + "learning_rate": 0.00021353488092399933, + "loss": 3.1382, + "step": 13197 + }, + { + "epoch": 0.5557052631578947, + "grad_norm": 0.578125, + "learning_rate": 0.00021350149786360722, + "loss": 2.6871, + "step": 13198 + }, + { + "epoch": 0.5557473684210527, + "grad_norm": 0.4609375, + "learning_rate": 0.00021346811546817506, + "loss": 3.373, + "step": 13199 + }, + { + "epoch": 0.5557894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.00021343473373831078, + "loss": 3.1158, + "step": 13200 + }, + { + "epoch": 0.5558315789473685, + "grad_norm": 0.439453125, + "learning_rate": 0.00021340135267462269, + "loss": 2.678, + "step": 13201 + }, + { + "epoch": 0.5558736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.0002133679722777191, + "loss": 3.304, + "step": 13202 + }, + { + "epoch": 0.5559157894736843, + "grad_norm": 0.423828125, + "learning_rate": 0.00021333459254820785, + "loss": 3.0153, + "step": 13203 + }, + { + "epoch": 0.5559578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.00021330121348669734, + "loss": 3.1213, + "step": 13204 + }, + { + "epoch": 0.556, + "grad_norm": 0.404296875, + "learning_rate": 0.00021326783509379552, + "loss": 2.9011, + "step": 13205 + }, + { + "epoch": 0.5560421052631579, + "grad_norm": 0.458984375, + "learning_rate": 0.00021323445737011067, + "loss": 3.4657, + "step": 13206 + }, + { + "epoch": 0.5560842105263157, + "grad_norm": 0.43359375, + "learning_rate": 0.00021320108031625077, + "loss": 3.085, + "step": 13207 + }, + { + "epoch": 0.5561263157894737, + "grad_norm": 0.453125, + "learning_rate": 0.00021316770393282392, + "loss": 3.4868, + "step": 13208 + }, + { + "epoch": 0.5561684210526315, + "grad_norm": 0.421875, + "learning_rate": 0.00021313432822043832, + "loss": 3.2705, + "step": 13209 + }, + { + "epoch": 0.5562105263157895, + "grad_norm": 0.443359375, + "learning_rate": 0.00021310095317970184, + "loss": 3.2855, + "step": 13210 + }, + { + "epoch": 0.5562526315789473, + "grad_norm": 0.41015625, + "learning_rate": 0.00021306757881122277, + "loss": 2.8415, + "step": 13211 + }, + { + "epoch": 0.5562947368421053, + "grad_norm": 0.421875, + "learning_rate": 0.0002130342051156089, + "loss": 2.9086, + "step": 13212 + }, + { + "epoch": 0.5563368421052631, + "grad_norm": 0.40234375, + "learning_rate": 0.00021300083209346837, + "loss": 3.3894, + "step": 13213 + }, + { + "epoch": 0.5563789473684211, + "grad_norm": 0.453125, + "learning_rate": 0.00021296745974540927, + "loss": 2.801, + "step": 13214 + }, + { + "epoch": 0.5564210526315789, + "grad_norm": 0.408203125, + "learning_rate": 0.00021293408807203948, + "loss": 3.3947, + "step": 13215 + }, + { + "epoch": 0.5564631578947369, + "grad_norm": 0.44921875, + "learning_rate": 0.00021290071707396718, + "loss": 2.9878, + "step": 13216 + }, + { + "epoch": 0.5565052631578947, + "grad_norm": 0.4296875, + "learning_rate": 0.0002128673467518001, + "loss": 3.0717, + "step": 13217 + }, + { + "epoch": 0.5565473684210527, + "grad_norm": 0.41015625, + "learning_rate": 0.00021283397710614643, + "loss": 3.0413, + "step": 13218 + }, + { + "epoch": 0.5565894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.00021280060813761397, + "loss": 3.104, + "step": 13219 + }, + { + "epoch": 0.5566315789473684, + "grad_norm": 0.466796875, + "learning_rate": 0.00021276723984681067, + "loss": 3.3639, + "step": 13220 + }, + { + "epoch": 0.5566736842105263, + "grad_norm": 0.44140625, + "learning_rate": 0.0002127338722343446, + "loss": 3.316, + "step": 13221 + }, + { + "epoch": 0.5567157894736842, + "grad_norm": 0.4453125, + "learning_rate": 0.00021270050530082358, + "loss": 3.1257, + "step": 13222 + }, + { + "epoch": 0.5567578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.00021266713904685553, + "loss": 3.0225, + "step": 13223 + }, + { + "epoch": 0.5568, + "grad_norm": 0.419921875, + "learning_rate": 0.00021263377347304827, + "loss": 3.2181, + "step": 13224 + }, + { + "epoch": 0.5568421052631579, + "grad_norm": 0.44921875, + "learning_rate": 0.00021260040858000988, + "loss": 3.7013, + "step": 13225 + }, + { + "epoch": 0.5568842105263158, + "grad_norm": 0.44140625, + "learning_rate": 0.000212567044368348, + "loss": 3.1312, + "step": 13226 + }, + { + "epoch": 0.5569263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0002125336808386706, + "loss": 2.6004, + "step": 13227 + }, + { + "epoch": 0.5569684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.00021250031799158558, + "loss": 3.2772, + "step": 13228 + }, + { + "epoch": 0.5570105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.00021246695582770062, + "loss": 3.3376, + "step": 13229 + }, + { + "epoch": 0.5570526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00021243359434762368, + "loss": 2.8049, + "step": 13230 + }, + { + "epoch": 0.5570947368421053, + "grad_norm": 0.375, + "learning_rate": 0.00021240023355196248, + "loss": 2.6925, + "step": 13231 + }, + { + "epoch": 0.5571368421052632, + "grad_norm": 0.404296875, + "learning_rate": 0.0002123668734413249, + "loss": 3.4016, + "step": 13232 + }, + { + "epoch": 0.5571789473684211, + "grad_norm": 0.412109375, + "learning_rate": 0.00021233351401631862, + "loss": 2.9465, + "step": 13233 + }, + { + "epoch": 0.557221052631579, + "grad_norm": 0.4609375, + "learning_rate": 0.00021230015527755143, + "loss": 2.9792, + "step": 13234 + }, + { + "epoch": 0.5572631578947368, + "grad_norm": 0.50390625, + "learning_rate": 0.00021226679722563123, + "loss": 2.9459, + "step": 13235 + }, + { + "epoch": 0.5573052631578947, + "grad_norm": 0.416015625, + "learning_rate": 0.00021223343986116552, + "loss": 2.9992, + "step": 13236 + }, + { + "epoch": 0.5573473684210526, + "grad_norm": 0.41796875, + "learning_rate": 0.00021220008318476225, + "loss": 2.8997, + "step": 13237 + }, + { + "epoch": 0.5573894736842105, + "grad_norm": 0.4453125, + "learning_rate": 0.00021216672719702904, + "loss": 2.4982, + "step": 13238 + }, + { + "epoch": 0.5574315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.0002121333718985736, + "loss": 3.0885, + "step": 13239 + }, + { + "epoch": 0.5574736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00021210001729000355, + "loss": 3.1118, + "step": 13240 + }, + { + "epoch": 0.5575157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.0002120666633719267, + "loss": 3.0469, + "step": 13241 + }, + { + "epoch": 0.5575578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00021203331014495077, + "loss": 2.8936, + "step": 13242 + }, + { + "epoch": 0.5576, + "grad_norm": 0.416015625, + "learning_rate": 0.00021199995760968321, + "loss": 2.9686, + "step": 13243 + }, + { + "epoch": 0.5576421052631579, + "grad_norm": 0.455078125, + "learning_rate": 0.00021196660576673187, + "loss": 3.6513, + "step": 13244 + }, + { + "epoch": 0.5576842105263158, + "grad_norm": 0.53515625, + "learning_rate": 0.00021193325461670422, + "loss": 3.2334, + "step": 13245 + }, + { + "epoch": 0.5577263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.00021189990416020795, + "loss": 3.0281, + "step": 13246 + }, + { + "epoch": 0.5577684210526316, + "grad_norm": 0.455078125, + "learning_rate": 0.00021186655439785063, + "loss": 3.295, + "step": 13247 + }, + { + "epoch": 0.5578105263157894, + "grad_norm": 0.62109375, + "learning_rate": 0.00021183320533023987, + "loss": 3.3307, + "step": 13248 + }, + { + "epoch": 0.5578526315789474, + "grad_norm": 0.431640625, + "learning_rate": 0.0002117998569579834, + "loss": 3.3982, + "step": 13249 + }, + { + "epoch": 0.5578947368421052, + "grad_norm": 0.48046875, + "learning_rate": 0.0002117665092816885, + "loss": 2.8229, + "step": 13250 + }, + { + "epoch": 0.5579368421052632, + "grad_norm": 0.4296875, + "learning_rate": 0.00021173316230196305, + "loss": 3.106, + "step": 13251 + }, + { + "epoch": 0.557978947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.00021169981601941427, + "loss": 2.9214, + "step": 13252 + }, + { + "epoch": 0.558021052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00021166647043464992, + "loss": 3.4026, + "step": 13253 + }, + { + "epoch": 0.5580631578947368, + "grad_norm": 0.408203125, + "learning_rate": 0.0002116331255482774, + "loss": 2.9294, + "step": 13254 + }, + { + "epoch": 0.5581052631578948, + "grad_norm": 0.4296875, + "learning_rate": 0.0002115997813609042, + "loss": 3.4222, + "step": 13255 + }, + { + "epoch": 0.5581473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.00021156643787313795, + "loss": 2.8854, + "step": 13256 + }, + { + "epoch": 0.5581894736842106, + "grad_norm": 0.400390625, + "learning_rate": 0.000211533095085586, + "loss": 2.9743, + "step": 13257 + }, + { + "epoch": 0.5582315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.00021149975299885592, + "loss": 3.3284, + "step": 13258 + }, + { + "epoch": 0.5582736842105264, + "grad_norm": 0.39453125, + "learning_rate": 0.000211466411613555, + "loss": 2.5828, + "step": 13259 + }, + { + "epoch": 0.5583157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.00021143307093029085, + "loss": 3.3491, + "step": 13260 + }, + { + "epoch": 0.558357894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.00021139973094967088, + "loss": 3.2432, + "step": 13261 + }, + { + "epoch": 0.5584, + "grad_norm": 0.44140625, + "learning_rate": 0.00021136639167230236, + "loss": 3.2038, + "step": 13262 + }, + { + "epoch": 0.5584421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.00021133305309879285, + "loss": 2.6996, + "step": 13263 + }, + { + "epoch": 0.5584842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.00021129971522974966, + "loss": 3.6287, + "step": 13264 + }, + { + "epoch": 0.5585263157894736, + "grad_norm": 0.478515625, + "learning_rate": 0.00021126637806578025, + "loss": 3.115, + "step": 13265 + }, + { + "epoch": 0.5585684210526316, + "grad_norm": 0.4453125, + "learning_rate": 0.00021123304160749185, + "loss": 3.419, + "step": 13266 + }, + { + "epoch": 0.5586105263157894, + "grad_norm": 0.419921875, + "learning_rate": 0.00021119970585549186, + "loss": 3.1656, + "step": 13267 + }, + { + "epoch": 0.5586526315789474, + "grad_norm": 0.400390625, + "learning_rate": 0.00021116637081038776, + "loss": 2.9235, + "step": 13268 + }, + { + "epoch": 0.5586947368421052, + "grad_norm": 0.4140625, + "learning_rate": 0.00021113303647278664, + "loss": 3.0173, + "step": 13269 + }, + { + "epoch": 0.5587368421052632, + "grad_norm": 0.427734375, + "learning_rate": 0.00021109970284329606, + "loss": 3.3627, + "step": 13270 + }, + { + "epoch": 0.558778947368421, + "grad_norm": 0.443359375, + "learning_rate": 0.00021106636992252305, + "loss": 3.0347, + "step": 13271 + }, + { + "epoch": 0.558821052631579, + "grad_norm": 0.421875, + "learning_rate": 0.00021103303771107508, + "loss": 3.2351, + "step": 13272 + }, + { + "epoch": 0.5588631578947368, + "grad_norm": 0.41796875, + "learning_rate": 0.00021099970620955933, + "loss": 3.0244, + "step": 13273 + }, + { + "epoch": 0.5589052631578947, + "grad_norm": 0.4296875, + "learning_rate": 0.00021096637541858314, + "loss": 3.1168, + "step": 13274 + }, + { + "epoch": 0.5589473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.0002109330453387538, + "loss": 2.8197, + "step": 13275 + }, + { + "epoch": 0.5589894736842105, + "grad_norm": 0.416015625, + "learning_rate": 0.0002108997159706784, + "loss": 3.0742, + "step": 13276 + }, + { + "epoch": 0.5590315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.00021086638731496432, + "loss": 3.2655, + "step": 13277 + }, + { + "epoch": 0.5590736842105263, + "grad_norm": 0.48046875, + "learning_rate": 0.00021083305937221857, + "loss": 2.867, + "step": 13278 + }, + { + "epoch": 0.5591157894736842, + "grad_norm": 0.4765625, + "learning_rate": 0.00021079973214304854, + "loss": 3.1748, + "step": 13279 + }, + { + "epoch": 0.5591578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.00021076640562806129, + "loss": 3.2145, + "step": 13280 + }, + { + "epoch": 0.5592, + "grad_norm": 0.421875, + "learning_rate": 0.000210733079827864, + "loss": 3.2894, + "step": 13281 + }, + { + "epoch": 0.5592421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.0002106997547430639, + "loss": 3.2616, + "step": 13282 + }, + { + "epoch": 0.5592842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.00021066643037426808, + "loss": 3.1266, + "step": 13283 + }, + { + "epoch": 0.5593263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.00021063310672208375, + "loss": 3.0293, + "step": 13284 + }, + { + "epoch": 0.5593684210526316, + "grad_norm": 0.455078125, + "learning_rate": 0.0002105997837871179, + "loss": 3.1444, + "step": 13285 + }, + { + "epoch": 0.5594105263157895, + "grad_norm": 0.42578125, + "learning_rate": 0.00021056646156997774, + "loss": 3.3858, + "step": 13286 + }, + { + "epoch": 0.5594526315789474, + "grad_norm": 0.427734375, + "learning_rate": 0.00021053314007127025, + "loss": 3.1317, + "step": 13287 + }, + { + "epoch": 0.5594947368421053, + "grad_norm": 1.2734375, + "learning_rate": 0.00021049981929160256, + "loss": 2.911, + "step": 13288 + }, + { + "epoch": 0.5595368421052631, + "grad_norm": 0.427734375, + "learning_rate": 0.0002104664992315818, + "loss": 3.145, + "step": 13289 + }, + { + "epoch": 0.5595789473684211, + "grad_norm": 0.42578125, + "learning_rate": 0.00021043317989181495, + "loss": 3.3867, + "step": 13290 + }, + { + "epoch": 0.5596210526315789, + "grad_norm": 0.412109375, + "learning_rate": 0.00021039986127290917, + "loss": 2.7112, + "step": 13291 + }, + { + "epoch": 0.5596631578947369, + "grad_norm": 0.451171875, + "learning_rate": 0.0002103665433754713, + "loss": 2.9776, + "step": 13292 + }, + { + "epoch": 0.5597052631578947, + "grad_norm": 0.41015625, + "learning_rate": 0.0002103332262001085, + "loss": 3.252, + "step": 13293 + }, + { + "epoch": 0.5597473684210527, + "grad_norm": 0.4375, + "learning_rate": 0.00021029990974742762, + "loss": 3.6021, + "step": 13294 + }, + { + "epoch": 0.5597894736842105, + "grad_norm": 0.435546875, + "learning_rate": 0.00021026659401803576, + "loss": 2.9986, + "step": 13295 + }, + { + "epoch": 0.5598315789473685, + "grad_norm": 0.4296875, + "learning_rate": 0.00021023327901253998, + "loss": 3.4741, + "step": 13296 + }, + { + "epoch": 0.5598736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.000210199964731547, + "loss": 3.3788, + "step": 13297 + }, + { + "epoch": 0.5599157894736843, + "grad_norm": 0.416015625, + "learning_rate": 0.00021016665117566398, + "loss": 3.2551, + "step": 13298 + }, + { + "epoch": 0.5599578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00021013333834549769, + "loss": 3.3287, + "step": 13299 + }, + { + "epoch": 0.56, + "grad_norm": 0.41796875, + "learning_rate": 0.00021010002624165526, + "loss": 3.3162, + "step": 13300 + }, + { + "epoch": 0.5600421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00021006671486474337, + "loss": 3.3367, + "step": 13301 + }, + { + "epoch": 0.5600842105263157, + "grad_norm": 0.4140625, + "learning_rate": 0.00021003340421536898, + "loss": 2.652, + "step": 13302 + }, + { + "epoch": 0.5601263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.00021000009429413916, + "loss": 3.1295, + "step": 13303 + }, + { + "epoch": 0.5601684210526315, + "grad_norm": 0.41796875, + "learning_rate": 0.0002099667851016605, + "loss": 3.0544, + "step": 13304 + }, + { + "epoch": 0.5602105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.00020993347663854004, + "loss": 3.033, + "step": 13305 + }, + { + "epoch": 0.5602526315789473, + "grad_norm": 0.439453125, + "learning_rate": 0.0002099001689053845, + "loss": 3.1356, + "step": 13306 + }, + { + "epoch": 0.5602947368421053, + "grad_norm": 0.412109375, + "learning_rate": 0.00020986686190280074, + "loss": 3.076, + "step": 13307 + }, + { + "epoch": 0.5603368421052631, + "grad_norm": 0.43359375, + "learning_rate": 0.00020983355563139572, + "loss": 2.8803, + "step": 13308 + }, + { + "epoch": 0.5603789473684211, + "grad_norm": 0.427734375, + "learning_rate": 0.00020980025009177607, + "loss": 3.1783, + "step": 13309 + }, + { + "epoch": 0.5604210526315789, + "grad_norm": 0.453125, + "learning_rate": 0.00020976694528454867, + "loss": 3.2449, + "step": 13310 + }, + { + "epoch": 0.5604631578947369, + "grad_norm": 0.44140625, + "learning_rate": 0.00020973364121032017, + "loss": 3.416, + "step": 13311 + }, + { + "epoch": 0.5605052631578947, + "grad_norm": 0.453125, + "learning_rate": 0.00020970033786969753, + "loss": 3.5466, + "step": 13312 + }, + { + "epoch": 0.5605473684210527, + "grad_norm": 0.412109375, + "learning_rate": 0.0002096670352632873, + "loss": 3.234, + "step": 13313 + }, + { + "epoch": 0.5605894736842105, + "grad_norm": 0.419921875, + "learning_rate": 0.00020963373339169623, + "loss": 2.6963, + "step": 13314 + }, + { + "epoch": 0.5606315789473684, + "grad_norm": 0.40234375, + "learning_rate": 0.00020960043225553123, + "loss": 3.085, + "step": 13315 + }, + { + "epoch": 0.5606736842105263, + "grad_norm": 0.48046875, + "learning_rate": 0.00020956713185539877, + "loss": 3.2071, + "step": 13316 + }, + { + "epoch": 0.5607157894736842, + "grad_norm": 0.47265625, + "learning_rate": 0.00020953383219190583, + "loss": 3.2566, + "step": 13317 + }, + { + "epoch": 0.5607578947368421, + "grad_norm": 0.447265625, + "learning_rate": 0.0002095005332656588, + "loss": 3.4362, + "step": 13318 + }, + { + "epoch": 0.5608, + "grad_norm": 0.44140625, + "learning_rate": 0.00020946723507726456, + "loss": 3.3992, + "step": 13319 + }, + { + "epoch": 0.5608421052631579, + "grad_norm": 0.5234375, + "learning_rate": 0.00020943393762732956, + "loss": 2.8092, + "step": 13320 + }, + { + "epoch": 0.5608842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.00020940064091646055, + "loss": 2.942, + "step": 13321 + }, + { + "epoch": 0.5609263157894737, + "grad_norm": 0.478515625, + "learning_rate": 0.00020936734494526424, + "loss": 2.9566, + "step": 13322 + }, + { + "epoch": 0.5609684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.00020933404971434715, + "loss": 2.6166, + "step": 13323 + }, + { + "epoch": 0.5610105263157895, + "grad_norm": 0.43359375, + "learning_rate": 0.0002093007552243159, + "loss": 3.0619, + "step": 13324 + }, + { + "epoch": 0.5610526315789474, + "grad_norm": 0.44140625, + "learning_rate": 0.000209267461475777, + "loss": 3.3991, + "step": 13325 + }, + { + "epoch": 0.5610947368421053, + "grad_norm": 0.455078125, + "learning_rate": 0.00020923416846933723, + "loss": 2.6836, + "step": 13326 + }, + { + "epoch": 0.5611368421052632, + "grad_norm": 0.46875, + "learning_rate": 0.00020920087620560288, + "loss": 3.4332, + "step": 13327 + }, + { + "epoch": 0.561178947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.00020916758468518065, + "loss": 2.9093, + "step": 13328 + }, + { + "epoch": 0.561221052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.00020913429390867713, + "loss": 2.8755, + "step": 13329 + }, + { + "epoch": 0.5612631578947368, + "grad_norm": 0.439453125, + "learning_rate": 0.00020910100387669866, + "loss": 3.0418, + "step": 13330 + }, + { + "epoch": 0.5613052631578948, + "grad_norm": 0.43359375, + "learning_rate": 0.0002090677145898519, + "loss": 3.116, + "step": 13331 + }, + { + "epoch": 0.5613473684210526, + "grad_norm": 0.4453125, + "learning_rate": 0.00020903442604874325, + "loss": 3.3644, + "step": 13332 + }, + { + "epoch": 0.5613894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.0002090011382539793, + "loss": 3.4315, + "step": 13333 + }, + { + "epoch": 0.5614315789473684, + "grad_norm": 0.466796875, + "learning_rate": 0.00020896785120616638, + "loss": 2.8542, + "step": 13334 + }, + { + "epoch": 0.5614736842105263, + "grad_norm": 0.40625, + "learning_rate": 0.00020893456490591098, + "loss": 3.2463, + "step": 13335 + }, + { + "epoch": 0.5615157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.00020890127935381964, + "loss": 3.2973, + "step": 13336 + }, + { + "epoch": 0.5615578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00020886799455049863, + "loss": 2.9064, + "step": 13337 + }, + { + "epoch": 0.5616, + "grad_norm": 0.41796875, + "learning_rate": 0.00020883471049655444, + "loss": 3.1614, + "step": 13338 + }, + { + "epoch": 0.5616421052631579, + "grad_norm": 0.396484375, + "learning_rate": 0.00020880142719259352, + "loss": 3.0627, + "step": 13339 + }, + { + "epoch": 0.5616842105263158, + "grad_norm": 0.443359375, + "learning_rate": 0.00020876814463922214, + "loss": 2.9105, + "step": 13340 + }, + { + "epoch": 0.5617263157894736, + "grad_norm": 0.4296875, + "learning_rate": 0.00020873486283704668, + "loss": 3.061, + "step": 13341 + }, + { + "epoch": 0.5617684210526316, + "grad_norm": 0.4453125, + "learning_rate": 0.00020870158178667354, + "loss": 3.2333, + "step": 13342 + }, + { + "epoch": 0.5618105263157894, + "grad_norm": 0.421875, + "learning_rate": 0.00020866830148870915, + "loss": 3.0262, + "step": 13343 + }, + { + "epoch": 0.5618526315789474, + "grad_norm": 0.419921875, + "learning_rate": 0.00020863502194375967, + "loss": 2.8341, + "step": 13344 + }, + { + "epoch": 0.5618947368421052, + "grad_norm": 0.44140625, + "learning_rate": 0.00020860174315243154, + "loss": 3.0954, + "step": 13345 + }, + { + "epoch": 0.5619368421052632, + "grad_norm": 0.42578125, + "learning_rate": 0.00020856846511533095, + "loss": 2.6248, + "step": 13346 + }, + { + "epoch": 0.561978947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00020853518783306427, + "loss": 3.5041, + "step": 13347 + }, + { + "epoch": 0.562021052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.0002085019113062377, + "loss": 2.7965, + "step": 13348 + }, + { + "epoch": 0.5620631578947368, + "grad_norm": 0.41796875, + "learning_rate": 0.00020846863553545755, + "loss": 2.8155, + "step": 13349 + }, + { + "epoch": 0.5621052631578948, + "grad_norm": 0.423828125, + "learning_rate": 0.0002084353605213302, + "loss": 3.212, + "step": 13350 + }, + { + "epoch": 0.5621473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.0002084020862644616, + "loss": 3.1989, + "step": 13351 + }, + { + "epoch": 0.5621894736842106, + "grad_norm": 0.412109375, + "learning_rate": 0.00020836881276545822, + "loss": 3.0888, + "step": 13352 + }, + { + "epoch": 0.5622315789473684, + "grad_norm": 0.451171875, + "learning_rate": 0.00020833554002492607, + "loss": 3.1422, + "step": 13353 + }, + { + "epoch": 0.5622736842105264, + "grad_norm": 0.431640625, + "learning_rate": 0.00020830226804347142, + "loss": 3.2071, + "step": 13354 + }, + { + "epoch": 0.5623157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.00020826899682170055, + "loss": 3.0148, + "step": 13355 + }, + { + "epoch": 0.5623578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.00020823572636021942, + "loss": 3.335, + "step": 13356 + }, + { + "epoch": 0.5624, + "grad_norm": 0.419921875, + "learning_rate": 0.00020820245665963437, + "loss": 3.038, + "step": 13357 + }, + { + "epoch": 0.5624421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.0002081691877205514, + "loss": 3.4129, + "step": 13358 + }, + { + "epoch": 0.5624842105263158, + "grad_norm": 0.4140625, + "learning_rate": 0.00020813591954357675, + "loss": 3.0938, + "step": 13359 + }, + { + "epoch": 0.5625263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00020810265212931633, + "loss": 3.2146, + "step": 13360 + }, + { + "epoch": 0.5625684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.0002080693854783764, + "loss": 3.0276, + "step": 13361 + }, + { + "epoch": 0.5626105263157894, + "grad_norm": 0.4296875, + "learning_rate": 0.00020803611959136308, + "loss": 2.8753, + "step": 13362 + }, + { + "epoch": 0.5626526315789474, + "grad_norm": 0.44921875, + "learning_rate": 0.00020800285446888228, + "loss": 3.0602, + "step": 13363 + }, + { + "epoch": 0.5626947368421052, + "grad_norm": 0.4453125, + "learning_rate": 0.00020796959011154019, + "loss": 3.2112, + "step": 13364 + }, + { + "epoch": 0.5627368421052632, + "grad_norm": 0.4609375, + "learning_rate": 0.00020793632651994269, + "loss": 3.4673, + "step": 13365 + }, + { + "epoch": 0.562778947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00020790306369469603, + "loss": 3.3115, + "step": 13366 + }, + { + "epoch": 0.562821052631579, + "grad_norm": 0.3984375, + "learning_rate": 0.00020786980163640595, + "loss": 2.5923, + "step": 13367 + }, + { + "epoch": 0.5628631578947368, + "grad_norm": 0.4453125, + "learning_rate": 0.0002078365403456786, + "loss": 3.4835, + "step": 13368 + }, + { + "epoch": 0.5629052631578947, + "grad_norm": 0.4375, + "learning_rate": 0.00020780327982312008, + "loss": 3.0649, + "step": 13369 + }, + { + "epoch": 0.5629473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.00020777002006933608, + "loss": 3.021, + "step": 13370 + }, + { + "epoch": 0.5629894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.00020773676108493285, + "loss": 2.9052, + "step": 13371 + }, + { + "epoch": 0.5630315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.00020770350287051603, + "loss": 3.2196, + "step": 13372 + }, + { + "epoch": 0.5630736842105263, + "grad_norm": 0.44140625, + "learning_rate": 0.00020767024542669174, + "loss": 3.1854, + "step": 13373 + }, + { + "epoch": 0.5631157894736842, + "grad_norm": 0.4453125, + "learning_rate": 0.00020763698875406582, + "loss": 2.9665, + "step": 13374 + }, + { + "epoch": 0.5631578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00020760373285324415, + "loss": 2.7489, + "step": 13375 + }, + { + "epoch": 0.5632, + "grad_norm": 0.42578125, + "learning_rate": 0.00020757047772483278, + "loss": 3.5204, + "step": 13376 + }, + { + "epoch": 0.5632421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.00020753722336943736, + "loss": 2.9893, + "step": 13377 + }, + { + "epoch": 0.5632842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.00020750396978766396, + "loss": 2.6112, + "step": 13378 + }, + { + "epoch": 0.5633263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.00020747071698011817, + "loss": 3.3848, + "step": 13379 + }, + { + "epoch": 0.5633684210526316, + "grad_norm": 0.388671875, + "learning_rate": 0.00020743746494740597, + "loss": 2.846, + "step": 13380 + }, + { + "epoch": 0.5634105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.00020740421369013312, + "loss": 3.4135, + "step": 13381 + }, + { + "epoch": 0.5634526315789473, + "grad_norm": 0.435546875, + "learning_rate": 0.0002073709632089055, + "loss": 3.1989, + "step": 13382 + }, + { + "epoch": 0.5634947368421053, + "grad_norm": 0.427734375, + "learning_rate": 0.00020733771350432885, + "loss": 3.1562, + "step": 13383 + }, + { + "epoch": 0.5635368421052631, + "grad_norm": 0.390625, + "learning_rate": 0.0002073044645770089, + "loss": 2.7636, + "step": 13384 + }, + { + "epoch": 0.5635789473684211, + "grad_norm": 0.427734375, + "learning_rate": 0.0002072712164275515, + "loss": 2.8281, + "step": 13385 + }, + { + "epoch": 0.5636210526315789, + "grad_norm": 0.404296875, + "learning_rate": 0.00020723796905656224, + "loss": 2.9482, + "step": 13386 + }, + { + "epoch": 0.5636631578947369, + "grad_norm": 0.443359375, + "learning_rate": 0.00020720472246464708, + "loss": 3.2882, + "step": 13387 + }, + { + "epoch": 0.5637052631578947, + "grad_norm": 0.44140625, + "learning_rate": 0.0002071714766524115, + "loss": 3.0299, + "step": 13388 + }, + { + "epoch": 0.5637473684210527, + "grad_norm": 0.423828125, + "learning_rate": 0.00020713823162046122, + "loss": 2.6648, + "step": 13389 + }, + { + "epoch": 0.5637894736842105, + "grad_norm": 0.443359375, + "learning_rate": 0.0002071049873694021, + "loss": 3.2732, + "step": 13390 + }, + { + "epoch": 0.5638315789473685, + "grad_norm": 0.427734375, + "learning_rate": 0.00020707174389983965, + "loss": 2.9669, + "step": 13391 + }, + { + "epoch": 0.5638736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.0002070385012123797, + "loss": 3.3212, + "step": 13392 + }, + { + "epoch": 0.5639157894736843, + "grad_norm": 0.427734375, + "learning_rate": 0.00020700525930762766, + "loss": 3.5706, + "step": 13393 + }, + { + "epoch": 0.5639578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.0002069720181861894, + "loss": 3.2916, + "step": 13394 + }, + { + "epoch": 0.564, + "grad_norm": 0.41015625, + "learning_rate": 0.00020693877784867028, + "loss": 3.0355, + "step": 13395 + }, + { + "epoch": 0.5640421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00020690553829567606, + "loss": 3.3719, + "step": 13396 + }, + { + "epoch": 0.5640842105263157, + "grad_norm": 0.7265625, + "learning_rate": 0.00020687229952781236, + "loss": 3.3088, + "step": 13397 + }, + { + "epoch": 0.5641263157894737, + "grad_norm": 0.435546875, + "learning_rate": 0.00020683906154568465, + "loss": 3.159, + "step": 13398 + }, + { + "epoch": 0.5641684210526315, + "grad_norm": 0.4296875, + "learning_rate": 0.00020680582434989857, + "loss": 3.0954, + "step": 13399 + }, + { + "epoch": 0.5642105263157895, + "grad_norm": 0.431640625, + "learning_rate": 0.00020677258794105956, + "loss": 3.2769, + "step": 13400 + }, + { + "epoch": 0.5642526315789473, + "grad_norm": 0.43359375, + "learning_rate": 0.0002067393523197733, + "loss": 3.5096, + "step": 13401 + }, + { + "epoch": 0.5642947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.00020670611748664508, + "loss": 3.0812, + "step": 13402 + }, + { + "epoch": 0.5643368421052631, + "grad_norm": 0.498046875, + "learning_rate": 0.00020667288344228057, + "loss": 2.9125, + "step": 13403 + }, + { + "epoch": 0.5643789473684211, + "grad_norm": 0.52734375, + "learning_rate": 0.0002066396501872853, + "loss": 3.1512, + "step": 13404 + }, + { + "epoch": 0.5644210526315789, + "grad_norm": 0.45703125, + "learning_rate": 0.00020660641772226457, + "loss": 2.9029, + "step": 13405 + }, + { + "epoch": 0.5644631578947369, + "grad_norm": 0.41796875, + "learning_rate": 0.00020657318604782397, + "loss": 3.4337, + "step": 13406 + }, + { + "epoch": 0.5645052631578947, + "grad_norm": 0.4296875, + "learning_rate": 0.00020653995516456883, + "loss": 2.7975, + "step": 13407 + }, + { + "epoch": 0.5645473684210527, + "grad_norm": 0.431640625, + "learning_rate": 0.00020650672507310463, + "loss": 2.9489, + "step": 13408 + }, + { + "epoch": 0.5645894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.00020647349577403696, + "loss": 2.9926, + "step": 13409 + }, + { + "epoch": 0.5646315789473684, + "grad_norm": 0.46484375, + "learning_rate": 0.00020644026726797092, + "loss": 2.9842, + "step": 13410 + }, + { + "epoch": 0.5646736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.00020640703955551214, + "loss": 3.0026, + "step": 13411 + }, + { + "epoch": 0.5647157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.00020637381263726573, + "loss": 3.5585, + "step": 13412 + }, + { + "epoch": 0.5647578947368421, + "grad_norm": 0.447265625, + "learning_rate": 0.00020634058651383737, + "loss": 3.1831, + "step": 13413 + }, + { + "epoch": 0.5648, + "grad_norm": 0.443359375, + "learning_rate": 0.00020630736118583206, + "loss": 3.178, + "step": 13414 + }, + { + "epoch": 0.5648421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00020627413665385533, + "loss": 3.0248, + "step": 13415 + }, + { + "epoch": 0.5648842105263158, + "grad_norm": 0.4140625, + "learning_rate": 0.0002062409129185125, + "loss": 3.0945, + "step": 13416 + }, + { + "epoch": 0.5649263157894737, + "grad_norm": 0.52734375, + "learning_rate": 0.00020620768998040874, + "loss": 3.2937, + "step": 13417 + }, + { + "epoch": 0.5649684210526316, + "grad_norm": 0.44140625, + "learning_rate": 0.00020617446784014956, + "loss": 3.4015, + "step": 13418 + }, + { + "epoch": 0.5650105263157895, + "grad_norm": 0.423828125, + "learning_rate": 0.00020614124649833996, + "loss": 2.9013, + "step": 13419 + }, + { + "epoch": 0.5650526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.00020610802595558543, + "loss": 2.8324, + "step": 13420 + }, + { + "epoch": 0.5650947368421053, + "grad_norm": 0.421875, + "learning_rate": 0.00020607480621249097, + "loss": 3.1197, + "step": 13421 + }, + { + "epoch": 0.5651368421052632, + "grad_norm": 0.46875, + "learning_rate": 0.00020604158726966195, + "loss": 3.2631, + "step": 13422 + }, + { + "epoch": 0.565178947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00020600836912770362, + "loss": 2.8708, + "step": 13423 + }, + { + "epoch": 0.565221052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.00020597515178722106, + "loss": 2.5407, + "step": 13424 + }, + { + "epoch": 0.5652631578947368, + "grad_norm": 0.38671875, + "learning_rate": 0.0002059419352488196, + "loss": 2.6628, + "step": 13425 + }, + { + "epoch": 0.5653052631578948, + "grad_norm": 0.4609375, + "learning_rate": 0.00020590871951310422, + "loss": 3.3524, + "step": 13426 + }, + { + "epoch": 0.5653473684210526, + "grad_norm": 0.4140625, + "learning_rate": 0.00020587550458068027, + "loss": 3.0766, + "step": 13427 + }, + { + "epoch": 0.5653894736842106, + "grad_norm": 0.4609375, + "learning_rate": 0.00020584229045215267, + "loss": 3.2858, + "step": 13428 + }, + { + "epoch": 0.5654315789473684, + "grad_norm": 0.53125, + "learning_rate": 0.00020580907712812666, + "loss": 3.3141, + "step": 13429 + }, + { + "epoch": 0.5654736842105264, + "grad_norm": 0.443359375, + "learning_rate": 0.00020577586460920745, + "loss": 3.1747, + "step": 13430 + }, + { + "epoch": 0.5655157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.00020574265289599993, + "loss": 3.3121, + "step": 13431 + }, + { + "epoch": 0.5655578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.00020570944198910928, + "loss": 2.8888, + "step": 13432 + }, + { + "epoch": 0.5656, + "grad_norm": 0.443359375, + "learning_rate": 0.00020567623188914055, + "loss": 3.4697, + "step": 13433 + }, + { + "epoch": 0.565642105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.0002056430225966989, + "loss": 3.2755, + "step": 13434 + }, + { + "epoch": 0.5656842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.00020560981411238915, + "loss": 2.9606, + "step": 13435 + }, + { + "epoch": 0.5657263157894736, + "grad_norm": 0.47265625, + "learning_rate": 0.0002055766064368164, + "loss": 2.9596, + "step": 13436 + }, + { + "epoch": 0.5657684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.00020554339957058583, + "loss": 2.8059, + "step": 13437 + }, + { + "epoch": 0.5658105263157894, + "grad_norm": 0.4375, + "learning_rate": 0.00020551019351430216, + "loss": 2.9254, + "step": 13438 + }, + { + "epoch": 0.5658526315789474, + "grad_norm": 0.447265625, + "learning_rate": 0.00020547698826857056, + "loss": 3.0345, + "step": 13439 + }, + { + "epoch": 0.5658947368421052, + "grad_norm": 0.423828125, + "learning_rate": 0.00020544378383399592, + "loss": 3.2259, + "step": 13440 + }, + { + "epoch": 0.5659368421052632, + "grad_norm": 0.470703125, + "learning_rate": 0.00020541058021118316, + "loss": 3.4507, + "step": 13441 + }, + { + "epoch": 0.565978947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.0002053773774007372, + "loss": 3.0356, + "step": 13442 + }, + { + "epoch": 0.566021052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.000205344175403263, + "loss": 3.0072, + "step": 13443 + }, + { + "epoch": 0.5660631578947368, + "grad_norm": 0.421875, + "learning_rate": 0.00020531097421936558, + "loss": 3.2416, + "step": 13444 + }, + { + "epoch": 0.5661052631578948, + "grad_norm": 0.431640625, + "learning_rate": 0.00020527777384964958, + "loss": 3.0714, + "step": 13445 + }, + { + "epoch": 0.5661473684210526, + "grad_norm": 0.40625, + "learning_rate": 0.0002052445742947201, + "loss": 2.8334, + "step": 13446 + }, + { + "epoch": 0.5661894736842106, + "grad_norm": 0.4375, + "learning_rate": 0.00020521137555518177, + "loss": 3.6101, + "step": 13447 + }, + { + "epoch": 0.5662315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.0002051781776316396, + "loss": 2.8802, + "step": 13448 + }, + { + "epoch": 0.5662736842105263, + "grad_norm": 0.396484375, + "learning_rate": 0.00020514498052469834, + "loss": 3.4606, + "step": 13449 + }, + { + "epoch": 0.5663157894736842, + "grad_norm": 0.39453125, + "learning_rate": 0.0002051117842349628, + "loss": 2.4606, + "step": 13450 + }, + { + "epoch": 0.5663578947368421, + "grad_norm": 0.455078125, + "learning_rate": 0.00020507858876303797, + "loss": 2.9818, + "step": 13451 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4765625, + "learning_rate": 0.00020504539410952832, + "loss": 3.2285, + "step": 13452 + }, + { + "epoch": 0.5664421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.00020501220027503893, + "loss": 3.2174, + "step": 13453 + }, + { + "epoch": 0.5664842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.00020497900726017425, + "loss": 3.2938, + "step": 13454 + }, + { + "epoch": 0.5665263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.00020494581506553915, + "loss": 2.8193, + "step": 13455 + }, + { + "epoch": 0.5665684210526316, + "grad_norm": 0.453125, + "learning_rate": 0.00020491262369173847, + "loss": 2.9598, + "step": 13456 + }, + { + "epoch": 0.5666105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.00020487943313937673, + "loss": 2.7091, + "step": 13457 + }, + { + "epoch": 0.5666526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.0002048462434090587, + "loss": 2.9085, + "step": 13458 + }, + { + "epoch": 0.5666947368421053, + "grad_norm": 0.388671875, + "learning_rate": 0.00020481305450138908, + "loss": 3.0619, + "step": 13459 + }, + { + "epoch": 0.5667368421052632, + "grad_norm": 0.4296875, + "learning_rate": 0.0002047798664169726, + "loss": 3.0536, + "step": 13460 + }, + { + "epoch": 0.566778947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.0002047466791564137, + "loss": 2.9131, + "step": 13461 + }, + { + "epoch": 0.5668210526315789, + "grad_norm": 0.41796875, + "learning_rate": 0.00020471349272031713, + "loss": 2.8271, + "step": 13462 + }, + { + "epoch": 0.5668631578947368, + "grad_norm": 0.490234375, + "learning_rate": 0.00020468030710928765, + "loss": 2.9499, + "step": 13463 + }, + { + "epoch": 0.5669052631578947, + "grad_norm": 0.4375, + "learning_rate": 0.0002046471223239296, + "loss": 3.1213, + "step": 13464 + }, + { + "epoch": 0.5669473684210526, + "grad_norm": 0.435546875, + "learning_rate": 0.00020461393836484777, + "loss": 3.49, + "step": 13465 + }, + { + "epoch": 0.5669894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.00020458075523264655, + "loss": 3.5118, + "step": 13466 + }, + { + "epoch": 0.5670315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.00020454757292793076, + "loss": 3.5503, + "step": 13467 + }, + { + "epoch": 0.5670736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.00020451439145130465, + "loss": 3.2772, + "step": 13468 + }, + { + "epoch": 0.5671157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.0002044812108033729, + "loss": 2.7237, + "step": 13469 + }, + { + "epoch": 0.5671578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.00020444803098474011, + "loss": 2.7521, + "step": 13470 + }, + { + "epoch": 0.5672, + "grad_norm": 0.41796875, + "learning_rate": 0.00020441485199601057, + "loss": 2.8323, + "step": 13471 + }, + { + "epoch": 0.5672421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.00020438167383778896, + "loss": 3.7848, + "step": 13472 + }, + { + "epoch": 0.5672842105263158, + "grad_norm": 0.466796875, + "learning_rate": 0.00020434849651067958, + "loss": 3.0707, + "step": 13473 + }, + { + "epoch": 0.5673263157894737, + "grad_norm": 0.447265625, + "learning_rate": 0.00020431532001528697, + "loss": 2.8751, + "step": 13474 + }, + { + "epoch": 0.5673684210526316, + "grad_norm": 0.439453125, + "learning_rate": 0.0002042821443522155, + "loss": 3.2152, + "step": 13475 + }, + { + "epoch": 0.5674105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.00020424896952206967, + "loss": 2.8275, + "step": 13476 + }, + { + "epoch": 0.5674526315789473, + "grad_norm": 0.451171875, + "learning_rate": 0.00020421579552545397, + "loss": 3.0393, + "step": 13477 + }, + { + "epoch": 0.5674947368421053, + "grad_norm": 0.453125, + "learning_rate": 0.00020418262236297256, + "loss": 3.7078, + "step": 13478 + }, + { + "epoch": 0.5675368421052631, + "grad_norm": 0.40234375, + "learning_rate": 0.00020414945003523004, + "loss": 3.1412, + "step": 13479 + }, + { + "epoch": 0.5675789473684211, + "grad_norm": 0.41796875, + "learning_rate": 0.00020411627854283057, + "loss": 3.2693, + "step": 13480 + }, + { + "epoch": 0.5676210526315789, + "grad_norm": 0.451171875, + "learning_rate": 0.00020408310788637863, + "loss": 3.086, + "step": 13481 + }, + { + "epoch": 0.5676631578947369, + "grad_norm": 0.412109375, + "learning_rate": 0.00020404993806647848, + "loss": 2.898, + "step": 13482 + }, + { + "epoch": 0.5677052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.0002040167690837345, + "loss": 3.0452, + "step": 13483 + }, + { + "epoch": 0.5677473684210527, + "grad_norm": 0.40625, + "learning_rate": 0.000203983600938751, + "loss": 3.1846, + "step": 13484 + }, + { + "epoch": 0.5677894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.00020395043363213215, + "loss": 3.0197, + "step": 13485 + }, + { + "epoch": 0.5678315789473685, + "grad_norm": 0.412109375, + "learning_rate": 0.0002039172671644824, + "loss": 3.1228, + "step": 13486 + }, + { + "epoch": 0.5678736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00020388410153640576, + "loss": 3.2838, + "step": 13487 + }, + { + "epoch": 0.5679157894736843, + "grad_norm": 0.4296875, + "learning_rate": 0.00020385093674850675, + "loss": 3.1748, + "step": 13488 + }, + { + "epoch": 0.5679578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.00020381777280138933, + "loss": 3.039, + "step": 13489 + }, + { + "epoch": 0.568, + "grad_norm": 0.478515625, + "learning_rate": 0.00020378460969565782, + "loss": 2.3598, + "step": 13490 + }, + { + "epoch": 0.5680421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00020375144743191648, + "loss": 3.4648, + "step": 13491 + }, + { + "epoch": 0.5680842105263157, + "grad_norm": 0.4375, + "learning_rate": 0.00020371828601076938, + "loss": 3.5287, + "step": 13492 + }, + { + "epoch": 0.5681263157894737, + "grad_norm": 0.4453125, + "learning_rate": 0.00020368512543282087, + "loss": 3.3216, + "step": 13493 + }, + { + "epoch": 0.5681684210526315, + "grad_norm": 0.421875, + "learning_rate": 0.00020365196569867479, + "loss": 3.1009, + "step": 13494 + }, + { + "epoch": 0.5682105263157895, + "grad_norm": 0.44140625, + "learning_rate": 0.00020361880680893558, + "loss": 3.4478, + "step": 13495 + }, + { + "epoch": 0.5682526315789473, + "grad_norm": 0.421875, + "learning_rate": 0.0002035856487642071, + "loss": 3.0307, + "step": 13496 + }, + { + "epoch": 0.5682947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.00020355249156509355, + "loss": 2.9178, + "step": 13497 + }, + { + "epoch": 0.5683368421052631, + "grad_norm": 0.482421875, + "learning_rate": 0.00020351933521219913, + "loss": 3.0776, + "step": 13498 + }, + { + "epoch": 0.5683789473684211, + "grad_norm": 0.44921875, + "learning_rate": 0.00020348617970612776, + "loss": 3.2554, + "step": 13499 + }, + { + "epoch": 0.5684210526315789, + "grad_norm": 0.421875, + "learning_rate": 0.00020345302504748357, + "loss": 2.7334, + "step": 13500 + }, + { + "epoch": 0.5684631578947369, + "grad_norm": 0.4296875, + "learning_rate": 0.0002034198712368705, + "loss": 3.3914, + "step": 13501 + }, + { + "epoch": 0.5685052631578947, + "grad_norm": 0.421875, + "learning_rate": 0.00020338671827489275, + "loss": 2.8601, + "step": 13502 + }, + { + "epoch": 0.5685473684210526, + "grad_norm": 0.44921875, + "learning_rate": 0.00020335356616215414, + "loss": 3.218, + "step": 13503 + }, + { + "epoch": 0.5685894736842105, + "grad_norm": 0.412109375, + "learning_rate": 0.00020332041489925872, + "loss": 3.3275, + "step": 13504 + }, + { + "epoch": 0.5686315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.00020328726448681062, + "loss": 3.2204, + "step": 13505 + }, + { + "epoch": 0.5686736842105263, + "grad_norm": 0.404296875, + "learning_rate": 0.00020325411492541358, + "loss": 2.5485, + "step": 13506 + }, + { + "epoch": 0.5687157894736842, + "grad_norm": 0.44921875, + "learning_rate": 0.00020322096621567168, + "loss": 2.8415, + "step": 13507 + }, + { + "epoch": 0.5687578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.00020318781835818877, + "loss": 2.9219, + "step": 13508 + }, + { + "epoch": 0.5688, + "grad_norm": 0.4453125, + "learning_rate": 0.0002031546713535688, + "loss": 3.1259, + "step": 13509 + }, + { + "epoch": 0.5688421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00020312152520241577, + "loss": 2.7639, + "step": 13510 + }, + { + "epoch": 0.5688842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.00020308837990533338, + "loss": 3.5421, + "step": 13511 + }, + { + "epoch": 0.5689263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.00020305523546292571, + "loss": 2.8726, + "step": 13512 + }, + { + "epoch": 0.5689684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.00020302209187579638, + "loss": 2.8697, + "step": 13513 + }, + { + "epoch": 0.5690105263157895, + "grad_norm": 0.44140625, + "learning_rate": 0.0002029889491445494, + "loss": 3.183, + "step": 13514 + }, + { + "epoch": 0.5690526315789474, + "grad_norm": 0.416015625, + "learning_rate": 0.00020295580726978848, + "loss": 3.3963, + "step": 13515 + }, + { + "epoch": 0.5690947368421052, + "grad_norm": 0.435546875, + "learning_rate": 0.00020292266625211746, + "loss": 2.8119, + "step": 13516 + }, + { + "epoch": 0.5691368421052632, + "grad_norm": 0.42578125, + "learning_rate": 0.0002028895260921402, + "loss": 2.8115, + "step": 13517 + }, + { + "epoch": 0.569178947368421, + "grad_norm": 0.421875, + "learning_rate": 0.00020285638679046038, + "loss": 3.203, + "step": 13518 + }, + { + "epoch": 0.569221052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.00020282324834768186, + "loss": 3.0444, + "step": 13519 + }, + { + "epoch": 0.5692631578947368, + "grad_norm": 0.43359375, + "learning_rate": 0.00020279011076440825, + "loss": 3.3087, + "step": 13520 + }, + { + "epoch": 0.5693052631578948, + "grad_norm": 0.4296875, + "learning_rate": 0.0002027569740412435, + "loss": 3.3042, + "step": 13521 + }, + { + "epoch": 0.5693473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.000202723838178791, + "loss": 2.9935, + "step": 13522 + }, + { + "epoch": 0.5693894736842106, + "grad_norm": 0.419921875, + "learning_rate": 0.00020269070317765464, + "loss": 3.1422, + "step": 13523 + }, + { + "epoch": 0.5694315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.0002026575690384382, + "loss": 3.0019, + "step": 13524 + }, + { + "epoch": 0.5694736842105264, + "grad_norm": 0.435546875, + "learning_rate": 0.0002026244357617451, + "loss": 2.9676, + "step": 13525 + }, + { + "epoch": 0.5695157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.00020259130334817922, + "loss": 3.4595, + "step": 13526 + }, + { + "epoch": 0.5695578947368422, + "grad_norm": 0.4609375, + "learning_rate": 0.000202558171798344, + "loss": 3.3711, + "step": 13527 + }, + { + "epoch": 0.5696, + "grad_norm": 0.439453125, + "learning_rate": 0.00020252504111284328, + "loss": 3.2391, + "step": 13528 + }, + { + "epoch": 0.569642105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.0002024919112922804, + "loss": 3.2737, + "step": 13529 + }, + { + "epoch": 0.5696842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.00020245878233725908, + "loss": 3.2462, + "step": 13530 + }, + { + "epoch": 0.5697263157894736, + "grad_norm": 0.447265625, + "learning_rate": 0.00020242565424838298, + "loss": 3.1948, + "step": 13531 + }, + { + "epoch": 0.5697684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.00020239252702625548, + "loss": 2.7891, + "step": 13532 + }, + { + "epoch": 0.5698105263157894, + "grad_norm": 0.390625, + "learning_rate": 0.00020235940067148025, + "loss": 3.0025, + "step": 13533 + }, + { + "epoch": 0.5698526315789474, + "grad_norm": 0.44921875, + "learning_rate": 0.00020232627518466072, + "loss": 2.8918, + "step": 13534 + }, + { + "epoch": 0.5698947368421052, + "grad_norm": 0.4296875, + "learning_rate": 0.00020229315056640053, + "loss": 2.9107, + "step": 13535 + }, + { + "epoch": 0.5699368421052632, + "grad_norm": 0.43359375, + "learning_rate": 0.00020226002681730297, + "loss": 3.2809, + "step": 13536 + }, + { + "epoch": 0.569978947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.00020222690393797163, + "loss": 2.9792, + "step": 13537 + }, + { + "epoch": 0.570021052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.0002021937819290101, + "loss": 3.0968, + "step": 13538 + }, + { + "epoch": 0.5700631578947368, + "grad_norm": 0.455078125, + "learning_rate": 0.00020216066079102157, + "loss": 3.0045, + "step": 13539 + }, + { + "epoch": 0.5701052631578948, + "grad_norm": 0.48046875, + "learning_rate": 0.00020212754052460963, + "loss": 3.0371, + "step": 13540 + }, + { + "epoch": 0.5701473684210526, + "grad_norm": 0.400390625, + "learning_rate": 0.00020209442113037763, + "loss": 2.9851, + "step": 13541 + }, + { + "epoch": 0.5701894736842106, + "grad_norm": 0.44140625, + "learning_rate": 0.0002020613026089291, + "loss": 2.9925, + "step": 13542 + }, + { + "epoch": 0.5702315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.00020202818496086716, + "loss": 3.1685, + "step": 13543 + }, + { + "epoch": 0.5702736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.00020199506818679536, + "loss": 3.0505, + "step": 13544 + }, + { + "epoch": 0.5703157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.00020196195228731713, + "loss": 2.7883, + "step": 13545 + }, + { + "epoch": 0.5703578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.00020192883726303558, + "loss": 2.734, + "step": 13546 + }, + { + "epoch": 0.5704, + "grad_norm": 0.4140625, + "learning_rate": 0.00020189572311455421, + "loss": 3.1089, + "step": 13547 + }, + { + "epoch": 0.5704421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00020186260984247616, + "loss": 3.1768, + "step": 13548 + }, + { + "epoch": 0.5704842105263158, + "grad_norm": 0.49609375, + "learning_rate": 0.00020182949744740482, + "loss": 3.0338, + "step": 13549 + }, + { + "epoch": 0.5705263157894737, + "grad_norm": 0.40625, + "learning_rate": 0.00020179638592994343, + "loss": 2.866, + "step": 13550 + }, + { + "epoch": 0.5705684210526316, + "grad_norm": 0.396484375, + "learning_rate": 0.0002017632752906952, + "loss": 3.2152, + "step": 13551 + }, + { + "epoch": 0.5706105263157895, + "grad_norm": 0.423828125, + "learning_rate": 0.00020173016553026358, + "loss": 3.4708, + "step": 13552 + }, + { + "epoch": 0.5706526315789474, + "grad_norm": 0.412109375, + "learning_rate": 0.0002016970566492515, + "loss": 3.3534, + "step": 13553 + }, + { + "epoch": 0.5706947368421053, + "grad_norm": 0.4609375, + "learning_rate": 0.00020166394864826244, + "loss": 3.2921, + "step": 13554 + }, + { + "epoch": 0.5707368421052632, + "grad_norm": 0.431640625, + "learning_rate": 0.00020163084152789926, + "loss": 3.3696, + "step": 13555 + }, + { + "epoch": 0.570778947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.00020159773528876534, + "loss": 3.0982, + "step": 13556 + }, + { + "epoch": 0.5708210526315789, + "grad_norm": 0.455078125, + "learning_rate": 0.0002015646299314639, + "loss": 3.3727, + "step": 13557 + }, + { + "epoch": 0.5708631578947369, + "grad_norm": 0.42578125, + "learning_rate": 0.00020153152545659798, + "loss": 2.8047, + "step": 13558 + }, + { + "epoch": 0.5709052631578947, + "grad_norm": 0.435546875, + "learning_rate": 0.00020149842186477072, + "loss": 3.143, + "step": 13559 + }, + { + "epoch": 0.5709473684210526, + "grad_norm": 0.40625, + "learning_rate": 0.00020146531915658518, + "loss": 3.2787, + "step": 13560 + }, + { + "epoch": 0.5709894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.00020143221733264456, + "loss": 3.1574, + "step": 13561 + }, + { + "epoch": 0.5710315789473684, + "grad_norm": 0.451171875, + "learning_rate": 0.0002013991163935518, + "loss": 3.353, + "step": 13562 + }, + { + "epoch": 0.5710736842105263, + "grad_norm": 0.3984375, + "learning_rate": 0.00020136601633991003, + "loss": 3.1434, + "step": 13563 + }, + { + "epoch": 0.5711157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.00020133291717232244, + "loss": 2.7114, + "step": 13564 + }, + { + "epoch": 0.5711578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00020129981889139176, + "loss": 3.0325, + "step": 13565 + }, + { + "epoch": 0.5712, + "grad_norm": 0.45703125, + "learning_rate": 0.00020126672149772123, + "loss": 3.1738, + "step": 13566 + }, + { + "epoch": 0.5712421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00020123362499191373, + "loss": 3.1487, + "step": 13567 + }, + { + "epoch": 0.5712842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.0002012005293745724, + "loss": 2.931, + "step": 13568 + }, + { + "epoch": 0.5713263157894737, + "grad_norm": 0.421875, + "learning_rate": 0.00020116743464629997, + "loss": 3.248, + "step": 13569 + }, + { + "epoch": 0.5713684210526315, + "grad_norm": 0.4375, + "learning_rate": 0.0002011343408076995, + "loss": 3.3934, + "step": 13570 + }, + { + "epoch": 0.5714105263157895, + "grad_norm": 0.53125, + "learning_rate": 0.00020110124785937403, + "loss": 3.0199, + "step": 13571 + }, + { + "epoch": 0.5714526315789473, + "grad_norm": 0.3984375, + "learning_rate": 0.00020106815580192629, + "loss": 3.3724, + "step": 13572 + }, + { + "epoch": 0.5714947368421053, + "grad_norm": 0.408203125, + "learning_rate": 0.00020103506463595933, + "loss": 2.919, + "step": 13573 + }, + { + "epoch": 0.5715368421052631, + "grad_norm": 0.39453125, + "learning_rate": 0.00020100197436207588, + "loss": 3.3387, + "step": 13574 + }, + { + "epoch": 0.5715789473684211, + "grad_norm": 0.427734375, + "learning_rate": 0.00020096888498087893, + "loss": 2.9242, + "step": 13575 + }, + { + "epoch": 0.5716210526315789, + "grad_norm": 0.435546875, + "learning_rate": 0.00020093579649297123, + "loss": 3.3454, + "step": 13576 + }, + { + "epoch": 0.5716631578947369, + "grad_norm": 0.4375, + "learning_rate": 0.00020090270889895569, + "loss": 2.5785, + "step": 13577 + }, + { + "epoch": 0.5717052631578947, + "grad_norm": 0.443359375, + "learning_rate": 0.00020086962219943515, + "loss": 3.1434, + "step": 13578 + }, + { + "epoch": 0.5717473684210527, + "grad_norm": 0.41796875, + "learning_rate": 0.0002008365363950123, + "loss": 3.1326, + "step": 13579 + }, + { + "epoch": 0.5717894736842105, + "grad_norm": 0.451171875, + "learning_rate": 0.00020080345148629013, + "loss": 3.085, + "step": 13580 + }, + { + "epoch": 0.5718315789473685, + "grad_norm": 0.419921875, + "learning_rate": 0.00020077036747387115, + "loss": 3.3156, + "step": 13581 + }, + { + "epoch": 0.5718736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.00020073728435835827, + "loss": 3.3149, + "step": 13582 + }, + { + "epoch": 0.5719157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.00020070420214035416, + "loss": 2.9619, + "step": 13583 + }, + { + "epoch": 0.5719578947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.00020067112082046152, + "loss": 2.7903, + "step": 13584 + }, + { + "epoch": 0.572, + "grad_norm": 0.400390625, + "learning_rate": 0.00020063804039928323, + "loss": 3.2269, + "step": 13585 + }, + { + "epoch": 0.5720421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.00020060496087742174, + "loss": 2.8271, + "step": 13586 + }, + { + "epoch": 0.5720842105263158, + "grad_norm": 0.412109375, + "learning_rate": 0.00020057188225547996, + "loss": 3.1412, + "step": 13587 + }, + { + "epoch": 0.5721263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.00020053880453406027, + "loss": 3.0548, + "step": 13588 + }, + { + "epoch": 0.5721684210526315, + "grad_norm": 0.412109375, + "learning_rate": 0.00020050572771376557, + "loss": 3.0698, + "step": 13589 + }, + { + "epoch": 0.5722105263157895, + "grad_norm": 0.40234375, + "learning_rate": 0.00020047265179519826, + "loss": 2.918, + "step": 13590 + }, + { + "epoch": 0.5722526315789473, + "grad_norm": 0.41015625, + "learning_rate": 0.000200439576778961, + "loss": 2.994, + "step": 13591 + }, + { + "epoch": 0.5722947368421053, + "grad_norm": 0.44140625, + "learning_rate": 0.0002004065026656565, + "loss": 3.0818, + "step": 13592 + }, + { + "epoch": 0.5723368421052631, + "grad_norm": 0.421875, + "learning_rate": 0.0002003734294558872, + "loss": 2.8728, + "step": 13593 + }, + { + "epoch": 0.5723789473684211, + "grad_norm": 0.42578125, + "learning_rate": 0.00020034035715025582, + "loss": 3.0596, + "step": 13594 + }, + { + "epoch": 0.5724210526315789, + "grad_norm": 0.404296875, + "learning_rate": 0.00020030728574936467, + "loss": 2.934, + "step": 13595 + }, + { + "epoch": 0.5724631578947369, + "grad_norm": 0.44140625, + "learning_rate": 0.00020027421525381647, + "loss": 3.0167, + "step": 13596 + }, + { + "epoch": 0.5725052631578947, + "grad_norm": 0.41796875, + "learning_rate": 0.00020024114566421356, + "loss": 3.0947, + "step": 13597 + }, + { + "epoch": 0.5725473684210526, + "grad_norm": 0.4453125, + "learning_rate": 0.0002002080769811585, + "loss": 3.2543, + "step": 13598 + }, + { + "epoch": 0.5725894736842105, + "grad_norm": 0.498046875, + "learning_rate": 0.00020017500920525383, + "loss": 3.1688, + "step": 13599 + }, + { + "epoch": 0.5726315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.0002001419423371019, + "loss": 3.0509, + "step": 13600 + }, + { + "epoch": 0.5726736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.0002001088763773053, + "loss": 3.1073, + "step": 13601 + }, + { + "epoch": 0.5727157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.0002000758113264663, + "loss": 3.0342, + "step": 13602 + }, + { + "epoch": 0.5727578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.0002000427471851873, + "loss": 2.999, + "step": 13603 + }, + { + "epoch": 0.5728, + "grad_norm": 0.396484375, + "learning_rate": 0.00020000968395407089, + "loss": 3.1953, + "step": 13604 + }, + { + "epoch": 0.5728421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00019997662163371915, + "loss": 3.1256, + "step": 13605 + }, + { + "epoch": 0.5728842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.00019994356022473472, + "loss": 3.0893, + "step": 13606 + }, + { + "epoch": 0.5729263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.0001999104997277197, + "loss": 3.2867, + "step": 13607 + }, + { + "epoch": 0.5729684210526316, + "grad_norm": 0.412109375, + "learning_rate": 0.0001998774401432766, + "loss": 2.9404, + "step": 13608 + }, + { + "epoch": 0.5730105263157895, + "grad_norm": 0.453125, + "learning_rate": 0.00019984438147200757, + "loss": 2.9398, + "step": 13609 + }, + { + "epoch": 0.5730526315789474, + "grad_norm": 0.443359375, + "learning_rate": 0.00019981132371451498, + "loss": 3.4143, + "step": 13610 + }, + { + "epoch": 0.5730947368421052, + "grad_norm": 0.431640625, + "learning_rate": 0.0001997782668714012, + "loss": 3.0714, + "step": 13611 + }, + { + "epoch": 0.5731368421052632, + "grad_norm": 0.4296875, + "learning_rate": 0.0001997452109432683, + "loss": 3.3885, + "step": 13612 + }, + { + "epoch": 0.573178947368421, + "grad_norm": 0.400390625, + "learning_rate": 0.00019971215593071872, + "loss": 3.4412, + "step": 13613 + }, + { + "epoch": 0.573221052631579, + "grad_norm": 0.455078125, + "learning_rate": 0.00019967910183435448, + "loss": 2.8344, + "step": 13614 + }, + { + "epoch": 0.5732631578947368, + "grad_norm": 0.435546875, + "learning_rate": 0.0001996460486547779, + "loss": 3.0669, + "step": 13615 + }, + { + "epoch": 0.5733052631578948, + "grad_norm": 0.484375, + "learning_rate": 0.00019961299639259118, + "loss": 3.5075, + "step": 13616 + }, + { + "epoch": 0.5733473684210526, + "grad_norm": 0.4375, + "learning_rate": 0.00019957994504839639, + "loss": 3.4828, + "step": 13617 + }, + { + "epoch": 0.5733894736842106, + "grad_norm": 0.423828125, + "learning_rate": 0.00019954689462279578, + "loss": 2.9475, + "step": 13618 + }, + { + "epoch": 0.5734315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00019951384511639149, + "loss": 3.3577, + "step": 13619 + }, + { + "epoch": 0.5734736842105264, + "grad_norm": 0.55078125, + "learning_rate": 0.00019948079652978569, + "loss": 3.1781, + "step": 13620 + }, + { + "epoch": 0.5735157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.0001994477488635803, + "loss": 3.4171, + "step": 13621 + }, + { + "epoch": 0.5735578947368422, + "grad_norm": 0.427734375, + "learning_rate": 0.00019941470211837768, + "loss": 3.2132, + "step": 13622 + }, + { + "epoch": 0.5736, + "grad_norm": 0.43359375, + "learning_rate": 0.0001993816562947796, + "loss": 2.7517, + "step": 13623 + }, + { + "epoch": 0.5736421052631578, + "grad_norm": 0.41796875, + "learning_rate": 0.00019934861139338827, + "loss": 3.0417, + "step": 13624 + }, + { + "epoch": 0.5736842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.0001993155674148058, + "loss": 3.4557, + "step": 13625 + }, + { + "epoch": 0.5737263157894736, + "grad_norm": 0.4375, + "learning_rate": 0.0001992825243596341, + "loss": 2.835, + "step": 13626 + }, + { + "epoch": 0.5737684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.0001992494822284753, + "loss": 3.0413, + "step": 13627 + }, + { + "epoch": 0.5738105263157894, + "grad_norm": 0.421875, + "learning_rate": 0.0001992164410219312, + "loss": 3.423, + "step": 13628 + }, + { + "epoch": 0.5738526315789474, + "grad_norm": 0.45703125, + "learning_rate": 0.00019918340074060397, + "loss": 3.0479, + "step": 13629 + }, + { + "epoch": 0.5738947368421052, + "grad_norm": 0.44140625, + "learning_rate": 0.0001991503613850954, + "loss": 3.2468, + "step": 13630 + }, + { + "epoch": 0.5739368421052632, + "grad_norm": 0.41796875, + "learning_rate": 0.0001991173229560075, + "loss": 3.0716, + "step": 13631 + }, + { + "epoch": 0.573978947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.00019908428545394226, + "loss": 3.3137, + "step": 13632 + }, + { + "epoch": 0.574021052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.00019905124887950143, + "loss": 2.8699, + "step": 13633 + }, + { + "epoch": 0.5740631578947368, + "grad_norm": 0.41796875, + "learning_rate": 0.00019901821323328704, + "loss": 3.4341, + "step": 13634 + }, + { + "epoch": 0.5741052631578948, + "grad_norm": 0.4140625, + "learning_rate": 0.00019898517851590084, + "loss": 2.8827, + "step": 13635 + }, + { + "epoch": 0.5741473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.00019895214472794484, + "loss": 3.211, + "step": 13636 + }, + { + "epoch": 0.5741894736842105, + "grad_norm": 0.443359375, + "learning_rate": 0.00019891911187002072, + "loss": 3.2151, + "step": 13637 + }, + { + "epoch": 0.5742315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.00019888607994273034, + "loss": 3.1404, + "step": 13638 + }, + { + "epoch": 0.5742736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.0001988530489466756, + "loss": 2.8683, + "step": 13639 + }, + { + "epoch": 0.5743157894736842, + "grad_norm": 0.396484375, + "learning_rate": 0.00019882001888245815, + "loss": 2.3499, + "step": 13640 + }, + { + "epoch": 0.5743578947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.00019878698975067988, + "loss": 3.0614, + "step": 13641 + }, + { + "epoch": 0.5744, + "grad_norm": 0.4140625, + "learning_rate": 0.00019875396155194242, + "loss": 2.8017, + "step": 13642 + }, + { + "epoch": 0.5744421052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.00019872093428684767, + "loss": 3.169, + "step": 13643 + }, + { + "epoch": 0.5744842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.00019868790795599713, + "loss": 3.2016, + "step": 13644 + }, + { + "epoch": 0.5745263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.00019865488255999266, + "loss": 2.6418, + "step": 13645 + }, + { + "epoch": 0.5745684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.00019862185809943594, + "loss": 3.6396, + "step": 13646 + }, + { + "epoch": 0.5746105263157895, + "grad_norm": 0.423828125, + "learning_rate": 0.00019858883457492854, + "loss": 2.9667, + "step": 13647 + }, + { + "epoch": 0.5746526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.00019855581198707225, + "loss": 2.6119, + "step": 13648 + }, + { + "epoch": 0.5746947368421053, + "grad_norm": 0.4296875, + "learning_rate": 0.00019852279033646852, + "loss": 3.2095, + "step": 13649 + }, + { + "epoch": 0.5747368421052632, + "grad_norm": 0.478515625, + "learning_rate": 0.00019848976962371916, + "loss": 3.0505, + "step": 13650 + }, + { + "epoch": 0.5747789473684211, + "grad_norm": 0.404296875, + "learning_rate": 0.00019845674984942557, + "loss": 3.4954, + "step": 13651 + }, + { + "epoch": 0.5748210526315789, + "grad_norm": 0.431640625, + "learning_rate": 0.00019842373101418948, + "loss": 3.6447, + "step": 13652 + }, + { + "epoch": 0.5748631578947369, + "grad_norm": 0.48828125, + "learning_rate": 0.0001983907131186125, + "loss": 3.1914, + "step": 13653 + }, + { + "epoch": 0.5749052631578947, + "grad_norm": 0.4375, + "learning_rate": 0.00019835769616329595, + "loss": 2.9919, + "step": 13654 + }, + { + "epoch": 0.5749473684210527, + "grad_norm": 0.443359375, + "learning_rate": 0.00019832468014884165, + "loss": 3.0797, + "step": 13655 + }, + { + "epoch": 0.5749894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.00019829166507585083, + "loss": 3.087, + "step": 13656 + }, + { + "epoch": 0.5750315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.00019825865094492512, + "loss": 2.8431, + "step": 13657 + }, + { + "epoch": 0.5750736842105263, + "grad_norm": 0.4453125, + "learning_rate": 0.0001982256377566661, + "loss": 3.5012, + "step": 13658 + }, + { + "epoch": 0.5751157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.00019819262551167504, + "loss": 3.4376, + "step": 13659 + }, + { + "epoch": 0.5751578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00019815961421055354, + "loss": 3.0873, + "step": 13660 + }, + { + "epoch": 0.5752, + "grad_norm": 0.439453125, + "learning_rate": 0.00019812660385390288, + "loss": 2.9918, + "step": 13661 + }, + { + "epoch": 0.5752421052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.00019809359444232467, + "loss": 2.7084, + "step": 13662 + }, + { + "epoch": 0.5752842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.00019806058597642004, + "loss": 3.4384, + "step": 13663 + }, + { + "epoch": 0.5753263157894737, + "grad_norm": 0.435546875, + "learning_rate": 0.00019802757845679053, + "loss": 3.2723, + "step": 13664 + }, + { + "epoch": 0.5753684210526315, + "grad_norm": 0.41796875, + "learning_rate": 0.00019799457188403758, + "loss": 3.1454, + "step": 13665 + }, + { + "epoch": 0.5754105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.00019796156625876232, + "loss": 2.7084, + "step": 13666 + }, + { + "epoch": 0.5754526315789473, + "grad_norm": 0.453125, + "learning_rate": 0.00019792856158156624, + "loss": 3.069, + "step": 13667 + }, + { + "epoch": 0.5754947368421053, + "grad_norm": 0.40234375, + "learning_rate": 0.00019789555785305053, + "loss": 3.2069, + "step": 13668 + }, + { + "epoch": 0.5755368421052631, + "grad_norm": 0.4375, + "learning_rate": 0.00019786255507381665, + "loss": 3.3254, + "step": 13669 + }, + { + "epoch": 0.5755789473684211, + "grad_norm": 0.4375, + "learning_rate": 0.00019782955324446563, + "loss": 2.7682, + "step": 13670 + }, + { + "epoch": 0.5756210526315789, + "grad_norm": 0.439453125, + "learning_rate": 0.00019779655236559885, + "loss": 3.0039, + "step": 13671 + }, + { + "epoch": 0.5756631578947369, + "grad_norm": 0.412109375, + "learning_rate": 0.00019776355243781768, + "loss": 3.0551, + "step": 13672 + }, + { + "epoch": 0.5757052631578947, + "grad_norm": 0.466796875, + "learning_rate": 0.00019773055346172309, + "loss": 3.0055, + "step": 13673 + }, + { + "epoch": 0.5757473684210527, + "grad_norm": 0.44140625, + "learning_rate": 0.00019769755543791646, + "loss": 3.3821, + "step": 13674 + }, + { + "epoch": 0.5757894736842105, + "grad_norm": 0.4140625, + "learning_rate": 0.00019766455836699891, + "loss": 2.7475, + "step": 13675 + }, + { + "epoch": 0.5758315789473685, + "grad_norm": 0.4609375, + "learning_rate": 0.00019763156224957164, + "loss": 3.2862, + "step": 13676 + }, + { + "epoch": 0.5758736842105263, + "grad_norm": 0.451171875, + "learning_rate": 0.0001975985670862357, + "loss": 2.9635, + "step": 13677 + }, + { + "epoch": 0.5759157894736842, + "grad_norm": 0.4375, + "learning_rate": 0.0001975655728775923, + "loss": 2.8314, + "step": 13678 + }, + { + "epoch": 0.5759578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.00019753257962424264, + "loss": 3.2541, + "step": 13679 + }, + { + "epoch": 0.576, + "grad_norm": 0.4296875, + "learning_rate": 0.00019749958732678767, + "loss": 3.2719, + "step": 13680 + }, + { + "epoch": 0.5760421052631579, + "grad_norm": 0.447265625, + "learning_rate": 0.00019746659598582862, + "loss": 3.1393, + "step": 13681 + }, + { + "epoch": 0.5760842105263158, + "grad_norm": 0.494140625, + "learning_rate": 0.00019743360560196634, + "loss": 2.8229, + "step": 13682 + }, + { + "epoch": 0.5761263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.00019740061617580206, + "loss": 2.6648, + "step": 13683 + }, + { + "epoch": 0.5761684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.00019736762770793673, + "loss": 3.2121, + "step": 13684 + }, + { + "epoch": 0.5762105263157895, + "grad_norm": 0.4140625, + "learning_rate": 0.00019733464019897136, + "loss": 2.9435, + "step": 13685 + }, + { + "epoch": 0.5762526315789473, + "grad_norm": 0.412109375, + "learning_rate": 0.0001973016536495071, + "loss": 3.3146, + "step": 13686 + }, + { + "epoch": 0.5762947368421053, + "grad_norm": 0.5078125, + "learning_rate": 0.00019726866806014465, + "loss": 2.9521, + "step": 13687 + }, + { + "epoch": 0.5763368421052631, + "grad_norm": 0.43359375, + "learning_rate": 0.00019723568343148524, + "loss": 3.2617, + "step": 13688 + }, + { + "epoch": 0.5763789473684211, + "grad_norm": 0.421875, + "learning_rate": 0.00019720269976412956, + "loss": 3.2851, + "step": 13689 + }, + { + "epoch": 0.576421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.0001971697170586787, + "loss": 3.0564, + "step": 13690 + }, + { + "epoch": 0.5764631578947368, + "grad_norm": 0.455078125, + "learning_rate": 0.00019713673531573353, + "loss": 3.2995, + "step": 13691 + }, + { + "epoch": 0.5765052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.0001971037545358949, + "loss": 3.3089, + "step": 13692 + }, + { + "epoch": 0.5765473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.00019707077471976375, + "loss": 3.106, + "step": 13693 + }, + { + "epoch": 0.5765894736842105, + "grad_norm": 0.458984375, + "learning_rate": 0.00019703779586794086, + "loss": 2.9998, + "step": 13694 + }, + { + "epoch": 0.5766315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.00019700481798102718, + "loss": 2.9926, + "step": 13695 + }, + { + "epoch": 0.5766736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.00019697184105962334, + "loss": 3.4573, + "step": 13696 + }, + { + "epoch": 0.5767157894736842, + "grad_norm": 0.4453125, + "learning_rate": 0.00019693886510433037, + "loss": 2.6345, + "step": 13697 + }, + { + "epoch": 0.5767578947368421, + "grad_norm": 0.3984375, + "learning_rate": 0.00019690589011574884, + "loss": 3.0796, + "step": 13698 + }, + { + "epoch": 0.5768, + "grad_norm": 0.431640625, + "learning_rate": 0.0001968729160944796, + "loss": 3.4196, + "step": 13699 + }, + { + "epoch": 0.5768421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00019683994304112347, + "loss": 3.1593, + "step": 13700 + }, + { + "epoch": 0.5768842105263158, + "grad_norm": 0.400390625, + "learning_rate": 0.00019680697095628103, + "loss": 2.9102, + "step": 13701 + }, + { + "epoch": 0.5769263157894737, + "grad_norm": 0.451171875, + "learning_rate": 0.00019677399984055323, + "loss": 3.1848, + "step": 13702 + }, + { + "epoch": 0.5769684210526316, + "grad_norm": 0.416015625, + "learning_rate": 0.00019674102969454048, + "loss": 2.7659, + "step": 13703 + }, + { + "epoch": 0.5770105263157894, + "grad_norm": 0.416015625, + "learning_rate": 0.00019670806051884357, + "loss": 2.865, + "step": 13704 + }, + { + "epoch": 0.5770526315789474, + "grad_norm": 0.447265625, + "learning_rate": 0.00019667509231406332, + "loss": 3.0852, + "step": 13705 + }, + { + "epoch": 0.5770947368421052, + "grad_norm": 0.431640625, + "learning_rate": 0.00019664212508080012, + "loss": 2.9547, + "step": 13706 + }, + { + "epoch": 0.5771368421052632, + "grad_norm": 0.43359375, + "learning_rate": 0.00019660915881965482, + "loss": 3.28, + "step": 13707 + }, + { + "epoch": 0.577178947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.0001965761935312278, + "loss": 2.5273, + "step": 13708 + }, + { + "epoch": 0.577221052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00019654322921611983, + "loss": 3.3062, + "step": 13709 + }, + { + "epoch": 0.5772631578947368, + "grad_norm": 0.423828125, + "learning_rate": 0.00019651026587493137, + "loss": 2.8975, + "step": 13710 + }, + { + "epoch": 0.5773052631578948, + "grad_norm": 0.416015625, + "learning_rate": 0.00019647730350826298, + "loss": 3.2323, + "step": 13711 + }, + { + "epoch": 0.5773473684210526, + "grad_norm": 0.419921875, + "learning_rate": 0.00019644434211671537, + "loss": 2.9513, + "step": 13712 + }, + { + "epoch": 0.5773894736842106, + "grad_norm": 0.404296875, + "learning_rate": 0.00019641138170088882, + "loss": 3.2068, + "step": 13713 + }, + { + "epoch": 0.5774315789473684, + "grad_norm": 0.40234375, + "learning_rate": 0.00019637842226138402, + "loss": 3.1636, + "step": 13714 + }, + { + "epoch": 0.5774736842105264, + "grad_norm": 0.4375, + "learning_rate": 0.00019634546379880126, + "loss": 3.102, + "step": 13715 + }, + { + "epoch": 0.5775157894736842, + "grad_norm": 0.400390625, + "learning_rate": 0.00019631250631374118, + "loss": 3.246, + "step": 13716 + }, + { + "epoch": 0.5775578947368422, + "grad_norm": 0.421875, + "learning_rate": 0.00019627954980680406, + "loss": 3.2551, + "step": 13717 + }, + { + "epoch": 0.5776, + "grad_norm": 0.43359375, + "learning_rate": 0.00019624659427859052, + "loss": 3.3713, + "step": 13718 + }, + { + "epoch": 0.5776421052631578, + "grad_norm": 0.4453125, + "learning_rate": 0.00019621363972970087, + "loss": 3.1488, + "step": 13719 + }, + { + "epoch": 0.5776842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.00019618068616073542, + "loss": 3.0424, + "step": 13720 + }, + { + "epoch": 0.5777263157894736, + "grad_norm": 0.423828125, + "learning_rate": 0.00019614773357229476, + "loss": 2.7422, + "step": 13721 + }, + { + "epoch": 0.5777684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.00019611478196497902, + "loss": 2.8786, + "step": 13722 + }, + { + "epoch": 0.5778105263157894, + "grad_norm": 0.423828125, + "learning_rate": 0.00019608183133938874, + "loss": 2.9028, + "step": 13723 + }, + { + "epoch": 0.5778526315789474, + "grad_norm": 0.435546875, + "learning_rate": 0.00019604888169612403, + "loss": 3.0107, + "step": 13724 + }, + { + "epoch": 0.5778947368421052, + "grad_norm": 0.43359375, + "learning_rate": 0.00019601593303578534, + "loss": 3.3451, + "step": 13725 + }, + { + "epoch": 0.5779368421052632, + "grad_norm": 0.427734375, + "learning_rate": 0.00019598298535897292, + "loss": 3.1659, + "step": 13726 + }, + { + "epoch": 0.577978947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00019595003866628702, + "loss": 3.0619, + "step": 13727 + }, + { + "epoch": 0.578021052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00019591709295832798, + "loss": 3.0275, + "step": 13728 + }, + { + "epoch": 0.5780631578947368, + "grad_norm": 0.447265625, + "learning_rate": 0.00019588414823569592, + "loss": 3.1347, + "step": 13729 + }, + { + "epoch": 0.5781052631578948, + "grad_norm": 0.427734375, + "learning_rate": 0.00019585120449899115, + "loss": 3.3909, + "step": 13730 + }, + { + "epoch": 0.5781473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.00019581826174881373, + "loss": 3.0728, + "step": 13731 + }, + { + "epoch": 0.5781894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.0001957853199857639, + "loss": 3.0892, + "step": 13732 + }, + { + "epoch": 0.5782315789473684, + "grad_norm": 0.408203125, + "learning_rate": 0.00019575237921044198, + "loss": 3.2125, + "step": 13733 + }, + { + "epoch": 0.5782736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.00019571943942344784, + "loss": 3.2148, + "step": 13734 + }, + { + "epoch": 0.5783157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.0001956865006253818, + "loss": 3.2875, + "step": 13735 + }, + { + "epoch": 0.5783578947368421, + "grad_norm": 0.53515625, + "learning_rate": 0.00019565356281684384, + "loss": 2.9895, + "step": 13736 + }, + { + "epoch": 0.5784, + "grad_norm": 0.45703125, + "learning_rate": 0.00019562062599843422, + "loss": 2.9685, + "step": 13737 + }, + { + "epoch": 0.5784421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.0001955876901707528, + "loss": 3.3921, + "step": 13738 + }, + { + "epoch": 0.5784842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.0001955547553343997, + "loss": 3.2356, + "step": 13739 + }, + { + "epoch": 0.5785263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.00019552182148997513, + "loss": 3.1637, + "step": 13740 + }, + { + "epoch": 0.5785684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.00019548888863807885, + "loss": 3.0512, + "step": 13741 + }, + { + "epoch": 0.5786105263157895, + "grad_norm": 0.42578125, + "learning_rate": 0.00019545595677931103, + "loss": 3.1227, + "step": 13742 + }, + { + "epoch": 0.5786526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.00019542302591427153, + "loss": 2.885, + "step": 13743 + }, + { + "epoch": 0.5786947368421053, + "grad_norm": 0.455078125, + "learning_rate": 0.00019539009604356049, + "loss": 3.1261, + "step": 13744 + }, + { + "epoch": 0.5787368421052631, + "grad_norm": 0.427734375, + "learning_rate": 0.00019535716716777762, + "loss": 2.8873, + "step": 13745 + }, + { + "epoch": 0.5787789473684211, + "grad_norm": 0.44921875, + "learning_rate": 0.00019532423928752296, + "loss": 3.0553, + "step": 13746 + }, + { + "epoch": 0.5788210526315789, + "grad_norm": 0.443359375, + "learning_rate": 0.00019529131240339653, + "loss": 3.3112, + "step": 13747 + }, + { + "epoch": 0.5788631578947369, + "grad_norm": 0.451171875, + "learning_rate": 0.00019525838651599797, + "loss": 3.3617, + "step": 13748 + }, + { + "epoch": 0.5789052631578947, + "grad_norm": 0.435546875, + "learning_rate": 0.00019522546162592744, + "loss": 3.3629, + "step": 13749 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.494140625, + "learning_rate": 0.00019519253773378454, + "loss": 3.076, + "step": 13750 + }, + { + "epoch": 0.5789894736842105, + "grad_norm": 0.4765625, + "learning_rate": 0.00019515961484016918, + "loss": 2.8628, + "step": 13751 + }, + { + "epoch": 0.5790315789473685, + "grad_norm": 0.435546875, + "learning_rate": 0.00019512669294568125, + "loss": 3.2886, + "step": 13752 + }, + { + "epoch": 0.5790736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.00019509377205092045, + "loss": 3.3293, + "step": 13753 + }, + { + "epoch": 0.5791157894736843, + "grad_norm": 0.412109375, + "learning_rate": 0.00019506085215648672, + "loss": 2.9399, + "step": 13754 + }, + { + "epoch": 0.5791578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.00019502793326297963, + "loss": 2.8272, + "step": 13755 + }, + { + "epoch": 0.5792, + "grad_norm": 0.431640625, + "learning_rate": 0.0001949950153709991, + "loss": 3.0928, + "step": 13756 + }, + { + "epoch": 0.5792421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00019496209848114465, + "loss": 3.1687, + "step": 13757 + }, + { + "epoch": 0.5792842105263157, + "grad_norm": 0.42578125, + "learning_rate": 0.0001949291825940161, + "loss": 3.147, + "step": 13758 + }, + { + "epoch": 0.5793263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.0001948962677102132, + "loss": 3.055, + "step": 13759 + }, + { + "epoch": 0.5793684210526315, + "grad_norm": 0.427734375, + "learning_rate": 0.00019486335383033548, + "loss": 2.8733, + "step": 13760 + }, + { + "epoch": 0.5794105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.00019483044095498282, + "loss": 2.8951, + "step": 13761 + }, + { + "epoch": 0.5794526315789473, + "grad_norm": 0.46484375, + "learning_rate": 0.00019479752908475455, + "loss": 3.1999, + "step": 13762 + }, + { + "epoch": 0.5794947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.00019476461822025057, + "loss": 3.1258, + "step": 13763 + }, + { + "epoch": 0.5795368421052631, + "grad_norm": 0.41796875, + "learning_rate": 0.00019473170836207024, + "loss": 3.0004, + "step": 13764 + }, + { + "epoch": 0.5795789473684211, + "grad_norm": 0.42578125, + "learning_rate": 0.00019469879951081322, + "loss": 3.4338, + "step": 13765 + }, + { + "epoch": 0.5796210526315789, + "grad_norm": 0.5078125, + "learning_rate": 0.00019466589166707924, + "loss": 3.0753, + "step": 13766 + }, + { + "epoch": 0.5796631578947369, + "grad_norm": 0.455078125, + "learning_rate": 0.00019463298483146757, + "loss": 2.815, + "step": 13767 + }, + { + "epoch": 0.5797052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.00019460007900457793, + "loss": 3.2149, + "step": 13768 + }, + { + "epoch": 0.5797473684210527, + "grad_norm": 0.3984375, + "learning_rate": 0.0001945671741870097, + "loss": 3.1791, + "step": 13769 + }, + { + "epoch": 0.5797894736842105, + "grad_norm": 0.466796875, + "learning_rate": 0.00019453427037936257, + "loss": 3.16, + "step": 13770 + }, + { + "epoch": 0.5798315789473685, + "grad_norm": 0.4453125, + "learning_rate": 0.0001945013675822357, + "loss": 3.202, + "step": 13771 + }, + { + "epoch": 0.5798736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.0001944684657962287, + "loss": 2.7253, + "step": 13772 + }, + { + "epoch": 0.5799157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00019443556502194114, + "loss": 2.9746, + "step": 13773 + }, + { + "epoch": 0.5799578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.00019440266525997218, + "loss": 2.7486, + "step": 13774 + }, + { + "epoch": 0.58, + "grad_norm": 0.431640625, + "learning_rate": 0.00019436976651092142, + "loss": 3.0587, + "step": 13775 + }, + { + "epoch": 0.5800421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.00019433686877538814, + "loss": 3.3073, + "step": 13776 + }, + { + "epoch": 0.5800842105263158, + "grad_norm": 0.462890625, + "learning_rate": 0.00019430397205397168, + "loss": 3.2184, + "step": 13777 + }, + { + "epoch": 0.5801263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.00019427107634727137, + "loss": 3.1135, + "step": 13778 + }, + { + "epoch": 0.5801684210526316, + "grad_norm": 0.43359375, + "learning_rate": 0.00019423818165588657, + "loss": 3.0284, + "step": 13779 + }, + { + "epoch": 0.5802105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.0001942052879804167, + "loss": 3.1657, + "step": 13780 + }, + { + "epoch": 0.5802526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.00019417239532146084, + "loss": 2.7498, + "step": 13781 + }, + { + "epoch": 0.5802947368421053, + "grad_norm": 0.4296875, + "learning_rate": 0.0001941395036796184, + "loss": 3.1181, + "step": 13782 + }, + { + "epoch": 0.5803368421052632, + "grad_norm": 0.431640625, + "learning_rate": 0.0001941066130554885, + "loss": 3.1329, + "step": 13783 + }, + { + "epoch": 0.5803789473684211, + "grad_norm": 0.43359375, + "learning_rate": 0.0001940737234496705, + "loss": 2.8607, + "step": 13784 + }, + { + "epoch": 0.580421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.00019404083486276345, + "loss": 2.8194, + "step": 13785 + }, + { + "epoch": 0.5804631578947368, + "grad_norm": 0.419921875, + "learning_rate": 0.00019400794729536666, + "loss": 2.9065, + "step": 13786 + }, + { + "epoch": 0.5805052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.00019397506074807938, + "loss": 2.9228, + "step": 13787 + }, + { + "epoch": 0.5805473684210526, + "grad_norm": 0.41015625, + "learning_rate": 0.0001939421752215006, + "loss": 3.3125, + "step": 13788 + }, + { + "epoch": 0.5805894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.0001939092907162296, + "loss": 3.2819, + "step": 13789 + }, + { + "epoch": 0.5806315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00019387640723286532, + "loss": 3.1888, + "step": 13790 + }, + { + "epoch": 0.5806736842105263, + "grad_norm": 0.412109375, + "learning_rate": 0.00019384352477200698, + "loss": 3.503, + "step": 13791 + }, + { + "epoch": 0.5807157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.00019381064333425367, + "loss": 3.3177, + "step": 13792 + }, + { + "epoch": 0.5807578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00019377776292020435, + "loss": 2.8954, + "step": 13793 + }, + { + "epoch": 0.5808, + "grad_norm": 0.390625, + "learning_rate": 0.0001937448835304582, + "loss": 3.0454, + "step": 13794 + }, + { + "epoch": 0.5808421052631579, + "grad_norm": 0.48828125, + "learning_rate": 0.0001937120051656141, + "loss": 3.3597, + "step": 13795 + }, + { + "epoch": 0.5808842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.00019367912782627126, + "loss": 2.838, + "step": 13796 + }, + { + "epoch": 0.5809263157894737, + "grad_norm": 0.4609375, + "learning_rate": 0.00019364625151302843, + "loss": 3.1315, + "step": 13797 + }, + { + "epoch": 0.5809684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.00019361337622648479, + "loss": 2.9543, + "step": 13798 + }, + { + "epoch": 0.5810105263157894, + "grad_norm": 0.44140625, + "learning_rate": 0.00019358050196723904, + "loss": 2.5844, + "step": 13799 + }, + { + "epoch": 0.5810526315789474, + "grad_norm": 0.435546875, + "learning_rate": 0.0001935476287358903, + "loss": 3.595, + "step": 13800 + }, + { + "epoch": 0.5810947368421052, + "grad_norm": 0.482421875, + "learning_rate": 0.00019351475653303746, + "loss": 3.1196, + "step": 13801 + }, + { + "epoch": 0.5811368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.00019348188535927937, + "loss": 3.3868, + "step": 13802 + }, + { + "epoch": 0.581178947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.000193449015215215, + "loss": 3.2094, + "step": 13803 + }, + { + "epoch": 0.581221052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.000193416146101443, + "loss": 2.9218, + "step": 13804 + }, + { + "epoch": 0.5812631578947368, + "grad_norm": 0.419921875, + "learning_rate": 0.00019338327801856237, + "loss": 2.6842, + "step": 13805 + }, + { + "epoch": 0.5813052631578948, + "grad_norm": 0.43359375, + "learning_rate": 0.00019335041096717198, + "loss": 3.0885, + "step": 13806 + }, + { + "epoch": 0.5813473684210526, + "grad_norm": 0.44921875, + "learning_rate": 0.00019331754494787047, + "loss": 2.9682, + "step": 13807 + }, + { + "epoch": 0.5813894736842106, + "grad_norm": 0.51171875, + "learning_rate": 0.00019328467996125676, + "loss": 3.0242, + "step": 13808 + }, + { + "epoch": 0.5814315789473684, + "grad_norm": 0.466796875, + "learning_rate": 0.00019325181600792946, + "loss": 2.9842, + "step": 13809 + }, + { + "epoch": 0.5814736842105264, + "grad_norm": 0.43359375, + "learning_rate": 0.00019321895308848744, + "loss": 2.784, + "step": 13810 + }, + { + "epoch": 0.5815157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.00019318609120352932, + "loss": 3.1663, + "step": 13811 + }, + { + "epoch": 0.581557894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00019315323035365388, + "loss": 3.3585, + "step": 13812 + }, + { + "epoch": 0.5816, + "grad_norm": 0.453125, + "learning_rate": 0.0001931203705394599, + "loss": 2.9644, + "step": 13813 + }, + { + "epoch": 0.5816421052631578, + "grad_norm": 0.416015625, + "learning_rate": 0.00019308751176154578, + "loss": 2.9935, + "step": 13814 + }, + { + "epoch": 0.5816842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.00019305465402051048, + "loss": 3.4496, + "step": 13815 + }, + { + "epoch": 0.5817263157894736, + "grad_norm": 0.42578125, + "learning_rate": 0.00019302179731695238, + "loss": 3.1577, + "step": 13816 + }, + { + "epoch": 0.5817684210526316, + "grad_norm": 0.447265625, + "learning_rate": 0.0001929889416514702, + "loss": 2.6438, + "step": 13817 + }, + { + "epoch": 0.5818105263157894, + "grad_norm": 0.435546875, + "learning_rate": 0.0001929560870246625, + "loss": 3.1945, + "step": 13818 + }, + { + "epoch": 0.5818526315789474, + "grad_norm": 0.443359375, + "learning_rate": 0.00019292323343712793, + "loss": 3.381, + "step": 13819 + }, + { + "epoch": 0.5818947368421052, + "grad_norm": 0.455078125, + "learning_rate": 0.00019289038088946494, + "loss": 3.0547, + "step": 13820 + }, + { + "epoch": 0.5819368421052632, + "grad_norm": 0.427734375, + "learning_rate": 0.00019285752938227207, + "loss": 3.0595, + "step": 13821 + }, + { + "epoch": 0.581978947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.00019282467891614801, + "loss": 3.0542, + "step": 13822 + }, + { + "epoch": 0.582021052631579, + "grad_norm": 0.498046875, + "learning_rate": 0.00019279182949169098, + "loss": 2.9763, + "step": 13823 + }, + { + "epoch": 0.5820631578947368, + "grad_norm": 0.43359375, + "learning_rate": 0.00019275898110949975, + "loss": 3.2207, + "step": 13824 + }, + { + "epoch": 0.5821052631578948, + "grad_norm": 0.447265625, + "learning_rate": 0.0001927261337701725, + "loss": 2.6718, + "step": 13825 + }, + { + "epoch": 0.5821473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.00019269328747430778, + "loss": 3.2863, + "step": 13826 + }, + { + "epoch": 0.5821894736842105, + "grad_norm": 0.435546875, + "learning_rate": 0.00019266044222250413, + "loss": 3.1867, + "step": 13827 + }, + { + "epoch": 0.5822315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.00019262759801535974, + "loss": 2.7265, + "step": 13828 + }, + { + "epoch": 0.5822736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.00019259475485347325, + "loss": 3.2155, + "step": 13829 + }, + { + "epoch": 0.5823157894736842, + "grad_norm": 0.48046875, + "learning_rate": 0.00019256191273744276, + "loss": 3.4406, + "step": 13830 + }, + { + "epoch": 0.5823578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00019252907166786683, + "loss": 2.801, + "step": 13831 + }, + { + "epoch": 0.5824, + "grad_norm": 0.427734375, + "learning_rate": 0.00019249623164534357, + "loss": 3.2797, + "step": 13832 + }, + { + "epoch": 0.5824421052631579, + "grad_norm": 0.45703125, + "learning_rate": 0.00019246339267047145, + "loss": 3.0109, + "step": 13833 + }, + { + "epoch": 0.5824842105263158, + "grad_norm": 0.439453125, + "learning_rate": 0.00019243055474384873, + "loss": 3.1749, + "step": 13834 + }, + { + "epoch": 0.5825263157894737, + "grad_norm": 0.44921875, + "learning_rate": 0.00019239771786607368, + "loss": 3.1003, + "step": 13835 + }, + { + "epoch": 0.5825684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.0001923648820377445, + "loss": 3.1046, + "step": 13836 + }, + { + "epoch": 0.5826105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.00019233204725945945, + "loss": 2.919, + "step": 13837 + }, + { + "epoch": 0.5826526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.00019229921353181682, + "loss": 2.955, + "step": 13838 + }, + { + "epoch": 0.5826947368421053, + "grad_norm": 0.42578125, + "learning_rate": 0.00019226638085541458, + "loss": 3.0983, + "step": 13839 + }, + { + "epoch": 0.5827368421052631, + "grad_norm": 0.427734375, + "learning_rate": 0.0001922335492308511, + "loss": 3.0569, + "step": 13840 + }, + { + "epoch": 0.5827789473684211, + "grad_norm": 0.443359375, + "learning_rate": 0.00019220071865872458, + "loss": 3.2637, + "step": 13841 + }, + { + "epoch": 0.5828210526315789, + "grad_norm": 0.447265625, + "learning_rate": 0.00019216788913963294, + "loss": 3.421, + "step": 13842 + }, + { + "epoch": 0.5828631578947369, + "grad_norm": 0.43359375, + "learning_rate": 0.00019213506067417447, + "loss": 2.7806, + "step": 13843 + }, + { + "epoch": 0.5829052631578947, + "grad_norm": 0.419921875, + "learning_rate": 0.00019210223326294714, + "loss": 3.4063, + "step": 13844 + }, + { + "epoch": 0.5829473684210527, + "grad_norm": 0.435546875, + "learning_rate": 0.0001920694069065492, + "loss": 3.3458, + "step": 13845 + }, + { + "epoch": 0.5829894736842105, + "grad_norm": 0.416015625, + "learning_rate": 0.00019203658160557848, + "loss": 3.3923, + "step": 13846 + }, + { + "epoch": 0.5830315789473685, + "grad_norm": 0.416015625, + "learning_rate": 0.00019200375736063316, + "loss": 3.1868, + "step": 13847 + }, + { + "epoch": 0.5830736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.00019197093417231136, + "loss": 3.0917, + "step": 13848 + }, + { + "epoch": 0.5831157894736843, + "grad_norm": 0.6015625, + "learning_rate": 0.0001919381120412108, + "loss": 3.1577, + "step": 13849 + }, + { + "epoch": 0.5831578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.0001919052909679297, + "loss": 2.9812, + "step": 13850 + }, + { + "epoch": 0.5832, + "grad_norm": 0.466796875, + "learning_rate": 0.00019187247095306598, + "loss": 3.6194, + "step": 13851 + }, + { + "epoch": 0.5832421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00019183965199721746, + "loss": 2.9838, + "step": 13852 + }, + { + "epoch": 0.5832842105263157, + "grad_norm": 0.41796875, + "learning_rate": 0.00019180683410098217, + "loss": 2.6956, + "step": 13853 + }, + { + "epoch": 0.5833263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.00019177401726495796, + "loss": 2.8155, + "step": 13854 + }, + { + "epoch": 0.5833684210526315, + "grad_norm": 0.41015625, + "learning_rate": 0.00019174120148974282, + "loss": 2.8291, + "step": 13855 + }, + { + "epoch": 0.5834105263157895, + "grad_norm": 0.439453125, + "learning_rate": 0.00019170838677593448, + "loss": 3.076, + "step": 13856 + }, + { + "epoch": 0.5834526315789473, + "grad_norm": 0.451171875, + "learning_rate": 0.00019167557312413093, + "loss": 3.3579, + "step": 13857 + }, + { + "epoch": 0.5834947368421053, + "grad_norm": 0.421875, + "learning_rate": 0.0001916427605349298, + "loss": 2.6468, + "step": 13858 + }, + { + "epoch": 0.5835368421052631, + "grad_norm": 0.41015625, + "learning_rate": 0.000191609949008929, + "loss": 3.21, + "step": 13859 + }, + { + "epoch": 0.5835789473684211, + "grad_norm": 0.43359375, + "learning_rate": 0.0001915771385467264, + "loss": 3.4829, + "step": 13860 + }, + { + "epoch": 0.5836210526315789, + "grad_norm": 0.4296875, + "learning_rate": 0.0001915443291489196, + "loss": 2.8703, + "step": 13861 + }, + { + "epoch": 0.5836631578947369, + "grad_norm": 0.443359375, + "learning_rate": 0.00019151152081610655, + "loss": 3.3593, + "step": 13862 + }, + { + "epoch": 0.5837052631578947, + "grad_norm": 0.4375, + "learning_rate": 0.0001914787135488848, + "loss": 3.2309, + "step": 13863 + }, + { + "epoch": 0.5837473684210527, + "grad_norm": 0.4375, + "learning_rate": 0.00019144590734785223, + "loss": 3.3431, + "step": 13864 + }, + { + "epoch": 0.5837894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.00019141310221360632, + "loss": 3.0165, + "step": 13865 + }, + { + "epoch": 0.5838315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.00019138029814674481, + "loss": 3.5003, + "step": 13866 + }, + { + "epoch": 0.5838736842105263, + "grad_norm": 0.470703125, + "learning_rate": 0.00019134749514786555, + "loss": 3.181, + "step": 13867 + }, + { + "epoch": 0.5839157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.0001913146932175659, + "loss": 3.1319, + "step": 13868 + }, + { + "epoch": 0.5839578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.0001912818923564436, + "loss": 2.715, + "step": 13869 + }, + { + "epoch": 0.584, + "grad_norm": 0.37890625, + "learning_rate": 0.0001912490925650962, + "loss": 3.1688, + "step": 13870 + }, + { + "epoch": 0.5840421052631579, + "grad_norm": 0.447265625, + "learning_rate": 0.00019121629384412142, + "loss": 3.3224, + "step": 13871 + }, + { + "epoch": 0.5840842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.0001911834961941166, + "loss": 2.9578, + "step": 13872 + }, + { + "epoch": 0.5841263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.0001911506996156794, + "loss": 3.3652, + "step": 13873 + }, + { + "epoch": 0.5841684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.00019111790410940733, + "loss": 3.2564, + "step": 13874 + }, + { + "epoch": 0.5842105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.00019108510967589783, + "loss": 3.3644, + "step": 13875 + }, + { + "epoch": 0.5842526315789474, + "grad_norm": 0.421875, + "learning_rate": 0.00019105231631574843, + "loss": 3.3257, + "step": 13876 + }, + { + "epoch": 0.5842947368421053, + "grad_norm": 0.4375, + "learning_rate": 0.00019101952402955652, + "loss": 2.8879, + "step": 13877 + }, + { + "epoch": 0.5843368421052632, + "grad_norm": 0.44140625, + "learning_rate": 0.00019098673281791969, + "loss": 3.1453, + "step": 13878 + }, + { + "epoch": 0.584378947368421, + "grad_norm": 0.45703125, + "learning_rate": 0.0001909539426814351, + "loss": 3.2128, + "step": 13879 + }, + { + "epoch": 0.584421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00019092115362070037, + "loss": 3.2869, + "step": 13880 + }, + { + "epoch": 0.5844631578947368, + "grad_norm": 0.470703125, + "learning_rate": 0.00019088836563631287, + "loss": 3.3193, + "step": 13881 + }, + { + "epoch": 0.5845052631578947, + "grad_norm": 0.41796875, + "learning_rate": 0.0001908555787288698, + "loss": 2.8254, + "step": 13882 + }, + { + "epoch": 0.5845473684210526, + "grad_norm": 0.41796875, + "learning_rate": 0.0001908227928989687, + "loss": 3.1346, + "step": 13883 + }, + { + "epoch": 0.5845894736842105, + "grad_norm": 0.43359375, + "learning_rate": 0.00019079000814720669, + "loss": 2.9287, + "step": 13884 + }, + { + "epoch": 0.5846315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.0001907572244741812, + "loss": 3.2555, + "step": 13885 + }, + { + "epoch": 0.5846736842105263, + "grad_norm": 0.455078125, + "learning_rate": 0.00019072444188048942, + "loss": 3.1583, + "step": 13886 + }, + { + "epoch": 0.5847157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.00019069166036672866, + "loss": 3.4248, + "step": 13887 + }, + { + "epoch": 0.5847578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00019065887993349628, + "loss": 3.4637, + "step": 13888 + }, + { + "epoch": 0.5848, + "grad_norm": 0.431640625, + "learning_rate": 0.00019062610058138927, + "loss": 2.6711, + "step": 13889 + }, + { + "epoch": 0.5848421052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.00019059332231100506, + "loss": 3.3968, + "step": 13890 + }, + { + "epoch": 0.5848842105263158, + "grad_norm": 0.412109375, + "learning_rate": 0.0001905605451229406, + "loss": 3.2266, + "step": 13891 + }, + { + "epoch": 0.5849263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00019052776901779323, + "loss": 3.2517, + "step": 13892 + }, + { + "epoch": 0.5849684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.00019049499399616, + "loss": 2.68, + "step": 13893 + }, + { + "epoch": 0.5850105263157894, + "grad_norm": 0.494140625, + "learning_rate": 0.00019046222005863808, + "loss": 3.0779, + "step": 13894 + }, + { + "epoch": 0.5850526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.00019042944720582455, + "loss": 2.8453, + "step": 13895 + }, + { + "epoch": 0.5850947368421052, + "grad_norm": 0.41796875, + "learning_rate": 0.00019039667543831648, + "loss": 3.3318, + "step": 13896 + }, + { + "epoch": 0.5851368421052632, + "grad_norm": 0.443359375, + "learning_rate": 0.00019036390475671107, + "loss": 3.3269, + "step": 13897 + }, + { + "epoch": 0.585178947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0001903311351616051, + "loss": 3.3125, + "step": 13898 + }, + { + "epoch": 0.585221052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.00019029836665359587, + "loss": 3.172, + "step": 13899 + }, + { + "epoch": 0.5852631578947368, + "grad_norm": 0.4453125, + "learning_rate": 0.00019026559923328014, + "loss": 2.8946, + "step": 13900 + }, + { + "epoch": 0.5853052631578948, + "grad_norm": 0.419921875, + "learning_rate": 0.00019023283290125497, + "loss": 3.0234, + "step": 13901 + }, + { + "epoch": 0.5853473684210526, + "grad_norm": 0.3828125, + "learning_rate": 0.0001902000676581175, + "loss": 2.5206, + "step": 13902 + }, + { + "epoch": 0.5853894736842106, + "grad_norm": 0.4375, + "learning_rate": 0.0001901673035044644, + "loss": 2.8556, + "step": 13903 + }, + { + "epoch": 0.5854315789473684, + "grad_norm": 0.5078125, + "learning_rate": 0.00019013454044089286, + "loss": 3.3084, + "step": 13904 + }, + { + "epoch": 0.5854736842105264, + "grad_norm": 0.458984375, + "learning_rate": 0.0001901017784679996, + "loss": 3.2, + "step": 13905 + }, + { + "epoch": 0.5855157894736842, + "grad_norm": 0.578125, + "learning_rate": 0.00019006901758638147, + "loss": 3.023, + "step": 13906 + }, + { + "epoch": 0.585557894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.00019003625779663558, + "loss": 3.0901, + "step": 13907 + }, + { + "epoch": 0.5856, + "grad_norm": 0.4296875, + "learning_rate": 0.00019000349909935853, + "loss": 3.022, + "step": 13908 + }, + { + "epoch": 0.5856421052631579, + "grad_norm": 0.45703125, + "learning_rate": 0.00018997074149514732, + "loss": 2.5702, + "step": 13909 + }, + { + "epoch": 0.5856842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.00018993798498459854, + "loss": 3.3927, + "step": 13910 + }, + { + "epoch": 0.5857263157894737, + "grad_norm": 0.44921875, + "learning_rate": 0.0001899052295683092, + "loss": 3.46, + "step": 13911 + }, + { + "epoch": 0.5857684210526316, + "grad_norm": 0.435546875, + "learning_rate": 0.00018987247524687587, + "loss": 3.0356, + "step": 13912 + }, + { + "epoch": 0.5858105263157894, + "grad_norm": 0.435546875, + "learning_rate": 0.0001898397220208954, + "loss": 3.3943, + "step": 13913 + }, + { + "epoch": 0.5858526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.00018980696989096463, + "loss": 3.3657, + "step": 13914 + }, + { + "epoch": 0.5858947368421052, + "grad_norm": 0.4609375, + "learning_rate": 0.00018977421885768003, + "loss": 3.5954, + "step": 13915 + }, + { + "epoch": 0.5859368421052632, + "grad_norm": 0.431640625, + "learning_rate": 0.00018974146892163855, + "loss": 3.2644, + "step": 13916 + }, + { + "epoch": 0.585978947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00018970872008343653, + "loss": 3.1405, + "step": 13917 + }, + { + "epoch": 0.586021052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.0001896759723436709, + "loss": 3.1546, + "step": 13918 + }, + { + "epoch": 0.5860631578947368, + "grad_norm": 0.4140625, + "learning_rate": 0.0001896432257029381, + "loss": 3.055, + "step": 13919 + }, + { + "epoch": 0.5861052631578947, + "grad_norm": 0.458984375, + "learning_rate": 0.0001896104801618348, + "loss": 3.0267, + "step": 13920 + }, + { + "epoch": 0.5861473684210526, + "grad_norm": 0.41015625, + "learning_rate": 0.00018957773572095773, + "loss": 3.4306, + "step": 13921 + }, + { + "epoch": 0.5861894736842105, + "grad_norm": 0.427734375, + "learning_rate": 0.0001895449923809032, + "loss": 3.4683, + "step": 13922 + }, + { + "epoch": 0.5862315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.00018951225014226798, + "loss": 3.274, + "step": 13923 + }, + { + "epoch": 0.5862736842105263, + "grad_norm": 0.404296875, + "learning_rate": 0.0001894795090056484, + "loss": 2.6348, + "step": 13924 + }, + { + "epoch": 0.5863157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.00018944676897164115, + "loss": 3.2133, + "step": 13925 + }, + { + "epoch": 0.5863578947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.0001894140300408425, + "loss": 3.1667, + "step": 13926 + }, + { + "epoch": 0.5864, + "grad_norm": 0.458984375, + "learning_rate": 0.0001893812922138491, + "loss": 2.7082, + "step": 13927 + }, + { + "epoch": 0.5864421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.0001893485554912573, + "loss": 2.8815, + "step": 13928 + }, + { + "epoch": 0.5864842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.00018931581987366355, + "loss": 3.5537, + "step": 13929 + }, + { + "epoch": 0.5865263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.00018928308536166438, + "loss": 3.0808, + "step": 13930 + }, + { + "epoch": 0.5865684210526316, + "grad_norm": 0.44140625, + "learning_rate": 0.00018925035195585594, + "loss": 3.0665, + "step": 13931 + }, + { + "epoch": 0.5866105263157895, + "grad_norm": 0.470703125, + "learning_rate": 0.00018921761965683478, + "loss": 2.887, + "step": 13932 + }, + { + "epoch": 0.5866526315789473, + "grad_norm": 0.439453125, + "learning_rate": 0.0001891848884651971, + "loss": 2.6216, + "step": 13933 + }, + { + "epoch": 0.5866947368421053, + "grad_norm": 0.400390625, + "learning_rate": 0.00018915215838153932, + "loss": 2.9539, + "step": 13934 + }, + { + "epoch": 0.5867368421052631, + "grad_norm": 0.427734375, + "learning_rate": 0.00018911942940645773, + "loss": 2.6452, + "step": 13935 + }, + { + "epoch": 0.5867789473684211, + "grad_norm": 0.439453125, + "learning_rate": 0.00018908670154054862, + "loss": 3.0001, + "step": 13936 + }, + { + "epoch": 0.5868210526315789, + "grad_norm": 0.39453125, + "learning_rate": 0.00018905397478440823, + "loss": 2.9021, + "step": 13937 + }, + { + "epoch": 0.5868631578947369, + "grad_norm": 0.455078125, + "learning_rate": 0.00018902124913863278, + "loss": 3.0226, + "step": 13938 + }, + { + "epoch": 0.5869052631578947, + "grad_norm": 0.4375, + "learning_rate": 0.0001889885246038186, + "loss": 2.8644, + "step": 13939 + }, + { + "epoch": 0.5869473684210527, + "grad_norm": 0.4140625, + "learning_rate": 0.00018895580118056174, + "loss": 2.5821, + "step": 13940 + }, + { + "epoch": 0.5869894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.00018892307886945846, + "loss": 2.953, + "step": 13941 + }, + { + "epoch": 0.5870315789473685, + "grad_norm": 0.416015625, + "learning_rate": 0.000188890357671105, + "loss": 3.3639, + "step": 13942 + }, + { + "epoch": 0.5870736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.00018885763758609732, + "loss": 3.2949, + "step": 13943 + }, + { + "epoch": 0.5871157894736843, + "grad_norm": 0.44140625, + "learning_rate": 0.0001888249186150317, + "loss": 2.6449, + "step": 13944 + }, + { + "epoch": 0.5871578947368421, + "grad_norm": 0.443359375, + "learning_rate": 0.00018879220075850416, + "loss": 2.9494, + "step": 13945 + }, + { + "epoch": 0.5872, + "grad_norm": 0.4296875, + "learning_rate": 0.00018875948401711086, + "loss": 3.1981, + "step": 13946 + }, + { + "epoch": 0.5872421052631579, + "grad_norm": 0.447265625, + "learning_rate": 0.00018872676839144773, + "loss": 3.3015, + "step": 13947 + }, + { + "epoch": 0.5872842105263157, + "grad_norm": 0.41015625, + "learning_rate": 0.0001886940538821109, + "loss": 3.6655, + "step": 13948 + }, + { + "epoch": 0.5873263157894737, + "grad_norm": 0.439453125, + "learning_rate": 0.00018866134048969646, + "loss": 3.2806, + "step": 13949 + }, + { + "epoch": 0.5873684210526315, + "grad_norm": 0.4140625, + "learning_rate": 0.00018862862821480023, + "loss": 3.1046, + "step": 13950 + }, + { + "epoch": 0.5874105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.00018859591705801832, + "loss": 3.1942, + "step": 13951 + }, + { + "epoch": 0.5874526315789473, + "grad_norm": 0.40234375, + "learning_rate": 0.00018856320701994667, + "loss": 3.3292, + "step": 13952 + }, + { + "epoch": 0.5874947368421053, + "grad_norm": 0.42578125, + "learning_rate": 0.00018853049810118116, + "loss": 3.3935, + "step": 13953 + }, + { + "epoch": 0.5875368421052631, + "grad_norm": 0.44140625, + "learning_rate": 0.0001884977903023178, + "loss": 2.9382, + "step": 13954 + }, + { + "epoch": 0.5875789473684211, + "grad_norm": 0.4609375, + "learning_rate": 0.00018846508362395237, + "loss": 3.0085, + "step": 13955 + }, + { + "epoch": 0.5876210526315789, + "grad_norm": 0.4296875, + "learning_rate": 0.00018843237806668095, + "loss": 2.9271, + "step": 13956 + }, + { + "epoch": 0.5876631578947369, + "grad_norm": 0.384765625, + "learning_rate": 0.00018839967363109913, + "loss": 3.158, + "step": 13957 + }, + { + "epoch": 0.5877052631578947, + "grad_norm": 0.412109375, + "learning_rate": 0.000188366970317803, + "loss": 2.9561, + "step": 13958 + }, + { + "epoch": 0.5877473684210527, + "grad_norm": 0.54296875, + "learning_rate": 0.00018833426812738812, + "loss": 2.8339, + "step": 13959 + }, + { + "epoch": 0.5877894736842105, + "grad_norm": 0.439453125, + "learning_rate": 0.00018830156706045045, + "loss": 2.9765, + "step": 13960 + }, + { + "epoch": 0.5878315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.00018826886711758578, + "loss": 2.9063, + "step": 13961 + }, + { + "epoch": 0.5878736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.00018823616829938977, + "loss": 2.951, + "step": 13962 + }, + { + "epoch": 0.5879157894736842, + "grad_norm": 1.15625, + "learning_rate": 0.0001882034706064583, + "loss": 2.8705, + "step": 13963 + }, + { + "epoch": 0.5879578947368421, + "grad_norm": 0.447265625, + "learning_rate": 0.00018817077403938693, + "loss": 3.7721, + "step": 13964 + }, + { + "epoch": 0.588, + "grad_norm": 0.439453125, + "learning_rate": 0.00018813807859877147, + "loss": 3.269, + "step": 13965 + }, + { + "epoch": 0.5880421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00018810538428520743, + "loss": 3.1864, + "step": 13966 + }, + { + "epoch": 0.5880842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.00018807269109929057, + "loss": 3.3164, + "step": 13967 + }, + { + "epoch": 0.5881263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.0001880399990416166, + "loss": 3.0805, + "step": 13968 + }, + { + "epoch": 0.5881684210526316, + "grad_norm": 0.44140625, + "learning_rate": 0.000188007308112781, + "loss": 2.9917, + "step": 13969 + }, + { + "epoch": 0.5882105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.00018797461831337943, + "loss": 2.8359, + "step": 13970 + }, + { + "epoch": 0.5882526315789474, + "grad_norm": 0.4453125, + "learning_rate": 0.00018794192964400735, + "loss": 3.1658, + "step": 13971 + }, + { + "epoch": 0.5882947368421053, + "grad_norm": 0.4296875, + "learning_rate": 0.00018790924210526056, + "loss": 3.0243, + "step": 13972 + }, + { + "epoch": 0.5883368421052632, + "grad_norm": 0.419921875, + "learning_rate": 0.00018787655569773426, + "loss": 3.083, + "step": 13973 + }, + { + "epoch": 0.588378947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00018784387042202415, + "loss": 3.137, + "step": 13974 + }, + { + "epoch": 0.588421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.0001878111862787258, + "loss": 2.7689, + "step": 13975 + }, + { + "epoch": 0.5884631578947368, + "grad_norm": 0.42578125, + "learning_rate": 0.00018777850326843448, + "loss": 2.8201, + "step": 13976 + }, + { + "epoch": 0.5885052631578948, + "grad_norm": 0.416015625, + "learning_rate": 0.00018774582139174576, + "loss": 3.4679, + "step": 13977 + }, + { + "epoch": 0.5885473684210526, + "grad_norm": 0.421875, + "learning_rate": 0.000187713140649255, + "loss": 3.1368, + "step": 13978 + }, + { + "epoch": 0.5885894736842106, + "grad_norm": 0.431640625, + "learning_rate": 0.00018768046104155768, + "loss": 3.3511, + "step": 13979 + }, + { + "epoch": 0.5886315789473684, + "grad_norm": 0.3984375, + "learning_rate": 0.00018764778256924908, + "loss": 2.6194, + "step": 13980 + }, + { + "epoch": 0.5886736842105263, + "grad_norm": 0.416015625, + "learning_rate": 0.00018761510523292459, + "loss": 3.1724, + "step": 13981 + }, + { + "epoch": 0.5887157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.00018758242903317972, + "loss": 3.2743, + "step": 13982 + }, + { + "epoch": 0.5887578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.00018754975397060951, + "loss": 3.3447, + "step": 13983 + }, + { + "epoch": 0.5888, + "grad_norm": 0.4296875, + "learning_rate": 0.00018751708004580955, + "loss": 3.3171, + "step": 13984 + }, + { + "epoch": 0.5888421052631579, + "grad_norm": 0.470703125, + "learning_rate": 0.00018748440725937485, + "loss": 3.805, + "step": 13985 + }, + { + "epoch": 0.5888842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.00018745173561190087, + "loss": 3.2026, + "step": 13986 + }, + { + "epoch": 0.5889263157894736, + "grad_norm": 0.4453125, + "learning_rate": 0.0001874190651039827, + "loss": 3.4327, + "step": 13987 + }, + { + "epoch": 0.5889684210526316, + "grad_norm": 0.46875, + "learning_rate": 0.00018738639573621563, + "loss": 2.8129, + "step": 13988 + }, + { + "epoch": 0.5890105263157894, + "grad_norm": 0.443359375, + "learning_rate": 0.000187353727509195, + "loss": 3.4215, + "step": 13989 + }, + { + "epoch": 0.5890526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.00018732106042351572, + "loss": 2.9689, + "step": 13990 + }, + { + "epoch": 0.5890947368421052, + "grad_norm": 0.4375, + "learning_rate": 0.0001872883944797732, + "loss": 3.0947, + "step": 13991 + }, + { + "epoch": 0.5891368421052632, + "grad_norm": 0.48046875, + "learning_rate": 0.0001872557296785623, + "loss": 2.9188, + "step": 13992 + }, + { + "epoch": 0.589178947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.0001872230660204784, + "loss": 3.483, + "step": 13993 + }, + { + "epoch": 0.589221052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00018719040350611638, + "loss": 2.8715, + "step": 13994 + }, + { + "epoch": 0.5892631578947368, + "grad_norm": 0.427734375, + "learning_rate": 0.00018715774213607145, + "loss": 2.9801, + "step": 13995 + }, + { + "epoch": 0.5893052631578948, + "grad_norm": 0.4296875, + "learning_rate": 0.00018712508191093865, + "loss": 3.3026, + "step": 13996 + }, + { + "epoch": 0.5893473684210526, + "grad_norm": 0.4453125, + "learning_rate": 0.00018709242283131294, + "loss": 3.444, + "step": 13997 + }, + { + "epoch": 0.5893894736842106, + "grad_norm": 0.435546875, + "learning_rate": 0.00018705976489778947, + "loss": 3.209, + "step": 13998 + }, + { + "epoch": 0.5894315789473684, + "grad_norm": 0.40234375, + "learning_rate": 0.000187027108110963, + "loss": 3.3482, + "step": 13999 + }, + { + "epoch": 0.5894736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.00018699445247142866, + "loss": 3.333, + "step": 14000 + }, + { + "epoch": 0.5895157894736842, + "grad_norm": 0.455078125, + "learning_rate": 0.00018696179797978146, + "loss": 3.0912, + "step": 14001 + }, + { + "epoch": 0.5895578947368421, + "grad_norm": 0.4453125, + "learning_rate": 0.00018692914463661615, + "loss": 3.3171, + "step": 14002 + }, + { + "epoch": 0.5896, + "grad_norm": 0.423828125, + "learning_rate": 0.00018689649244252774, + "loss": 3.2943, + "step": 14003 + }, + { + "epoch": 0.5896421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.00018686384139811105, + "loss": 2.6557, + "step": 14004 + }, + { + "epoch": 0.5896842105263158, + "grad_norm": 0.4453125, + "learning_rate": 0.0001868311915039611, + "loss": 3.171, + "step": 14005 + }, + { + "epoch": 0.5897263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.00018679854276067252, + "loss": 3.1507, + "step": 14006 + }, + { + "epoch": 0.5897684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.0001867658951688402, + "loss": 2.9782, + "step": 14007 + }, + { + "epoch": 0.5898105263157895, + "grad_norm": 0.4453125, + "learning_rate": 0.00018673324872905912, + "loss": 2.9733, + "step": 14008 + }, + { + "epoch": 0.5898526315789474, + "grad_norm": 0.44921875, + "learning_rate": 0.0001867006034419238, + "loss": 3.087, + "step": 14009 + }, + { + "epoch": 0.5898947368421052, + "grad_norm": 0.451171875, + "learning_rate": 0.00018666795930802915, + "loss": 3.3937, + "step": 14010 + }, + { + "epoch": 0.5899368421052632, + "grad_norm": 0.396484375, + "learning_rate": 0.00018663531632796989, + "loss": 2.9664, + "step": 14011 + }, + { + "epoch": 0.589978947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00018660267450234066, + "loss": 3.0335, + "step": 14012 + }, + { + "epoch": 0.590021052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.0001865700338317362, + "loss": 2.774, + "step": 14013 + }, + { + "epoch": 0.5900631578947368, + "grad_norm": 0.427734375, + "learning_rate": 0.0001865373943167512, + "loss": 3.3908, + "step": 14014 + }, + { + "epoch": 0.5901052631578947, + "grad_norm": 0.421875, + "learning_rate": 0.00018650475595798042, + "loss": 2.8827, + "step": 14015 + }, + { + "epoch": 0.5901473684210526, + "grad_norm": 0.408203125, + "learning_rate": 0.00018647211875601828, + "loss": 2.8737, + "step": 14016 + }, + { + "epoch": 0.5901894736842105, + "grad_norm": 0.443359375, + "learning_rate": 0.00018643948271145958, + "loss": 2.3997, + "step": 14017 + }, + { + "epoch": 0.5902315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.0001864068478248987, + "loss": 3.5256, + "step": 14018 + }, + { + "epoch": 0.5902736842105263, + "grad_norm": 0.45703125, + "learning_rate": 0.00018637421409693045, + "loss": 2.7102, + "step": 14019 + }, + { + "epoch": 0.5903157894736842, + "grad_norm": 0.396484375, + "learning_rate": 0.00018634158152814917, + "loss": 3.1982, + "step": 14020 + }, + { + "epoch": 0.5903578947368421, + "grad_norm": 0.4453125, + "learning_rate": 0.00018630895011914947, + "loss": 2.8601, + "step": 14021 + }, + { + "epoch": 0.5904, + "grad_norm": 0.427734375, + "learning_rate": 0.00018627631987052598, + "loss": 3.4766, + "step": 14022 + }, + { + "epoch": 0.5904421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.00018624369078287296, + "loss": 2.8024, + "step": 14023 + }, + { + "epoch": 0.5904842105263158, + "grad_norm": 0.40234375, + "learning_rate": 0.0001862110628567851, + "loss": 2.7287, + "step": 14024 + }, + { + "epoch": 0.5905263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.00018617843609285663, + "loss": 3.0883, + "step": 14025 + }, + { + "epoch": 0.5905684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.00018614581049168218, + "loss": 3.0341, + "step": 14026 + }, + { + "epoch": 0.5906105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.00018611318605385585, + "loss": 3.2233, + "step": 14027 + }, + { + "epoch": 0.5906526315789473, + "grad_norm": 0.439453125, + "learning_rate": 0.0001860805627799723, + "loss": 3.2316, + "step": 14028 + }, + { + "epoch": 0.5906947368421053, + "grad_norm": 0.40625, + "learning_rate": 0.00018604794067062581, + "loss": 3.2186, + "step": 14029 + }, + { + "epoch": 0.5907368421052631, + "grad_norm": 0.453125, + "learning_rate": 0.00018601531972641064, + "loss": 3.1209, + "step": 14030 + }, + { + "epoch": 0.5907789473684211, + "grad_norm": 0.4296875, + "learning_rate": 0.00018598269994792127, + "loss": 3.4186, + "step": 14031 + }, + { + "epoch": 0.5908210526315789, + "grad_norm": 0.419921875, + "learning_rate": 0.00018595008133575182, + "loss": 2.9376, + "step": 14032 + }, + { + "epoch": 0.5908631578947369, + "grad_norm": 0.40625, + "learning_rate": 0.0001859174638904967, + "loss": 3.4776, + "step": 14033 + }, + { + "epoch": 0.5909052631578947, + "grad_norm": 0.400390625, + "learning_rate": 0.00018588484761275, + "loss": 2.7281, + "step": 14034 + }, + { + "epoch": 0.5909473684210527, + "grad_norm": 0.416015625, + "learning_rate": 0.00018585223250310606, + "loss": 3.1854, + "step": 14035 + }, + { + "epoch": 0.5909894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.0001858196185621591, + "loss": 2.9249, + "step": 14036 + }, + { + "epoch": 0.5910315789473685, + "grad_norm": 0.453125, + "learning_rate": 0.00018578700579050322, + "loss": 3.1483, + "step": 14037 + }, + { + "epoch": 0.5910736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.0001857543941887328, + "loss": 3.0011, + "step": 14038 + }, + { + "epoch": 0.5911157894736843, + "grad_norm": 0.419921875, + "learning_rate": 0.0001857217837574417, + "loss": 3.1335, + "step": 14039 + }, + { + "epoch": 0.5911578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.0001856891744972243, + "loss": 3.5151, + "step": 14040 + }, + { + "epoch": 0.5912, + "grad_norm": 0.412109375, + "learning_rate": 0.00018565656640867447, + "loss": 3.0257, + "step": 14041 + }, + { + "epoch": 0.5912421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.00018562395949238636, + "loss": 3.2532, + "step": 14042 + }, + { + "epoch": 0.5912842105263157, + "grad_norm": 0.4375, + "learning_rate": 0.0001855913537489542, + "loss": 3.2271, + "step": 14043 + }, + { + "epoch": 0.5913263157894737, + "grad_norm": 0.5078125, + "learning_rate": 0.0001855587491789718, + "loss": 3.1459, + "step": 14044 + }, + { + "epoch": 0.5913684210526315, + "grad_norm": 0.439453125, + "learning_rate": 0.00018552614578303328, + "loss": 2.7192, + "step": 14045 + }, + { + "epoch": 0.5914105263157895, + "grad_norm": 0.43359375, + "learning_rate": 0.00018549354356173266, + "loss": 2.8455, + "step": 14046 + }, + { + "epoch": 0.5914526315789473, + "grad_norm": 0.40625, + "learning_rate": 0.00018546094251566393, + "loss": 2.9592, + "step": 14047 + }, + { + "epoch": 0.5914947368421053, + "grad_norm": 0.3984375, + "learning_rate": 0.00018542834264542092, + "loss": 3.1317, + "step": 14048 + }, + { + "epoch": 0.5915368421052631, + "grad_norm": 0.44921875, + "learning_rate": 0.0001853957439515976, + "loss": 3.6409, + "step": 14049 + }, + { + "epoch": 0.5915789473684211, + "grad_norm": 0.5234375, + "learning_rate": 0.0001853631464347881, + "loss": 2.8056, + "step": 14050 + }, + { + "epoch": 0.5916210526315789, + "grad_norm": 0.427734375, + "learning_rate": 0.0001853305500955859, + "loss": 3.4607, + "step": 14051 + }, + { + "epoch": 0.5916631578947369, + "grad_norm": 0.439453125, + "learning_rate": 0.00018529795493458522, + "loss": 3.1425, + "step": 14052 + }, + { + "epoch": 0.5917052631578947, + "grad_norm": 0.4296875, + "learning_rate": 0.00018526536095237974, + "loss": 3.2086, + "step": 14053 + }, + { + "epoch": 0.5917473684210526, + "grad_norm": 0.44140625, + "learning_rate": 0.00018523276814956337, + "loss": 3.8333, + "step": 14054 + }, + { + "epoch": 0.5917894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.00018520017652672984, + "loss": 3.4118, + "step": 14055 + }, + { + "epoch": 0.5918315789473684, + "grad_norm": 0.431640625, + "learning_rate": 0.00018516758608447292, + "loss": 2.9002, + "step": 14056 + }, + { + "epoch": 0.5918736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.00018513499682338652, + "loss": 2.9023, + "step": 14057 + }, + { + "epoch": 0.5919157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.00018510240874406415, + "loss": 3.3603, + "step": 14058 + }, + { + "epoch": 0.5919578947368421, + "grad_norm": 0.447265625, + "learning_rate": 0.00018506982184709976, + "loss": 3.4403, + "step": 14059 + }, + { + "epoch": 0.592, + "grad_norm": 0.427734375, + "learning_rate": 0.0001850372361330868, + "loss": 3.006, + "step": 14060 + }, + { + "epoch": 0.5920421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.0001850046516026191, + "loss": 3.1424, + "step": 14061 + }, + { + "epoch": 0.5920842105263158, + "grad_norm": 0.46875, + "learning_rate": 0.0001849720682562903, + "loss": 2.6964, + "step": 14062 + }, + { + "epoch": 0.5921263157894737, + "grad_norm": 0.443359375, + "learning_rate": 0.00018493948609469397, + "loss": 3.0775, + "step": 14063 + }, + { + "epoch": 0.5921684210526316, + "grad_norm": 0.439453125, + "learning_rate": 0.00018490690511842388, + "loss": 2.7162, + "step": 14064 + }, + { + "epoch": 0.5922105263157895, + "grad_norm": 0.43359375, + "learning_rate": 0.00018487432532807336, + "loss": 3.132, + "step": 14065 + }, + { + "epoch": 0.5922526315789474, + "grad_norm": 0.40625, + "learning_rate": 0.00018484174672423625, + "loss": 2.975, + "step": 14066 + }, + { + "epoch": 0.5922947368421053, + "grad_norm": 0.412109375, + "learning_rate": 0.00018480916930750587, + "loss": 3.3118, + "step": 14067 + }, + { + "epoch": 0.5923368421052632, + "grad_norm": 0.427734375, + "learning_rate": 0.00018477659307847578, + "loss": 3.5919, + "step": 14068 + }, + { + "epoch": 0.592378947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00018474401803773966, + "loss": 3.3821, + "step": 14069 + }, + { + "epoch": 0.592421052631579, + "grad_norm": 0.455078125, + "learning_rate": 0.00018471144418589073, + "loss": 3.0293, + "step": 14070 + }, + { + "epoch": 0.5924631578947368, + "grad_norm": 0.42578125, + "learning_rate": 0.00018467887152352264, + "loss": 3.1185, + "step": 14071 + }, + { + "epoch": 0.5925052631578948, + "grad_norm": 0.4140625, + "learning_rate": 0.00018464630005122874, + "loss": 3.027, + "step": 14072 + }, + { + "epoch": 0.5925473684210526, + "grad_norm": 0.421875, + "learning_rate": 0.00018461372976960252, + "loss": 3.3056, + "step": 14073 + }, + { + "epoch": 0.5925894736842106, + "grad_norm": 0.4453125, + "learning_rate": 0.00018458116067923724, + "loss": 2.9311, + "step": 14074 + }, + { + "epoch": 0.5926315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.00018454859278072637, + "loss": 3.242, + "step": 14075 + }, + { + "epoch": 0.5926736842105264, + "grad_norm": 0.40234375, + "learning_rate": 0.0001845160260746633, + "loss": 3.3032, + "step": 14076 + }, + { + "epoch": 0.5927157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.00018448346056164122, + "loss": 3.4215, + "step": 14077 + }, + { + "epoch": 0.5927578947368422, + "grad_norm": 0.416015625, + "learning_rate": 0.00018445089624225352, + "loss": 2.9424, + "step": 14078 + }, + { + "epoch": 0.5928, + "grad_norm": 0.423828125, + "learning_rate": 0.0001844183331170935, + "loss": 2.9477, + "step": 14079 + }, + { + "epoch": 0.592842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.00018438577118675442, + "loss": 3.2466, + "step": 14080 + }, + { + "epoch": 0.5928842105263158, + "grad_norm": 0.5, + "learning_rate": 0.00018435321045182944, + "loss": 2.7994, + "step": 14081 + }, + { + "epoch": 0.5929263157894736, + "grad_norm": 0.400390625, + "learning_rate": 0.0001843206509129118, + "loss": 2.945, + "step": 14082 + }, + { + "epoch": 0.5929684210526316, + "grad_norm": 0.4609375, + "learning_rate": 0.00018428809257059486, + "loss": 2.9767, + "step": 14083 + }, + { + "epoch": 0.5930105263157894, + "grad_norm": 0.416015625, + "learning_rate": 0.00018425553542547154, + "loss": 3.0225, + "step": 14084 + }, + { + "epoch": 0.5930526315789474, + "grad_norm": 0.443359375, + "learning_rate": 0.00018422297947813526, + "loss": 3.1694, + "step": 14085 + }, + { + "epoch": 0.5930947368421052, + "grad_norm": 0.3984375, + "learning_rate": 0.00018419042472917884, + "loss": 2.7224, + "step": 14086 + }, + { + "epoch": 0.5931368421052632, + "grad_norm": 0.43359375, + "learning_rate": 0.00018415787117919568, + "loss": 2.8289, + "step": 14087 + }, + { + "epoch": 0.593178947368421, + "grad_norm": 0.462890625, + "learning_rate": 0.00018412531882877863, + "loss": 3.5368, + "step": 14088 + }, + { + "epoch": 0.593221052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.00018409276767852094, + "loss": 3.0918, + "step": 14089 + }, + { + "epoch": 0.5932631578947368, + "grad_norm": 0.41796875, + "learning_rate": 0.00018406021772901567, + "loss": 2.9173, + "step": 14090 + }, + { + "epoch": 0.5933052631578948, + "grad_norm": 0.447265625, + "learning_rate": 0.0001840276689808556, + "loss": 3.6685, + "step": 14091 + }, + { + "epoch": 0.5933473684210526, + "grad_norm": 0.447265625, + "learning_rate": 0.00018399512143463403, + "loss": 3.4018, + "step": 14092 + }, + { + "epoch": 0.5933894736842106, + "grad_norm": 0.5078125, + "learning_rate": 0.00018396257509094372, + "loss": 3.2447, + "step": 14093 + }, + { + "epoch": 0.5934315789473684, + "grad_norm": 0.46484375, + "learning_rate": 0.00018393002995037772, + "loss": 3.2238, + "step": 14094 + }, + { + "epoch": 0.5934736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.0001838974860135289, + "loss": 2.8675, + "step": 14095 + }, + { + "epoch": 0.5935157894736842, + "grad_norm": 0.4375, + "learning_rate": 0.00018386494328099024, + "loss": 2.8125, + "step": 14096 + }, + { + "epoch": 0.5935578947368421, + "grad_norm": 0.404296875, + "learning_rate": 0.00018383240175335464, + "loss": 2.7587, + "step": 14097 + }, + { + "epoch": 0.5936, + "grad_norm": 0.419921875, + "learning_rate": 0.0001837998614312149, + "loss": 2.9503, + "step": 14098 + }, + { + "epoch": 0.5936421052631579, + "grad_norm": 0.4609375, + "learning_rate": 0.00018376732231516397, + "loss": 2.7711, + "step": 14099 + }, + { + "epoch": 0.5936842105263158, + "grad_norm": 0.45703125, + "learning_rate": 0.0001837347844057945, + "loss": 2.7605, + "step": 14100 + }, + { + "epoch": 0.5937263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.00018370224770369943, + "loss": 3.0096, + "step": 14101 + }, + { + "epoch": 0.5937684210526316, + "grad_norm": 0.404296875, + "learning_rate": 0.0001836697122094716, + "loss": 3.3535, + "step": 14102 + }, + { + "epoch": 0.5938105263157895, + "grad_norm": 0.416015625, + "learning_rate": 0.00018363717792370358, + "loss": 3.5111, + "step": 14103 + }, + { + "epoch": 0.5938526315789474, + "grad_norm": 0.427734375, + "learning_rate": 0.00018360464484698825, + "loss": 3.2068, + "step": 14104 + }, + { + "epoch": 0.5938947368421053, + "grad_norm": 0.39453125, + "learning_rate": 0.0001835721129799182, + "loss": 3.2665, + "step": 14105 + }, + { + "epoch": 0.5939368421052632, + "grad_norm": 0.4453125, + "learning_rate": 0.00018353958232308638, + "loss": 3.4318, + "step": 14106 + }, + { + "epoch": 0.593978947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00018350705287708513, + "loss": 3.0891, + "step": 14107 + }, + { + "epoch": 0.5940210526315789, + "grad_norm": 0.421875, + "learning_rate": 0.00018347452464250724, + "loss": 3.3439, + "step": 14108 + }, + { + "epoch": 0.5940631578947368, + "grad_norm": 0.419921875, + "learning_rate": 0.00018344199761994546, + "loss": 2.7869, + "step": 14109 + }, + { + "epoch": 0.5941052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.0001834094718099922, + "loss": 3.3625, + "step": 14110 + }, + { + "epoch": 0.5941473684210526, + "grad_norm": 0.4296875, + "learning_rate": 0.00018337694721324018, + "loss": 3.3111, + "step": 14111 + }, + { + "epoch": 0.5941894736842105, + "grad_norm": 0.439453125, + "learning_rate": 0.00018334442383028185, + "loss": 3.2014, + "step": 14112 + }, + { + "epoch": 0.5942315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.00018331190166170983, + "loss": 2.9869, + "step": 14113 + }, + { + "epoch": 0.5942736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.00018327938070811657, + "loss": 3.1566, + "step": 14114 + }, + { + "epoch": 0.5943157894736842, + "grad_norm": 0.451171875, + "learning_rate": 0.00018324686097009456, + "loss": 3.0855, + "step": 14115 + }, + { + "epoch": 0.5943578947368421, + "grad_norm": 0.443359375, + "learning_rate": 0.00018321434244823645, + "loss": 2.6995, + "step": 14116 + }, + { + "epoch": 0.5944, + "grad_norm": 0.51171875, + "learning_rate": 0.00018318182514313444, + "loss": 3.3796, + "step": 14117 + }, + { + "epoch": 0.5944421052631579, + "grad_norm": 0.447265625, + "learning_rate": 0.00018314930905538119, + "loss": 2.9373, + "step": 14118 + }, + { + "epoch": 0.5944842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.00018311679418556883, + "loss": 2.9837, + "step": 14119 + }, + { + "epoch": 0.5945263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.00018308428053429, + "loss": 2.9227, + "step": 14120 + }, + { + "epoch": 0.5945684210526315, + "grad_norm": 0.48046875, + "learning_rate": 0.00018305176810213687, + "loss": 2.7834, + "step": 14121 + }, + { + "epoch": 0.5946105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.00018301925688970187, + "loss": 2.7403, + "step": 14122 + }, + { + "epoch": 0.5946526315789473, + "grad_norm": 0.41796875, + "learning_rate": 0.00018298674689757745, + "loss": 2.8032, + "step": 14123 + }, + { + "epoch": 0.5946947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.00018295423812635559, + "loss": 2.873, + "step": 14124 + }, + { + "epoch": 0.5947368421052631, + "grad_norm": 0.455078125, + "learning_rate": 0.0001829217305766289, + "loss": 3.2694, + "step": 14125 + }, + { + "epoch": 0.5947789473684211, + "grad_norm": 0.46484375, + "learning_rate": 0.00018288922424898932, + "loss": 3.4606, + "step": 14126 + }, + { + "epoch": 0.5948210526315789, + "grad_norm": 0.447265625, + "learning_rate": 0.0001828567191440293, + "loss": 2.9249, + "step": 14127 + }, + { + "epoch": 0.5948631578947369, + "grad_norm": 0.42578125, + "learning_rate": 0.00018282421526234093, + "loss": 2.5941, + "step": 14128 + }, + { + "epoch": 0.5949052631578947, + "grad_norm": 0.431640625, + "learning_rate": 0.00018279171260451643, + "loss": 3.0786, + "step": 14129 + }, + { + "epoch": 0.5949473684210527, + "grad_norm": 0.392578125, + "learning_rate": 0.000182759211171148, + "loss": 3.0047, + "step": 14130 + }, + { + "epoch": 0.5949894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.00018272671096282767, + "loss": 3.0047, + "step": 14131 + }, + { + "epoch": 0.5950315789473685, + "grad_norm": 0.40234375, + "learning_rate": 0.00018269421198014775, + "loss": 2.9246, + "step": 14132 + }, + { + "epoch": 0.5950736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.0001826617142237001, + "loss": 3.2797, + "step": 14133 + }, + { + "epoch": 0.5951157894736843, + "grad_norm": 0.4296875, + "learning_rate": 0.00018262921769407697, + "loss": 3.1566, + "step": 14134 + }, + { + "epoch": 0.5951578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00018259672239187024, + "loss": 2.2889, + "step": 14135 + }, + { + "epoch": 0.5952, + "grad_norm": 0.431640625, + "learning_rate": 0.00018256422831767206, + "loss": 3.2808, + "step": 14136 + }, + { + "epoch": 0.5952421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.0001825317354720744, + "loss": 2.9301, + "step": 14137 + }, + { + "epoch": 0.5952842105263157, + "grad_norm": 0.4453125, + "learning_rate": 0.00018249924385566928, + "loss": 3.2682, + "step": 14138 + }, + { + "epoch": 0.5953263157894737, + "grad_norm": 0.39453125, + "learning_rate": 0.00018246675346904868, + "loss": 3.2212, + "step": 14139 + }, + { + "epoch": 0.5953684210526315, + "grad_norm": 0.447265625, + "learning_rate": 0.00018243426431280437, + "loss": 3.192, + "step": 14140 + }, + { + "epoch": 0.5954105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.00018240177638752852, + "loss": 3.4714, + "step": 14141 + }, + { + "epoch": 0.5954526315789473, + "grad_norm": 0.42578125, + "learning_rate": 0.00018236928969381274, + "loss": 3.3307, + "step": 14142 + }, + { + "epoch": 0.5954947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.00018233680423224906, + "loss": 3.0748, + "step": 14143 + }, + { + "epoch": 0.5955368421052631, + "grad_norm": 0.423828125, + "learning_rate": 0.0001823043200034294, + "loss": 3.4481, + "step": 14144 + }, + { + "epoch": 0.5955789473684211, + "grad_norm": 0.43359375, + "learning_rate": 0.0001822718370079454, + "loss": 2.7483, + "step": 14145 + }, + { + "epoch": 0.5956210526315789, + "grad_norm": 0.5, + "learning_rate": 0.00018223935524638897, + "loss": 2.9262, + "step": 14146 + }, + { + "epoch": 0.5956631578947369, + "grad_norm": 0.427734375, + "learning_rate": 0.00018220687471935185, + "loss": 2.971, + "step": 14147 + }, + { + "epoch": 0.5957052631578947, + "grad_norm": 0.41015625, + "learning_rate": 0.0001821743954274258, + "loss": 3.0555, + "step": 14148 + }, + { + "epoch": 0.5957473684210526, + "grad_norm": 0.44140625, + "learning_rate": 0.00018214191737120271, + "loss": 2.9495, + "step": 14149 + }, + { + "epoch": 0.5957894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.00018210944055127405, + "loss": 2.8069, + "step": 14150 + }, + { + "epoch": 0.5958315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.0001820769649682317, + "loss": 3.209, + "step": 14151 + }, + { + "epoch": 0.5958736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.0001820444906226671, + "loss": 3.172, + "step": 14152 + }, + { + "epoch": 0.5959157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.00018201201751517215, + "loss": 3.3699, + "step": 14153 + }, + { + "epoch": 0.5959578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00018197954564633828, + "loss": 2.7971, + "step": 14154 + }, + { + "epoch": 0.596, + "grad_norm": 0.447265625, + "learning_rate": 0.00018194707501675722, + "loss": 3.0257, + "step": 14155 + }, + { + "epoch": 0.5960421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.0001819146056270205, + "loss": 3.318, + "step": 14156 + }, + { + "epoch": 0.5960842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.00018188213747771963, + "loss": 2.9438, + "step": 14157 + }, + { + "epoch": 0.5961263157894737, + "grad_norm": 0.435546875, + "learning_rate": 0.00018184967056944623, + "loss": 3.0333, + "step": 14158 + }, + { + "epoch": 0.5961684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.00018181720490279173, + "loss": 3.274, + "step": 14159 + }, + { + "epoch": 0.5962105263157895, + "grad_norm": 0.46875, + "learning_rate": 0.0001817847404783477, + "loss": 3.0457, + "step": 14160 + }, + { + "epoch": 0.5962526315789474, + "grad_norm": 0.39453125, + "learning_rate": 0.00018175227729670546, + "loss": 2.7163, + "step": 14161 + }, + { + "epoch": 0.5962947368421052, + "grad_norm": 0.4375, + "learning_rate": 0.00018171981535845656, + "loss": 3.4618, + "step": 14162 + }, + { + "epoch": 0.5963368421052632, + "grad_norm": 0.44921875, + "learning_rate": 0.0001816873546641924, + "loss": 3.4997, + "step": 14163 + }, + { + "epoch": 0.596378947368421, + "grad_norm": 0.421875, + "learning_rate": 0.00018165489521450434, + "loss": 3.3044, + "step": 14164 + }, + { + "epoch": 0.596421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.0001816224370099839, + "loss": 3.0795, + "step": 14165 + }, + { + "epoch": 0.5964631578947368, + "grad_norm": 0.42578125, + "learning_rate": 0.00018158998005122224, + "loss": 3.0373, + "step": 14166 + }, + { + "epoch": 0.5965052631578948, + "grad_norm": 0.431640625, + "learning_rate": 0.00018155752433881084, + "loss": 3.0506, + "step": 14167 + }, + { + "epoch": 0.5965473684210526, + "grad_norm": 0.43359375, + "learning_rate": 0.00018152506987334082, + "loss": 3.4617, + "step": 14168 + }, + { + "epoch": 0.5965894736842106, + "grad_norm": 0.4140625, + "learning_rate": 0.00018149261665540356, + "loss": 2.9665, + "step": 14169 + }, + { + "epoch": 0.5966315789473684, + "grad_norm": 0.404296875, + "learning_rate": 0.0001814601646855904, + "loss": 3.1314, + "step": 14170 + }, + { + "epoch": 0.5966736842105264, + "grad_norm": 0.439453125, + "learning_rate": 0.00018142771396449254, + "loss": 3.0709, + "step": 14171 + }, + { + "epoch": 0.5967157894736842, + "grad_norm": 0.44921875, + "learning_rate": 0.00018139526449270112, + "loss": 3.292, + "step": 14172 + }, + { + "epoch": 0.5967578947368422, + "grad_norm": 0.439453125, + "learning_rate": 0.00018136281627080734, + "loss": 3.0269, + "step": 14173 + }, + { + "epoch": 0.5968, + "grad_norm": 0.443359375, + "learning_rate": 0.0001813303692994025, + "loss": 3.3575, + "step": 14174 + }, + { + "epoch": 0.5968421052631578, + "grad_norm": 0.42578125, + "learning_rate": 0.00018129792357907755, + "loss": 3.191, + "step": 14175 + }, + { + "epoch": 0.5968842105263158, + "grad_norm": 0.466796875, + "learning_rate": 0.0001812654791104237, + "loss": 3.1151, + "step": 14176 + }, + { + "epoch": 0.5969263157894736, + "grad_norm": 0.423828125, + "learning_rate": 0.00018123303589403218, + "loss": 2.8759, + "step": 14177 + }, + { + "epoch": 0.5969684210526316, + "grad_norm": 0.455078125, + "learning_rate": 0.00018120059393049383, + "loss": 2.8783, + "step": 14178 + }, + { + "epoch": 0.5970105263157894, + "grad_norm": 0.431640625, + "learning_rate": 0.0001811681532203999, + "loss": 3.4004, + "step": 14179 + }, + { + "epoch": 0.5970526315789474, + "grad_norm": 0.431640625, + "learning_rate": 0.00018113571376434127, + "loss": 3.1573, + "step": 14180 + }, + { + "epoch": 0.5970947368421052, + "grad_norm": 0.404296875, + "learning_rate": 0.0001811032755629091, + "loss": 2.8983, + "step": 14181 + }, + { + "epoch": 0.5971368421052632, + "grad_norm": 0.46484375, + "learning_rate": 0.00018107083861669426, + "loss": 3.4293, + "step": 14182 + }, + { + "epoch": 0.597178947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.0001810384029262877, + "loss": 3.0085, + "step": 14183 + }, + { + "epoch": 0.597221052631579, + "grad_norm": 0.470703125, + "learning_rate": 0.00018100596849228055, + "loss": 3.0895, + "step": 14184 + }, + { + "epoch": 0.5972631578947368, + "grad_norm": 0.427734375, + "learning_rate": 0.00018097353531526346, + "loss": 3.0738, + "step": 14185 + }, + { + "epoch": 0.5973052631578948, + "grad_norm": 0.458984375, + "learning_rate": 0.00018094110339582757, + "loss": 3.2546, + "step": 14186 + }, + { + "epoch": 0.5973473684210526, + "grad_norm": 0.4296875, + "learning_rate": 0.00018090867273456353, + "loss": 3.2452, + "step": 14187 + }, + { + "epoch": 0.5973894736842106, + "grad_norm": 0.59765625, + "learning_rate": 0.00018087624333206233, + "loss": 3.3662, + "step": 14188 + }, + { + "epoch": 0.5974315789473684, + "grad_norm": 0.40625, + "learning_rate": 0.00018084381518891475, + "loss": 2.9624, + "step": 14189 + }, + { + "epoch": 0.5974736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.00018081138830571152, + "loss": 3.3758, + "step": 14190 + }, + { + "epoch": 0.5975157894736842, + "grad_norm": 0.40625, + "learning_rate": 0.0001807789626830437, + "loss": 3.2225, + "step": 14191 + }, + { + "epoch": 0.5975578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.00018074653832150168, + "loss": 3.0898, + "step": 14192 + }, + { + "epoch": 0.5976, + "grad_norm": 0.41796875, + "learning_rate": 0.00018071411522167652, + "loss": 3.403, + "step": 14193 + }, + { + "epoch": 0.5976421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.00018068169338415863, + "loss": 2.8114, + "step": 14194 + }, + { + "epoch": 0.5976842105263158, + "grad_norm": 0.44140625, + "learning_rate": 0.00018064927280953891, + "loss": 3.4682, + "step": 14195 + }, + { + "epoch": 0.5977263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.0001806168534984079, + "loss": 3.3691, + "step": 14196 + }, + { + "epoch": 0.5977684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.00018058443545135628, + "loss": 2.8175, + "step": 14197 + }, + { + "epoch": 0.5978105263157895, + "grad_norm": 0.4453125, + "learning_rate": 0.00018055201866897482, + "loss": 3.1892, + "step": 14198 + }, + { + "epoch": 0.5978526315789474, + "grad_norm": 0.4375, + "learning_rate": 0.00018051960315185384, + "loss": 3.3063, + "step": 14199 + }, + { + "epoch": 0.5978947368421053, + "grad_norm": 0.447265625, + "learning_rate": 0.00018048718890058418, + "loss": 3.1537, + "step": 14200 + }, + { + "epoch": 0.5979368421052632, + "grad_norm": 0.4375, + "learning_rate": 0.00018045477591575616, + "loss": 3.1764, + "step": 14201 + }, + { + "epoch": 0.597978947368421, + "grad_norm": 0.4375, + "learning_rate": 0.0001804223641979604, + "loss": 3.09, + "step": 14202 + }, + { + "epoch": 0.5980210526315789, + "grad_norm": 0.4453125, + "learning_rate": 0.00018038995374778754, + "loss": 3.3642, + "step": 14203 + }, + { + "epoch": 0.5980631578947369, + "grad_norm": 0.404296875, + "learning_rate": 0.0001803575445658278, + "loss": 3.1946, + "step": 14204 + }, + { + "epoch": 0.5981052631578947, + "grad_norm": 0.404296875, + "learning_rate": 0.00018032513665267185, + "loss": 2.9327, + "step": 14205 + }, + { + "epoch": 0.5981473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.00018029273000891, + "loss": 3.1497, + "step": 14206 + }, + { + "epoch": 0.5981894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.00018026032463513278, + "loss": 2.7971, + "step": 14207 + }, + { + "epoch": 0.5982315789473684, + "grad_norm": 0.44921875, + "learning_rate": 0.00018022792053193042, + "loss": 2.958, + "step": 14208 + }, + { + "epoch": 0.5982736842105263, + "grad_norm": 0.44140625, + "learning_rate": 0.00018019551769989336, + "loss": 3.3272, + "step": 14209 + }, + { + "epoch": 0.5983157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.00018016311613961206, + "loss": 2.4949, + "step": 14210 + }, + { + "epoch": 0.5983578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.00018013071585167663, + "loss": 2.9405, + "step": 14211 + }, + { + "epoch": 0.5984, + "grad_norm": 0.45703125, + "learning_rate": 0.0001800983168366775, + "loss": 3.173, + "step": 14212 + }, + { + "epoch": 0.5984421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.00018006591909520487, + "loss": 3.0187, + "step": 14213 + }, + { + "epoch": 0.5984842105263158, + "grad_norm": 0.443359375, + "learning_rate": 0.00018003352262784914, + "loss": 3.0295, + "step": 14214 + }, + { + "epoch": 0.5985263157894737, + "grad_norm": 0.451171875, + "learning_rate": 0.0001800011274352003, + "loss": 3.4337, + "step": 14215 + }, + { + "epoch": 0.5985684210526315, + "grad_norm": 0.4140625, + "learning_rate": 0.00017996873351784864, + "loss": 2.989, + "step": 14216 + }, + { + "epoch": 0.5986105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.00017993634087638455, + "loss": 3.1541, + "step": 14217 + }, + { + "epoch": 0.5986526315789473, + "grad_norm": 0.453125, + "learning_rate": 0.00017990394951139784, + "loss": 3.0405, + "step": 14218 + }, + { + "epoch": 0.5986947368421053, + "grad_norm": 0.41796875, + "learning_rate": 0.00017987155942347894, + "loss": 3.3537, + "step": 14219 + }, + { + "epoch": 0.5987368421052631, + "grad_norm": 0.427734375, + "learning_rate": 0.00017983917061321775, + "loss": 3.0125, + "step": 14220 + }, + { + "epoch": 0.5987789473684211, + "grad_norm": 0.49609375, + "learning_rate": 0.00017980678308120447, + "loss": 3.3385, + "step": 14221 + }, + { + "epoch": 0.5988210526315789, + "grad_norm": 0.453125, + "learning_rate": 0.00017977439682802905, + "loss": 3.2128, + "step": 14222 + }, + { + "epoch": 0.5988631578947369, + "grad_norm": 0.427734375, + "learning_rate": 0.00017974201185428162, + "loss": 3.344, + "step": 14223 + }, + { + "epoch": 0.5989052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.0001797096281605523, + "loss": 3.4571, + "step": 14224 + }, + { + "epoch": 0.5989473684210527, + "grad_norm": 0.53515625, + "learning_rate": 0.00017967724574743084, + "loss": 3.192, + "step": 14225 + }, + { + "epoch": 0.5989894736842105, + "grad_norm": 0.435546875, + "learning_rate": 0.00017964486461550746, + "loss": 3.0205, + "step": 14226 + }, + { + "epoch": 0.5990315789473685, + "grad_norm": 0.490234375, + "learning_rate": 0.0001796124847653719, + "loss": 3.0975, + "step": 14227 + }, + { + "epoch": 0.5990736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.00017958010619761416, + "loss": 2.7796, + "step": 14228 + }, + { + "epoch": 0.5991157894736842, + "grad_norm": 0.4609375, + "learning_rate": 0.0001795477289128242, + "loss": 2.9655, + "step": 14229 + }, + { + "epoch": 0.5991578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00017951535291159176, + "loss": 2.9468, + "step": 14230 + }, + { + "epoch": 0.5992, + "grad_norm": 0.4296875, + "learning_rate": 0.00017948297819450685, + "loss": 2.9989, + "step": 14231 + }, + { + "epoch": 0.5992421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00017945060476215913, + "loss": 3.1823, + "step": 14232 + }, + { + "epoch": 0.5992842105263158, + "grad_norm": 0.4609375, + "learning_rate": 0.00017941823261513864, + "loss": 3.3522, + "step": 14233 + }, + { + "epoch": 0.5993263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.00017938586175403492, + "loss": 3.2526, + "step": 14234 + }, + { + "epoch": 0.5993684210526316, + "grad_norm": 0.40625, + "learning_rate": 0.00017935349217943793, + "loss": 3.1417, + "step": 14235 + }, + { + "epoch": 0.5994105263157895, + "grad_norm": 0.40234375, + "learning_rate": 0.00017932112389193716, + "loss": 2.8684, + "step": 14236 + }, + { + "epoch": 0.5994526315789473, + "grad_norm": 0.453125, + "learning_rate": 0.00017928875689212254, + "loss": 3.1602, + "step": 14237 + }, + { + "epoch": 0.5994947368421053, + "grad_norm": 0.421875, + "learning_rate": 0.00017925639118058367, + "loss": 3.1476, + "step": 14238 + }, + { + "epoch": 0.5995368421052631, + "grad_norm": 0.458984375, + "learning_rate": 0.00017922402675791022, + "loss": 2.9712, + "step": 14239 + }, + { + "epoch": 0.5995789473684211, + "grad_norm": 0.416015625, + "learning_rate": 0.00017919166362469194, + "loss": 3.0147, + "step": 14240 + }, + { + "epoch": 0.5996210526315789, + "grad_norm": 0.427734375, + "learning_rate": 0.00017915930178151823, + "loss": 3.1166, + "step": 14241 + }, + { + "epoch": 0.5996631578947368, + "grad_norm": 0.41015625, + "learning_rate": 0.00017912694122897894, + "loss": 2.8975, + "step": 14242 + }, + { + "epoch": 0.5997052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.0001790945819676634, + "loss": 3.0567, + "step": 14243 + }, + { + "epoch": 0.5997473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.00017906222399816123, + "loss": 3.4858, + "step": 14244 + }, + { + "epoch": 0.5997894736842105, + "grad_norm": 0.439453125, + "learning_rate": 0.00017902986732106218, + "loss": 2.8606, + "step": 14245 + }, + { + "epoch": 0.5998315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.0001789975119369554, + "loss": 3.4899, + "step": 14246 + }, + { + "epoch": 0.5998736842105263, + "grad_norm": 0.396484375, + "learning_rate": 0.0001789651578464306, + "loss": 3.1858, + "step": 14247 + }, + { + "epoch": 0.5999157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.00017893280505007706, + "loss": 2.8699, + "step": 14248 + }, + { + "epoch": 0.5999578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.00017890045354848434, + "loss": 3.1221, + "step": 14249 + }, + { + "epoch": 0.6, + "grad_norm": 0.458984375, + "learning_rate": 0.0001788681033422419, + "loss": 2.7105, + "step": 14250 + }, + { + "epoch": 0.6000421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00017883575443193898, + "loss": 3.1565, + "step": 14251 + }, + { + "epoch": 0.6000842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.0001788034068181651, + "loss": 3.221, + "step": 14252 + }, + { + "epoch": 0.6001263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.0001787710605015094, + "loss": 3.2445, + "step": 14253 + }, + { + "epoch": 0.6001684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.00017873871548256125, + "loss": 3.1422, + "step": 14254 + }, + { + "epoch": 0.6002105263157895, + "grad_norm": 0.43359375, + "learning_rate": 0.00017870637176191, + "loss": 3.4716, + "step": 14255 + }, + { + "epoch": 0.6002526315789474, + "grad_norm": 0.40625, + "learning_rate": 0.00017867402934014493, + "loss": 2.8835, + "step": 14256 + }, + { + "epoch": 0.6002947368421052, + "grad_norm": 0.42578125, + "learning_rate": 0.00017864168821785526, + "loss": 3.517, + "step": 14257 + }, + { + "epoch": 0.6003368421052632, + "grad_norm": 0.42578125, + "learning_rate": 0.0001786093483956301, + "loss": 2.9048, + "step": 14258 + }, + { + "epoch": 0.600378947368421, + "grad_norm": 0.45703125, + "learning_rate": 0.00017857700987405887, + "loss": 2.8359, + "step": 14259 + }, + { + "epoch": 0.600421052631579, + "grad_norm": 0.451171875, + "learning_rate": 0.00017854467265373048, + "loss": 3.2564, + "step": 14260 + }, + { + "epoch": 0.6004631578947368, + "grad_norm": 0.416015625, + "learning_rate": 0.0001785123367352343, + "loss": 3.2064, + "step": 14261 + }, + { + "epoch": 0.6005052631578948, + "grad_norm": 0.421875, + "learning_rate": 0.00017848000211915924, + "loss": 2.9936, + "step": 14262 + }, + { + "epoch": 0.6005473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.00017844766880609453, + "loss": 3.3684, + "step": 14263 + }, + { + "epoch": 0.6005894736842106, + "grad_norm": 0.404296875, + "learning_rate": 0.00017841533679662923, + "loss": 3.0844, + "step": 14264 + }, + { + "epoch": 0.6006315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.0001783830060913524, + "loss": 3.1581, + "step": 14265 + }, + { + "epoch": 0.6006736842105264, + "grad_norm": 0.40625, + "learning_rate": 0.0001783506766908531, + "loss": 2.9269, + "step": 14266 + }, + { + "epoch": 0.6007157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.00017831834859572022, + "loss": 3.4592, + "step": 14267 + }, + { + "epoch": 0.6007578947368422, + "grad_norm": 0.4296875, + "learning_rate": 0.0001782860218065429, + "loss": 3.322, + "step": 14268 + }, + { + "epoch": 0.6008, + "grad_norm": 0.466796875, + "learning_rate": 0.00017825369632390987, + "loss": 2.8601, + "step": 14269 + }, + { + "epoch": 0.6008421052631578, + "grad_norm": 0.58203125, + "learning_rate": 0.00017822137214841023, + "loss": 2.9781, + "step": 14270 + }, + { + "epoch": 0.6008842105263158, + "grad_norm": 0.404296875, + "learning_rate": 0.0001781890492806329, + "loss": 2.7142, + "step": 14271 + }, + { + "epoch": 0.6009263157894736, + "grad_norm": 0.435546875, + "learning_rate": 0.0001781567277211667, + "loss": 3.2353, + "step": 14272 + }, + { + "epoch": 0.6009684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.00017812440747060052, + "loss": 2.9058, + "step": 14273 + }, + { + "epoch": 0.6010105263157894, + "grad_norm": 0.44921875, + "learning_rate": 0.00017809208852952312, + "loss": 3.1842, + "step": 14274 + }, + { + "epoch": 0.6010526315789474, + "grad_norm": 0.44921875, + "learning_rate": 0.00017805977089852348, + "loss": 3.3471, + "step": 14275 + }, + { + "epoch": 0.6010947368421052, + "grad_norm": 0.447265625, + "learning_rate": 0.00017802745457819023, + "loss": 3.102, + "step": 14276 + }, + { + "epoch": 0.6011368421052632, + "grad_norm": 0.404296875, + "learning_rate": 0.00017799513956911215, + "loss": 2.7908, + "step": 14277 + }, + { + "epoch": 0.601178947368421, + "grad_norm": 0.447265625, + "learning_rate": 0.00017796282587187814, + "loss": 3.1129, + "step": 14278 + }, + { + "epoch": 0.601221052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.0001779305134870767, + "loss": 3.0738, + "step": 14279 + }, + { + "epoch": 0.6012631578947368, + "grad_norm": 0.478515625, + "learning_rate": 0.0001778982024152967, + "loss": 2.9089, + "step": 14280 + }, + { + "epoch": 0.6013052631578948, + "grad_norm": 0.416015625, + "learning_rate": 0.00017786589265712666, + "loss": 3.0549, + "step": 14281 + }, + { + "epoch": 0.6013473684210526, + "grad_norm": 0.4296875, + "learning_rate": 0.00017783358421315538, + "loss": 2.6364, + "step": 14282 + }, + { + "epoch": 0.6013894736842105, + "grad_norm": 0.44140625, + "learning_rate": 0.0001778012770839713, + "loss": 2.8628, + "step": 14283 + }, + { + "epoch": 0.6014315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.00017776897127016317, + "loss": 3.1314, + "step": 14284 + }, + { + "epoch": 0.6014736842105263, + "grad_norm": 0.45703125, + "learning_rate": 0.00017773666677231963, + "loss": 3.1663, + "step": 14285 + }, + { + "epoch": 0.6015157894736842, + "grad_norm": 0.4453125, + "learning_rate": 0.00017770436359102897, + "loss": 3.3835, + "step": 14286 + }, + { + "epoch": 0.6015578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0001776720617268799, + "loss": 3.1776, + "step": 14287 + }, + { + "epoch": 0.6016, + "grad_norm": 0.431640625, + "learning_rate": 0.00017763976118046094, + "loss": 3.2507, + "step": 14288 + }, + { + "epoch": 0.6016421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.00017760746195236043, + "loss": 3.4731, + "step": 14289 + }, + { + "epoch": 0.6016842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.00017757516404316693, + "loss": 2.7301, + "step": 14290 + }, + { + "epoch": 0.6017263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.00017754286745346883, + "loss": 3.5825, + "step": 14291 + }, + { + "epoch": 0.6017684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.00017751057218385463, + "loss": 3.1869, + "step": 14292 + }, + { + "epoch": 0.6018105263157895, + "grad_norm": 0.455078125, + "learning_rate": 0.00017747827823491252, + "loss": 2.8144, + "step": 14293 + }, + { + "epoch": 0.6018526315789474, + "grad_norm": 0.416015625, + "learning_rate": 0.00017744598560723114, + "loss": 3.1766, + "step": 14294 + }, + { + "epoch": 0.6018947368421053, + "grad_norm": 0.431640625, + "learning_rate": 0.00017741369430139848, + "loss": 3.2164, + "step": 14295 + }, + { + "epoch": 0.6019368421052631, + "grad_norm": 0.40234375, + "learning_rate": 0.0001773814043180031, + "loss": 3.6473, + "step": 14296 + }, + { + "epoch": 0.6019789473684211, + "grad_norm": 0.4296875, + "learning_rate": 0.00017734911565763317, + "loss": 3.3819, + "step": 14297 + }, + { + "epoch": 0.6020210526315789, + "grad_norm": 0.41015625, + "learning_rate": 0.00017731682832087698, + "loss": 3.141, + "step": 14298 + }, + { + "epoch": 0.6020631578947369, + "grad_norm": 0.470703125, + "learning_rate": 0.00017728454230832286, + "loss": 2.921, + "step": 14299 + }, + { + "epoch": 0.6021052631578947, + "grad_norm": 0.431640625, + "learning_rate": 0.0001772522576205589, + "loss": 2.9058, + "step": 14300 + }, + { + "epoch": 0.6021473684210527, + "grad_norm": 0.453125, + "learning_rate": 0.00017721997425817338, + "loss": 3.1806, + "step": 14301 + }, + { + "epoch": 0.6021894736842105, + "grad_norm": 0.416015625, + "learning_rate": 0.00017718769222175435, + "loss": 3.222, + "step": 14302 + }, + { + "epoch": 0.6022315789473685, + "grad_norm": 0.4609375, + "learning_rate": 0.00017715541151188996, + "loss": 3.1514, + "step": 14303 + }, + { + "epoch": 0.6022736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.00017712313212916853, + "loss": 2.8569, + "step": 14304 + }, + { + "epoch": 0.6023157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.00017709085407417792, + "loss": 3.4402, + "step": 14305 + }, + { + "epoch": 0.6023578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.0001770585773475063, + "loss": 3.3504, + "step": 14306 + }, + { + "epoch": 0.6024, + "grad_norm": 0.412109375, + "learning_rate": 0.00017702630194974168, + "loss": 3.1977, + "step": 14307 + }, + { + "epoch": 0.6024421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00017699402788147218, + "loss": 3.2827, + "step": 14308 + }, + { + "epoch": 0.6024842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.0001769617551432856, + "loss": 3.2181, + "step": 14309 + }, + { + "epoch": 0.6025263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.00017692948373577004, + "loss": 3.0767, + "step": 14310 + }, + { + "epoch": 0.6025684210526315, + "grad_norm": 0.427734375, + "learning_rate": 0.0001768972136595135, + "loss": 2.9875, + "step": 14311 + }, + { + "epoch": 0.6026105263157895, + "grad_norm": 0.4609375, + "learning_rate": 0.0001768649449151038, + "loss": 3.1384, + "step": 14312 + }, + { + "epoch": 0.6026526315789473, + "grad_norm": 0.466796875, + "learning_rate": 0.00017683267750312888, + "loss": 3.0977, + "step": 14313 + }, + { + "epoch": 0.6026947368421053, + "grad_norm": 0.44140625, + "learning_rate": 0.00017680041142417656, + "loss": 3.4675, + "step": 14314 + }, + { + "epoch": 0.6027368421052631, + "grad_norm": 0.46484375, + "learning_rate": 0.00017676814667883485, + "loss": 3.0921, + "step": 14315 + }, + { + "epoch": 0.6027789473684211, + "grad_norm": 0.421875, + "learning_rate": 0.00017673588326769135, + "loss": 3.4372, + "step": 14316 + }, + { + "epoch": 0.6028210526315789, + "grad_norm": 0.51953125, + "learning_rate": 0.00017670362119133397, + "loss": 3.1493, + "step": 14317 + }, + { + "epoch": 0.6028631578947369, + "grad_norm": 0.443359375, + "learning_rate": 0.0001766713604503506, + "loss": 3.4734, + "step": 14318 + }, + { + "epoch": 0.6029052631578947, + "grad_norm": 0.44140625, + "learning_rate": 0.0001766391010453288, + "loss": 3.4723, + "step": 14319 + }, + { + "epoch": 0.6029473684210527, + "grad_norm": 0.443359375, + "learning_rate": 0.00017660684297685647, + "loss": 3.2245, + "step": 14320 + }, + { + "epoch": 0.6029894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.00017657458624552113, + "loss": 3.166, + "step": 14321 + }, + { + "epoch": 0.6030315789473685, + "grad_norm": 0.4375, + "learning_rate": 0.0001765423308519106, + "loss": 3.1929, + "step": 14322 + }, + { + "epoch": 0.6030736842105263, + "grad_norm": 0.44140625, + "learning_rate": 0.00017651007679661247, + "loss": 3.1554, + "step": 14323 + }, + { + "epoch": 0.6031157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00017647782408021436, + "loss": 3.4395, + "step": 14324 + }, + { + "epoch": 0.6031578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00017644557270330403, + "loss": 3.1518, + "step": 14325 + }, + { + "epoch": 0.6032, + "grad_norm": 0.44140625, + "learning_rate": 0.0001764133226664688, + "loss": 3.0421, + "step": 14326 + }, + { + "epoch": 0.6032421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.00017638107397029655, + "loss": 3.3119, + "step": 14327 + }, + { + "epoch": 0.6032842105263158, + "grad_norm": 0.462890625, + "learning_rate": 0.00017634882661537448, + "loss": 3.4664, + "step": 14328 + }, + { + "epoch": 0.6033263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0001763165806022903, + "loss": 3.1714, + "step": 14329 + }, + { + "epoch": 0.6033684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.0001762843359316314, + "loss": 3.3592, + "step": 14330 + }, + { + "epoch": 0.6034105263157895, + "grad_norm": 0.4453125, + "learning_rate": 0.00017625209260398536, + "loss": 2.9916, + "step": 14331 + }, + { + "epoch": 0.6034526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.00017621985061993953, + "loss": 3.6419, + "step": 14332 + }, + { + "epoch": 0.6034947368421053, + "grad_norm": 0.42578125, + "learning_rate": 0.00017618760998008127, + "loss": 3.3885, + "step": 14333 + }, + { + "epoch": 0.6035368421052631, + "grad_norm": 0.423828125, + "learning_rate": 0.00017615537068499816, + "loss": 3.0778, + "step": 14334 + }, + { + "epoch": 0.6035789473684211, + "grad_norm": 0.39453125, + "learning_rate": 0.0001761231327352773, + "loss": 2.5775, + "step": 14335 + }, + { + "epoch": 0.603621052631579, + "grad_norm": 0.46484375, + "learning_rate": 0.00017609089613150627, + "loss": 3.2107, + "step": 14336 + }, + { + "epoch": 0.6036631578947368, + "grad_norm": 0.4296875, + "learning_rate": 0.0001760586608742722, + "loss": 3.0254, + "step": 14337 + }, + { + "epoch": 0.6037052631578947, + "grad_norm": 0.435546875, + "learning_rate": 0.00017602642696416243, + "loss": 3.3475, + "step": 14338 + }, + { + "epoch": 0.6037473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.00017599419440176428, + "loss": 3.2766, + "step": 14339 + }, + { + "epoch": 0.6037894736842105, + "grad_norm": 0.5, + "learning_rate": 0.0001759619631876649, + "loss": 3.3514, + "step": 14340 + }, + { + "epoch": 0.6038315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.00017592973332245172, + "loss": 3.0353, + "step": 14341 + }, + { + "epoch": 0.6038736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.0001758975048067116, + "loss": 3.0156, + "step": 14342 + }, + { + "epoch": 0.6039157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.000175865277641032, + "loss": 2.9943, + "step": 14343 + }, + { + "epoch": 0.6039578947368421, + "grad_norm": 0.45703125, + "learning_rate": 0.00017583305182599984, + "loss": 3.2597, + "step": 14344 + }, + { + "epoch": 0.604, + "grad_norm": 0.431640625, + "learning_rate": 0.00017580082736220234, + "loss": 3.1668, + "step": 14345 + }, + { + "epoch": 0.6040421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00017576860425022662, + "loss": 2.7546, + "step": 14346 + }, + { + "epoch": 0.6040842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.00017573638249065974, + "loss": 3.0973, + "step": 14347 + }, + { + "epoch": 0.6041263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.0001757041620840887, + "loss": 2.8762, + "step": 14348 + }, + { + "epoch": 0.6041684210526316, + "grad_norm": 0.44140625, + "learning_rate": 0.00017567194303110047, + "loss": 3.2078, + "step": 14349 + }, + { + "epoch": 0.6042105263157894, + "grad_norm": 0.447265625, + "learning_rate": 0.00017563972533228211, + "loss": 3.1475, + "step": 14350 + }, + { + "epoch": 0.6042526315789474, + "grad_norm": 0.443359375, + "learning_rate": 0.0001756075089882207, + "loss": 3.1275, + "step": 14351 + }, + { + "epoch": 0.6042947368421052, + "grad_norm": 0.431640625, + "learning_rate": 0.00017557529399950294, + "loss": 3.2045, + "step": 14352 + }, + { + "epoch": 0.6043368421052632, + "grad_norm": 0.431640625, + "learning_rate": 0.000175543080366716, + "loss": 3.0218, + "step": 14353 + }, + { + "epoch": 0.604378947368421, + "grad_norm": 0.455078125, + "learning_rate": 0.00017551086809044656, + "loss": 2.8475, + "step": 14354 + }, + { + "epoch": 0.604421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.0001754786571712816, + "loss": 3.1377, + "step": 14355 + }, + { + "epoch": 0.6044631578947368, + "grad_norm": 0.431640625, + "learning_rate": 0.00017544644760980794, + "loss": 3.4219, + "step": 14356 + }, + { + "epoch": 0.6045052631578948, + "grad_norm": 0.4140625, + "learning_rate": 0.00017541423940661238, + "loss": 3.1462, + "step": 14357 + }, + { + "epoch": 0.6045473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.00017538203256228186, + "loss": 3.2937, + "step": 14358 + }, + { + "epoch": 0.6045894736842106, + "grad_norm": 0.421875, + "learning_rate": 0.0001753498270774029, + "loss": 2.7347, + "step": 14359 + }, + { + "epoch": 0.6046315789473684, + "grad_norm": 0.408203125, + "learning_rate": 0.00017531762295256253, + "loss": 2.7794, + "step": 14360 + }, + { + "epoch": 0.6046736842105264, + "grad_norm": 0.4375, + "learning_rate": 0.0001752854201883472, + "loss": 3.3034, + "step": 14361 + }, + { + "epoch": 0.6047157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.00017525321878534382, + "loss": 2.7878, + "step": 14362 + }, + { + "epoch": 0.604757894736842, + "grad_norm": 0.447265625, + "learning_rate": 0.00017522101874413887, + "loss": 3.1952, + "step": 14363 + }, + { + "epoch": 0.6048, + "grad_norm": 0.42578125, + "learning_rate": 0.00017518882006531912, + "loss": 3.2335, + "step": 14364 + }, + { + "epoch": 0.6048421052631578, + "grad_norm": 0.46484375, + "learning_rate": 0.00017515662274947117, + "loss": 2.9054, + "step": 14365 + }, + { + "epoch": 0.6048842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.0001751244267971816, + "loss": 2.7199, + "step": 14366 + }, + { + "epoch": 0.6049263157894736, + "grad_norm": 0.47265625, + "learning_rate": 0.00017509223220903708, + "loss": 2.8467, + "step": 14367 + }, + { + "epoch": 0.6049684210526316, + "grad_norm": 0.412109375, + "learning_rate": 0.000175060038985624, + "loss": 3.2566, + "step": 14368 + }, + { + "epoch": 0.6050105263157894, + "grad_norm": 0.443359375, + "learning_rate": 0.00017502784712752906, + "loss": 3.4888, + "step": 14369 + }, + { + "epoch": 0.6050526315789474, + "grad_norm": 0.421875, + "learning_rate": 0.00017499565663533852, + "loss": 2.6347, + "step": 14370 + }, + { + "epoch": 0.6050947368421052, + "grad_norm": 0.41796875, + "learning_rate": 0.00017496346750963903, + "loss": 3.0466, + "step": 14371 + }, + { + "epoch": 0.6051368421052632, + "grad_norm": 0.453125, + "learning_rate": 0.00017493127975101703, + "loss": 3.6157, + "step": 14372 + }, + { + "epoch": 0.605178947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00017489909336005887, + "loss": 3.4147, + "step": 14373 + }, + { + "epoch": 0.605221052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.00017486690833735108, + "loss": 3.1085, + "step": 14374 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 0.447265625, + "learning_rate": 0.00017483472468347982, + "loss": 2.8484, + "step": 14375 + }, + { + "epoch": 0.6053052631578948, + "grad_norm": 0.40234375, + "learning_rate": 0.0001748025423990317, + "loss": 3.1112, + "step": 14376 + }, + { + "epoch": 0.6053473684210526, + "grad_norm": 0.453125, + "learning_rate": 0.0001747703614845928, + "loss": 3.2114, + "step": 14377 + }, + { + "epoch": 0.6053894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.00017473818194074951, + "loss": 2.9145, + "step": 14378 + }, + { + "epoch": 0.6054315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.0001747060037680882, + "loss": 3.224, + "step": 14379 + }, + { + "epoch": 0.6054736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.00017467382696719497, + "loss": 3.0835, + "step": 14380 + }, + { + "epoch": 0.6055157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00017464165153865613, + "loss": 3.1289, + "step": 14381 + }, + { + "epoch": 0.6055578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00017460947748305783, + "loss": 3.0134, + "step": 14382 + }, + { + "epoch": 0.6056, + "grad_norm": 0.435546875, + "learning_rate": 0.00017457730480098638, + "loss": 3.1964, + "step": 14383 + }, + { + "epoch": 0.6056421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.00017454513349302766, + "loss": 3.1444, + "step": 14384 + }, + { + "epoch": 0.6056842105263158, + "grad_norm": 0.439453125, + "learning_rate": 0.000174512963559768, + "loss": 3.1645, + "step": 14385 + }, + { + "epoch": 0.6057263157894737, + "grad_norm": 0.4921875, + "learning_rate": 0.00017448079500179353, + "loss": 2.9484, + "step": 14386 + }, + { + "epoch": 0.6057684210526316, + "grad_norm": 0.435546875, + "learning_rate": 0.00017444862781969018, + "loss": 2.868, + "step": 14387 + }, + { + "epoch": 0.6058105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.00017441646201404407, + "loss": 3.1535, + "step": 14388 + }, + { + "epoch": 0.6058526315789474, + "grad_norm": 0.45703125, + "learning_rate": 0.0001743842975854412, + "loss": 2.8054, + "step": 14389 + }, + { + "epoch": 0.6058947368421053, + "grad_norm": 0.4375, + "learning_rate": 0.00017435213453446765, + "loss": 3.1087, + "step": 14390 + }, + { + "epoch": 0.6059368421052631, + "grad_norm": 0.431640625, + "learning_rate": 0.00017431997286170922, + "loss": 3.1549, + "step": 14391 + }, + { + "epoch": 0.6059789473684211, + "grad_norm": 0.546875, + "learning_rate": 0.000174287812567752, + "loss": 3.1221, + "step": 14392 + }, + { + "epoch": 0.6060210526315789, + "grad_norm": 0.3984375, + "learning_rate": 0.00017425565365318196, + "loss": 2.8602, + "step": 14393 + }, + { + "epoch": 0.6060631578947369, + "grad_norm": 0.421875, + "learning_rate": 0.0001742234961185848, + "loss": 2.8558, + "step": 14394 + }, + { + "epoch": 0.6061052631578947, + "grad_norm": 0.4375, + "learning_rate": 0.0001741913399645466, + "loss": 2.8347, + "step": 14395 + }, + { + "epoch": 0.6061473684210527, + "grad_norm": 0.41796875, + "learning_rate": 0.0001741591851916531, + "loss": 3.4093, + "step": 14396 + }, + { + "epoch": 0.6061894736842105, + "grad_norm": 0.427734375, + "learning_rate": 0.00017412703180049002, + "loss": 3.1927, + "step": 14397 + }, + { + "epoch": 0.6062315789473685, + "grad_norm": 0.412109375, + "learning_rate": 0.0001740948797916434, + "loss": 3.0297, + "step": 14398 + }, + { + "epoch": 0.6062736842105263, + "grad_norm": 0.431640625, + "learning_rate": 0.0001740627291656988, + "loss": 3.3566, + "step": 14399 + }, + { + "epoch": 0.6063157894736843, + "grad_norm": 0.421875, + "learning_rate": 0.00017403057992324217, + "loss": 3.1013, + "step": 14400 + }, + { + "epoch": 0.6063578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00017399843206485898, + "loss": 3.3346, + "step": 14401 + }, + { + "epoch": 0.6064, + "grad_norm": 0.4453125, + "learning_rate": 0.0001739662855911352, + "loss": 3.3243, + "step": 14402 + }, + { + "epoch": 0.6064421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00017393414050265627, + "loss": 3.4896, + "step": 14403 + }, + { + "epoch": 0.6064842105263157, + "grad_norm": 0.41796875, + "learning_rate": 0.00017390199680000787, + "loss": 3.4679, + "step": 14404 + }, + { + "epoch": 0.6065263157894737, + "grad_norm": 0.421875, + "learning_rate": 0.0001738698544837758, + "loss": 2.815, + "step": 14405 + }, + { + "epoch": 0.6065684210526315, + "grad_norm": 0.4140625, + "learning_rate": 0.00017383771355454548, + "loss": 3.3122, + "step": 14406 + }, + { + "epoch": 0.6066105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.00017380557401290252, + "loss": 2.6552, + "step": 14407 + }, + { + "epoch": 0.6066526315789473, + "grad_norm": 0.447265625, + "learning_rate": 0.00017377343585943245, + "loss": 3.1834, + "step": 14408 + }, + { + "epoch": 0.6066947368421053, + "grad_norm": 0.427734375, + "learning_rate": 0.00017374129909472093, + "loss": 3.0601, + "step": 14409 + }, + { + "epoch": 0.6067368421052631, + "grad_norm": 0.416015625, + "learning_rate": 0.00017370916371935323, + "loss": 2.8227, + "step": 14410 + }, + { + "epoch": 0.6067789473684211, + "grad_norm": 0.51171875, + "learning_rate": 0.0001736770297339149, + "loss": 3.3021, + "step": 14411 + }, + { + "epoch": 0.6068210526315789, + "grad_norm": 0.462890625, + "learning_rate": 0.00017364489713899157, + "loss": 3.0529, + "step": 14412 + }, + { + "epoch": 0.6068631578947369, + "grad_norm": 0.435546875, + "learning_rate": 0.00017361276593516833, + "loss": 2.9552, + "step": 14413 + }, + { + "epoch": 0.6069052631578947, + "grad_norm": 0.46875, + "learning_rate": 0.00017358063612303085, + "loss": 2.8452, + "step": 14414 + }, + { + "epoch": 0.6069473684210527, + "grad_norm": 0.41015625, + "learning_rate": 0.00017354850770316433, + "loss": 3.105, + "step": 14415 + }, + { + "epoch": 0.6069894736842105, + "grad_norm": 0.419921875, + "learning_rate": 0.00017351638067615428, + "loss": 3.0665, + "step": 14416 + }, + { + "epoch": 0.6070315789473684, + "grad_norm": 0.44140625, + "learning_rate": 0.00017348425504258577, + "loss": 2.6447, + "step": 14417 + }, + { + "epoch": 0.6070736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.00017345213080304425, + "loss": 3.4866, + "step": 14418 + }, + { + "epoch": 0.6071157894736842, + "grad_norm": 0.4375, + "learning_rate": 0.00017342000795811507, + "loss": 3.4621, + "step": 14419 + }, + { + "epoch": 0.6071578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.00017338788650838328, + "loss": 3.1926, + "step": 14420 + }, + { + "epoch": 0.6072, + "grad_norm": 0.43359375, + "learning_rate": 0.0001733557664544343, + "loss": 2.6116, + "step": 14421 + }, + { + "epoch": 0.6072421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00017332364779685305, + "loss": 2.8597, + "step": 14422 + }, + { + "epoch": 0.6072842105263158, + "grad_norm": 0.390625, + "learning_rate": 0.00017329153053622493, + "loss": 2.5189, + "step": 14423 + }, + { + "epoch": 0.6073263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.00017325941467313493, + "loss": 3.4193, + "step": 14424 + }, + { + "epoch": 0.6073684210526316, + "grad_norm": 0.43359375, + "learning_rate": 0.00017322730020816825, + "loss": 2.8177, + "step": 14425 + }, + { + "epoch": 0.6074105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.00017319518714191003, + "loss": 3.1547, + "step": 14426 + }, + { + "epoch": 0.6074526315789474, + "grad_norm": 0.408203125, + "learning_rate": 0.00017316307547494514, + "loss": 2.8267, + "step": 14427 + }, + { + "epoch": 0.6074947368421053, + "grad_norm": 0.408203125, + "learning_rate": 0.00017313096520785886, + "loss": 2.7881, + "step": 14428 + }, + { + "epoch": 0.6075368421052632, + "grad_norm": 0.423828125, + "learning_rate": 0.00017309885634123602, + "loss": 3.4356, + "step": 14429 + }, + { + "epoch": 0.6075789473684211, + "grad_norm": 0.50390625, + "learning_rate": 0.00017306674887566166, + "loss": 3.4469, + "step": 14430 + }, + { + "epoch": 0.607621052631579, + "grad_norm": 0.4375, + "learning_rate": 0.0001730346428117207, + "loss": 3.0541, + "step": 14431 + }, + { + "epoch": 0.6076631578947368, + "grad_norm": 0.404296875, + "learning_rate": 0.00017300253814999818, + "loss": 2.9103, + "step": 14432 + }, + { + "epoch": 0.6077052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.00017297043489107895, + "loss": 3.3339, + "step": 14433 + }, + { + "epoch": 0.6077473684210526, + "grad_norm": 0.482421875, + "learning_rate": 0.0001729383330355478, + "loss": 3.1711, + "step": 14434 + }, + { + "epoch": 0.6077894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.0001729062325839898, + "loss": 3.0314, + "step": 14435 + }, + { + "epoch": 0.6078315789473684, + "grad_norm": 0.44140625, + "learning_rate": 0.00017287413353698957, + "loss": 3.287, + "step": 14436 + }, + { + "epoch": 0.6078736842105263, + "grad_norm": 0.4453125, + "learning_rate": 0.00017284203589513212, + "loss": 3.3059, + "step": 14437 + }, + { + "epoch": 0.6079157894736842, + "grad_norm": 0.453125, + "learning_rate": 0.00017280993965900198, + "loss": 2.8258, + "step": 14438 + }, + { + "epoch": 0.6079578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00017277784482918408, + "loss": 3.0593, + "step": 14439 + }, + { + "epoch": 0.608, + "grad_norm": 0.41796875, + "learning_rate": 0.00017274575140626317, + "loss": 2.933, + "step": 14440 + }, + { + "epoch": 0.6080421052631579, + "grad_norm": 0.404296875, + "learning_rate": 0.00017271365939082383, + "loss": 3.0344, + "step": 14441 + }, + { + "epoch": 0.6080842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.00017268156878345092, + "loss": 2.9966, + "step": 14442 + }, + { + "epoch": 0.6081263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.00017264947958472887, + "loss": 3.2886, + "step": 14443 + }, + { + "epoch": 0.6081684210526316, + "grad_norm": 0.4296875, + "learning_rate": 0.00017261739179524253, + "loss": 3.0775, + "step": 14444 + }, + { + "epoch": 0.6082105263157894, + "grad_norm": 0.435546875, + "learning_rate": 0.00017258530541557627, + "loss": 3.5416, + "step": 14445 + }, + { + "epoch": 0.6082526315789474, + "grad_norm": 0.4140625, + "learning_rate": 0.0001725532204463148, + "loss": 2.4764, + "step": 14446 + }, + { + "epoch": 0.6082947368421052, + "grad_norm": 0.416015625, + "learning_rate": 0.00017252113688804273, + "loss": 2.9827, + "step": 14447 + }, + { + "epoch": 0.6083368421052632, + "grad_norm": 0.466796875, + "learning_rate": 0.00017248905474134447, + "loss": 3.0291, + "step": 14448 + }, + { + "epoch": 0.608378947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.00017245697400680458, + "loss": 3.4183, + "step": 14449 + }, + { + "epoch": 0.608421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.00017242489468500746, + "loss": 3.1488, + "step": 14450 + }, + { + "epoch": 0.6084631578947368, + "grad_norm": 0.4140625, + "learning_rate": 0.00017239281677653762, + "loss": 2.9957, + "step": 14451 + }, + { + "epoch": 0.6085052631578948, + "grad_norm": 0.4296875, + "learning_rate": 0.00017236074028197958, + "loss": 3.5276, + "step": 14452 + }, + { + "epoch": 0.6085473684210526, + "grad_norm": 0.408203125, + "learning_rate": 0.0001723286652019175, + "loss": 3.3975, + "step": 14453 + }, + { + "epoch": 0.6085894736842106, + "grad_norm": 0.439453125, + "learning_rate": 0.000172296591536936, + "loss": 3.3564, + "step": 14454 + }, + { + "epoch": 0.6086315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00017226451928761921, + "loss": 3.3337, + "step": 14455 + }, + { + "epoch": 0.6086736842105264, + "grad_norm": 0.427734375, + "learning_rate": 0.0001722324484545516, + "loss": 3.3109, + "step": 14456 + }, + { + "epoch": 0.6087157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.00017220037903831732, + "loss": 3.1215, + "step": 14457 + }, + { + "epoch": 0.608757894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.00017216831103950076, + "loss": 2.4311, + "step": 14458 + }, + { + "epoch": 0.6088, + "grad_norm": 0.421875, + "learning_rate": 0.0001721362444586862, + "loss": 3.2418, + "step": 14459 + }, + { + "epoch": 0.6088421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.0001721041792964577, + "loss": 3.4157, + "step": 14460 + }, + { + "epoch": 0.6088842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.00017207211555339964, + "loss": 2.9562, + "step": 14461 + }, + { + "epoch": 0.6089263157894736, + "grad_norm": 0.400390625, + "learning_rate": 0.0001720400532300959, + "loss": 2.8664, + "step": 14462 + }, + { + "epoch": 0.6089684210526316, + "grad_norm": 0.45703125, + "learning_rate": 0.00017200799232713093, + "loss": 3.0115, + "step": 14463 + }, + { + "epoch": 0.6090105263157894, + "grad_norm": 0.4296875, + "learning_rate": 0.00017197593284508866, + "loss": 3.2859, + "step": 14464 + }, + { + "epoch": 0.6090526315789474, + "grad_norm": 0.439453125, + "learning_rate": 0.00017194387478455318, + "loss": 2.8892, + "step": 14465 + }, + { + "epoch": 0.6090947368421052, + "grad_norm": 0.447265625, + "learning_rate": 0.00017191181814610863, + "loss": 3.1776, + "step": 14466 + }, + { + "epoch": 0.6091368421052632, + "grad_norm": 0.453125, + "learning_rate": 0.00017187976293033897, + "loss": 2.6483, + "step": 14467 + }, + { + "epoch": 0.609178947368421, + "grad_norm": 0.46875, + "learning_rate": 0.00017184770913782833, + "loss": 3.0819, + "step": 14468 + }, + { + "epoch": 0.609221052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.0001718156567691605, + "loss": 3.0569, + "step": 14469 + }, + { + "epoch": 0.6092631578947368, + "grad_norm": 0.412109375, + "learning_rate": 0.00017178360582491964, + "loss": 2.9479, + "step": 14470 + }, + { + "epoch": 0.6093052631578947, + "grad_norm": 0.4609375, + "learning_rate": 0.0001717515563056895, + "loss": 3.2937, + "step": 14471 + }, + { + "epoch": 0.6093473684210526, + "grad_norm": 0.462890625, + "learning_rate": 0.00017171950821205404, + "loss": 3.1747, + "step": 14472 + }, + { + "epoch": 0.6093894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.00017168746154459726, + "loss": 3.1403, + "step": 14473 + }, + { + "epoch": 0.6094315789473684, + "grad_norm": 0.390625, + "learning_rate": 0.0001716554163039028, + "loss": 2.6385, + "step": 14474 + }, + { + "epoch": 0.6094736842105263, + "grad_norm": 0.431640625, + "learning_rate": 0.00017162337249055476, + "loss": 3.3172, + "step": 14475 + }, + { + "epoch": 0.6095157894736842, + "grad_norm": 0.455078125, + "learning_rate": 0.00017159133010513666, + "loss": 3.3779, + "step": 14476 + }, + { + "epoch": 0.6095578947368421, + "grad_norm": 0.443359375, + "learning_rate": 0.00017155928914823249, + "loss": 3.1768, + "step": 14477 + }, + { + "epoch": 0.6096, + "grad_norm": 0.4375, + "learning_rate": 0.00017152724962042582, + "loss": 2.9544, + "step": 14478 + }, + { + "epoch": 0.6096421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00017149521152230046, + "loss": 3.2507, + "step": 14479 + }, + { + "epoch": 0.6096842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.0001714631748544402, + "loss": 3.1608, + "step": 14480 + }, + { + "epoch": 0.6097263157894737, + "grad_norm": 0.455078125, + "learning_rate": 0.00017143113961742853, + "loss": 2.7929, + "step": 14481 + }, + { + "epoch": 0.6097684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.00017139910581184922, + "loss": 2.8111, + "step": 14482 + }, + { + "epoch": 0.6098105263157895, + "grad_norm": 0.43359375, + "learning_rate": 0.00017136707343828582, + "loss": 3.2001, + "step": 14483 + }, + { + "epoch": 0.6098526315789474, + "grad_norm": 0.421875, + "learning_rate": 0.00017133504249732204, + "loss": 2.8702, + "step": 14484 + }, + { + "epoch": 0.6098947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.00017130301298954126, + "loss": 2.9646, + "step": 14485 + }, + { + "epoch": 0.6099368421052631, + "grad_norm": 0.435546875, + "learning_rate": 0.00017127098491552712, + "loss": 3.202, + "step": 14486 + }, + { + "epoch": 0.6099789473684211, + "grad_norm": 0.412109375, + "learning_rate": 0.00017123895827586322, + "loss": 2.5163, + "step": 14487 + }, + { + "epoch": 0.6100210526315789, + "grad_norm": 0.412109375, + "learning_rate": 0.00017120693307113286, + "loss": 2.7098, + "step": 14488 + }, + { + "epoch": 0.6100631578947369, + "grad_norm": 0.455078125, + "learning_rate": 0.00017117490930191963, + "loss": 2.8221, + "step": 14489 + }, + { + "epoch": 0.6101052631578947, + "grad_norm": 0.44921875, + "learning_rate": 0.0001711428869688069, + "loss": 3.1792, + "step": 14490 + }, + { + "epoch": 0.6101473684210527, + "grad_norm": 0.62109375, + "learning_rate": 0.00017111086607237826, + "loss": 2.894, + "step": 14491 + }, + { + "epoch": 0.6101894736842105, + "grad_norm": 0.44140625, + "learning_rate": 0.00017107884661321683, + "loss": 3.3188, + "step": 14492 + }, + { + "epoch": 0.6102315789473685, + "grad_norm": 0.40625, + "learning_rate": 0.00017104682859190605, + "loss": 2.8844, + "step": 14493 + }, + { + "epoch": 0.6102736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.00017101481200902945, + "loss": 3.2214, + "step": 14494 + }, + { + "epoch": 0.6103157894736843, + "grad_norm": 0.427734375, + "learning_rate": 0.00017098279686517, + "loss": 2.9975, + "step": 14495 + }, + { + "epoch": 0.6103578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00017095078316091132, + "loss": 2.7352, + "step": 14496 + }, + { + "epoch": 0.6104, + "grad_norm": 0.4375, + "learning_rate": 0.00017091877089683632, + "loss": 3.2231, + "step": 14497 + }, + { + "epoch": 0.6104421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.00017088676007352843, + "loss": 3.0825, + "step": 14498 + }, + { + "epoch": 0.6104842105263157, + "grad_norm": 0.427734375, + "learning_rate": 0.00017085475069157088, + "loss": 3.0315, + "step": 14499 + }, + { + "epoch": 0.6105263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.00017082274275154674, + "loss": 3.0517, + "step": 14500 + }, + { + "epoch": 0.6105684210526315, + "grad_norm": 0.43359375, + "learning_rate": 0.0001707907362540393, + "loss": 3.0769, + "step": 14501 + }, + { + "epoch": 0.6106105263157895, + "grad_norm": 0.408203125, + "learning_rate": 0.00017075873119963144, + "loss": 2.7907, + "step": 14502 + }, + { + "epoch": 0.6106526315789473, + "grad_norm": 0.43359375, + "learning_rate": 0.0001707267275889065, + "loss": 3.003, + "step": 14503 + }, + { + "epoch": 0.6106947368421053, + "grad_norm": 0.458984375, + "learning_rate": 0.00017069472542244736, + "loss": 3.1957, + "step": 14504 + }, + { + "epoch": 0.6107368421052631, + "grad_norm": 0.427734375, + "learning_rate": 0.00017066272470083713, + "loss": 3.1265, + "step": 14505 + }, + { + "epoch": 0.6107789473684211, + "grad_norm": 0.439453125, + "learning_rate": 0.0001706307254246589, + "loss": 3.106, + "step": 14506 + }, + { + "epoch": 0.6108210526315789, + "grad_norm": 0.421875, + "learning_rate": 0.00017059872759449558, + "loss": 3.1876, + "step": 14507 + }, + { + "epoch": 0.6108631578947369, + "grad_norm": 0.439453125, + "learning_rate": 0.00017056673121093016, + "loss": 3.278, + "step": 14508 + }, + { + "epoch": 0.6109052631578947, + "grad_norm": 0.4453125, + "learning_rate": 0.00017053473627454553, + "loss": 3.056, + "step": 14509 + }, + { + "epoch": 0.6109473684210527, + "grad_norm": 0.431640625, + "learning_rate": 0.0001705027427859247, + "loss": 3.3199, + "step": 14510 + }, + { + "epoch": 0.6109894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.00017047075074565045, + "loss": 3.2345, + "step": 14511 + }, + { + "epoch": 0.6110315789473684, + "grad_norm": 0.451171875, + "learning_rate": 0.00017043876015430563, + "loss": 2.9218, + "step": 14512 + }, + { + "epoch": 0.6110736842105263, + "grad_norm": 0.412109375, + "learning_rate": 0.00017040677101247322, + "loss": 2.8957, + "step": 14513 + }, + { + "epoch": 0.6111157894736842, + "grad_norm": 0.447265625, + "learning_rate": 0.00017037478332073585, + "loss": 3.4303, + "step": 14514 + }, + { + "epoch": 0.6111578947368421, + "grad_norm": 0.458984375, + "learning_rate": 0.0001703427970796764, + "loss": 3.5862, + "step": 14515 + }, + { + "epoch": 0.6112, + "grad_norm": 0.421875, + "learning_rate": 0.00017031081228987755, + "loss": 2.6488, + "step": 14516 + }, + { + "epoch": 0.6112421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.00017027882895192219, + "loss": 2.9731, + "step": 14517 + }, + { + "epoch": 0.6112842105263158, + "grad_norm": 0.45703125, + "learning_rate": 0.00017024684706639276, + "loss": 3.0858, + "step": 14518 + }, + { + "epoch": 0.6113263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.00017021486663387208, + "loss": 3.0784, + "step": 14519 + }, + { + "epoch": 0.6113684210526316, + "grad_norm": 0.4296875, + "learning_rate": 0.00017018288765494294, + "loss": 2.9581, + "step": 14520 + }, + { + "epoch": 0.6114105263157895, + "grad_norm": 0.408203125, + "learning_rate": 0.00017015091013018762, + "loss": 2.9755, + "step": 14521 + }, + { + "epoch": 0.6114526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00017011893406018909, + "loss": 3.0269, + "step": 14522 + }, + { + "epoch": 0.6114947368421053, + "grad_norm": 0.435546875, + "learning_rate": 0.00017008695944552955, + "loss": 2.6009, + "step": 14523 + }, + { + "epoch": 0.6115368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.0001700549862867918, + "loss": 2.8842, + "step": 14524 + }, + { + "epoch": 0.611578947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.00017002301458455822, + "loss": 3.007, + "step": 14525 + }, + { + "epoch": 0.611621052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.00016999104433941133, + "loss": 3.1766, + "step": 14526 + }, + { + "epoch": 0.6116631578947368, + "grad_norm": 0.43359375, + "learning_rate": 0.00016995907555193368, + "loss": 3.0891, + "step": 14527 + }, + { + "epoch": 0.6117052631578948, + "grad_norm": 0.439453125, + "learning_rate": 0.00016992710822270757, + "loss": 3.0675, + "step": 14528 + }, + { + "epoch": 0.6117473684210526, + "grad_norm": 0.439453125, + "learning_rate": 0.00016989514235231558, + "loss": 3.0202, + "step": 14529 + }, + { + "epoch": 0.6117894736842105, + "grad_norm": 0.451171875, + "learning_rate": 0.00016986317794133985, + "loss": 2.9388, + "step": 14530 + }, + { + "epoch": 0.6118315789473684, + "grad_norm": 0.453125, + "learning_rate": 0.00016983121499036292, + "loss": 2.9369, + "step": 14531 + }, + { + "epoch": 0.6118736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.000169799253499967, + "loss": 2.9923, + "step": 14532 + }, + { + "epoch": 0.6119157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00016976729347073445, + "loss": 3.2774, + "step": 14533 + }, + { + "epoch": 0.6119578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.00016973533490324764, + "loss": 3.3321, + "step": 14534 + }, + { + "epoch": 0.612, + "grad_norm": 0.431640625, + "learning_rate": 0.00016970337779808863, + "loss": 3.2423, + "step": 14535 + }, + { + "epoch": 0.6120421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.0001696714221558398, + "loss": 2.9124, + "step": 14536 + }, + { + "epoch": 0.6120842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.00016963946797708324, + "loss": 3.2309, + "step": 14537 + }, + { + "epoch": 0.6121263157894736, + "grad_norm": 0.423828125, + "learning_rate": 0.0001696075152624012, + "loss": 2.9732, + "step": 14538 + }, + { + "epoch": 0.6121684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.0001695755640123757, + "loss": 2.654, + "step": 14539 + }, + { + "epoch": 0.6122105263157894, + "grad_norm": 0.435546875, + "learning_rate": 0.0001695436142275889, + "loss": 2.905, + "step": 14540 + }, + { + "epoch": 0.6122526315789474, + "grad_norm": 0.412109375, + "learning_rate": 0.00016951166590862299, + "loss": 2.971, + "step": 14541 + }, + { + "epoch": 0.6122947368421052, + "grad_norm": 0.392578125, + "learning_rate": 0.00016947971905605991, + "loss": 2.767, + "step": 14542 + }, + { + "epoch": 0.6123368421052632, + "grad_norm": 0.40234375, + "learning_rate": 0.00016944777367048185, + "loss": 3.1406, + "step": 14543 + }, + { + "epoch": 0.612378947368421, + "grad_norm": 0.455078125, + "learning_rate": 0.0001694158297524706, + "loss": 3.0883, + "step": 14544 + }, + { + "epoch": 0.612421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00016938388730260823, + "loss": 3.1209, + "step": 14545 + }, + { + "epoch": 0.6124631578947368, + "grad_norm": 0.4375, + "learning_rate": 0.0001693519463214768, + "loss": 3.3674, + "step": 14546 + }, + { + "epoch": 0.6125052631578948, + "grad_norm": 0.42578125, + "learning_rate": 0.00016932000680965813, + "loss": 3.3928, + "step": 14547 + }, + { + "epoch": 0.6125473684210526, + "grad_norm": 0.40234375, + "learning_rate": 0.00016928806876773412, + "loss": 2.9182, + "step": 14548 + }, + { + "epoch": 0.6125894736842106, + "grad_norm": 0.41796875, + "learning_rate": 0.0001692561321962867, + "loss": 3.1807, + "step": 14549 + }, + { + "epoch": 0.6126315789473684, + "grad_norm": 0.419921875, + "learning_rate": 0.00016922419709589765, + "loss": 3.2928, + "step": 14550 + }, + { + "epoch": 0.6126736842105264, + "grad_norm": 0.451171875, + "learning_rate": 0.0001691922634671488, + "loss": 3.2731, + "step": 14551 + }, + { + "epoch": 0.6127157894736842, + "grad_norm": 0.404296875, + "learning_rate": 0.00016916033131062194, + "loss": 2.9972, + "step": 14552 + }, + { + "epoch": 0.6127578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.000169128400626899, + "loss": 2.6307, + "step": 14553 + }, + { + "epoch": 0.6128, + "grad_norm": 0.419921875, + "learning_rate": 0.00016909647141656142, + "loss": 3.1296, + "step": 14554 + }, + { + "epoch": 0.6128421052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.0001690645436801912, + "loss": 2.9623, + "step": 14555 + }, + { + "epoch": 0.6128842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.00016903261741836983, + "loss": 3.3081, + "step": 14556 + }, + { + "epoch": 0.6129263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.00016900069263167905, + "loss": 2.6553, + "step": 14557 + }, + { + "epoch": 0.6129684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.00016896876932070043, + "loss": 3.2837, + "step": 14558 + }, + { + "epoch": 0.6130105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.00016893684748601565, + "loss": 3.2106, + "step": 14559 + }, + { + "epoch": 0.6130526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.00016890492712820634, + "loss": 2.9639, + "step": 14560 + }, + { + "epoch": 0.6130947368421052, + "grad_norm": 0.41015625, + "learning_rate": 0.0001688730082478539, + "loss": 3.3748, + "step": 14561 + }, + { + "epoch": 0.6131368421052632, + "grad_norm": 0.494140625, + "learning_rate": 0.00016884109084554, + "loss": 2.7156, + "step": 14562 + }, + { + "epoch": 0.613178947368421, + "grad_norm": 0.46484375, + "learning_rate": 0.00016880917492184599, + "loss": 2.9551, + "step": 14563 + }, + { + "epoch": 0.613221052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00016877726047735347, + "loss": 2.8629, + "step": 14564 + }, + { + "epoch": 0.6132631578947368, + "grad_norm": 0.443359375, + "learning_rate": 0.0001687453475126438, + "loss": 3.2514, + "step": 14565 + }, + { + "epoch": 0.6133052631578947, + "grad_norm": 0.431640625, + "learning_rate": 0.00016871343602829842, + "loss": 3.2552, + "step": 14566 + }, + { + "epoch": 0.6133473684210526, + "grad_norm": 0.50390625, + "learning_rate": 0.00016868152602489875, + "loss": 3.5586, + "step": 14567 + }, + { + "epoch": 0.6133894736842105, + "grad_norm": 0.466796875, + "learning_rate": 0.0001686496175030261, + "loss": 3.4255, + "step": 14568 + }, + { + "epoch": 0.6134315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.0001686177104632619, + "loss": 2.9049, + "step": 14569 + }, + { + "epoch": 0.6134736842105263, + "grad_norm": 0.46875, + "learning_rate": 0.00016858580490618734, + "loss": 3.1637, + "step": 14570 + }, + { + "epoch": 0.6135157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.0001685539008323838, + "loss": 2.8351, + "step": 14571 + }, + { + "epoch": 0.6135578947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.00016852199824243246, + "loss": 3.4535, + "step": 14572 + }, + { + "epoch": 0.6136, + "grad_norm": 0.65625, + "learning_rate": 0.00016849009713691454, + "loss": 3.362, + "step": 14573 + }, + { + "epoch": 0.6136421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00016845819751641134, + "loss": 3.1529, + "step": 14574 + }, + { + "epoch": 0.6136842105263158, + "grad_norm": 0.4453125, + "learning_rate": 0.0001684262993815039, + "loss": 3.1031, + "step": 14575 + }, + { + "epoch": 0.6137263157894737, + "grad_norm": 0.453125, + "learning_rate": 0.00016839440273277358, + "loss": 3.0688, + "step": 14576 + }, + { + "epoch": 0.6137684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.00016836250757080122, + "loss": 2.9009, + "step": 14577 + }, + { + "epoch": 0.6138105263157895, + "grad_norm": 0.423828125, + "learning_rate": 0.00016833061389616818, + "loss": 3.3902, + "step": 14578 + }, + { + "epoch": 0.6138526315789473, + "grad_norm": 0.447265625, + "learning_rate": 0.00016829872170945532, + "loss": 3.1091, + "step": 14579 + }, + { + "epoch": 0.6138947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.00016826683101124368, + "loss": 2.526, + "step": 14580 + }, + { + "epoch": 0.6139368421052631, + "grad_norm": 0.41796875, + "learning_rate": 0.00016823494180211453, + "loss": 2.7999, + "step": 14581 + }, + { + "epoch": 0.6139789473684211, + "grad_norm": 0.5625, + "learning_rate": 0.00016820305408264853, + "loss": 2.6407, + "step": 14582 + }, + { + "epoch": 0.6140210526315789, + "grad_norm": 0.4375, + "learning_rate": 0.00016817116785342682, + "loss": 3.1273, + "step": 14583 + }, + { + "epoch": 0.6140631578947369, + "grad_norm": 0.427734375, + "learning_rate": 0.00016813928311503024, + "loss": 3.4547, + "step": 14584 + }, + { + "epoch": 0.6141052631578947, + "grad_norm": 0.423828125, + "learning_rate": 0.00016810739986803985, + "loss": 2.978, + "step": 14585 + }, + { + "epoch": 0.6141473684210527, + "grad_norm": 0.458984375, + "learning_rate": 0.00016807551811303629, + "loss": 2.7872, + "step": 14586 + }, + { + "epoch": 0.6141894736842105, + "grad_norm": 0.45703125, + "learning_rate": 0.00016804363785060056, + "loss": 3.059, + "step": 14587 + }, + { + "epoch": 0.6142315789473685, + "grad_norm": 0.443359375, + "learning_rate": 0.00016801175908131354, + "loss": 3.4983, + "step": 14588 + }, + { + "epoch": 0.6142736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00016797988180575584, + "loss": 2.5859, + "step": 14589 + }, + { + "epoch": 0.6143157894736843, + "grad_norm": 0.443359375, + "learning_rate": 0.00016794800602450838, + "loss": 3.3268, + "step": 14590 + }, + { + "epoch": 0.6143578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.0001679161317381518, + "loss": 3.0141, + "step": 14591 + }, + { + "epoch": 0.6144, + "grad_norm": 0.427734375, + "learning_rate": 0.00016788425894726696, + "loss": 3.1844, + "step": 14592 + }, + { + "epoch": 0.6144421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.00016785238765243431, + "loss": 3.1795, + "step": 14593 + }, + { + "epoch": 0.6144842105263157, + "grad_norm": 0.412109375, + "learning_rate": 0.00016782051785423462, + "loss": 2.6298, + "step": 14594 + }, + { + "epoch": 0.6145263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.00016778864955324869, + "loss": 3.3081, + "step": 14595 + }, + { + "epoch": 0.6145684210526315, + "grad_norm": 0.435546875, + "learning_rate": 0.00016775678275005685, + "loss": 3.1323, + "step": 14596 + }, + { + "epoch": 0.6146105263157895, + "grad_norm": 0.412109375, + "learning_rate": 0.0001677249174452399, + "loss": 2.9106, + "step": 14597 + }, + { + "epoch": 0.6146526315789473, + "grad_norm": 0.4140625, + "learning_rate": 0.0001676930536393782, + "loss": 3.4599, + "step": 14598 + }, + { + "epoch": 0.6146947368421053, + "grad_norm": 0.47265625, + "learning_rate": 0.0001676611913330523, + "loss": 2.8165, + "step": 14599 + }, + { + "epoch": 0.6147368421052631, + "grad_norm": 0.45703125, + "learning_rate": 0.00016762933052684288, + "loss": 3.2782, + "step": 14600 + }, + { + "epoch": 0.6147789473684211, + "grad_norm": 0.421875, + "learning_rate": 0.00016759747122133017, + "loss": 3.232, + "step": 14601 + }, + { + "epoch": 0.6148210526315789, + "grad_norm": 0.435546875, + "learning_rate": 0.00016756561341709484, + "loss": 3.3507, + "step": 14602 + }, + { + "epoch": 0.6148631578947369, + "grad_norm": 0.455078125, + "learning_rate": 0.00016753375711471707, + "loss": 3.1859, + "step": 14603 + }, + { + "epoch": 0.6149052631578947, + "grad_norm": 0.439453125, + "learning_rate": 0.00016750190231477744, + "loss": 3.6782, + "step": 14604 + }, + { + "epoch": 0.6149473684210527, + "grad_norm": 0.47265625, + "learning_rate": 0.00016747004901785612, + "loss": 3.0056, + "step": 14605 + }, + { + "epoch": 0.6149894736842105, + "grad_norm": 0.439453125, + "learning_rate": 0.00016743819722453356, + "loss": 3.2818, + "step": 14606 + }, + { + "epoch": 0.6150315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.00016740634693539005, + "loss": 3.1684, + "step": 14607 + }, + { + "epoch": 0.6150736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.0001673744981510059, + "loss": 2.8754, + "step": 14608 + }, + { + "epoch": 0.6151157894736842, + "grad_norm": 0.4453125, + "learning_rate": 0.0001673426508719613, + "loss": 2.8765, + "step": 14609 + }, + { + "epoch": 0.6151578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.0001673108050988364, + "loss": 2.8824, + "step": 14610 + }, + { + "epoch": 0.6152, + "grad_norm": 0.423828125, + "learning_rate": 0.00016727896083221163, + "loss": 3.6416, + "step": 14611 + }, + { + "epoch": 0.6152421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.00016724711807266686, + "loss": 3.4658, + "step": 14612 + }, + { + "epoch": 0.6152842105263158, + "grad_norm": 0.474609375, + "learning_rate": 0.00016721527682078235, + "loss": 2.8821, + "step": 14613 + }, + { + "epoch": 0.6153263157894737, + "grad_norm": 0.421875, + "learning_rate": 0.0001671834370771384, + "loss": 2.9586, + "step": 14614 + }, + { + "epoch": 0.6153684210526316, + "grad_norm": 0.439453125, + "learning_rate": 0.00016715159884231475, + "loss": 2.7713, + "step": 14615 + }, + { + "epoch": 0.6154105263157895, + "grad_norm": 0.45703125, + "learning_rate": 0.00016711976211689173, + "loss": 2.9707, + "step": 14616 + }, + { + "epoch": 0.6154526315789474, + "grad_norm": 0.392578125, + "learning_rate": 0.00016708792690144918, + "loss": 2.7867, + "step": 14617 + }, + { + "epoch": 0.6154947368421053, + "grad_norm": 0.5546875, + "learning_rate": 0.00016705609319656728, + "loss": 2.6267, + "step": 14618 + }, + { + "epoch": 0.6155368421052632, + "grad_norm": 0.412109375, + "learning_rate": 0.00016702426100282584, + "loss": 2.7877, + "step": 14619 + }, + { + "epoch": 0.615578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00016699243032080484, + "loss": 2.967, + "step": 14620 + }, + { + "epoch": 0.615621052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00016696060115108436, + "loss": 2.7963, + "step": 14621 + }, + { + "epoch": 0.6156631578947368, + "grad_norm": 0.43359375, + "learning_rate": 0.00016692877349424402, + "loss": 3.3531, + "step": 14622 + }, + { + "epoch": 0.6157052631578948, + "grad_norm": 0.431640625, + "learning_rate": 0.00016689694735086392, + "loss": 3.166, + "step": 14623 + }, + { + "epoch": 0.6157473684210526, + "grad_norm": 0.46484375, + "learning_rate": 0.00016686512272152376, + "loss": 3.0746, + "step": 14624 + }, + { + "epoch": 0.6157894736842106, + "grad_norm": 0.44921875, + "learning_rate": 0.0001668332996068034, + "loss": 3.3734, + "step": 14625 + }, + { + "epoch": 0.6158315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.00016680147800728255, + "loss": 3.1566, + "step": 14626 + }, + { + "epoch": 0.6158736842105264, + "grad_norm": 0.416015625, + "learning_rate": 0.00016676965792354102, + "loss": 2.7957, + "step": 14627 + }, + { + "epoch": 0.6159157894736842, + "grad_norm": 0.451171875, + "learning_rate": 0.0001667378393561586, + "loss": 3.2673, + "step": 14628 + }, + { + "epoch": 0.6159578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00016670602230571486, + "loss": 2.9194, + "step": 14629 + }, + { + "epoch": 0.616, + "grad_norm": 0.4296875, + "learning_rate": 0.0001666742067727896, + "loss": 3.4422, + "step": 14630 + }, + { + "epoch": 0.616042105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.00016664239275796232, + "loss": 2.9945, + "step": 14631 + }, + { + "epoch": 0.6160842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.00016661058026181275, + "loss": 3.1402, + "step": 14632 + }, + { + "epoch": 0.6161263157894736, + "grad_norm": 0.44140625, + "learning_rate": 0.00016657876928492034, + "loss": 3.1204, + "step": 14633 + }, + { + "epoch": 0.6161684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.00016654695982786479, + "loss": 3.5275, + "step": 14634 + }, + { + "epoch": 0.6162105263157894, + "grad_norm": 0.416015625, + "learning_rate": 0.00016651515189122562, + "loss": 3.0454, + "step": 14635 + }, + { + "epoch": 0.6162526315789474, + "grad_norm": 0.427734375, + "learning_rate": 0.00016648334547558224, + "loss": 3.2069, + "step": 14636 + }, + { + "epoch": 0.6162947368421052, + "grad_norm": 0.416015625, + "learning_rate": 0.00016645154058151423, + "loss": 3.0012, + "step": 14637 + }, + { + "epoch": 0.6163368421052632, + "grad_norm": 0.423828125, + "learning_rate": 0.00016641973720960094, + "loss": 3.1157, + "step": 14638 + }, + { + "epoch": 0.616378947368421, + "grad_norm": 0.421875, + "learning_rate": 0.00016638793536042184, + "loss": 3.1975, + "step": 14639 + }, + { + "epoch": 0.616421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00016635613503455636, + "loss": 3.2474, + "step": 14640 + }, + { + "epoch": 0.6164631578947368, + "grad_norm": 0.4140625, + "learning_rate": 0.00016632433623258374, + "loss": 3.3385, + "step": 14641 + }, + { + "epoch": 0.6165052631578948, + "grad_norm": 0.41015625, + "learning_rate": 0.00016629253895508346, + "loss": 2.9157, + "step": 14642 + }, + { + "epoch": 0.6165473684210526, + "grad_norm": 0.4453125, + "learning_rate": 0.00016626074320263474, + "loss": 3.3105, + "step": 14643 + }, + { + "epoch": 0.6165894736842106, + "grad_norm": 0.412109375, + "learning_rate": 0.00016622894897581694, + "loss": 3.0823, + "step": 14644 + }, + { + "epoch": 0.6166315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.0001661971562752092, + "loss": 2.8648, + "step": 14645 + }, + { + "epoch": 0.6166736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.0001661653651013908, + "loss": 3.2991, + "step": 14646 + }, + { + "epoch": 0.6167157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.00016613357545494103, + "loss": 3.2676, + "step": 14647 + }, + { + "epoch": 0.6167578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.00016610178733643887, + "loss": 3.0841, + "step": 14648 + }, + { + "epoch": 0.6168, + "grad_norm": 0.3984375, + "learning_rate": 0.00016607000074646367, + "loss": 3.2915, + "step": 14649 + }, + { + "epoch": 0.6168421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.00016603821568559436, + "loss": 3.2486, + "step": 14650 + }, + { + "epoch": 0.6168842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.0001660064321544102, + "loss": 3.4859, + "step": 14651 + }, + { + "epoch": 0.6169263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.00016597465015349007, + "loss": 3.4322, + "step": 14652 + }, + { + "epoch": 0.6169684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.0001659428696834131, + "loss": 2.9866, + "step": 14653 + }, + { + "epoch": 0.6170105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.00016591109074475836, + "loss": 3.317, + "step": 14654 + }, + { + "epoch": 0.6170526315789474, + "grad_norm": 0.451171875, + "learning_rate": 0.00016587931333810464, + "loss": 3.0822, + "step": 14655 + }, + { + "epoch": 0.6170947368421053, + "grad_norm": 0.455078125, + "learning_rate": 0.00016584753746403113, + "loss": 2.8402, + "step": 14656 + }, + { + "epoch": 0.6171368421052632, + "grad_norm": 0.423828125, + "learning_rate": 0.00016581576312311647, + "loss": 2.8914, + "step": 14657 + }, + { + "epoch": 0.617178947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.00016578399031593976, + "loss": 2.9862, + "step": 14658 + }, + { + "epoch": 0.6172210526315789, + "grad_norm": 0.421875, + "learning_rate": 0.00016575221904307976, + "loss": 3.3821, + "step": 14659 + }, + { + "epoch": 0.6172631578947368, + "grad_norm": 0.431640625, + "learning_rate": 0.0001657204493051153, + "loss": 2.8875, + "step": 14660 + }, + { + "epoch": 0.6173052631578947, + "grad_norm": 0.4609375, + "learning_rate": 0.00016568868110262538, + "loss": 3.1469, + "step": 14661 + }, + { + "epoch": 0.6173473684210526, + "grad_norm": 0.396484375, + "learning_rate": 0.00016565691443618853, + "loss": 3.2223, + "step": 14662 + }, + { + "epoch": 0.6173894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.00016562514930638366, + "loss": 3.313, + "step": 14663 + }, + { + "epoch": 0.6174315789473684, + "grad_norm": 0.4140625, + "learning_rate": 0.00016559338571378934, + "loss": 3.0802, + "step": 14664 + }, + { + "epoch": 0.6174736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.00016556162365898445, + "loss": 3.2864, + "step": 14665 + }, + { + "epoch": 0.6175157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.0001655298631425475, + "loss": 3.5397, + "step": 14666 + }, + { + "epoch": 0.6175578947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0001654981041650572, + "loss": 2.9549, + "step": 14667 + }, + { + "epoch": 0.6176, + "grad_norm": 0.427734375, + "learning_rate": 0.0001654663467270922, + "loss": 3.1089, + "step": 14668 + }, + { + "epoch": 0.6176421052631579, + "grad_norm": 0.40234375, + "learning_rate": 0.00016543459082923095, + "loss": 3.3306, + "step": 14669 + }, + { + "epoch": 0.6176842105263158, + "grad_norm": 0.419921875, + "learning_rate": 0.00016540283647205223, + "loss": 2.5111, + "step": 14670 + }, + { + "epoch": 0.6177263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.00016537108365613436, + "loss": 3.2455, + "step": 14671 + }, + { + "epoch": 0.6177684210526316, + "grad_norm": 0.40234375, + "learning_rate": 0.00016533933238205596, + "loss": 3.1339, + "step": 14672 + }, + { + "epoch": 0.6178105263157895, + "grad_norm": 0.59375, + "learning_rate": 0.00016530758265039536, + "loss": 3.1826, + "step": 14673 + }, + { + "epoch": 0.6178526315789473, + "grad_norm": 0.435546875, + "learning_rate": 0.00016527583446173112, + "loss": 3.2293, + "step": 14674 + }, + { + "epoch": 0.6178947368421053, + "grad_norm": 0.439453125, + "learning_rate": 0.00016524408781664162, + "loss": 2.9576, + "step": 14675 + }, + { + "epoch": 0.6179368421052631, + "grad_norm": 0.439453125, + "learning_rate": 0.0001652123427157053, + "loss": 3.1977, + "step": 14676 + }, + { + "epoch": 0.6179789473684211, + "grad_norm": 0.416015625, + "learning_rate": 0.0001651805991595005, + "loss": 2.9556, + "step": 14677 + }, + { + "epoch": 0.6180210526315789, + "grad_norm": 0.41796875, + "learning_rate": 0.00016514885714860546, + "loss": 2.7214, + "step": 14678 + }, + { + "epoch": 0.6180631578947369, + "grad_norm": 0.44140625, + "learning_rate": 0.00016511711668359863, + "loss": 3.0968, + "step": 14679 + }, + { + "epoch": 0.6181052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.00016508537776505812, + "loss": 3.2297, + "step": 14680 + }, + { + "epoch": 0.6181473684210527, + "grad_norm": 0.44140625, + "learning_rate": 0.00016505364039356223, + "loss": 3.832, + "step": 14681 + }, + { + "epoch": 0.6181894736842105, + "grad_norm": 0.453125, + "learning_rate": 0.00016502190456968929, + "loss": 3.6666, + "step": 14682 + }, + { + "epoch": 0.6182315789473685, + "grad_norm": 0.466796875, + "learning_rate": 0.00016499017029401734, + "loss": 3.0906, + "step": 14683 + }, + { + "epoch": 0.6182736842105263, + "grad_norm": 0.458984375, + "learning_rate": 0.00016495843756712464, + "loss": 3.1767, + "step": 14684 + }, + { + "epoch": 0.6183157894736843, + "grad_norm": 0.419921875, + "learning_rate": 0.00016492670638958924, + "loss": 2.901, + "step": 14685 + }, + { + "epoch": 0.6183578947368421, + "grad_norm": 0.4453125, + "learning_rate": 0.0001648949767619894, + "loss": 2.8862, + "step": 14686 + }, + { + "epoch": 0.6184, + "grad_norm": 0.43359375, + "learning_rate": 0.00016486324868490298, + "loss": 3.1939, + "step": 14687 + }, + { + "epoch": 0.6184421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00016483152215890812, + "loss": 3.2774, + "step": 14688 + }, + { + "epoch": 0.6184842105263157, + "grad_norm": 0.416015625, + "learning_rate": 0.000164799797184583, + "loss": 2.9213, + "step": 14689 + }, + { + "epoch": 0.6185263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.00016476807376250536, + "loss": 3.2491, + "step": 14690 + }, + { + "epoch": 0.6185684210526315, + "grad_norm": 0.4140625, + "learning_rate": 0.00016473635189325332, + "loss": 2.8205, + "step": 14691 + }, + { + "epoch": 0.6186105263157895, + "grad_norm": 0.466796875, + "learning_rate": 0.00016470463157740466, + "loss": 3.2021, + "step": 14692 + }, + { + "epoch": 0.6186526315789473, + "grad_norm": 0.4296875, + "learning_rate": 0.00016467291281553755, + "loss": 3.0599, + "step": 14693 + }, + { + "epoch": 0.6186947368421053, + "grad_norm": 0.4296875, + "learning_rate": 0.0001646411956082296, + "loss": 3.1228, + "step": 14694 + }, + { + "epoch": 0.6187368421052631, + "grad_norm": 0.41796875, + "learning_rate": 0.0001646094799560588, + "loss": 2.8678, + "step": 14695 + }, + { + "epoch": 0.6187789473684211, + "grad_norm": 0.412109375, + "learning_rate": 0.000164577765859603, + "loss": 3.1835, + "step": 14696 + }, + { + "epoch": 0.6188210526315789, + "grad_norm": 0.439453125, + "learning_rate": 0.00016454605331943988, + "loss": 3.0552, + "step": 14697 + }, + { + "epoch": 0.6188631578947369, + "grad_norm": 0.44140625, + "learning_rate": 0.00016451434233614738, + "loss": 3.1266, + "step": 14698 + }, + { + "epoch": 0.6189052631578947, + "grad_norm": 0.431640625, + "learning_rate": 0.00016448263291030297, + "loss": 3.0081, + "step": 14699 + }, + { + "epoch": 0.6189473684210526, + "grad_norm": 0.4140625, + "learning_rate": 0.00016445092504248452, + "loss": 3.1335, + "step": 14700 + }, + { + "epoch": 0.6189894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.00016441921873326976, + "loss": 3.0132, + "step": 14701 + }, + { + "epoch": 0.6190315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.0001643875139832362, + "loss": 3.0719, + "step": 14702 + }, + { + "epoch": 0.6190736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.0001643558107929617, + "loss": 3.2791, + "step": 14703 + }, + { + "epoch": 0.6191157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.00016432410916302357, + "loss": 2.4616, + "step": 14704 + }, + { + "epoch": 0.6191578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.0001642924090939996, + "loss": 3.0128, + "step": 14705 + }, + { + "epoch": 0.6192, + "grad_norm": 0.47265625, + "learning_rate": 0.00016426071058646717, + "loss": 2.8629, + "step": 14706 + }, + { + "epoch": 0.6192421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00016422901364100383, + "loss": 3.2047, + "step": 14707 + }, + { + "epoch": 0.6192842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.00016419731825818712, + "loss": 2.9342, + "step": 14708 + }, + { + "epoch": 0.6193263157894737, + "grad_norm": 0.447265625, + "learning_rate": 0.00016416562443859444, + "loss": 2.9637, + "step": 14709 + }, + { + "epoch": 0.6193684210526316, + "grad_norm": 0.435546875, + "learning_rate": 0.0001641339321828033, + "loss": 3.4904, + "step": 14710 + }, + { + "epoch": 0.6194105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.0001641022414913909, + "loss": 3.3665, + "step": 14711 + }, + { + "epoch": 0.6194526315789474, + "grad_norm": 0.462890625, + "learning_rate": 0.0001640705523649349, + "loss": 2.6715, + "step": 14712 + }, + { + "epoch": 0.6194947368421052, + "grad_norm": 0.42578125, + "learning_rate": 0.00016403886480401234, + "loss": 3.5207, + "step": 14713 + }, + { + "epoch": 0.6195368421052632, + "grad_norm": 0.4296875, + "learning_rate": 0.00016400717880920069, + "loss": 2.9443, + "step": 14714 + }, + { + "epoch": 0.619578947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.00016397549438107727, + "loss": 3.294, + "step": 14715 + }, + { + "epoch": 0.619621052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00016394381152021916, + "loss": 2.8237, + "step": 14716 + }, + { + "epoch": 0.6196631578947368, + "grad_norm": 0.40234375, + "learning_rate": 0.00016391213022720378, + "loss": 2.4853, + "step": 14717 + }, + { + "epoch": 0.6197052631578948, + "grad_norm": 0.42578125, + "learning_rate": 0.00016388045050260816, + "loss": 3.3355, + "step": 14718 + }, + { + "epoch": 0.6197473684210526, + "grad_norm": 0.46875, + "learning_rate": 0.00016384877234700967, + "loss": 3.7743, + "step": 14719 + }, + { + "epoch": 0.6197894736842106, + "grad_norm": 0.40625, + "learning_rate": 0.00016381709576098518, + "loss": 2.9684, + "step": 14720 + }, + { + "epoch": 0.6198315789473684, + "grad_norm": 0.439453125, + "learning_rate": 0.000163785420745112, + "loss": 3.1094, + "step": 14721 + }, + { + "epoch": 0.6198736842105264, + "grad_norm": 0.4296875, + "learning_rate": 0.00016375374729996723, + "loss": 3.2501, + "step": 14722 + }, + { + "epoch": 0.6199157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.00016372207542612776, + "loss": 3.146, + "step": 14723 + }, + { + "epoch": 0.6199578947368422, + "grad_norm": 0.416015625, + "learning_rate": 0.00016369040512417073, + "loss": 3.0607, + "step": 14724 + }, + { + "epoch": 0.62, + "grad_norm": 0.4296875, + "learning_rate": 0.00016365873639467314, + "loss": 2.838, + "step": 14725 + }, + { + "epoch": 0.620042105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.0001636270692382119, + "loss": 3.2375, + "step": 14726 + }, + { + "epoch": 0.6200842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.00016359540365536393, + "loss": 3.2377, + "step": 14727 + }, + { + "epoch": 0.6201263157894736, + "grad_norm": 0.431640625, + "learning_rate": 0.00016356373964670617, + "loss": 3.3782, + "step": 14728 + }, + { + "epoch": 0.6201684210526316, + "grad_norm": 0.46484375, + "learning_rate": 0.00016353207721281566, + "loss": 3.1545, + "step": 14729 + }, + { + "epoch": 0.6202105263157894, + "grad_norm": 0.40625, + "learning_rate": 0.00016350041635426898, + "loss": 3.169, + "step": 14730 + }, + { + "epoch": 0.6202526315789474, + "grad_norm": 0.421875, + "learning_rate": 0.00016346875707164317, + "loss": 3.2718, + "step": 14731 + }, + { + "epoch": 0.6202947368421052, + "grad_norm": 0.412109375, + "learning_rate": 0.00016343709936551484, + "loss": 3.0835, + "step": 14732 + }, + { + "epoch": 0.6203368421052632, + "grad_norm": 0.43359375, + "learning_rate": 0.00016340544323646092, + "loss": 2.9436, + "step": 14733 + }, + { + "epoch": 0.620378947368421, + "grad_norm": 0.396484375, + "learning_rate": 0.00016337378868505804, + "loss": 2.7633, + "step": 14734 + }, + { + "epoch": 0.620421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.0001633421357118829, + "loss": 3.0181, + "step": 14735 + }, + { + "epoch": 0.6204631578947368, + "grad_norm": 0.435546875, + "learning_rate": 0.0001633104843175124, + "loss": 3.3796, + "step": 14736 + }, + { + "epoch": 0.6205052631578948, + "grad_norm": 0.4296875, + "learning_rate": 0.00016327883450252285, + "loss": 2.9884, + "step": 14737 + }, + { + "epoch": 0.6205473684210526, + "grad_norm": 0.421875, + "learning_rate": 0.00016324718626749117, + "loss": 3.1744, + "step": 14738 + }, + { + "epoch": 0.6205894736842106, + "grad_norm": 0.51171875, + "learning_rate": 0.00016321553961299375, + "loss": 2.9381, + "step": 14739 + }, + { + "epoch": 0.6206315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.00016318389453960722, + "loss": 2.8592, + "step": 14740 + }, + { + "epoch": 0.6206736842105263, + "grad_norm": 0.451171875, + "learning_rate": 0.00016315225104790814, + "loss": 3.2543, + "step": 14741 + }, + { + "epoch": 0.6207157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00016312060913847298, + "loss": 2.9172, + "step": 14742 + }, + { + "epoch": 0.6207578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.0001630889688118783, + "loss": 3.3776, + "step": 14743 + }, + { + "epoch": 0.6208, + "grad_norm": 0.4296875, + "learning_rate": 0.00016305733006870043, + "loss": 3.0296, + "step": 14744 + }, + { + "epoch": 0.6208421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00016302569290951597, + "loss": 2.8502, + "step": 14745 + }, + { + "epoch": 0.6208842105263158, + "grad_norm": 0.462890625, + "learning_rate": 0.0001629940573349011, + "loss": 3.4774, + "step": 14746 + }, + { + "epoch": 0.6209263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.0001629624233454322, + "loss": 3.2138, + "step": 14747 + }, + { + "epoch": 0.6209684210526316, + "grad_norm": 0.412109375, + "learning_rate": 0.00016293079094168586, + "loss": 3.4904, + "step": 14748 + }, + { + "epoch": 0.6210105263157895, + "grad_norm": 0.423828125, + "learning_rate": 0.00016289916012423807, + "loss": 2.8742, + "step": 14749 + }, + { + "epoch": 0.6210526315789474, + "grad_norm": 0.4609375, + "learning_rate": 0.00016286753089366532, + "loss": 3.2267, + "step": 14750 + }, + { + "epoch": 0.6210947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.0001628359032505437, + "loss": 3.1515, + "step": 14751 + }, + { + "epoch": 0.6211368421052632, + "grad_norm": 0.400390625, + "learning_rate": 0.00016280427719544965, + "loss": 3.3967, + "step": 14752 + }, + { + "epoch": 0.621178947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.0001627726527289591, + "loss": 3.3758, + "step": 14753 + }, + { + "epoch": 0.6212210526315789, + "grad_norm": 0.443359375, + "learning_rate": 0.00016274102985164833, + "loss": 2.9003, + "step": 14754 + }, + { + "epoch": 0.6212631578947369, + "grad_norm": 0.421875, + "learning_rate": 0.00016270940856409356, + "loss": 2.6935, + "step": 14755 + }, + { + "epoch": 0.6213052631578947, + "grad_norm": 0.4609375, + "learning_rate": 0.0001626777888668707, + "loss": 3.0158, + "step": 14756 + }, + { + "epoch": 0.6213473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.00016264617076055605, + "loss": 3.2252, + "step": 14757 + }, + { + "epoch": 0.6213894736842105, + "grad_norm": 0.453125, + "learning_rate": 0.0001626145542457254, + "loss": 3.0112, + "step": 14758 + }, + { + "epoch": 0.6214315789473684, + "grad_norm": 0.4765625, + "learning_rate": 0.00016258293932295499, + "loss": 3.0777, + "step": 14759 + }, + { + "epoch": 0.6214736842105263, + "grad_norm": 0.42578125, + "learning_rate": 0.00016255132599282064, + "loss": 3.1352, + "step": 14760 + }, + { + "epoch": 0.6215157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.0001625197142558984, + "loss": 3.009, + "step": 14761 + }, + { + "epoch": 0.6215578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00016248810411276427, + "loss": 3.3425, + "step": 14762 + }, + { + "epoch": 0.6216, + "grad_norm": 0.427734375, + "learning_rate": 0.00016245649556399395, + "loss": 3.132, + "step": 14763 + }, + { + "epoch": 0.6216421052631579, + "grad_norm": 0.451171875, + "learning_rate": 0.00016242488861016352, + "loss": 2.775, + "step": 14764 + }, + { + "epoch": 0.6216842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.00016239328325184864, + "loss": 3.3143, + "step": 14765 + }, + { + "epoch": 0.6217263157894737, + "grad_norm": 0.435546875, + "learning_rate": 0.00016236167948962524, + "loss": 2.9878, + "step": 14766 + }, + { + "epoch": 0.6217684210526315, + "grad_norm": 0.419921875, + "learning_rate": 0.00016233007732406902, + "loss": 2.799, + "step": 14767 + }, + { + "epoch": 0.6218105263157895, + "grad_norm": 0.462890625, + "learning_rate": 0.00016229847675575584, + "loss": 3.5372, + "step": 14768 + }, + { + "epoch": 0.6218526315789473, + "grad_norm": 0.408203125, + "learning_rate": 0.00016226687778526136, + "loss": 2.6357, + "step": 14769 + }, + { + "epoch": 0.6218947368421053, + "grad_norm": 0.453125, + "learning_rate": 0.00016223528041316127, + "loss": 3.2334, + "step": 14770 + }, + { + "epoch": 0.6219368421052631, + "grad_norm": 0.416015625, + "learning_rate": 0.00016220368464003133, + "loss": 2.6858, + "step": 14771 + }, + { + "epoch": 0.6219789473684211, + "grad_norm": 0.42578125, + "learning_rate": 0.00016217209046644697, + "loss": 3.527, + "step": 14772 + }, + { + "epoch": 0.6220210526315789, + "grad_norm": 0.439453125, + "learning_rate": 0.00016214049789298405, + "loss": 3.4784, + "step": 14773 + }, + { + "epoch": 0.6220631578947369, + "grad_norm": 0.4375, + "learning_rate": 0.00016210890692021793, + "loss": 3.4383, + "step": 14774 + }, + { + "epoch": 0.6221052631578947, + "grad_norm": 0.41015625, + "learning_rate": 0.00016207731754872427, + "loss": 2.8816, + "step": 14775 + }, + { + "epoch": 0.6221473684210527, + "grad_norm": 0.39453125, + "learning_rate": 0.00016204572977907862, + "loss": 3.0155, + "step": 14776 + }, + { + "epoch": 0.6221894736842105, + "grad_norm": 0.58203125, + "learning_rate": 0.00016201414361185636, + "loss": 3.5335, + "step": 14777 + }, + { + "epoch": 0.6222315789473685, + "grad_norm": 0.435546875, + "learning_rate": 0.00016198255904763316, + "loss": 3.0139, + "step": 14778 + }, + { + "epoch": 0.6222736842105263, + "grad_norm": 0.416015625, + "learning_rate": 0.00016195097608698417, + "loss": 3.1913, + "step": 14779 + }, + { + "epoch": 0.6223157894736842, + "grad_norm": 0.396484375, + "learning_rate": 0.0001619193947304851, + "loss": 3.5826, + "step": 14780 + }, + { + "epoch": 0.6223578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.00016188781497871103, + "loss": 2.7906, + "step": 14781 + }, + { + "epoch": 0.6224, + "grad_norm": 0.44140625, + "learning_rate": 0.00016185623683223743, + "loss": 3.1118, + "step": 14782 + }, + { + "epoch": 0.6224421052631579, + "grad_norm": 0.455078125, + "learning_rate": 0.00016182466029163973, + "loss": 2.6, + "step": 14783 + }, + { + "epoch": 0.6224842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.0001617930853574931, + "loss": 3.299, + "step": 14784 + }, + { + "epoch": 0.6225263157894737, + "grad_norm": 0.396484375, + "learning_rate": 0.00016176151203037278, + "loss": 2.865, + "step": 14785 + }, + { + "epoch": 0.6225684210526315, + "grad_norm": 0.423828125, + "learning_rate": 0.00016172994031085402, + "loss": 3.1171, + "step": 14786 + }, + { + "epoch": 0.6226105263157895, + "grad_norm": 0.439453125, + "learning_rate": 0.00016169837019951213, + "loss": 2.9711, + "step": 14787 + }, + { + "epoch": 0.6226526315789473, + "grad_norm": 0.427734375, + "learning_rate": 0.00016166680169692207, + "loss": 3.1872, + "step": 14788 + }, + { + "epoch": 0.6226947368421053, + "grad_norm": 0.4609375, + "learning_rate": 0.0001616352348036591, + "loss": 3.0893, + "step": 14789 + }, + { + "epoch": 0.6227368421052631, + "grad_norm": 0.419921875, + "learning_rate": 0.00016160366952029846, + "loss": 3.5962, + "step": 14790 + }, + { + "epoch": 0.6227789473684211, + "grad_norm": 0.482421875, + "learning_rate": 0.00016157210584741497, + "loss": 3.4068, + "step": 14791 + }, + { + "epoch": 0.6228210526315789, + "grad_norm": 0.44140625, + "learning_rate": 0.00016154054378558387, + "loss": 2.9685, + "step": 14792 + }, + { + "epoch": 0.6228631578947369, + "grad_norm": 0.400390625, + "learning_rate": 0.00016150898333538007, + "loss": 2.9176, + "step": 14793 + }, + { + "epoch": 0.6229052631578947, + "grad_norm": 0.431640625, + "learning_rate": 0.0001614774244973786, + "loss": 2.8883, + "step": 14794 + }, + { + "epoch": 0.6229473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.0001614458672721546, + "loss": 3.1156, + "step": 14795 + }, + { + "epoch": 0.6229894736842105, + "grad_norm": 0.453125, + "learning_rate": 0.00016141431166028273, + "loss": 2.5544, + "step": 14796 + }, + { + "epoch": 0.6230315789473684, + "grad_norm": 0.4921875, + "learning_rate": 0.00016138275766233812, + "loss": 3.2856, + "step": 14797 + }, + { + "epoch": 0.6230736842105263, + "grad_norm": 0.4453125, + "learning_rate": 0.0001613512052788954, + "loss": 3.0189, + "step": 14798 + }, + { + "epoch": 0.6231157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.00016131965451052967, + "loss": 2.8155, + "step": 14799 + }, + { + "epoch": 0.6231578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.00016128810535781562, + "loss": 3.1113, + "step": 14800 + }, + { + "epoch": 0.6232, + "grad_norm": 0.416015625, + "learning_rate": 0.000161256557821328, + "loss": 2.7182, + "step": 14801 + }, + { + "epoch": 0.6232421052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.00016122501190164168, + "loss": 2.7206, + "step": 14802 + }, + { + "epoch": 0.6232842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.00016119346759933128, + "loss": 3.4374, + "step": 14803 + }, + { + "epoch": 0.6233263157894737, + "grad_norm": 0.4453125, + "learning_rate": 0.00016116192491497168, + "loss": 3.5363, + "step": 14804 + }, + { + "epoch": 0.6233684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.00016113038384913731, + "loss": 3.5053, + "step": 14805 + }, + { + "epoch": 0.6234105263157895, + "grad_norm": 0.88671875, + "learning_rate": 0.00016109884440240302, + "loss": 3.1919, + "step": 14806 + }, + { + "epoch": 0.6234526315789474, + "grad_norm": 0.419921875, + "learning_rate": 0.00016106730657534324, + "loss": 2.9201, + "step": 14807 + }, + { + "epoch": 0.6234947368421052, + "grad_norm": 0.435546875, + "learning_rate": 0.00016103577036853268, + "loss": 2.9444, + "step": 14808 + }, + { + "epoch": 0.6235368421052632, + "grad_norm": 0.478515625, + "learning_rate": 0.00016100423578254585, + "loss": 2.926, + "step": 14809 + }, + { + "epoch": 0.623578947368421, + "grad_norm": 0.466796875, + "learning_rate": 0.00016097270281795724, + "loss": 2.9178, + "step": 14810 + }, + { + "epoch": 0.623621052631579, + "grad_norm": 0.4765625, + "learning_rate": 0.0001609411714753415, + "loss": 2.6337, + "step": 14811 + }, + { + "epoch": 0.6236631578947368, + "grad_norm": 0.4140625, + "learning_rate": 0.00016090964175527288, + "loss": 3.3811, + "step": 14812 + }, + { + "epoch": 0.6237052631578948, + "grad_norm": 0.453125, + "learning_rate": 0.000160878113658326, + "loss": 2.9459, + "step": 14813 + }, + { + "epoch": 0.6237473684210526, + "grad_norm": 0.470703125, + "learning_rate": 0.00016084658718507506, + "loss": 3.1945, + "step": 14814 + }, + { + "epoch": 0.6237894736842106, + "grad_norm": 0.4296875, + "learning_rate": 0.00016081506233609456, + "loss": 3.469, + "step": 14815 + }, + { + "epoch": 0.6238315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.00016078353911195891, + "loss": 2.9836, + "step": 14816 + }, + { + "epoch": 0.6238736842105264, + "grad_norm": 0.4453125, + "learning_rate": 0.0001607520175132423, + "loss": 2.9011, + "step": 14817 + }, + { + "epoch": 0.6239157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.00016072049754051909, + "loss": 3.0529, + "step": 14818 + }, + { + "epoch": 0.6239578947368422, + "grad_norm": 0.4453125, + "learning_rate": 0.00016068897919436348, + "loss": 2.9474, + "step": 14819 + }, + { + "epoch": 0.624, + "grad_norm": 0.41796875, + "learning_rate": 0.00016065746247534985, + "loss": 3.2194, + "step": 14820 + }, + { + "epoch": 0.6240421052631578, + "grad_norm": 0.474609375, + "learning_rate": 0.00016062594738405216, + "loss": 3.2669, + "step": 14821 + }, + { + "epoch": 0.6240842105263158, + "grad_norm": 0.439453125, + "learning_rate": 0.0001605944339210447, + "loss": 3.2098, + "step": 14822 + }, + { + "epoch": 0.6241263157894736, + "grad_norm": 0.4296875, + "learning_rate": 0.0001605629220869017, + "loss": 2.7981, + "step": 14823 + }, + { + "epoch": 0.6241684210526316, + "grad_norm": 0.400390625, + "learning_rate": 0.00016053141188219708, + "loss": 2.8417, + "step": 14824 + }, + { + "epoch": 0.6242105263157894, + "grad_norm": 0.4140625, + "learning_rate": 0.00016049990330750509, + "loss": 2.8781, + "step": 14825 + }, + { + "epoch": 0.6242526315789474, + "grad_norm": 0.4375, + "learning_rate": 0.0001604683963633996, + "loss": 3.4556, + "step": 14826 + }, + { + "epoch": 0.6242947368421052, + "grad_norm": 0.4296875, + "learning_rate": 0.00016043689105045488, + "loss": 2.9334, + "step": 14827 + }, + { + "epoch": 0.6243368421052632, + "grad_norm": 0.435546875, + "learning_rate": 0.00016040538736924464, + "loss": 3.1528, + "step": 14828 + }, + { + "epoch": 0.624378947368421, + "grad_norm": 0.453125, + "learning_rate": 0.00016037388532034297, + "loss": 2.6867, + "step": 14829 + }, + { + "epoch": 0.624421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00016034238490432394, + "loss": 2.971, + "step": 14830 + }, + { + "epoch": 0.6244631578947368, + "grad_norm": 0.40234375, + "learning_rate": 0.00016031088612176116, + "loss": 3.0505, + "step": 14831 + }, + { + "epoch": 0.6245052631578948, + "grad_norm": 0.44140625, + "learning_rate": 0.0001602793889732288, + "loss": 2.9823, + "step": 14832 + }, + { + "epoch": 0.6245473684210526, + "grad_norm": 0.408203125, + "learning_rate": 0.00016024789345930047, + "loss": 3.2585, + "step": 14833 + }, + { + "epoch": 0.6245894736842105, + "grad_norm": 0.427734375, + "learning_rate": 0.00016021639958055007, + "loss": 3.009, + "step": 14834 + }, + { + "epoch": 0.6246315789473684, + "grad_norm": 0.7265625, + "learning_rate": 0.00016018490733755138, + "loss": 2.7402, + "step": 14835 + }, + { + "epoch": 0.6246736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.0001601534167308781, + "loss": 3.5637, + "step": 14836 + }, + { + "epoch": 0.6247157894736842, + "grad_norm": 0.5, + "learning_rate": 0.00016012192776110413, + "loss": 3.4937, + "step": 14837 + }, + { + "epoch": 0.6247578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00016009044042880288, + "loss": 2.8601, + "step": 14838 + }, + { + "epoch": 0.6248, + "grad_norm": 0.45703125, + "learning_rate": 0.00016005895473454834, + "loss": 3.1615, + "step": 14839 + }, + { + "epoch": 0.6248421052631579, + "grad_norm": 0.3984375, + "learning_rate": 0.00016002747067891382, + "loss": 2.6641, + "step": 14840 + }, + { + "epoch": 0.6248842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.00015999598826247315, + "loss": 3.3915, + "step": 14841 + }, + { + "epoch": 0.6249263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.00015996450748579982, + "loss": 3.0779, + "step": 14842 + }, + { + "epoch": 0.6249684210526316, + "grad_norm": 0.4296875, + "learning_rate": 0.00015993302834946732, + "loss": 2.717, + "step": 14843 + }, + { + "epoch": 0.6250105263157895, + "grad_norm": 0.42578125, + "learning_rate": 0.00015990155085404925, + "loss": 3.4828, + "step": 14844 + }, + { + "epoch": 0.6250526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00015987007500011902, + "loss": 3.0511, + "step": 14845 + }, + { + "epoch": 0.6250947368421053, + "grad_norm": 0.412109375, + "learning_rate": 0.00015983860078825024, + "loss": 3.191, + "step": 14846 + }, + { + "epoch": 0.6251368421052632, + "grad_norm": 0.423828125, + "learning_rate": 0.00015980712821901612, + "loss": 2.8682, + "step": 14847 + }, + { + "epoch": 0.6251789473684211, + "grad_norm": 0.435546875, + "learning_rate": 0.00015977565729299016, + "loss": 3.1973, + "step": 14848 + }, + { + "epoch": 0.6252210526315789, + "grad_norm": 0.435546875, + "learning_rate": 0.0001597441880107458, + "loss": 3.1877, + "step": 14849 + }, + { + "epoch": 0.6252631578947369, + "grad_norm": 0.458984375, + "learning_rate": 0.00015971272037285618, + "loss": 2.9822, + "step": 14850 + }, + { + "epoch": 0.6253052631578947, + "grad_norm": 0.419921875, + "learning_rate": 0.00015968125437989478, + "loss": 2.9245, + "step": 14851 + }, + { + "epoch": 0.6253473684210527, + "grad_norm": 0.42578125, + "learning_rate": 0.00015964979003243475, + "loss": 3.0644, + "step": 14852 + }, + { + "epoch": 0.6253894736842105, + "grad_norm": 0.44140625, + "learning_rate": 0.00015961832733104946, + "loss": 3.1771, + "step": 14853 + }, + { + "epoch": 0.6254315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00015958686627631197, + "loss": 3.1407, + "step": 14854 + }, + { + "epoch": 0.6254736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00015955540686879554, + "loss": 3.1409, + "step": 14855 + }, + { + "epoch": 0.6255157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00015952394910907346, + "loss": 3.355, + "step": 14856 + }, + { + "epoch": 0.6255578947368421, + "grad_norm": 0.447265625, + "learning_rate": 0.00015949249299771861, + "loss": 3.1925, + "step": 14857 + }, + { + "epoch": 0.6256, + "grad_norm": 0.416015625, + "learning_rate": 0.00015946103853530427, + "loss": 3.0282, + "step": 14858 + }, + { + "epoch": 0.6256421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00015942958572240335, + "loss": 3.0676, + "step": 14859 + }, + { + "epoch": 0.6256842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.00015939813455958902, + "loss": 3.397, + "step": 14860 + }, + { + "epoch": 0.6257263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.00015936668504743415, + "loss": 3.5096, + "step": 14861 + }, + { + "epoch": 0.6257684210526315, + "grad_norm": 0.45703125, + "learning_rate": 0.00015933523718651184, + "loss": 3.1072, + "step": 14862 + }, + { + "epoch": 0.6258105263157895, + "grad_norm": 0.42578125, + "learning_rate": 0.000159303790977395, + "loss": 2.6977, + "step": 14863 + }, + { + "epoch": 0.6258526315789473, + "grad_norm": 0.41015625, + "learning_rate": 0.00015927234642065647, + "loss": 2.9145, + "step": 14864 + }, + { + "epoch": 0.6258947368421053, + "grad_norm": 0.447265625, + "learning_rate": 0.0001592409035168693, + "loss": 3.4849, + "step": 14865 + }, + { + "epoch": 0.6259368421052631, + "grad_norm": 0.44140625, + "learning_rate": 0.0001592094622666061, + "loss": 3.4892, + "step": 14866 + }, + { + "epoch": 0.6259789473684211, + "grad_norm": 0.4453125, + "learning_rate": 0.0001591780226704399, + "loss": 2.7073, + "step": 14867 + }, + { + "epoch": 0.6260210526315789, + "grad_norm": 0.412109375, + "learning_rate": 0.00015914658472894336, + "loss": 2.9459, + "step": 14868 + }, + { + "epoch": 0.6260631578947369, + "grad_norm": 0.416015625, + "learning_rate": 0.00015911514844268936, + "loss": 3.397, + "step": 14869 + }, + { + "epoch": 0.6261052631578947, + "grad_norm": 0.453125, + "learning_rate": 0.00015908371381225054, + "loss": 3.1791, + "step": 14870 + }, + { + "epoch": 0.6261473684210527, + "grad_norm": 0.431640625, + "learning_rate": 0.00015905228083819961, + "loss": 3.104, + "step": 14871 + }, + { + "epoch": 0.6261894736842105, + "grad_norm": 0.43359375, + "learning_rate": 0.00015902084952110933, + "loss": 3.1175, + "step": 14872 + }, + { + "epoch": 0.6262315789473685, + "grad_norm": 0.44921875, + "learning_rate": 0.0001589894198615522, + "loss": 2.6542, + "step": 14873 + }, + { + "epoch": 0.6262736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.000158957991860101, + "loss": 3.1141, + "step": 14874 + }, + { + "epoch": 0.6263157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.00015892656551732813, + "loss": 3.1094, + "step": 14875 + }, + { + "epoch": 0.6263578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.00015889514083380622, + "loss": 3.0647, + "step": 14876 + }, + { + "epoch": 0.6264, + "grad_norm": 0.419921875, + "learning_rate": 0.00015886371781010782, + "loss": 2.6812, + "step": 14877 + }, + { + "epoch": 0.6264421052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.00015883229644680542, + "loss": 3.1418, + "step": 14878 + }, + { + "epoch": 0.6264842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.00015880087674447152, + "loss": 3.1786, + "step": 14879 + }, + { + "epoch": 0.6265263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.00015876945870367838, + "loss": 3.2833, + "step": 14880 + }, + { + "epoch": 0.6265684210526316, + "grad_norm": 0.416015625, + "learning_rate": 0.0001587380423249986, + "loss": 3.2237, + "step": 14881 + }, + { + "epoch": 0.6266105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.0001587066276090044, + "loss": 3.1676, + "step": 14882 + }, + { + "epoch": 0.6266526315789474, + "grad_norm": 0.400390625, + "learning_rate": 0.00015867521455626815, + "loss": 2.8551, + "step": 14883 + }, + { + "epoch": 0.6266947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.00015864380316736227, + "loss": 3.1448, + "step": 14884 + }, + { + "epoch": 0.6267368421052631, + "grad_norm": 0.41796875, + "learning_rate": 0.00015861239344285895, + "loss": 2.9195, + "step": 14885 + }, + { + "epoch": 0.6267789473684211, + "grad_norm": 0.4140625, + "learning_rate": 0.00015858098538333042, + "loss": 2.9354, + "step": 14886 + }, + { + "epoch": 0.626821052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.0001585495789893489, + "loss": 3.7157, + "step": 14887 + }, + { + "epoch": 0.6268631578947368, + "grad_norm": 0.421875, + "learning_rate": 0.00015851817426148669, + "loss": 3.2879, + "step": 14888 + }, + { + "epoch": 0.6269052631578947, + "grad_norm": 0.4140625, + "learning_rate": 0.00015848677120031577, + "loss": 3.3159, + "step": 14889 + }, + { + "epoch": 0.6269473684210526, + "grad_norm": 0.5234375, + "learning_rate": 0.00015845536980640834, + "loss": 2.9835, + "step": 14890 + }, + { + "epoch": 0.6269894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.0001584239700803366, + "loss": 3.1589, + "step": 14891 + }, + { + "epoch": 0.6270315789473684, + "grad_norm": 0.41015625, + "learning_rate": 0.00015839257202267246, + "loss": 2.8193, + "step": 14892 + }, + { + "epoch": 0.6270736842105263, + "grad_norm": 0.44140625, + "learning_rate": 0.00015836117563398805, + "loss": 3.2201, + "step": 14893 + }, + { + "epoch": 0.6271157894736842, + "grad_norm": 0.4375, + "learning_rate": 0.00015832978091485532, + "loss": 3.595, + "step": 14894 + }, + { + "epoch": 0.6271578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.00015829838786584623, + "loss": 3.4672, + "step": 14895 + }, + { + "epoch": 0.6272, + "grad_norm": 0.435546875, + "learning_rate": 0.0001582669964875329, + "loss": 3.2728, + "step": 14896 + }, + { + "epoch": 0.6272421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.000158235606780487, + "loss": 2.7824, + "step": 14897 + }, + { + "epoch": 0.6272842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.00015820421874528066, + "loss": 3.1381, + "step": 14898 + }, + { + "epoch": 0.6273263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.00015817283238248546, + "loss": 3.3476, + "step": 14899 + }, + { + "epoch": 0.6273684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.00015814144769267345, + "loss": 3.5594, + "step": 14900 + }, + { + "epoch": 0.6274105263157894, + "grad_norm": 0.4609375, + "learning_rate": 0.0001581100646764163, + "loss": 3.0274, + "step": 14901 + }, + { + "epoch": 0.6274526315789474, + "grad_norm": 0.45703125, + "learning_rate": 0.00015807868333428578, + "loss": 3.3255, + "step": 14902 + }, + { + "epoch": 0.6274947368421052, + "grad_norm": 0.427734375, + "learning_rate": 0.00015804730366685365, + "loss": 3.161, + "step": 14903 + }, + { + "epoch": 0.6275368421052632, + "grad_norm": 0.466796875, + "learning_rate": 0.00015801592567469159, + "loss": 3.0962, + "step": 14904 + }, + { + "epoch": 0.627578947368421, + "grad_norm": 0.443359375, + "learning_rate": 0.00015798454935837139, + "loss": 3.0788, + "step": 14905 + }, + { + "epoch": 0.627621052631579, + "grad_norm": 0.451171875, + "learning_rate": 0.00015795317471846448, + "loss": 2.9881, + "step": 14906 + }, + { + "epoch": 0.6276631578947368, + "grad_norm": 0.51953125, + "learning_rate": 0.00015792180175554267, + "loss": 3.3221, + "step": 14907 + }, + { + "epoch": 0.6277052631578948, + "grad_norm": 0.4296875, + "learning_rate": 0.00015789043047017737, + "loss": 3.2497, + "step": 14908 + }, + { + "epoch": 0.6277473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.00015785906086294017, + "loss": 3.2467, + "step": 14909 + }, + { + "epoch": 0.6277894736842106, + "grad_norm": 0.4140625, + "learning_rate": 0.00015782769293440268, + "loss": 2.998, + "step": 14910 + }, + { + "epoch": 0.6278315789473684, + "grad_norm": 0.47265625, + "learning_rate": 0.0001577963266851363, + "loss": 3.075, + "step": 14911 + }, + { + "epoch": 0.6278736842105264, + "grad_norm": 0.4296875, + "learning_rate": 0.00015776496211571258, + "loss": 2.7698, + "step": 14912 + }, + { + "epoch": 0.6279157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.0001577335992267028, + "loss": 3.0398, + "step": 14913 + }, + { + "epoch": 0.6279578947368422, + "grad_norm": 0.4296875, + "learning_rate": 0.00015770223801867854, + "loss": 3.1419, + "step": 14914 + }, + { + "epoch": 0.628, + "grad_norm": 0.423828125, + "learning_rate": 0.00015767087849221097, + "loss": 2.5886, + "step": 14915 + }, + { + "epoch": 0.6280421052631578, + "grad_norm": 0.419921875, + "learning_rate": 0.00015763952064787147, + "loss": 3.1272, + "step": 14916 + }, + { + "epoch": 0.6280842105263158, + "grad_norm": 0.462890625, + "learning_rate": 0.0001576081644862315, + "loss": 3.2196, + "step": 14917 + }, + { + "epoch": 0.6281263157894736, + "grad_norm": 0.41796875, + "learning_rate": 0.00015757681000786212, + "loss": 3.3713, + "step": 14918 + }, + { + "epoch": 0.6281684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.0001575454572133347, + "loss": 3.2238, + "step": 14919 + }, + { + "epoch": 0.6282105263157894, + "grad_norm": 0.41015625, + "learning_rate": 0.0001575141061032204, + "loss": 3.0077, + "step": 14920 + }, + { + "epoch": 0.6282526315789474, + "grad_norm": 0.44921875, + "learning_rate": 0.00015748275667809051, + "loss": 2.8974, + "step": 14921 + }, + { + "epoch": 0.6282947368421052, + "grad_norm": 0.453125, + "learning_rate": 0.00015745140893851596, + "loss": 3.0965, + "step": 14922 + }, + { + "epoch": 0.6283368421052632, + "grad_norm": 0.458984375, + "learning_rate": 0.00015742006288506805, + "loss": 2.7788, + "step": 14923 + }, + { + "epoch": 0.628378947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.00015738871851831787, + "loss": 2.8099, + "step": 14924 + }, + { + "epoch": 0.628421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.00015735737583883635, + "loss": 3.4885, + "step": 14925 + }, + { + "epoch": 0.6284631578947368, + "grad_norm": 0.42578125, + "learning_rate": 0.00015732603484719466, + "loss": 2.9779, + "step": 14926 + }, + { + "epoch": 0.6285052631578948, + "grad_norm": 0.421875, + "learning_rate": 0.00015729469554396364, + "loss": 3.539, + "step": 14927 + }, + { + "epoch": 0.6285473684210526, + "grad_norm": 0.43359375, + "learning_rate": 0.00015726335792971447, + "loss": 3.0421, + "step": 14928 + }, + { + "epoch": 0.6285894736842105, + "grad_norm": 0.40234375, + "learning_rate": 0.00015723202200501786, + "loss": 2.9923, + "step": 14929 + }, + { + "epoch": 0.6286315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.00015720068777044476, + "loss": 3.1699, + "step": 14930 + }, + { + "epoch": 0.6286736842105263, + "grad_norm": 0.4375, + "learning_rate": 0.0001571693552265663, + "loss": 3.3098, + "step": 14931 + }, + { + "epoch": 0.6287157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.00015713802437395291, + "loss": 3.3099, + "step": 14932 + }, + { + "epoch": 0.6287578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.00015710669521317577, + "loss": 2.8568, + "step": 14933 + }, + { + "epoch": 0.6288, + "grad_norm": 0.42578125, + "learning_rate": 0.00015707536774480535, + "loss": 3.2563, + "step": 14934 + }, + { + "epoch": 0.6288421052631579, + "grad_norm": 0.451171875, + "learning_rate": 0.00015704404196941263, + "loss": 3.1794, + "step": 14935 + }, + { + "epoch": 0.6288842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.0001570127178875682, + "loss": 3.225, + "step": 14936 + }, + { + "epoch": 0.6289263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.00015698139549984278, + "loss": 3.476, + "step": 14937 + }, + { + "epoch": 0.6289684210526316, + "grad_norm": 0.421875, + "learning_rate": 0.00015695007480680714, + "loss": 2.9088, + "step": 14938 + }, + { + "epoch": 0.6290105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.0001569187558090317, + "loss": 3.3564, + "step": 14939 + }, + { + "epoch": 0.6290526315789474, + "grad_norm": 0.45703125, + "learning_rate": 0.0001568874385070873, + "loss": 3.0855, + "step": 14940 + }, + { + "epoch": 0.6290947368421053, + "grad_norm": 0.412109375, + "learning_rate": 0.00015685612290154425, + "loss": 3.1391, + "step": 14941 + }, + { + "epoch": 0.6291368421052631, + "grad_norm": 0.43359375, + "learning_rate": 0.00015682480899297319, + "loss": 3.0318, + "step": 14942 + }, + { + "epoch": 0.6291789473684211, + "grad_norm": 0.439453125, + "learning_rate": 0.00015679349678194467, + "loss": 3.0833, + "step": 14943 + }, + { + "epoch": 0.6292210526315789, + "grad_norm": 0.396484375, + "learning_rate": 0.00015676218626902916, + "loss": 2.6311, + "step": 14944 + }, + { + "epoch": 0.6292631578947369, + "grad_norm": 0.404296875, + "learning_rate": 0.00015673087745479705, + "loss": 3.2621, + "step": 14945 + }, + { + "epoch": 0.6293052631578947, + "grad_norm": 0.484375, + "learning_rate": 0.00015669957033981875, + "loss": 3.1662, + "step": 14946 + }, + { + "epoch": 0.6293473684210527, + "grad_norm": 0.427734375, + "learning_rate": 0.0001566682649246647, + "loss": 3.2021, + "step": 14947 + }, + { + "epoch": 0.6293894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.0001566369612099051, + "loss": 3.1501, + "step": 14948 + }, + { + "epoch": 0.6294315789473685, + "grad_norm": 0.412109375, + "learning_rate": 0.0001566056591961104, + "loss": 3.3739, + "step": 14949 + }, + { + "epoch": 0.6294736842105263, + "grad_norm": 0.458984375, + "learning_rate": 0.00015657435888385098, + "loss": 3.1056, + "step": 14950 + }, + { + "epoch": 0.6295157894736843, + "grad_norm": 0.400390625, + "learning_rate": 0.00015654306027369682, + "loss": 2.8816, + "step": 14951 + }, + { + "epoch": 0.6295578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00015651176336621837, + "loss": 2.7512, + "step": 14952 + }, + { + "epoch": 0.6296, + "grad_norm": 0.416015625, + "learning_rate": 0.00015648046816198568, + "loss": 3.5269, + "step": 14953 + }, + { + "epoch": 0.6296421052631579, + "grad_norm": 0.4609375, + "learning_rate": 0.0001564491746615691, + "loss": 3.2092, + "step": 14954 + }, + { + "epoch": 0.6296842105263157, + "grad_norm": 0.427734375, + "learning_rate": 0.00015641788286553855, + "loss": 2.9902, + "step": 14955 + }, + { + "epoch": 0.6297263157894737, + "grad_norm": 0.4453125, + "learning_rate": 0.0001563865927744642, + "loss": 2.9846, + "step": 14956 + }, + { + "epoch": 0.6297684210526315, + "grad_norm": 0.427734375, + "learning_rate": 0.00015635530438891623, + "loss": 3.8157, + "step": 14957 + }, + { + "epoch": 0.6298105263157895, + "grad_norm": 0.43359375, + "learning_rate": 0.0001563240177094645, + "loss": 2.7765, + "step": 14958 + }, + { + "epoch": 0.6298526315789473, + "grad_norm": 0.423828125, + "learning_rate": 0.00015629273273667915, + "loss": 3.4377, + "step": 14959 + }, + { + "epoch": 0.6298947368421053, + "grad_norm": 0.412109375, + "learning_rate": 0.00015626144947113012, + "loss": 3.4104, + "step": 14960 + }, + { + "epoch": 0.6299368421052631, + "grad_norm": 0.453125, + "learning_rate": 0.0001562301679133873, + "loss": 3.1417, + "step": 14961 + }, + { + "epoch": 0.6299789473684211, + "grad_norm": 0.412109375, + "learning_rate": 0.00015619888806402065, + "loss": 2.893, + "step": 14962 + }, + { + "epoch": 0.6300210526315789, + "grad_norm": 0.431640625, + "learning_rate": 0.0001561676099236, + "loss": 3.422, + "step": 14963 + }, + { + "epoch": 0.6300631578947369, + "grad_norm": 0.4453125, + "learning_rate": 0.0001561363334926954, + "loss": 3.1399, + "step": 14964 + }, + { + "epoch": 0.6301052631578947, + "grad_norm": 0.431640625, + "learning_rate": 0.00015610505877187636, + "loss": 3.207, + "step": 14965 + }, + { + "epoch": 0.6301473684210527, + "grad_norm": 0.431640625, + "learning_rate": 0.000156073785761713, + "loss": 3.2552, + "step": 14966 + }, + { + "epoch": 0.6301894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.00015604251446277474, + "loss": 2.9491, + "step": 14967 + }, + { + "epoch": 0.6302315789473685, + "grad_norm": 0.427734375, + "learning_rate": 0.00015601124487563153, + "loss": 3.4595, + "step": 14968 + }, + { + "epoch": 0.6302736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.000155979977000853, + "loss": 3.6816, + "step": 14969 + }, + { + "epoch": 0.6303157894736842, + "grad_norm": 0.455078125, + "learning_rate": 0.00015594871083900874, + "loss": 3.2167, + "step": 14970 + }, + { + "epoch": 0.6303578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.00015591744639066862, + "loss": 2.7873, + "step": 14971 + }, + { + "epoch": 0.6304, + "grad_norm": 0.423828125, + "learning_rate": 0.000155886183656402, + "loss": 2.6726, + "step": 14972 + }, + { + "epoch": 0.6304421052631579, + "grad_norm": 0.44921875, + "learning_rate": 0.00015585492263677858, + "loss": 3.301, + "step": 14973 + }, + { + "epoch": 0.6304842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.0001558236633323678, + "loss": 3.1195, + "step": 14974 + }, + { + "epoch": 0.6305263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.0001557924057437393, + "loss": 3.2124, + "step": 14975 + }, + { + "epoch": 0.6305684210526316, + "grad_norm": 0.486328125, + "learning_rate": 0.00015576114987146238, + "loss": 2.5183, + "step": 14976 + }, + { + "epoch": 0.6306105263157895, + "grad_norm": 0.40625, + "learning_rate": 0.0001557298957161066, + "loss": 2.826, + "step": 14977 + }, + { + "epoch": 0.6306526315789474, + "grad_norm": 0.462890625, + "learning_rate": 0.0001556986432782414, + "loss": 2.817, + "step": 14978 + }, + { + "epoch": 0.6306947368421053, + "grad_norm": 0.435546875, + "learning_rate": 0.00015566739255843607, + "loss": 3.1216, + "step": 14979 + }, + { + "epoch": 0.6307368421052632, + "grad_norm": 0.44140625, + "learning_rate": 0.00015563614355726009, + "loss": 3.059, + "step": 14980 + }, + { + "epoch": 0.6307789473684211, + "grad_norm": 0.447265625, + "learning_rate": 0.00015560489627528264, + "loss": 3.1275, + "step": 14981 + }, + { + "epoch": 0.630821052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.00015557365071307313, + "loss": 3.0089, + "step": 14982 + }, + { + "epoch": 0.6308631578947368, + "grad_norm": 0.4140625, + "learning_rate": 0.00015554240687120068, + "loss": 2.8831, + "step": 14983 + }, + { + "epoch": 0.6309052631578947, + "grad_norm": 0.51953125, + "learning_rate": 0.00015551116475023458, + "loss": 2.8487, + "step": 14984 + }, + { + "epoch": 0.6309473684210526, + "grad_norm": 0.44921875, + "learning_rate": 0.0001554799243507441, + "loss": 3.6548, + "step": 14985 + }, + { + "epoch": 0.6309894736842105, + "grad_norm": 0.44921875, + "learning_rate": 0.00015544868567329828, + "loss": 2.9701, + "step": 14986 + }, + { + "epoch": 0.6310315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.00015541744871846642, + "loss": 2.5822, + "step": 14987 + }, + { + "epoch": 0.6310736842105263, + "grad_norm": 0.453125, + "learning_rate": 0.00015538621348681742, + "loss": 3.1166, + "step": 14988 + }, + { + "epoch": 0.6311157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.00015535497997892057, + "loss": 3.0705, + "step": 14989 + }, + { + "epoch": 0.6311578947368421, + "grad_norm": 0.458984375, + "learning_rate": 0.00015532374819534462, + "loss": 3.0401, + "step": 14990 + }, + { + "epoch": 0.6312, + "grad_norm": 0.431640625, + "learning_rate": 0.00015529251813665878, + "loss": 3.2649, + "step": 14991 + }, + { + "epoch": 0.6312421052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.00015526128980343208, + "loss": 3.2711, + "step": 14992 + }, + { + "epoch": 0.6312842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.00015523006319623326, + "loss": 3.0064, + "step": 14993 + }, + { + "epoch": 0.6313263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.00015519883831563135, + "loss": 2.876, + "step": 14994 + }, + { + "epoch": 0.6313684210526316, + "grad_norm": 0.443359375, + "learning_rate": 0.00015516761516219524, + "loss": 3.1223, + "step": 14995 + }, + { + "epoch": 0.6314105263157894, + "grad_norm": 0.4296875, + "learning_rate": 0.00015513639373649374, + "loss": 3.1781, + "step": 14996 + }, + { + "epoch": 0.6314526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.00015510517403909574, + "loss": 3.3544, + "step": 14997 + }, + { + "epoch": 0.6314947368421052, + "grad_norm": 0.451171875, + "learning_rate": 0.0001550739560705699, + "loss": 3.7107, + "step": 14998 + }, + { + "epoch": 0.6315368421052632, + "grad_norm": 0.443359375, + "learning_rate": 0.00015504273983148518, + "loss": 3.4328, + "step": 14999 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00015501152532241004, + "loss": 3.466, + "step": 15000 + }, + { + "epoch": 0.631578947368421, + "eval_loss": 3.0952181816101074, + "eval_runtime": 335.5119, + "eval_samples_per_second": 44.708, + "eval_steps_per_second": 5.588, + "step": 15000 + }, + { + "epoch": 0.631621052631579, + "grad_norm": 0.4609375, + "learning_rate": 0.00015498031254391334, + "loss": 3.3284, + "step": 15001 + }, + { + "epoch": 0.6316631578947368, + "grad_norm": 0.421875, + "learning_rate": 0.00015494910149656371, + "loss": 2.9961, + "step": 15002 + }, + { + "epoch": 0.6317052631578948, + "grad_norm": 0.40625, + "learning_rate": 0.00015491789218092971, + "loss": 2.9405, + "step": 15003 + }, + { + "epoch": 0.6317473684210526, + "grad_norm": 0.43359375, + "learning_rate": 0.0001548866845975801, + "loss": 3.5555, + "step": 15004 + }, + { + "epoch": 0.6317894736842106, + "grad_norm": 0.40625, + "learning_rate": 0.00015485547874708323, + "loss": 3.0339, + "step": 15005 + }, + { + "epoch": 0.6318315789473684, + "grad_norm": 0.421875, + "learning_rate": 0.00015482427463000786, + "loss": 2.9503, + "step": 15006 + }, + { + "epoch": 0.6318736842105264, + "grad_norm": 0.439453125, + "learning_rate": 0.0001547930722469223, + "loss": 3.3502, + "step": 15007 + }, + { + "epoch": 0.6319157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00015476187159839521, + "loss": 3.1998, + "step": 15008 + }, + { + "epoch": 0.631957894736842, + "grad_norm": 0.455078125, + "learning_rate": 0.0001547306726849948, + "loss": 3.1636, + "step": 15009 + }, + { + "epoch": 0.632, + "grad_norm": 0.447265625, + "learning_rate": 0.00015469947550728958, + "loss": 2.9483, + "step": 15010 + }, + { + "epoch": 0.6320421052631578, + "grad_norm": 0.40625, + "learning_rate": 0.000154668280065848, + "loss": 3.0869, + "step": 15011 + }, + { + "epoch": 0.6320842105263158, + "grad_norm": 0.45703125, + "learning_rate": 0.00015463708636123831, + "loss": 3.1895, + "step": 15012 + }, + { + "epoch": 0.6321263157894736, + "grad_norm": 0.451171875, + "learning_rate": 0.00015460589439402896, + "loss": 2.8422, + "step": 15013 + }, + { + "epoch": 0.6321684210526316, + "grad_norm": 0.4609375, + "learning_rate": 0.00015457470416478802, + "loss": 3.1311, + "step": 15014 + }, + { + "epoch": 0.6322105263157894, + "grad_norm": 0.42578125, + "learning_rate": 0.000154543515674084, + "loss": 3.2495, + "step": 15015 + }, + { + "epoch": 0.6322526315789474, + "grad_norm": 0.44140625, + "learning_rate": 0.0001545123289224848, + "loss": 3.2504, + "step": 15016 + }, + { + "epoch": 0.6322947368421052, + "grad_norm": 0.427734375, + "learning_rate": 0.00015448114391055878, + "loss": 3.506, + "step": 15017 + }, + { + "epoch": 0.6323368421052632, + "grad_norm": 0.408203125, + "learning_rate": 0.00015444996063887422, + "loss": 2.7659, + "step": 15018 + }, + { + "epoch": 0.632378947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.000154418779107999, + "loss": 3.3293, + "step": 15019 + }, + { + "epoch": 0.632421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.00015438759931850137, + "loss": 3.3957, + "step": 15020 + }, + { + "epoch": 0.6324631578947368, + "grad_norm": 0.421875, + "learning_rate": 0.00015435642127094927, + "loss": 2.8634, + "step": 15021 + }, + { + "epoch": 0.6325052631578947, + "grad_norm": 0.466796875, + "learning_rate": 0.00015432524496591094, + "loss": 2.6752, + "step": 15022 + }, + { + "epoch": 0.6325473684210526, + "grad_norm": 0.44140625, + "learning_rate": 0.0001542940704039541, + "loss": 3.2996, + "step": 15023 + }, + { + "epoch": 0.6325894736842105, + "grad_norm": 0.451171875, + "learning_rate": 0.00015426289758564688, + "loss": 3.3541, + "step": 15024 + }, + { + "epoch": 0.6326315789473684, + "grad_norm": 0.44140625, + "learning_rate": 0.00015423172651155725, + "loss": 3.0191, + "step": 15025 + }, + { + "epoch": 0.6326736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.00015420055718225295, + "loss": 3.0934, + "step": 15026 + }, + { + "epoch": 0.6327157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.00015416938959830198, + "loss": 3.5385, + "step": 15027 + }, + { + "epoch": 0.6327578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.0001541382237602721, + "loss": 2.8548, + "step": 15028 + }, + { + "epoch": 0.6328, + "grad_norm": 0.421875, + "learning_rate": 0.00015410705966873123, + "loss": 3.2134, + "step": 15029 + }, + { + "epoch": 0.6328421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.000154075897324247, + "loss": 3.0927, + "step": 15030 + }, + { + "epoch": 0.6328842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.00015404473672738718, + "loss": 3.1764, + "step": 15031 + }, + { + "epoch": 0.6329263157894737, + "grad_norm": 0.466796875, + "learning_rate": 0.0001540135778787196, + "loss": 3.2262, + "step": 15032 + }, + { + "epoch": 0.6329684210526316, + "grad_norm": 0.44921875, + "learning_rate": 0.0001539824207788118, + "loss": 3.4183, + "step": 15033 + }, + { + "epoch": 0.6330105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.00015395126542823155, + "loss": 3.7791, + "step": 15034 + }, + { + "epoch": 0.6330526315789474, + "grad_norm": 0.45703125, + "learning_rate": 0.0001539201118275463, + "loss": 2.9635, + "step": 15035 + }, + { + "epoch": 0.6330947368421053, + "grad_norm": 0.421875, + "learning_rate": 0.00015388895997732374, + "loss": 2.9243, + "step": 15036 + }, + { + "epoch": 0.6331368421052631, + "grad_norm": 0.453125, + "learning_rate": 0.00015385780987813138, + "loss": 2.8828, + "step": 15037 + }, + { + "epoch": 0.6331789473684211, + "grad_norm": 0.4375, + "learning_rate": 0.00015382666153053676, + "loss": 3.8644, + "step": 15038 + }, + { + "epoch": 0.6332210526315789, + "grad_norm": 0.59765625, + "learning_rate": 0.00015379551493510747, + "loss": 3.1287, + "step": 15039 + }, + { + "epoch": 0.6332631578947369, + "grad_norm": 0.4375, + "learning_rate": 0.0001537643700924108, + "loss": 2.9927, + "step": 15040 + }, + { + "epoch": 0.6333052631578947, + "grad_norm": 0.427734375, + "learning_rate": 0.00015373322700301429, + "loss": 2.4085, + "step": 15041 + }, + { + "epoch": 0.6333473684210527, + "grad_norm": 0.40234375, + "learning_rate": 0.00015370208566748518, + "loss": 2.7481, + "step": 15042 + }, + { + "epoch": 0.6333894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.0001536709460863909, + "loss": 2.9193, + "step": 15043 + }, + { + "epoch": 0.6334315789473685, + "grad_norm": 0.423828125, + "learning_rate": 0.0001536398082602989, + "loss": 3.3618, + "step": 15044 + }, + { + "epoch": 0.6334736842105263, + "grad_norm": 0.3984375, + "learning_rate": 0.00015360867218977634, + "loss": 2.6084, + "step": 15045 + }, + { + "epoch": 0.6335157894736843, + "grad_norm": 0.44921875, + "learning_rate": 0.00015357753787539053, + "loss": 3.1218, + "step": 15046 + }, + { + "epoch": 0.6335578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.00015354640531770863, + "loss": 2.6928, + "step": 15047 + }, + { + "epoch": 0.6336, + "grad_norm": 0.421875, + "learning_rate": 0.000153515274517298, + "loss": 3.1263, + "step": 15048 + }, + { + "epoch": 0.6336421052631579, + "grad_norm": 0.388671875, + "learning_rate": 0.00015348414547472562, + "loss": 2.7719, + "step": 15049 + }, + { + "epoch": 0.6336842105263157, + "grad_norm": 0.43359375, + "learning_rate": 0.0001534530181905587, + "loss": 3.3164, + "step": 15050 + }, + { + "epoch": 0.6337263157894737, + "grad_norm": 0.439453125, + "learning_rate": 0.00015342189266536444, + "loss": 3.4272, + "step": 15051 + }, + { + "epoch": 0.6337684210526315, + "grad_norm": 0.435546875, + "learning_rate": 0.0001533907688997097, + "loss": 3.1096, + "step": 15052 + }, + { + "epoch": 0.6338105263157895, + "grad_norm": 0.453125, + "learning_rate": 0.00015335964689416174, + "loss": 3.0531, + "step": 15053 + }, + { + "epoch": 0.6338526315789473, + "grad_norm": 0.408203125, + "learning_rate": 0.00015332852664928738, + "loss": 3.2097, + "step": 15054 + }, + { + "epoch": 0.6338947368421053, + "grad_norm": 0.455078125, + "learning_rate": 0.0001532974081656538, + "loss": 3.1711, + "step": 15055 + }, + { + "epoch": 0.6339368421052631, + "grad_norm": 0.427734375, + "learning_rate": 0.0001532662914438277, + "loss": 2.9236, + "step": 15056 + }, + { + "epoch": 0.6339789473684211, + "grad_norm": 0.451171875, + "learning_rate": 0.00015323517648437612, + "loss": 3.537, + "step": 15057 + }, + { + "epoch": 0.6340210526315789, + "grad_norm": 0.404296875, + "learning_rate": 0.00015320406328786607, + "loss": 2.6572, + "step": 15058 + }, + { + "epoch": 0.6340631578947369, + "grad_norm": 0.46484375, + "learning_rate": 0.0001531729518548641, + "loss": 3.1046, + "step": 15059 + }, + { + "epoch": 0.6341052631578947, + "grad_norm": 0.453125, + "learning_rate": 0.00015314184218593723, + "loss": 2.8916, + "step": 15060 + }, + { + "epoch": 0.6341473684210527, + "grad_norm": 0.42578125, + "learning_rate": 0.00015311073428165222, + "loss": 3.1758, + "step": 15061 + }, + { + "epoch": 0.6341894736842105, + "grad_norm": 0.458984375, + "learning_rate": 0.0001530796281425758, + "loss": 3.246, + "step": 15062 + }, + { + "epoch": 0.6342315789473684, + "grad_norm": 0.40234375, + "learning_rate": 0.00015304852376927464, + "loss": 2.6281, + "step": 15063 + }, + { + "epoch": 0.6342736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.00015301742116231542, + "loss": 2.6975, + "step": 15064 + }, + { + "epoch": 0.6343157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.00015298632032226494, + "loss": 3.1861, + "step": 15065 + }, + { + "epoch": 0.6343578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00015295522124968963, + "loss": 2.699, + "step": 15066 + }, + { + "epoch": 0.6344, + "grad_norm": 0.431640625, + "learning_rate": 0.0001529241239451563, + "loss": 2.9479, + "step": 15067 + }, + { + "epoch": 0.6344421052631579, + "grad_norm": 0.498046875, + "learning_rate": 0.00015289302840923124, + "loss": 3.2398, + "step": 15068 + }, + { + "epoch": 0.6344842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.00015286193464248115, + "loss": 3.2375, + "step": 15069 + }, + { + "epoch": 0.6345263157894737, + "grad_norm": 0.435546875, + "learning_rate": 0.00015283084264547243, + "loss": 3.3047, + "step": 15070 + }, + { + "epoch": 0.6345684210526316, + "grad_norm": 0.439453125, + "learning_rate": 0.0001527997524187716, + "loss": 3.2854, + "step": 15071 + }, + { + "epoch": 0.6346105263157895, + "grad_norm": 0.435546875, + "learning_rate": 0.00015276866396294518, + "loss": 3.3843, + "step": 15072 + }, + { + "epoch": 0.6346526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.00015273757727855933, + "loss": 3.0405, + "step": 15073 + }, + { + "epoch": 0.6346947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.00015270649236618066, + "loss": 3.012, + "step": 15074 + }, + { + "epoch": 0.6347368421052632, + "grad_norm": 0.419921875, + "learning_rate": 0.0001526754092263753, + "loss": 3.3896, + "step": 15075 + }, + { + "epoch": 0.634778947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00015264432785970966, + "loss": 3.2394, + "step": 15076 + }, + { + "epoch": 0.634821052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00015261324826675, + "loss": 2.9098, + "step": 15077 + }, + { + "epoch": 0.6348631578947368, + "grad_norm": 0.392578125, + "learning_rate": 0.00015258217044806244, + "loss": 2.7584, + "step": 15078 + }, + { + "epoch": 0.6349052631578948, + "grad_norm": 0.404296875, + "learning_rate": 0.00015255109440421338, + "loss": 2.8926, + "step": 15079 + }, + { + "epoch": 0.6349473684210526, + "grad_norm": 0.41796875, + "learning_rate": 0.0001525200201357688, + "loss": 2.9187, + "step": 15080 + }, + { + "epoch": 0.6349894736842105, + "grad_norm": 0.4140625, + "learning_rate": 0.00015248894764329502, + "loss": 3.4695, + "step": 15081 + }, + { + "epoch": 0.6350315789473684, + "grad_norm": 0.443359375, + "learning_rate": 0.00015245787692735795, + "loss": 3.0178, + "step": 15082 + }, + { + "epoch": 0.6350736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.00015242680798852385, + "loss": 3.2409, + "step": 15083 + }, + { + "epoch": 0.6351157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00015239574082735857, + "loss": 3.1375, + "step": 15084 + }, + { + "epoch": 0.6351578947368421, + "grad_norm": 0.46484375, + "learning_rate": 0.0001523646754444282, + "loss": 2.7571, + "step": 15085 + }, + { + "epoch": 0.6352, + "grad_norm": 0.466796875, + "learning_rate": 0.0001523336118402988, + "loss": 3.3967, + "step": 15086 + }, + { + "epoch": 0.6352421052631579, + "grad_norm": 0.404296875, + "learning_rate": 0.00015230255001553617, + "loss": 3.0641, + "step": 15087 + }, + { + "epoch": 0.6352842105263158, + "grad_norm": 0.443359375, + "learning_rate": 0.0001522714899707064, + "loss": 3.3358, + "step": 15088 + }, + { + "epoch": 0.6353263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.00015224043170637515, + "loss": 3.2893, + "step": 15089 + }, + { + "epoch": 0.6353684210526316, + "grad_norm": 0.45703125, + "learning_rate": 0.00015220937522310846, + "loss": 3.352, + "step": 15090 + }, + { + "epoch": 0.6354105263157894, + "grad_norm": 0.435546875, + "learning_rate": 0.00015217832052147196, + "loss": 3.5444, + "step": 15091 + }, + { + "epoch": 0.6354526315789474, + "grad_norm": 0.435546875, + "learning_rate": 0.0001521472676020315, + "loss": 3.227, + "step": 15092 + }, + { + "epoch": 0.6354947368421052, + "grad_norm": 0.41015625, + "learning_rate": 0.000152116216465353, + "loss": 2.5507, + "step": 15093 + }, + { + "epoch": 0.6355368421052632, + "grad_norm": 0.423828125, + "learning_rate": 0.00015208516711200194, + "loss": 3.1603, + "step": 15094 + }, + { + "epoch": 0.635578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.0001520541195425441, + "loss": 2.8496, + "step": 15095 + }, + { + "epoch": 0.635621052631579, + "grad_norm": 0.451171875, + "learning_rate": 0.00015202307375754504, + "loss": 3.0032, + "step": 15096 + }, + { + "epoch": 0.6356631578947368, + "grad_norm": 0.41015625, + "learning_rate": 0.0001519920297575705, + "loss": 3.1384, + "step": 15097 + }, + { + "epoch": 0.6357052631578948, + "grad_norm": 0.400390625, + "learning_rate": 0.00015196098754318612, + "loss": 2.8778, + "step": 15098 + }, + { + "epoch": 0.6357473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.0001519299471149573, + "loss": 3.2905, + "step": 15099 + }, + { + "epoch": 0.6357894736842106, + "grad_norm": 0.419921875, + "learning_rate": 0.00015189890847344965, + "loss": 2.9913, + "step": 15100 + }, + { + "epoch": 0.6358315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.00015186787161922854, + "loss": 2.9145, + "step": 15101 + }, + { + "epoch": 0.6358736842105264, + "grad_norm": 0.466796875, + "learning_rate": 0.00015183683655285956, + "loss": 2.7434, + "step": 15102 + }, + { + "epoch": 0.6359157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.00015180580327490804, + "loss": 2.5147, + "step": 15103 + }, + { + "epoch": 0.635957894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.00015177477178593947, + "loss": 3.2603, + "step": 15104 + }, + { + "epoch": 0.636, + "grad_norm": 0.408203125, + "learning_rate": 0.0001517437420865191, + "loss": 2.9065, + "step": 15105 + }, + { + "epoch": 0.6360421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.0001517127141772123, + "loss": 2.8908, + "step": 15106 + }, + { + "epoch": 0.6360842105263158, + "grad_norm": 0.55078125, + "learning_rate": 0.00015168168805858445, + "loss": 3.0568, + "step": 15107 + }, + { + "epoch": 0.6361263157894737, + "grad_norm": 0.455078125, + "learning_rate": 0.00015165066373120062, + "loss": 2.9242, + "step": 15108 + }, + { + "epoch": 0.6361684210526316, + "grad_norm": 0.435546875, + "learning_rate": 0.00015161964119562627, + "loss": 3.2599, + "step": 15109 + }, + { + "epoch": 0.6362105263157894, + "grad_norm": 0.41796875, + "learning_rate": 0.00015158862045242633, + "loss": 2.648, + "step": 15110 + }, + { + "epoch": 0.6362526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.0001515576015021661, + "loss": 3.2012, + "step": 15111 + }, + { + "epoch": 0.6362947368421052, + "grad_norm": 0.41796875, + "learning_rate": 0.0001515265843454108, + "loss": 3.125, + "step": 15112 + }, + { + "epoch": 0.6363368421052632, + "grad_norm": 0.443359375, + "learning_rate": 0.00015149556898272533, + "loss": 3.186, + "step": 15113 + }, + { + "epoch": 0.636378947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00015146455541467497, + "loss": 3.1077, + "step": 15114 + }, + { + "epoch": 0.636421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00015143354364182454, + "loss": 3.2735, + "step": 15115 + }, + { + "epoch": 0.6364631578947368, + "grad_norm": 0.431640625, + "learning_rate": 0.00015140253366473922, + "loss": 3.1799, + "step": 15116 + }, + { + "epoch": 0.6365052631578947, + "grad_norm": 0.4453125, + "learning_rate": 0.0001513715254839838, + "loss": 3.1229, + "step": 15117 + }, + { + "epoch": 0.6365473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.00015134051910012332, + "loss": 3.0195, + "step": 15118 + }, + { + "epoch": 0.6365894736842105, + "grad_norm": 0.427734375, + "learning_rate": 0.0001513095145137227, + "loss": 2.8513, + "step": 15119 + }, + { + "epoch": 0.6366315789473684, + "grad_norm": 0.470703125, + "learning_rate": 0.00015127851172534675, + "loss": 3.0837, + "step": 15120 + }, + { + "epoch": 0.6366736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.0001512475107355603, + "loss": 3.4265, + "step": 15121 + }, + { + "epoch": 0.6367157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.00015121651154492819, + "loss": 3.086, + "step": 15122 + }, + { + "epoch": 0.6367578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.00015118551415401526, + "loss": 3.1515, + "step": 15123 + }, + { + "epoch": 0.6368, + "grad_norm": 0.44921875, + "learning_rate": 0.00015115451856338602, + "loss": 3.2297, + "step": 15124 + }, + { + "epoch": 0.6368421052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.00015112352477360537, + "loss": 2.9911, + "step": 15125 + }, + { + "epoch": 0.6368842105263158, + "grad_norm": 0.443359375, + "learning_rate": 0.000151092532785238, + "loss": 3.0715, + "step": 15126 + }, + { + "epoch": 0.6369263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00015106154259884836, + "loss": 3.424, + "step": 15127 + }, + { + "epoch": 0.6369684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.0001510305542150012, + "loss": 3.0849, + "step": 15128 + }, + { + "epoch": 0.6370105263157895, + "grad_norm": 0.435546875, + "learning_rate": 0.00015099956763426104, + "loss": 3.1155, + "step": 15129 + }, + { + "epoch": 0.6370526315789473, + "grad_norm": 0.482421875, + "learning_rate": 0.0001509685828571925, + "loss": 3.1721, + "step": 15130 + }, + { + "epoch": 0.6370947368421053, + "grad_norm": 0.431640625, + "learning_rate": 0.00015093759988435995, + "loss": 3.3245, + "step": 15131 + }, + { + "epoch": 0.6371368421052631, + "grad_norm": 0.404296875, + "learning_rate": 0.00015090661871632795, + "loss": 3.0829, + "step": 15132 + }, + { + "epoch": 0.6371789473684211, + "grad_norm": 0.4453125, + "learning_rate": 0.000150875639353661, + "loss": 3.1264, + "step": 15133 + }, + { + "epoch": 0.6372210526315789, + "grad_norm": 0.412109375, + "learning_rate": 0.00015084466179692335, + "loss": 3.521, + "step": 15134 + }, + { + "epoch": 0.6372631578947369, + "grad_norm": 0.4375, + "learning_rate": 0.00015081368604667956, + "loss": 2.6497, + "step": 15135 + }, + { + "epoch": 0.6373052631578947, + "grad_norm": 0.46484375, + "learning_rate": 0.00015078271210349376, + "loss": 3.2965, + "step": 15136 + }, + { + "epoch": 0.6373473684210527, + "grad_norm": 0.416015625, + "learning_rate": 0.00015075173996793043, + "loss": 3.0335, + "step": 15137 + }, + { + "epoch": 0.6373894736842105, + "grad_norm": 0.419921875, + "learning_rate": 0.00015072076964055373, + "loss": 3.0598, + "step": 15138 + }, + { + "epoch": 0.6374315789473685, + "grad_norm": 0.482421875, + "learning_rate": 0.00015068980112192796, + "loss": 2.8235, + "step": 15139 + }, + { + "epoch": 0.6374736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.00015065883441261744, + "loss": 3.0912, + "step": 15140 + }, + { + "epoch": 0.6375157894736843, + "grad_norm": 0.43359375, + "learning_rate": 0.00015062786951318614, + "loss": 3.219, + "step": 15141 + }, + { + "epoch": 0.6375578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.0001505969064241984, + "loss": 3.3337, + "step": 15142 + }, + { + "epoch": 0.6376, + "grad_norm": 0.439453125, + "learning_rate": 0.00015056594514621806, + "loss": 3.2448, + "step": 15143 + }, + { + "epoch": 0.6376421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00015053498567980944, + "loss": 3.1603, + "step": 15144 + }, + { + "epoch": 0.6376842105263157, + "grad_norm": 0.431640625, + "learning_rate": 0.00015050402802553657, + "loss": 3.5756, + "step": 15145 + }, + { + "epoch": 0.6377263157894737, + "grad_norm": 0.4140625, + "learning_rate": 0.00015047307218396332, + "loss": 3.5932, + "step": 15146 + }, + { + "epoch": 0.6377684210526315, + "grad_norm": 0.443359375, + "learning_rate": 0.0001504421181556539, + "loss": 3.2706, + "step": 15147 + }, + { + "epoch": 0.6378105263157895, + "grad_norm": 0.443359375, + "learning_rate": 0.00015041116594117197, + "loss": 2.8318, + "step": 15148 + }, + { + "epoch": 0.6378526315789473, + "grad_norm": 0.439453125, + "learning_rate": 0.0001503802155410817, + "loss": 3.4171, + "step": 15149 + }, + { + "epoch": 0.6378947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.00015034926695594675, + "loss": 3.0776, + "step": 15150 + }, + { + "epoch": 0.6379368421052631, + "grad_norm": 0.423828125, + "learning_rate": 0.00015031832018633107, + "loss": 2.8747, + "step": 15151 + }, + { + "epoch": 0.6379789473684211, + "grad_norm": 0.41796875, + "learning_rate": 0.00015028737523279856, + "loss": 3.0641, + "step": 15152 + }, + { + "epoch": 0.6380210526315789, + "grad_norm": 0.45703125, + "learning_rate": 0.00015025643209591285, + "loss": 3.1129, + "step": 15153 + }, + { + "epoch": 0.6380631578947369, + "grad_norm": 0.443359375, + "learning_rate": 0.0001502254907762378, + "loss": 3.2763, + "step": 15154 + }, + { + "epoch": 0.6381052631578947, + "grad_norm": 0.478515625, + "learning_rate": 0.000150194551274337, + "loss": 3.1304, + "step": 15155 + }, + { + "epoch": 0.6381473684210527, + "grad_norm": 0.4453125, + "learning_rate": 0.00015016361359077436, + "loss": 3.127, + "step": 15156 + }, + { + "epoch": 0.6381894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.00015013267772611322, + "loss": 2.8736, + "step": 15157 + }, + { + "epoch": 0.6382315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.0001501017436809174, + "loss": 3.3996, + "step": 15158 + }, + { + "epoch": 0.6382736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.00015007081145575047, + "loss": 3.4592, + "step": 15159 + }, + { + "epoch": 0.6383157894736842, + "grad_norm": 0.404296875, + "learning_rate": 0.0001500398810511759, + "loss": 2.9642, + "step": 15160 + }, + { + "epoch": 0.6383578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00015000895246775728, + "loss": 3.1632, + "step": 15161 + }, + { + "epoch": 0.6384, + "grad_norm": 0.404296875, + "learning_rate": 0.00014997802570605807, + "loss": 2.8832, + "step": 15162 + }, + { + "epoch": 0.6384421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.0001499471007666417, + "loss": 3.3844, + "step": 15163 + }, + { + "epoch": 0.6384842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.00014991617765007154, + "loss": 2.8697, + "step": 15164 + }, + { + "epoch": 0.6385263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00014988525635691106, + "loss": 2.8038, + "step": 15165 + }, + { + "epoch": 0.6385684210526316, + "grad_norm": 0.4765625, + "learning_rate": 0.00014985433688772366, + "loss": 3.1418, + "step": 15166 + }, + { + "epoch": 0.6386105263157895, + "grad_norm": 0.42578125, + "learning_rate": 0.00014982341924307246, + "loss": 3.5013, + "step": 15167 + }, + { + "epoch": 0.6386526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.00014979250342352096, + "loss": 3.1737, + "step": 15168 + }, + { + "epoch": 0.6386947368421053, + "grad_norm": 0.40625, + "learning_rate": 0.00014976158942963224, + "loss": 3.0077, + "step": 15169 + }, + { + "epoch": 0.6387368421052632, + "grad_norm": 0.46484375, + "learning_rate": 0.00014973067726196961, + "loss": 3.2396, + "step": 15170 + }, + { + "epoch": 0.638778947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00014969976692109623, + "loss": 2.9719, + "step": 15171 + }, + { + "epoch": 0.638821052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.00014966885840757516, + "loss": 3.2424, + "step": 15172 + }, + { + "epoch": 0.6388631578947368, + "grad_norm": 0.431640625, + "learning_rate": 0.00014963795172196975, + "loss": 3.2607, + "step": 15173 + }, + { + "epoch": 0.6389052631578948, + "grad_norm": 0.46484375, + "learning_rate": 0.00014960704686484282, + "loss": 2.8867, + "step": 15174 + }, + { + "epoch": 0.6389473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.0001495761438367577, + "loss": 3.5469, + "step": 15175 + }, + { + "epoch": 0.6389894736842106, + "grad_norm": 0.404296875, + "learning_rate": 0.0001495452426382771, + "loss": 3.2575, + "step": 15176 + }, + { + "epoch": 0.6390315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.00014951434326996422, + "loss": 3.4603, + "step": 15177 + }, + { + "epoch": 0.6390736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.00014948344573238192, + "loss": 3.3069, + "step": 15178 + }, + { + "epoch": 0.6391157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.0001494525500260931, + "loss": 2.7507, + "step": 15179 + }, + { + "epoch": 0.6391578947368421, + "grad_norm": 0.404296875, + "learning_rate": 0.0001494216561516607, + "loss": 3.0808, + "step": 15180 + }, + { + "epoch": 0.6392, + "grad_norm": 0.44140625, + "learning_rate": 0.00014939076410964752, + "loss": 3.2244, + "step": 15181 + }, + { + "epoch": 0.639242105263158, + "grad_norm": 0.45703125, + "learning_rate": 0.0001493598739006165, + "loss": 3.05, + "step": 15182 + }, + { + "epoch": 0.6392842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.00014932898552513026, + "loss": 3.4507, + "step": 15183 + }, + { + "epoch": 0.6393263157894736, + "grad_norm": 0.419921875, + "learning_rate": 0.0001492980989837517, + "loss": 3.3, + "step": 15184 + }, + { + "epoch": 0.6393684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.00014926721427704334, + "loss": 2.7179, + "step": 15185 + }, + { + "epoch": 0.6394105263157894, + "grad_norm": 0.4296875, + "learning_rate": 0.000149236331405568, + "loss": 2.6776, + "step": 15186 + }, + { + "epoch": 0.6394526315789474, + "grad_norm": 0.41015625, + "learning_rate": 0.00014920545036988837, + "loss": 2.9292, + "step": 15187 + }, + { + "epoch": 0.6394947368421052, + "grad_norm": 0.412109375, + "learning_rate": 0.00014917457117056694, + "loss": 3.1369, + "step": 15188 + }, + { + "epoch": 0.6395368421052632, + "grad_norm": 0.42578125, + "learning_rate": 0.00014914369380816645, + "loss": 3.387, + "step": 15189 + }, + { + "epoch": 0.639578947368421, + "grad_norm": 0.40625, + "learning_rate": 0.00014911281828324926, + "loss": 3.0294, + "step": 15190 + }, + { + "epoch": 0.639621052631579, + "grad_norm": 0.40234375, + "learning_rate": 0.000149081944596378, + "loss": 3.108, + "step": 15191 + }, + { + "epoch": 0.6396631578947368, + "grad_norm": 0.41796875, + "learning_rate": 0.0001490510727481152, + "loss": 2.8377, + "step": 15192 + }, + { + "epoch": 0.6397052631578948, + "grad_norm": 0.439453125, + "learning_rate": 0.0001490202027390232, + "loss": 3.5374, + "step": 15193 + }, + { + "epoch": 0.6397473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.00014898933456966451, + "loss": 3.141, + "step": 15194 + }, + { + "epoch": 0.6397894736842106, + "grad_norm": 0.4140625, + "learning_rate": 0.00014895846824060138, + "loss": 2.8468, + "step": 15195 + }, + { + "epoch": 0.6398315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.0001489276037523963, + "loss": 3.2337, + "step": 15196 + }, + { + "epoch": 0.6398736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.00014889674110561146, + "loss": 3.1992, + "step": 15197 + }, + { + "epoch": 0.6399157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.00014886588030080922, + "loss": 3.1638, + "step": 15198 + }, + { + "epoch": 0.6399578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00014883502133855193, + "loss": 2.7004, + "step": 15199 + }, + { + "epoch": 0.64, + "grad_norm": 0.45703125, + "learning_rate": 0.00014880416421940155, + "loss": 3.3718, + "step": 15200 + }, + { + "epoch": 0.6400421052631579, + "grad_norm": 0.50390625, + "learning_rate": 0.0001487733089439205, + "loss": 3.0877, + "step": 15201 + }, + { + "epoch": 0.6400842105263158, + "grad_norm": 0.458984375, + "learning_rate": 0.00014874245551267073, + "loss": 3.2789, + "step": 15202 + }, + { + "epoch": 0.6401263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00014871160392621453, + "loss": 3.0484, + "step": 15203 + }, + { + "epoch": 0.6401684210526316, + "grad_norm": 0.451171875, + "learning_rate": 0.0001486807541851138, + "loss": 3.4268, + "step": 15204 + }, + { + "epoch": 0.6402105263157895, + "grad_norm": 0.431640625, + "learning_rate": 0.00014864990628993074, + "loss": 3.0317, + "step": 15205 + }, + { + "epoch": 0.6402526315789474, + "grad_norm": 0.45703125, + "learning_rate": 0.00014861906024122735, + "loss": 3.4472, + "step": 15206 + }, + { + "epoch": 0.6402947368421052, + "grad_norm": 0.443359375, + "learning_rate": 0.00014858821603956545, + "loss": 3.11, + "step": 15207 + }, + { + "epoch": 0.6403368421052632, + "grad_norm": 0.494140625, + "learning_rate": 0.00014855737368550727, + "loss": 3.1971, + "step": 15208 + }, + { + "epoch": 0.640378947368421, + "grad_norm": 0.455078125, + "learning_rate": 0.0001485265331796144, + "loss": 3.0998, + "step": 15209 + }, + { + "epoch": 0.640421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.000148495694522449, + "loss": 3.3118, + "step": 15210 + }, + { + "epoch": 0.6404631578947368, + "grad_norm": 0.40625, + "learning_rate": 0.00014846485771457263, + "loss": 2.8374, + "step": 15211 + }, + { + "epoch": 0.6405052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.0001484340227565473, + "loss": 3.1539, + "step": 15212 + }, + { + "epoch": 0.6405473684210526, + "grad_norm": 0.458984375, + "learning_rate": 0.0001484031896489348, + "loss": 3.1276, + "step": 15213 + }, + { + "epoch": 0.6405894736842105, + "grad_norm": 0.451171875, + "learning_rate": 0.00014837235839229674, + "loss": 3.1338, + "step": 15214 + }, + { + "epoch": 0.6406315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.000148341528987195, + "loss": 2.9809, + "step": 15215 + }, + { + "epoch": 0.6406736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00014831070143419105, + "loss": 3.1864, + "step": 15216 + }, + { + "epoch": 0.6407157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.00014827987573384676, + "loss": 3.3738, + "step": 15217 + }, + { + "epoch": 0.6407578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.0001482490518867235, + "loss": 3.0045, + "step": 15218 + }, + { + "epoch": 0.6408, + "grad_norm": 0.44140625, + "learning_rate": 0.00014821822989338296, + "loss": 2.6944, + "step": 15219 + }, + { + "epoch": 0.6408421052631579, + "grad_norm": 0.4921875, + "learning_rate": 0.00014818740975438676, + "loss": 3.13, + "step": 15220 + }, + { + "epoch": 0.6408842105263158, + "grad_norm": 0.421875, + "learning_rate": 0.0001481565914702963, + "loss": 3.1615, + "step": 15221 + }, + { + "epoch": 0.6409263157894737, + "grad_norm": 0.455078125, + "learning_rate": 0.00014812577504167313, + "loss": 3.1618, + "step": 15222 + }, + { + "epoch": 0.6409684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.00014809496046907855, + "loss": 2.9308, + "step": 15223 + }, + { + "epoch": 0.6410105263157895, + "grad_norm": 0.431640625, + "learning_rate": 0.00014806414775307418, + "loss": 2.7885, + "step": 15224 + }, + { + "epoch": 0.6410526315789473, + "grad_norm": 0.439453125, + "learning_rate": 0.00014803333689422115, + "loss": 3.4629, + "step": 15225 + }, + { + "epoch": 0.6410947368421053, + "grad_norm": 0.4375, + "learning_rate": 0.00014800252789308092, + "loss": 3.203, + "step": 15226 + }, + { + "epoch": 0.6411368421052631, + "grad_norm": 0.419921875, + "learning_rate": 0.00014797172075021488, + "loss": 2.954, + "step": 15227 + }, + { + "epoch": 0.6411789473684211, + "grad_norm": 0.427734375, + "learning_rate": 0.00014794091546618414, + "loss": 3.2884, + "step": 15228 + }, + { + "epoch": 0.6412210526315789, + "grad_norm": 0.419921875, + "learning_rate": 0.00014791011204155, + "loss": 3.1616, + "step": 15229 + }, + { + "epoch": 0.6412631578947369, + "grad_norm": 0.40625, + "learning_rate": 0.0001478793104768737, + "loss": 3.2868, + "step": 15230 + }, + { + "epoch": 0.6413052631578947, + "grad_norm": 0.455078125, + "learning_rate": 0.0001478485107727164, + "loss": 3.31, + "step": 15231 + }, + { + "epoch": 0.6413473684210527, + "grad_norm": 0.41796875, + "learning_rate": 0.00014781771292963914, + "loss": 3.2528, + "step": 15232 + }, + { + "epoch": 0.6413894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.00014778691694820311, + "loss": 3.1324, + "step": 15233 + }, + { + "epoch": 0.6414315789473685, + "grad_norm": 0.5, + "learning_rate": 0.00014775612282896943, + "loss": 3.1566, + "step": 15234 + }, + { + "epoch": 0.6414736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00014772533057249898, + "loss": 3.5487, + "step": 15235 + }, + { + "epoch": 0.6415157894736843, + "grad_norm": 0.431640625, + "learning_rate": 0.00014769454017935285, + "loss": 3.2191, + "step": 15236 + }, + { + "epoch": 0.6415578947368421, + "grad_norm": 0.462890625, + "learning_rate": 0.00014766375165009205, + "loss": 2.8905, + "step": 15237 + }, + { + "epoch": 0.6416, + "grad_norm": 0.5234375, + "learning_rate": 0.00014763296498527742, + "loss": 3.0981, + "step": 15238 + }, + { + "epoch": 0.6416421052631579, + "grad_norm": 0.45703125, + "learning_rate": 0.00014760218018546983, + "loss": 3.1496, + "step": 15239 + }, + { + "epoch": 0.6416842105263157, + "grad_norm": 0.43359375, + "learning_rate": 0.00014757139725123024, + "loss": 3.0404, + "step": 15240 + }, + { + "epoch": 0.6417263157894737, + "grad_norm": 0.447265625, + "learning_rate": 0.00014754061618311954, + "loss": 3.2482, + "step": 15241 + }, + { + "epoch": 0.6417684210526315, + "grad_norm": 0.4765625, + "learning_rate": 0.00014750983698169832, + "loss": 2.8224, + "step": 15242 + }, + { + "epoch": 0.6418105263157895, + "grad_norm": 0.439453125, + "learning_rate": 0.00014747905964752756, + "loss": 3.2683, + "step": 15243 + }, + { + "epoch": 0.6418526315789473, + "grad_norm": 0.48046875, + "learning_rate": 0.00014744828418116775, + "loss": 2.7334, + "step": 15244 + }, + { + "epoch": 0.6418947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.00014741751058317971, + "loss": 3.0247, + "step": 15245 + }, + { + "epoch": 0.6419368421052631, + "grad_norm": 0.435546875, + "learning_rate": 0.00014738673885412415, + "loss": 3.3357, + "step": 15246 + }, + { + "epoch": 0.6419789473684211, + "grad_norm": 0.4375, + "learning_rate": 0.00014735596899456162, + "loss": 3.3715, + "step": 15247 + }, + { + "epoch": 0.6420210526315789, + "grad_norm": 0.4375, + "learning_rate": 0.00014732520100505282, + "loss": 2.9668, + "step": 15248 + }, + { + "epoch": 0.6420631578947369, + "grad_norm": 0.423828125, + "learning_rate": 0.0001472944348861581, + "loss": 3.1655, + "step": 15249 + }, + { + "epoch": 0.6421052631578947, + "grad_norm": 0.439453125, + "learning_rate": 0.00014726367063843822, + "loss": 3.1824, + "step": 15250 + }, + { + "epoch": 0.6421473684210526, + "grad_norm": 0.4453125, + "learning_rate": 0.00014723290826245344, + "loss": 3.167, + "step": 15251 + }, + { + "epoch": 0.6421894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.00014720214775876435, + "loss": 2.9136, + "step": 15252 + }, + { + "epoch": 0.6422315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.0001471713891279314, + "loss": 3.1543, + "step": 15253 + }, + { + "epoch": 0.6422736842105263, + "grad_norm": 0.412109375, + "learning_rate": 0.00014714063237051484, + "loss": 3.1397, + "step": 15254 + }, + { + "epoch": 0.6423157894736842, + "grad_norm": 0.4609375, + "learning_rate": 0.00014710987748707517, + "loss": 2.9942, + "step": 15255 + }, + { + "epoch": 0.6423578947368421, + "grad_norm": 0.470703125, + "learning_rate": 0.00014707912447817258, + "loss": 3.2469, + "step": 15256 + }, + { + "epoch": 0.6424, + "grad_norm": 0.52734375, + "learning_rate": 0.00014704837334436754, + "loss": 3.1904, + "step": 15257 + }, + { + "epoch": 0.6424421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.00014701762408622006, + "loss": 2.6426, + "step": 15258 + }, + { + "epoch": 0.6424842105263158, + "grad_norm": 0.453125, + "learning_rate": 0.00014698687670429045, + "loss": 2.8527, + "step": 15259 + }, + { + "epoch": 0.6425263157894737, + "grad_norm": 0.447265625, + "learning_rate": 0.00014695613119913904, + "loss": 3.5967, + "step": 15260 + }, + { + "epoch": 0.6425684210526316, + "grad_norm": 0.4765625, + "learning_rate": 0.0001469253875713257, + "loss": 3.0684, + "step": 15261 + }, + { + "epoch": 0.6426105263157895, + "grad_norm": 0.435546875, + "learning_rate": 0.00014689464582141076, + "loss": 3.228, + "step": 15262 + }, + { + "epoch": 0.6426526315789474, + "grad_norm": 0.400390625, + "learning_rate": 0.00014686390594995422, + "loss": 2.8606, + "step": 15263 + }, + { + "epoch": 0.6426947368421053, + "grad_norm": 0.4765625, + "learning_rate": 0.0001468331679575162, + "loss": 3.2451, + "step": 15264 + }, + { + "epoch": 0.6427368421052632, + "grad_norm": 0.431640625, + "learning_rate": 0.00014680243184465655, + "loss": 3.2359, + "step": 15265 + }, + { + "epoch": 0.642778947368421, + "grad_norm": 0.455078125, + "learning_rate": 0.0001467716976119353, + "loss": 3.17, + "step": 15266 + }, + { + "epoch": 0.642821052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.0001467409652599126, + "loss": 3.0633, + "step": 15267 + }, + { + "epoch": 0.6428631578947368, + "grad_norm": 0.44140625, + "learning_rate": 0.00014671023478914807, + "loss": 3.0597, + "step": 15268 + }, + { + "epoch": 0.6429052631578948, + "grad_norm": 0.416015625, + "learning_rate": 0.0001466795062002018, + "loss": 3.3747, + "step": 15269 + }, + { + "epoch": 0.6429473684210526, + "grad_norm": 0.478515625, + "learning_rate": 0.00014664877949363341, + "loss": 2.9471, + "step": 15270 + }, + { + "epoch": 0.6429894736842106, + "grad_norm": 0.419921875, + "learning_rate": 0.0001466180546700029, + "loss": 3.156, + "step": 15271 + }, + { + "epoch": 0.6430315789473684, + "grad_norm": 0.443359375, + "learning_rate": 0.0001465873317298699, + "loss": 3.1558, + "step": 15272 + }, + { + "epoch": 0.6430736842105264, + "grad_norm": 0.431640625, + "learning_rate": 0.0001465566106737942, + "loss": 3.3786, + "step": 15273 + }, + { + "epoch": 0.6431157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.0001465258915023356, + "loss": 2.8449, + "step": 15274 + }, + { + "epoch": 0.6431578947368422, + "grad_norm": 0.4296875, + "learning_rate": 0.00014649517421605362, + "loss": 2.5784, + "step": 15275 + }, + { + "epoch": 0.6432, + "grad_norm": 0.447265625, + "learning_rate": 0.00014646445881550803, + "loss": 3.403, + "step": 15276 + }, + { + "epoch": 0.643242105263158, + "grad_norm": 0.447265625, + "learning_rate": 0.00014643374530125825, + "loss": 3.2902, + "step": 15277 + }, + { + "epoch": 0.6432842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.00014640303367386405, + "loss": 2.8078, + "step": 15278 + }, + { + "epoch": 0.6433263157894736, + "grad_norm": 0.439453125, + "learning_rate": 0.0001463723239338847, + "loss": 2.9995, + "step": 15279 + }, + { + "epoch": 0.6433684210526316, + "grad_norm": 0.416015625, + "learning_rate": 0.00014634161608187998, + "loss": 3.4704, + "step": 15280 + }, + { + "epoch": 0.6434105263157894, + "grad_norm": 0.421875, + "learning_rate": 0.00014631091011840918, + "loss": 3.1849, + "step": 15281 + }, + { + "epoch": 0.6434526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.00014628020604403176, + "loss": 2.7016, + "step": 15282 + }, + { + "epoch": 0.6434947368421052, + "grad_norm": 0.4296875, + "learning_rate": 0.00014624950385930718, + "loss": 2.9955, + "step": 15283 + }, + { + "epoch": 0.6435368421052632, + "grad_norm": 0.482421875, + "learning_rate": 0.00014621880356479466, + "loss": 3.0802, + "step": 15284 + }, + { + "epoch": 0.643578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00014618810516105372, + "loss": 3.0921, + "step": 15285 + }, + { + "epoch": 0.643621052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.0001461574086486434, + "loss": 3.0386, + "step": 15286 + }, + { + "epoch": 0.6436631578947368, + "grad_norm": 0.44140625, + "learning_rate": 0.00014612671402812312, + "loss": 3.3815, + "step": 15287 + }, + { + "epoch": 0.6437052631578948, + "grad_norm": 0.4375, + "learning_rate": 0.00014609602130005207, + "loss": 2.9637, + "step": 15288 + }, + { + "epoch": 0.6437473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.00014606533046498938, + "loss": 2.6867, + "step": 15289 + }, + { + "epoch": 0.6437894736842106, + "grad_norm": 0.4296875, + "learning_rate": 0.0001460346415234944, + "loss": 2.9471, + "step": 15290 + }, + { + "epoch": 0.6438315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.00014600395447612595, + "loss": 3.2734, + "step": 15291 + }, + { + "epoch": 0.6438736842105263, + "grad_norm": 0.451171875, + "learning_rate": 0.00014597326932344325, + "loss": 3.2952, + "step": 15292 + }, + { + "epoch": 0.6439157894736842, + "grad_norm": 0.46484375, + "learning_rate": 0.00014594258606600547, + "loss": 3.4915, + "step": 15293 + }, + { + "epoch": 0.6439578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00014591190470437137, + "loss": 3.253, + "step": 15294 + }, + { + "epoch": 0.644, + "grad_norm": 0.41796875, + "learning_rate": 0.00014588122523910031, + "loss": 2.4031, + "step": 15295 + }, + { + "epoch": 0.6440421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.00014585054767075073, + "loss": 2.8718, + "step": 15296 + }, + { + "epoch": 0.6440842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.000145819871999882, + "loss": 2.9707, + "step": 15297 + }, + { + "epoch": 0.6441263157894737, + "grad_norm": 0.447265625, + "learning_rate": 0.00014578919822705262, + "loss": 3.4652, + "step": 15298 + }, + { + "epoch": 0.6441684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.00014575852635282172, + "loss": 3.3701, + "step": 15299 + }, + { + "epoch": 0.6442105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.00014572785637774794, + "loss": 3.4632, + "step": 15300 + }, + { + "epoch": 0.6442526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.00014569718830239014, + "loss": 3.4888, + "step": 15301 + }, + { + "epoch": 0.6442947368421053, + "grad_norm": 0.419921875, + "learning_rate": 0.00014566652212730702, + "loss": 2.5655, + "step": 15302 + }, + { + "epoch": 0.6443368421052632, + "grad_norm": 0.412109375, + "learning_rate": 0.00014563585785305717, + "loss": 3.2854, + "step": 15303 + }, + { + "epoch": 0.644378947368421, + "grad_norm": 0.443359375, + "learning_rate": 0.00014560519548019956, + "loss": 2.5697, + "step": 15304 + }, + { + "epoch": 0.6444210526315789, + "grad_norm": 0.423828125, + "learning_rate": 0.00014557453500929247, + "loss": 3.1477, + "step": 15305 + }, + { + "epoch": 0.6444631578947368, + "grad_norm": 0.42578125, + "learning_rate": 0.0001455438764408947, + "loss": 2.4591, + "step": 15306 + }, + { + "epoch": 0.6445052631578947, + "grad_norm": 0.435546875, + "learning_rate": 0.0001455132197755648, + "loss": 2.9453, + "step": 15307 + }, + { + "epoch": 0.6445473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.00014548256501386125, + "loss": 3.0365, + "step": 15308 + }, + { + "epoch": 0.6445894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.0001454519121563426, + "loss": 2.9882, + "step": 15309 + }, + { + "epoch": 0.6446315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.00014542126120356715, + "loss": 2.8996, + "step": 15310 + }, + { + "epoch": 0.6446736842105263, + "grad_norm": 0.494140625, + "learning_rate": 0.00014539061215609365, + "loss": 2.7953, + "step": 15311 + }, + { + "epoch": 0.6447157894736842, + "grad_norm": 0.3984375, + "learning_rate": 0.00014535996501448007, + "loss": 3.2627, + "step": 15312 + }, + { + "epoch": 0.6447578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0001453293197792851, + "loss": 3.2114, + "step": 15313 + }, + { + "epoch": 0.6448, + "grad_norm": 0.412109375, + "learning_rate": 0.00014529867645106693, + "loss": 2.8209, + "step": 15314 + }, + { + "epoch": 0.6448421052631579, + "grad_norm": 0.451171875, + "learning_rate": 0.00014526803503038383, + "loss": 3.4961, + "step": 15315 + }, + { + "epoch": 0.6448842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.00014523739551779413, + "loss": 2.7784, + "step": 15316 + }, + { + "epoch": 0.6449263157894737, + "grad_norm": 0.443359375, + "learning_rate": 0.00014520675791385596, + "loss": 2.8801, + "step": 15317 + }, + { + "epoch": 0.6449684210526315, + "grad_norm": 0.427734375, + "learning_rate": 0.00014517612221912754, + "loss": 3.4941, + "step": 15318 + }, + { + "epoch": 0.6450105263157895, + "grad_norm": 0.427734375, + "learning_rate": 0.00014514548843416693, + "loss": 3.218, + "step": 15319 + }, + { + "epoch": 0.6450526315789473, + "grad_norm": 0.44140625, + "learning_rate": 0.00014511485655953238, + "loss": 3.5963, + "step": 15320 + }, + { + "epoch": 0.6450947368421053, + "grad_norm": 0.41015625, + "learning_rate": 0.00014508422659578193, + "loss": 3.0655, + "step": 15321 + }, + { + "epoch": 0.6451368421052631, + "grad_norm": 0.412109375, + "learning_rate": 0.0001450535985434736, + "loss": 2.7694, + "step": 15322 + }, + { + "epoch": 0.6451789473684211, + "grad_norm": 0.42578125, + "learning_rate": 0.00014502297240316542, + "loss": 2.6437, + "step": 15323 + }, + { + "epoch": 0.6452210526315789, + "grad_norm": 0.419921875, + "learning_rate": 0.0001449923481754153, + "loss": 3.6341, + "step": 15324 + }, + { + "epoch": 0.6452631578947369, + "grad_norm": 0.44921875, + "learning_rate": 0.00014496172586078124, + "loss": 2.7572, + "step": 15325 + }, + { + "epoch": 0.6453052631578947, + "grad_norm": 0.435546875, + "learning_rate": 0.00014493110545982102, + "loss": 3.0701, + "step": 15326 + }, + { + "epoch": 0.6453473684210527, + "grad_norm": 0.41796875, + "learning_rate": 0.0001449004869730927, + "loss": 3.0578, + "step": 15327 + }, + { + "epoch": 0.6453894736842105, + "grad_norm": 0.447265625, + "learning_rate": 0.000144869870401154, + "loss": 2.7235, + "step": 15328 + }, + { + "epoch": 0.6454315789473685, + "grad_norm": 0.4609375, + "learning_rate": 0.0001448392557445628, + "loss": 3.0111, + "step": 15329 + }, + { + "epoch": 0.6454736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.00014480864300387675, + "loss": 3.0969, + "step": 15330 + }, + { + "epoch": 0.6455157894736843, + "grad_norm": 0.4296875, + "learning_rate": 0.00014477803217965364, + "loss": 3.1876, + "step": 15331 + }, + { + "epoch": 0.6455578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00014474742327245116, + "loss": 3.6041, + "step": 15332 + }, + { + "epoch": 0.6456, + "grad_norm": 0.419921875, + "learning_rate": 0.00014471681628282695, + "loss": 2.6802, + "step": 15333 + }, + { + "epoch": 0.6456421052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00014468621121133856, + "loss": 3.4278, + "step": 15334 + }, + { + "epoch": 0.6456842105263157, + "grad_norm": 0.451171875, + "learning_rate": 0.00014465560805854377, + "loss": 2.8994, + "step": 15335 + }, + { + "epoch": 0.6457263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.00014462500682500002, + "loss": 3.6074, + "step": 15336 + }, + { + "epoch": 0.6457684210526315, + "grad_norm": 0.4296875, + "learning_rate": 0.0001445944075112648, + "loss": 3.3545, + "step": 15337 + }, + { + "epoch": 0.6458105263157895, + "grad_norm": 0.404296875, + "learning_rate": 0.00014456381011789565, + "loss": 2.6238, + "step": 15338 + }, + { + "epoch": 0.6458526315789473, + "grad_norm": 0.439453125, + "learning_rate": 0.00014453321464544993, + "loss": 3.2071, + "step": 15339 + }, + { + "epoch": 0.6458947368421053, + "grad_norm": 0.4375, + "learning_rate": 0.0001445026210944853, + "loss": 3.1965, + "step": 15340 + }, + { + "epoch": 0.6459368421052631, + "grad_norm": 0.43359375, + "learning_rate": 0.00014447202946555877, + "loss": 3.0988, + "step": 15341 + }, + { + "epoch": 0.6459789473684211, + "grad_norm": 0.470703125, + "learning_rate": 0.0001444414397592279, + "loss": 2.9545, + "step": 15342 + }, + { + "epoch": 0.6460210526315789, + "grad_norm": 0.427734375, + "learning_rate": 0.00014441085197605005, + "loss": 3.3785, + "step": 15343 + }, + { + "epoch": 0.6460631578947369, + "grad_norm": 0.427734375, + "learning_rate": 0.00014438026611658238, + "loss": 3.1818, + "step": 15344 + }, + { + "epoch": 0.6461052631578947, + "grad_norm": 0.451171875, + "learning_rate": 0.0001443496821813822, + "loss": 3.1631, + "step": 15345 + }, + { + "epoch": 0.6461473684210526, + "grad_norm": 0.435546875, + "learning_rate": 0.00014431910017100654, + "loss": 2.6291, + "step": 15346 + }, + { + "epoch": 0.6461894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.00014428852008601292, + "loss": 3.2387, + "step": 15347 + }, + { + "epoch": 0.6462315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.00014425794192695807, + "loss": 3.0379, + "step": 15348 + }, + { + "epoch": 0.6462736842105263, + "grad_norm": 0.44921875, + "learning_rate": 0.00014422736569439946, + "loss": 2.9165, + "step": 15349 + }, + { + "epoch": 0.6463157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.00014419679138889376, + "loss": 3.3941, + "step": 15350 + }, + { + "epoch": 0.6463578947368421, + "grad_norm": 0.400390625, + "learning_rate": 0.00014416621901099832, + "loss": 2.898, + "step": 15351 + }, + { + "epoch": 0.6464, + "grad_norm": 0.44140625, + "learning_rate": 0.00014413564856127003, + "loss": 3.0999, + "step": 15352 + }, + { + "epoch": 0.6464421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00014410508004026573, + "loss": 2.6489, + "step": 15353 + }, + { + "epoch": 0.6464842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.00014407451344854264, + "loss": 3.4549, + "step": 15354 + }, + { + "epoch": 0.6465263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.00014404394878665726, + "loss": 3.2674, + "step": 15355 + }, + { + "epoch": 0.6465684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.00014401338605516683, + "loss": 3.0625, + "step": 15356 + }, + { + "epoch": 0.6466105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.0001439828252546278, + "loss": 3.2687, + "step": 15357 + }, + { + "epoch": 0.6466526315789474, + "grad_norm": 0.4375, + "learning_rate": 0.0001439522663855972, + "loss": 2.9323, + "step": 15358 + }, + { + "epoch": 0.6466947368421052, + "grad_norm": 0.44921875, + "learning_rate": 0.00014392170944863177, + "loss": 3.2047, + "step": 15359 + }, + { + "epoch": 0.6467368421052632, + "grad_norm": 0.41796875, + "learning_rate": 0.0001438911544442881, + "loss": 3.2014, + "step": 15360 + }, + { + "epoch": 0.646778947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.00014386060137312295, + "loss": 3.1416, + "step": 15361 + }, + { + "epoch": 0.646821052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.0001438300502356928, + "loss": 3.29, + "step": 15362 + }, + { + "epoch": 0.6468631578947368, + "grad_norm": 0.451171875, + "learning_rate": 0.00014379950103255462, + "loss": 3.3051, + "step": 15363 + }, + { + "epoch": 0.6469052631578948, + "grad_norm": 0.4296875, + "learning_rate": 0.00014376895376426454, + "loss": 3.3226, + "step": 15364 + }, + { + "epoch": 0.6469473684210526, + "grad_norm": 0.45703125, + "learning_rate": 0.00014373840843137954, + "loss": 3.1592, + "step": 15365 + }, + { + "epoch": 0.6469894736842106, + "grad_norm": 0.404296875, + "learning_rate": 0.00014370786503445563, + "loss": 3.341, + "step": 15366 + }, + { + "epoch": 0.6470315789473684, + "grad_norm": 0.419921875, + "learning_rate": 0.00014367732357404968, + "loss": 3.427, + "step": 15367 + }, + { + "epoch": 0.6470736842105264, + "grad_norm": 0.40234375, + "learning_rate": 0.0001436467840507179, + "loss": 2.9153, + "step": 15368 + }, + { + "epoch": 0.6471157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.0001436162464650167, + "loss": 3.094, + "step": 15369 + }, + { + "epoch": 0.6471578947368422, + "grad_norm": 0.4375, + "learning_rate": 0.0001435857108175027, + "loss": 3.1258, + "step": 15370 + }, + { + "epoch": 0.6472, + "grad_norm": 0.435546875, + "learning_rate": 0.00014355517710873183, + "loss": 3.1314, + "step": 15371 + }, + { + "epoch": 0.6472421052631578, + "grad_norm": 0.44140625, + "learning_rate": 0.00014352464533926075, + "loss": 3.3219, + "step": 15372 + }, + { + "epoch": 0.6472842105263158, + "grad_norm": 0.4453125, + "learning_rate": 0.0001434941155096453, + "loss": 3.1367, + "step": 15373 + }, + { + "epoch": 0.6473263157894736, + "grad_norm": 0.435546875, + "learning_rate": 0.00014346358762044206, + "loss": 3.3943, + "step": 15374 + }, + { + "epoch": 0.6473684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.00014343306167220708, + "loss": 2.918, + "step": 15375 + }, + { + "epoch": 0.6474105263157894, + "grad_norm": 0.40234375, + "learning_rate": 0.00014340253766549648, + "loss": 2.3796, + "step": 15376 + }, + { + "epoch": 0.6474526315789474, + "grad_norm": 0.4375, + "learning_rate": 0.00014337201560086645, + "loss": 2.9856, + "step": 15377 + }, + { + "epoch": 0.6474947368421052, + "grad_norm": 0.41796875, + "learning_rate": 0.00014334149547887284, + "loss": 3.1874, + "step": 15378 + }, + { + "epoch": 0.6475368421052632, + "grad_norm": 0.439453125, + "learning_rate": 0.0001433109773000721, + "loss": 3.1252, + "step": 15379 + }, + { + "epoch": 0.647578947368421, + "grad_norm": 0.46875, + "learning_rate": 0.0001432804610650198, + "loss": 3.4591, + "step": 15380 + }, + { + "epoch": 0.647621052631579, + "grad_norm": 0.53515625, + "learning_rate": 0.00014324994677427223, + "loss": 3.1386, + "step": 15381 + }, + { + "epoch": 0.6476631578947368, + "grad_norm": 0.44140625, + "learning_rate": 0.00014321943442838515, + "loss": 2.9095, + "step": 15382 + }, + { + "epoch": 0.6477052631578948, + "grad_norm": 0.451171875, + "learning_rate": 0.00014318892402791455, + "loss": 2.8722, + "step": 15383 + }, + { + "epoch": 0.6477473684210526, + "grad_norm": 0.42578125, + "learning_rate": 0.00014315841557341626, + "loss": 2.9733, + "step": 15384 + }, + { + "epoch": 0.6477894736842106, + "grad_norm": 0.431640625, + "learning_rate": 0.000143127909065446, + "loss": 3.1064, + "step": 15385 + }, + { + "epoch": 0.6478315789473684, + "grad_norm": 0.40625, + "learning_rate": 0.00014309740450455983, + "loss": 2.7143, + "step": 15386 + }, + { + "epoch": 0.6478736842105263, + "grad_norm": 0.59375, + "learning_rate": 0.0001430669018913132, + "loss": 3.0695, + "step": 15387 + }, + { + "epoch": 0.6479157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.00014303640122626204, + "loss": 3.6446, + "step": 15388 + }, + { + "epoch": 0.6479578947368421, + "grad_norm": 0.400390625, + "learning_rate": 0.00014300590250996198, + "loss": 3.2076, + "step": 15389 + }, + { + "epoch": 0.648, + "grad_norm": 0.4453125, + "learning_rate": 0.0001429754057429687, + "loss": 2.8745, + "step": 15390 + }, + { + "epoch": 0.6480421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.00014294491092583774, + "loss": 3.1567, + "step": 15391 + }, + { + "epoch": 0.6480842105263158, + "grad_norm": 0.408203125, + "learning_rate": 0.0001429144180591247, + "loss": 3.006, + "step": 15392 + }, + { + "epoch": 0.6481263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.00014288392714338508, + "loss": 3.1309, + "step": 15393 + }, + { + "epoch": 0.6481684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.0001428534381791745, + "loss": 2.9069, + "step": 15394 + }, + { + "epoch": 0.6482105263157895, + "grad_norm": 0.439453125, + "learning_rate": 0.00014282295116704847, + "loss": 3.1705, + "step": 15395 + }, + { + "epoch": 0.6482526315789474, + "grad_norm": 0.455078125, + "learning_rate": 0.00014279246610756228, + "loss": 3.3319, + "step": 15396 + }, + { + "epoch": 0.6482947368421053, + "grad_norm": 0.4296875, + "learning_rate": 0.0001427619830012714, + "loss": 3.3487, + "step": 15397 + }, + { + "epoch": 0.6483368421052632, + "grad_norm": 0.4296875, + "learning_rate": 0.00014273150184873122, + "loss": 3.3209, + "step": 15398 + }, + { + "epoch": 0.648378947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.000142701022650497, + "loss": 3.0337, + "step": 15399 + }, + { + "epoch": 0.6484210526315789, + "grad_norm": 0.419921875, + "learning_rate": 0.000142670545407124, + "loss": 3.2667, + "step": 15400 + }, + { + "epoch": 0.6484631578947369, + "grad_norm": 0.431640625, + "learning_rate": 0.00014264007011916766, + "loss": 3.547, + "step": 15401 + }, + { + "epoch": 0.6485052631578947, + "grad_norm": 0.44140625, + "learning_rate": 0.0001426095967871831, + "loss": 2.7986, + "step": 15402 + }, + { + "epoch": 0.6485473684210527, + "grad_norm": 0.4375, + "learning_rate": 0.00014257912541172548, + "loss": 3.2803, + "step": 15403 + }, + { + "epoch": 0.6485894736842105, + "grad_norm": 0.419921875, + "learning_rate": 0.00014254865599335002, + "loss": 3.2019, + "step": 15404 + }, + { + "epoch": 0.6486315789473684, + "grad_norm": 0.443359375, + "learning_rate": 0.00014251818853261178, + "loss": 2.8432, + "step": 15405 + }, + { + "epoch": 0.6486736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.00014248772303006587, + "loss": 3.1761, + "step": 15406 + }, + { + "epoch": 0.6487157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.0001424572594862672, + "loss": 3.0022, + "step": 15407 + }, + { + "epoch": 0.6487578947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.0001424267979017711, + "loss": 3.055, + "step": 15408 + }, + { + "epoch": 0.6488, + "grad_norm": 0.423828125, + "learning_rate": 0.00014239633827713217, + "loss": 2.9692, + "step": 15409 + }, + { + "epoch": 0.6488421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00014236588061290563, + "loss": 3.5153, + "step": 15410 + }, + { + "epoch": 0.6488842105263158, + "grad_norm": 0.455078125, + "learning_rate": 0.00014233542490964626, + "loss": 3.0271, + "step": 15411 + }, + { + "epoch": 0.6489263157894737, + "grad_norm": 0.5859375, + "learning_rate": 0.00014230497116790898, + "loss": 3.244, + "step": 15412 + }, + { + "epoch": 0.6489684210526315, + "grad_norm": 0.41015625, + "learning_rate": 0.00014227451938824857, + "loss": 3.2601, + "step": 15413 + }, + { + "epoch": 0.6490105263157895, + "grad_norm": 0.4140625, + "learning_rate": 0.00014224406957121972, + "loss": 3.1056, + "step": 15414 + }, + { + "epoch": 0.6490526315789473, + "grad_norm": 0.416015625, + "learning_rate": 0.00014221362171737756, + "loss": 2.8352, + "step": 15415 + }, + { + "epoch": 0.6490947368421053, + "grad_norm": 0.435546875, + "learning_rate": 0.00014218317582727637, + "loss": 2.794, + "step": 15416 + }, + { + "epoch": 0.6491368421052631, + "grad_norm": 0.50390625, + "learning_rate": 0.0001421527319014711, + "loss": 2.7106, + "step": 15417 + }, + { + "epoch": 0.6491789473684211, + "grad_norm": 0.451171875, + "learning_rate": 0.00014212228994051634, + "loss": 3.1676, + "step": 15418 + }, + { + "epoch": 0.6492210526315789, + "grad_norm": 0.4375, + "learning_rate": 0.0001420918499449667, + "loss": 3.0585, + "step": 15419 + }, + { + "epoch": 0.6492631578947369, + "grad_norm": 0.486328125, + "learning_rate": 0.0001420614119153768, + "loss": 3.1459, + "step": 15420 + }, + { + "epoch": 0.6493052631578947, + "grad_norm": 0.4375, + "learning_rate": 0.00014203097585230103, + "loss": 2.9756, + "step": 15421 + }, + { + "epoch": 0.6493473684210527, + "grad_norm": 0.4375, + "learning_rate": 0.00014200054175629424, + "loss": 3.3647, + "step": 15422 + }, + { + "epoch": 0.6493894736842105, + "grad_norm": 0.435546875, + "learning_rate": 0.00014197010962791045, + "loss": 2.9855, + "step": 15423 + }, + { + "epoch": 0.6494315789473685, + "grad_norm": 0.451171875, + "learning_rate": 0.0001419396794677046, + "loss": 3.2659, + "step": 15424 + }, + { + "epoch": 0.6494736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.00014190925127623055, + "loss": 3.0454, + "step": 15425 + }, + { + "epoch": 0.6495157894736842, + "grad_norm": 0.46484375, + "learning_rate": 0.00014187882505404308, + "loss": 3.428, + "step": 15426 + }, + { + "epoch": 0.6495578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.00014184840080169638, + "loss": 2.9105, + "step": 15427 + }, + { + "epoch": 0.6496, + "grad_norm": 0.466796875, + "learning_rate": 0.00014181797851974464, + "loss": 3.5866, + "step": 15428 + }, + { + "epoch": 0.6496421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.00014178755820874243, + "loss": 3.1565, + "step": 15429 + }, + { + "epoch": 0.6496842105263158, + "grad_norm": 0.443359375, + "learning_rate": 0.00014175713986924355, + "loss": 3.1828, + "step": 15430 + }, + { + "epoch": 0.6497263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.0001417267235018026, + "loss": 3.2198, + "step": 15431 + }, + { + "epoch": 0.6497684210526316, + "grad_norm": 0.435546875, + "learning_rate": 0.00014169630910697335, + "loss": 3.3999, + "step": 15432 + }, + { + "epoch": 0.6498105263157895, + "grad_norm": 0.453125, + "learning_rate": 0.0001416658966853102, + "loss": 3.5442, + "step": 15433 + }, + { + "epoch": 0.6498526315789473, + "grad_norm": 0.423828125, + "learning_rate": 0.0001416354862373671, + "loss": 3.0459, + "step": 15434 + }, + { + "epoch": 0.6498947368421053, + "grad_norm": 0.421875, + "learning_rate": 0.00014160507776369812, + "loss": 3.2393, + "step": 15435 + }, + { + "epoch": 0.6499368421052631, + "grad_norm": 0.423828125, + "learning_rate": 0.00014157467126485728, + "loss": 2.8662, + "step": 15436 + }, + { + "epoch": 0.6499789473684211, + "grad_norm": 0.41796875, + "learning_rate": 0.00014154426674139843, + "loss": 3.4425, + "step": 15437 + }, + { + "epoch": 0.6500210526315789, + "grad_norm": 0.4453125, + "learning_rate": 0.0001415138641938758, + "loss": 3.1926, + "step": 15438 + }, + { + "epoch": 0.6500631578947368, + "grad_norm": 0.431640625, + "learning_rate": 0.00014148346362284292, + "loss": 3.2168, + "step": 15439 + }, + { + "epoch": 0.6501052631578947, + "grad_norm": 0.45703125, + "learning_rate": 0.0001414530650288539, + "loss": 2.6032, + "step": 15440 + }, + { + "epoch": 0.6501473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.00014142266841246255, + "loss": 3.1884, + "step": 15441 + }, + { + "epoch": 0.6501894736842105, + "grad_norm": 0.49609375, + "learning_rate": 0.00014139227377422253, + "loss": 3.2056, + "step": 15442 + }, + { + "epoch": 0.6502315789473684, + "grad_norm": 0.43359375, + "learning_rate": 0.00014136188111468773, + "loss": 3.1645, + "step": 15443 + }, + { + "epoch": 0.6502736842105263, + "grad_norm": 0.494140625, + "learning_rate": 0.0001413314904344117, + "loss": 3.0849, + "step": 15444 + }, + { + "epoch": 0.6503157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.00014130110173394845, + "loss": 2.6386, + "step": 15445 + }, + { + "epoch": 0.6503578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.0001412707150138512, + "loss": 2.9644, + "step": 15446 + }, + { + "epoch": 0.6504, + "grad_norm": 0.451171875, + "learning_rate": 0.00014124033027467384, + "loss": 3.0721, + "step": 15447 + }, + { + "epoch": 0.6504421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.00014120994751696988, + "loss": 3.2342, + "step": 15448 + }, + { + "epoch": 0.6504842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.0001411795667412929, + "loss": 3.6151, + "step": 15449 + }, + { + "epoch": 0.6505263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.0001411491879481963, + "loss": 3.0623, + "step": 15450 + }, + { + "epoch": 0.6505684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.00014111881113823365, + "loss": 3.0185, + "step": 15451 + }, + { + "epoch": 0.6506105263157895, + "grad_norm": 0.494140625, + "learning_rate": 0.0001410884363119583, + "loss": 3.2916, + "step": 15452 + }, + { + "epoch": 0.6506526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00014105806346992358, + "loss": 3.3551, + "step": 15453 + }, + { + "epoch": 0.6506947368421052, + "grad_norm": 0.423828125, + "learning_rate": 0.00014102769261268306, + "loss": 3.0469, + "step": 15454 + }, + { + "epoch": 0.6507368421052632, + "grad_norm": 0.427734375, + "learning_rate": 0.00014099732374078992, + "loss": 2.9465, + "step": 15455 + }, + { + "epoch": 0.650778947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.00014096695685479748, + "loss": 3.3318, + "step": 15456 + }, + { + "epoch": 0.650821052631579, + "grad_norm": 0.453125, + "learning_rate": 0.000140936591955259, + "loss": 3.4713, + "step": 15457 + }, + { + "epoch": 0.6508631578947368, + "grad_norm": 0.439453125, + "learning_rate": 0.00014090622904272762, + "loss": 3.5066, + "step": 15458 + }, + { + "epoch": 0.6509052631578948, + "grad_norm": 0.443359375, + "learning_rate": 0.0001408758681177566, + "loss": 3.0179, + "step": 15459 + }, + { + "epoch": 0.6509473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.00014084550918089895, + "loss": 3.1468, + "step": 15460 + }, + { + "epoch": 0.6509894736842106, + "grad_norm": 0.423828125, + "learning_rate": 0.00014081515223270794, + "loss": 3.4201, + "step": 15461 + }, + { + "epoch": 0.6510315789473684, + "grad_norm": 0.44921875, + "learning_rate": 0.0001407847972737366, + "loss": 3.1901, + "step": 15462 + }, + { + "epoch": 0.6510736842105264, + "grad_norm": 0.482421875, + "learning_rate": 0.00014075444430453792, + "loss": 3.0143, + "step": 15463 + }, + { + "epoch": 0.6511157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.00014072409332566488, + "loss": 3.5901, + "step": 15464 + }, + { + "epoch": 0.6511578947368422, + "grad_norm": 0.43359375, + "learning_rate": 0.00014069374433767052, + "loss": 3.4214, + "step": 15465 + }, + { + "epoch": 0.6512, + "grad_norm": 0.423828125, + "learning_rate": 0.00014066339734110768, + "loss": 2.8183, + "step": 15466 + }, + { + "epoch": 0.6512421052631578, + "grad_norm": 0.423828125, + "learning_rate": 0.0001406330523365293, + "loss": 3.2492, + "step": 15467 + }, + { + "epoch": 0.6512842105263158, + "grad_norm": 0.447265625, + "learning_rate": 0.00014060270932448804, + "loss": 3.0104, + "step": 15468 + }, + { + "epoch": 0.6513263157894736, + "grad_norm": 0.48046875, + "learning_rate": 0.00014057236830553705, + "loss": 3.0741, + "step": 15469 + }, + { + "epoch": 0.6513684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.00014054202928022891, + "loss": 3.2965, + "step": 15470 + }, + { + "epoch": 0.6514105263157894, + "grad_norm": 0.42578125, + "learning_rate": 0.00014051169224911636, + "loss": 3.0974, + "step": 15471 + }, + { + "epoch": 0.6514526315789474, + "grad_norm": 0.435546875, + "learning_rate": 0.0001404813572127522, + "loss": 2.4895, + "step": 15472 + }, + { + "epoch": 0.6514947368421052, + "grad_norm": 0.44921875, + "learning_rate": 0.00014045102417168897, + "loss": 2.606, + "step": 15473 + }, + { + "epoch": 0.6515368421052632, + "grad_norm": 0.42578125, + "learning_rate": 0.0001404206931264794, + "loss": 2.9481, + "step": 15474 + }, + { + "epoch": 0.651578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.00014039036407767594, + "loss": 3.2244, + "step": 15475 + }, + { + "epoch": 0.651621052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.00014036003702583137, + "loss": 2.9741, + "step": 15476 + }, + { + "epoch": 0.6516631578947368, + "grad_norm": 0.5546875, + "learning_rate": 0.00014032971197149808, + "loss": 3.1124, + "step": 15477 + }, + { + "epoch": 0.6517052631578948, + "grad_norm": 0.439453125, + "learning_rate": 0.0001402993889152286, + "loss": 3.477, + "step": 15478 + }, + { + "epoch": 0.6517473684210526, + "grad_norm": 0.5078125, + "learning_rate": 0.00014026906785757537, + "loss": 2.8771, + "step": 15479 + }, + { + "epoch": 0.6517894736842105, + "grad_norm": 0.451171875, + "learning_rate": 0.00014023874879909076, + "loss": 3.2579, + "step": 15480 + }, + { + "epoch": 0.6518315789473684, + "grad_norm": 0.44921875, + "learning_rate": 0.0001402084317403272, + "loss": 3.4636, + "step": 15481 + }, + { + "epoch": 0.6518736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.00014017811668183693, + "loss": 3.0716, + "step": 15482 + }, + { + "epoch": 0.6519157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.00014014780362417252, + "loss": 2.5203, + "step": 15483 + }, + { + "epoch": 0.6519578947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.00014011749256788585, + "loss": 3.2773, + "step": 15484 + }, + { + "epoch": 0.652, + "grad_norm": 0.4296875, + "learning_rate": 0.00014008718351352952, + "loss": 3.2776, + "step": 15485 + }, + { + "epoch": 0.6520421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00014005687646165548, + "loss": 3.2386, + "step": 15486 + }, + { + "epoch": 0.6520842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.00014002657141281603, + "loss": 2.7903, + "step": 15487 + }, + { + "epoch": 0.6521263157894737, + "grad_norm": 0.44140625, + "learning_rate": 0.0001399962683675632, + "loss": 2.938, + "step": 15488 + }, + { + "epoch": 0.6521684210526316, + "grad_norm": 0.439453125, + "learning_rate": 0.00013996596732644904, + "loss": 3.3932, + "step": 15489 + }, + { + "epoch": 0.6522105263157895, + "grad_norm": 0.41015625, + "learning_rate": 0.00013993566829002585, + "loss": 3.4126, + "step": 15490 + }, + { + "epoch": 0.6522526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.0001399053712588453, + "loss": 3.0689, + "step": 15491 + }, + { + "epoch": 0.6522947368421053, + "grad_norm": 0.427734375, + "learning_rate": 0.00013987507623345957, + "loss": 2.8674, + "step": 15492 + }, + { + "epoch": 0.6523368421052631, + "grad_norm": 0.431640625, + "learning_rate": 0.0001398447832144206, + "loss": 3.3193, + "step": 15493 + }, + { + "epoch": 0.6523789473684211, + "grad_norm": 0.4140625, + "learning_rate": 0.00013981449220228025, + "loss": 2.9399, + "step": 15494 + }, + { + "epoch": 0.6524210526315789, + "grad_norm": 0.46875, + "learning_rate": 0.00013978420319759037, + "loss": 3.4297, + "step": 15495 + }, + { + "epoch": 0.6524631578947369, + "grad_norm": 0.431640625, + "learning_rate": 0.00013975391620090272, + "loss": 3.218, + "step": 15496 + }, + { + "epoch": 0.6525052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.00013972363121276938, + "loss": 3.1009, + "step": 15497 + }, + { + "epoch": 0.6525473684210527, + "grad_norm": 0.443359375, + "learning_rate": 0.0001396933482337417, + "loss": 3.1822, + "step": 15498 + }, + { + "epoch": 0.6525894736842105, + "grad_norm": 0.419921875, + "learning_rate": 0.00013966306726437183, + "loss": 3.3537, + "step": 15499 + }, + { + "epoch": 0.6526315789473685, + "grad_norm": 0.4140625, + "learning_rate": 0.00013963278830521103, + "loss": 3.0388, + "step": 15500 + }, + { + "epoch": 0.6526736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.00013960251135681126, + "loss": 3.3139, + "step": 15501 + }, + { + "epoch": 0.6527157894736842, + "grad_norm": 0.4375, + "learning_rate": 0.000139572236419724, + "loss": 3.41, + "step": 15502 + }, + { + "epoch": 0.6527578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.00013954196349450076, + "loss": 3.4537, + "step": 15503 + }, + { + "epoch": 0.6528, + "grad_norm": 0.421875, + "learning_rate": 0.00013951169258169337, + "loss": 3.1323, + "step": 15504 + }, + { + "epoch": 0.6528421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.0001394814236818529, + "loss": 3.36, + "step": 15505 + }, + { + "epoch": 0.6528842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.00013945115679553127, + "loss": 3.1089, + "step": 15506 + }, + { + "epoch": 0.6529263157894737, + "grad_norm": 0.40625, + "learning_rate": 0.00013942089192327945, + "loss": 3.1394, + "step": 15507 + }, + { + "epoch": 0.6529684210526315, + "grad_norm": 0.421875, + "learning_rate": 0.00013939062906564913, + "loss": 3.2209, + "step": 15508 + }, + { + "epoch": 0.6530105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.00013936036822319165, + "loss": 3.4028, + "step": 15509 + }, + { + "epoch": 0.6530526315789473, + "grad_norm": 0.408203125, + "learning_rate": 0.0001393301093964582, + "loss": 3.1186, + "step": 15510 + }, + { + "epoch": 0.6530947368421053, + "grad_norm": 0.419921875, + "learning_rate": 0.00013929985258600018, + "loss": 3.4186, + "step": 15511 + }, + { + "epoch": 0.6531368421052631, + "grad_norm": 0.41796875, + "learning_rate": 0.00013926959779236864, + "loss": 2.9905, + "step": 15512 + }, + { + "epoch": 0.6531789473684211, + "grad_norm": 0.4453125, + "learning_rate": 0.00013923934501611514, + "loss": 3.0216, + "step": 15513 + }, + { + "epoch": 0.6532210526315789, + "grad_norm": 0.44140625, + "learning_rate": 0.00013920909425779043, + "loss": 3.045, + "step": 15514 + }, + { + "epoch": 0.6532631578947369, + "grad_norm": 0.435546875, + "learning_rate": 0.00013917884551794597, + "loss": 3.109, + "step": 15515 + }, + { + "epoch": 0.6533052631578947, + "grad_norm": 0.45703125, + "learning_rate": 0.00013914859879713272, + "loss": 3.2266, + "step": 15516 + }, + { + "epoch": 0.6533473684210527, + "grad_norm": 0.48828125, + "learning_rate": 0.00013911835409590172, + "loss": 3.1346, + "step": 15517 + }, + { + "epoch": 0.6533894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.00013908811141480408, + "loss": 3.0094, + "step": 15518 + }, + { + "epoch": 0.6534315789473685, + "grad_norm": 0.46484375, + "learning_rate": 0.00013905787075439057, + "loss": 3.1286, + "step": 15519 + }, + { + "epoch": 0.6534736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.00013902763211521259, + "loss": 3.4562, + "step": 15520 + }, + { + "epoch": 0.6535157894736842, + "grad_norm": 0.400390625, + "learning_rate": 0.0001389973954978205, + "loss": 2.8064, + "step": 15521 + }, + { + "epoch": 0.6535578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00013896716090276557, + "loss": 2.9919, + "step": 15522 + }, + { + "epoch": 0.6536, + "grad_norm": 0.416015625, + "learning_rate": 0.00013893692833059852, + "loss": 3.1654, + "step": 15523 + }, + { + "epoch": 0.6536421052631579, + "grad_norm": 0.447265625, + "learning_rate": 0.0001389066977818701, + "loss": 2.7809, + "step": 15524 + }, + { + "epoch": 0.6536842105263158, + "grad_norm": 0.458984375, + "learning_rate": 0.00013887646925713116, + "loss": 3.1881, + "step": 15525 + }, + { + "epoch": 0.6537263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.00013884624275693237, + "loss": 2.8581, + "step": 15526 + }, + { + "epoch": 0.6537684210526316, + "grad_norm": 0.451171875, + "learning_rate": 0.00013881601828182445, + "loss": 3.0233, + "step": 15527 + }, + { + "epoch": 0.6538105263157895, + "grad_norm": 0.412109375, + "learning_rate": 0.0001387857958323579, + "loss": 3.559, + "step": 15528 + }, + { + "epoch": 0.6538526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.0001387555754090836, + "loss": 3.0345, + "step": 15529 + }, + { + "epoch": 0.6538947368421053, + "grad_norm": 0.42578125, + "learning_rate": 0.000138725357012552, + "loss": 3.4478, + "step": 15530 + }, + { + "epoch": 0.6539368421052631, + "grad_norm": 0.4296875, + "learning_rate": 0.00013869514064331369, + "loss": 3.0467, + "step": 15531 + }, + { + "epoch": 0.6539789473684211, + "grad_norm": 0.458984375, + "learning_rate": 0.00013866492630191913, + "loss": 3.0422, + "step": 15532 + }, + { + "epoch": 0.654021052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.00013863471398891874, + "loss": 3.1498, + "step": 15533 + }, + { + "epoch": 0.6540631578947368, + "grad_norm": 0.421875, + "learning_rate": 0.00013860450370486307, + "loss": 2.9495, + "step": 15534 + }, + { + "epoch": 0.6541052631578947, + "grad_norm": 0.44921875, + "learning_rate": 0.00013857429545030244, + "loss": 3.149, + "step": 15535 + }, + { + "epoch": 0.6541473684210526, + "grad_norm": 0.44140625, + "learning_rate": 0.00013854408922578715, + "loss": 3.1901, + "step": 15536 + }, + { + "epoch": 0.6541894736842105, + "grad_norm": 0.419921875, + "learning_rate": 0.0001385138850318677, + "loss": 3.2531, + "step": 15537 + }, + { + "epoch": 0.6542315789473684, + "grad_norm": 0.439453125, + "learning_rate": 0.00013848368286909423, + "loss": 3.0485, + "step": 15538 + }, + { + "epoch": 0.6542736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00013845348273801707, + "loss": 2.9535, + "step": 15539 + }, + { + "epoch": 0.6543157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.0001384232846391864, + "loss": 3.2329, + "step": 15540 + }, + { + "epoch": 0.6543578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00013839308857315226, + "loss": 3.3624, + "step": 15541 + }, + { + "epoch": 0.6544, + "grad_norm": 0.43359375, + "learning_rate": 0.0001383628945404651, + "loss": 2.7173, + "step": 15542 + }, + { + "epoch": 0.6544421052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00013833270254167465, + "loss": 2.9578, + "step": 15543 + }, + { + "epoch": 0.6544842105263158, + "grad_norm": 0.447265625, + "learning_rate": 0.00013830251257733128, + "loss": 3.4635, + "step": 15544 + }, + { + "epoch": 0.6545263157894737, + "grad_norm": 0.435546875, + "learning_rate": 0.00013827232464798487, + "loss": 3.5202, + "step": 15545 + }, + { + "epoch": 0.6545684210526316, + "grad_norm": 0.412109375, + "learning_rate": 0.00013824213875418542, + "loss": 2.6278, + "step": 15546 + }, + { + "epoch": 0.6546105263157894, + "grad_norm": 0.466796875, + "learning_rate": 0.0001382119548964829, + "loss": 3.1953, + "step": 15547 + }, + { + "epoch": 0.6546526315789474, + "grad_norm": 0.4140625, + "learning_rate": 0.00013818177307542713, + "loss": 3.0636, + "step": 15548 + }, + { + "epoch": 0.6546947368421052, + "grad_norm": 0.41796875, + "learning_rate": 0.00013815159329156826, + "loss": 2.9344, + "step": 15549 + }, + { + "epoch": 0.6547368421052632, + "grad_norm": 0.4375, + "learning_rate": 0.00013812141554545577, + "loss": 3.1729, + "step": 15550 + }, + { + "epoch": 0.654778947368421, + "grad_norm": 0.41796875, + "learning_rate": 0.0001380912398376398, + "loss": 3.0613, + "step": 15551 + }, + { + "epoch": 0.654821052631579, + "grad_norm": 0.453125, + "learning_rate": 0.00013806106616866975, + "loss": 3.012, + "step": 15552 + }, + { + "epoch": 0.6548631578947368, + "grad_norm": 0.4375, + "learning_rate": 0.0001380308945390957, + "loss": 3.5513, + "step": 15553 + }, + { + "epoch": 0.6549052631578948, + "grad_norm": 0.462890625, + "learning_rate": 0.00013800072494946715, + "loss": 3.428, + "step": 15554 + }, + { + "epoch": 0.6549473684210526, + "grad_norm": 0.447265625, + "learning_rate": 0.00013797055740033372, + "loss": 2.5745, + "step": 15555 + }, + { + "epoch": 0.6549894736842106, + "grad_norm": 0.427734375, + "learning_rate": 0.00013794039189224527, + "loss": 2.5773, + "step": 15556 + }, + { + "epoch": 0.6550315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.00013791022842575104, + "loss": 3.2473, + "step": 15557 + }, + { + "epoch": 0.6550736842105264, + "grad_norm": 0.416015625, + "learning_rate": 0.00013788006700140096, + "loss": 2.9793, + "step": 15558 + }, + { + "epoch": 0.6551157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.0001378499076197441, + "loss": 2.6976, + "step": 15559 + }, + { + "epoch": 0.655157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.00013781975028133027, + "loss": 3.2225, + "step": 15560 + }, + { + "epoch": 0.6552, + "grad_norm": 0.443359375, + "learning_rate": 0.00013778959498670878, + "loss": 3.2579, + "step": 15561 + }, + { + "epoch": 0.6552421052631578, + "grad_norm": 0.427734375, + "learning_rate": 0.000137759441736429, + "loss": 2.7472, + "step": 15562 + }, + { + "epoch": 0.6552842105263158, + "grad_norm": 0.462890625, + "learning_rate": 0.00013772929053104037, + "loss": 2.9966, + "step": 15563 + }, + { + "epoch": 0.6553263157894736, + "grad_norm": 0.423828125, + "learning_rate": 0.00013769914137109202, + "loss": 3.472, + "step": 15564 + }, + { + "epoch": 0.6553684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.00013766899425713354, + "loss": 3.0146, + "step": 15565 + }, + { + "epoch": 0.6554105263157894, + "grad_norm": 0.4375, + "learning_rate": 0.00013763884918971387, + "loss": 3.2299, + "step": 15566 + }, + { + "epoch": 0.6554526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.0001376087061693825, + "loss": 3.017, + "step": 15567 + }, + { + "epoch": 0.6554947368421052, + "grad_norm": 0.42578125, + "learning_rate": 0.00013757856519668827, + "loss": 3.5853, + "step": 15568 + }, + { + "epoch": 0.6555368421052632, + "grad_norm": 0.44140625, + "learning_rate": 0.00013754842627218056, + "loss": 3.4511, + "step": 15569 + }, + { + "epoch": 0.655578947368421, + "grad_norm": 0.453125, + "learning_rate": 0.00013751828939640841, + "loss": 3.4127, + "step": 15570 + }, + { + "epoch": 0.655621052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00013748815456992078, + "loss": 3.407, + "step": 15571 + }, + { + "epoch": 0.6556631578947368, + "grad_norm": 0.486328125, + "learning_rate": 0.000137458021793267, + "loss": 2.7952, + "step": 15572 + }, + { + "epoch": 0.6557052631578948, + "grad_norm": 0.484375, + "learning_rate": 0.0001374278910669956, + "loss": 3.0883, + "step": 15573 + }, + { + "epoch": 0.6557473684210526, + "grad_norm": 0.439453125, + "learning_rate": 0.00013739776239165598, + "loss": 2.6413, + "step": 15574 + }, + { + "epoch": 0.6557894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.0001373676357677966, + "loss": 2.8272, + "step": 15575 + }, + { + "epoch": 0.6558315789473684, + "grad_norm": 0.427734375, + "learning_rate": 0.00013733751119596665, + "loss": 3.0791, + "step": 15576 + }, + { + "epoch": 0.6558736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.0001373073886767149, + "loss": 3.1639, + "step": 15577 + }, + { + "epoch": 0.6559157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.00013727726821059012, + "loss": 3.3191, + "step": 15578 + }, + { + "epoch": 0.6559578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.00013724714979814105, + "loss": 3.3552, + "step": 15579 + }, + { + "epoch": 0.656, + "grad_norm": 0.41796875, + "learning_rate": 0.0001372170334399163, + "loss": 2.9483, + "step": 15580 + }, + { + "epoch": 0.6560421052631579, + "grad_norm": 0.45703125, + "learning_rate": 0.00013718691913646496, + "loss": 3.4547, + "step": 15581 + }, + { + "epoch": 0.6560842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.00013715680688833519, + "loss": 3.3725, + "step": 15582 + }, + { + "epoch": 0.6561263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.00013712669669607586, + "loss": 2.7827, + "step": 15583 + }, + { + "epoch": 0.6561684210526316, + "grad_norm": 0.44921875, + "learning_rate": 0.00013709658856023556, + "loss": 3.3958, + "step": 15584 + }, + { + "epoch": 0.6562105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.00013706648248136273, + "loss": 3.3138, + "step": 15585 + }, + { + "epoch": 0.6562526315789474, + "grad_norm": 0.419921875, + "learning_rate": 0.0001370363784600059, + "loss": 3.574, + "step": 15586 + }, + { + "epoch": 0.6562947368421053, + "grad_norm": 0.50390625, + "learning_rate": 0.00013700627649671345, + "loss": 3.1679, + "step": 15587 + }, + { + "epoch": 0.6563368421052631, + "grad_norm": 0.421875, + "learning_rate": 0.00013697617659203394, + "loss": 2.8231, + "step": 15588 + }, + { + "epoch": 0.6563789473684211, + "grad_norm": 0.451171875, + "learning_rate": 0.00013694607874651572, + "loss": 3.2442, + "step": 15589 + }, + { + "epoch": 0.6564210526315789, + "grad_norm": 0.42578125, + "learning_rate": 0.00013691598296070706, + "loss": 3.329, + "step": 15590 + }, + { + "epoch": 0.6564631578947369, + "grad_norm": 0.419921875, + "learning_rate": 0.00013688588923515638, + "loss": 3.119, + "step": 15591 + }, + { + "epoch": 0.6565052631578947, + "grad_norm": 0.4296875, + "learning_rate": 0.00013685579757041189, + "loss": 3.4377, + "step": 15592 + }, + { + "epoch": 0.6565473684210527, + "grad_norm": 0.4296875, + "learning_rate": 0.00013682570796702182, + "loss": 2.9281, + "step": 15593 + }, + { + "epoch": 0.6565894736842105, + "grad_norm": 0.421875, + "learning_rate": 0.0001367956204255344, + "loss": 3.4774, + "step": 15594 + }, + { + "epoch": 0.6566315789473685, + "grad_norm": 0.427734375, + "learning_rate": 0.00013676553494649762, + "loss": 3.0702, + "step": 15595 + }, + { + "epoch": 0.6566736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.00013673545153045986, + "loss": 3.053, + "step": 15596 + }, + { + "epoch": 0.6567157894736843, + "grad_norm": 0.640625, + "learning_rate": 0.0001367053701779691, + "loss": 3.0937, + "step": 15597 + }, + { + "epoch": 0.6567578947368421, + "grad_norm": 0.392578125, + "learning_rate": 0.0001366752908895734, + "loss": 2.9983, + "step": 15598 + }, + { + "epoch": 0.6568, + "grad_norm": 0.42578125, + "learning_rate": 0.00013664521366582073, + "loss": 3.1999, + "step": 15599 + }, + { + "epoch": 0.6568421052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.0001366151385072591, + "loss": 2.7007, + "step": 15600 + }, + { + "epoch": 0.6568842105263157, + "grad_norm": 0.41796875, + "learning_rate": 0.0001365850654144364, + "loss": 3.0551, + "step": 15601 + }, + { + "epoch": 0.6569263157894737, + "grad_norm": 0.40234375, + "learning_rate": 0.00013655499438790044, + "loss": 3.0709, + "step": 15602 + }, + { + "epoch": 0.6569684210526315, + "grad_norm": 0.412109375, + "learning_rate": 0.00013652492542819933, + "loss": 3.3113, + "step": 15603 + }, + { + "epoch": 0.6570105263157895, + "grad_norm": 0.423828125, + "learning_rate": 0.00013649485853588073, + "loss": 3.3274, + "step": 15604 + }, + { + "epoch": 0.6570526315789473, + "grad_norm": 0.41015625, + "learning_rate": 0.00013646479371149246, + "loss": 2.5574, + "step": 15605 + }, + { + "epoch": 0.6570947368421053, + "grad_norm": 0.443359375, + "learning_rate": 0.0001364347309555822, + "loss": 3.3983, + "step": 15606 + }, + { + "epoch": 0.6571368421052631, + "grad_norm": 0.423828125, + "learning_rate": 0.00013640467026869774, + "loss": 2.9053, + "step": 15607 + }, + { + "epoch": 0.6571789473684211, + "grad_norm": 0.431640625, + "learning_rate": 0.00013637461165138674, + "loss": 3.0288, + "step": 15608 + }, + { + "epoch": 0.6572210526315789, + "grad_norm": 0.439453125, + "learning_rate": 0.00013634455510419668, + "loss": 2.7372, + "step": 15609 + }, + { + "epoch": 0.6572631578947369, + "grad_norm": 0.443359375, + "learning_rate": 0.0001363145006276755, + "loss": 3.243, + "step": 15610 + }, + { + "epoch": 0.6573052631578947, + "grad_norm": 0.41796875, + "learning_rate": 0.00013628444822237035, + "loss": 2.7422, + "step": 15611 + }, + { + "epoch": 0.6573473684210527, + "grad_norm": 0.4296875, + "learning_rate": 0.00013625439788882905, + "loss": 3.0412, + "step": 15612 + }, + { + "epoch": 0.6573894736842105, + "grad_norm": 0.451171875, + "learning_rate": 0.00013622434962759896, + "loss": 2.8283, + "step": 15613 + }, + { + "epoch": 0.6574315789473684, + "grad_norm": 0.4453125, + "learning_rate": 0.0001361943034392275, + "loss": 3.1585, + "step": 15614 + }, + { + "epoch": 0.6574736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.00013616425932426218, + "loss": 3.245, + "step": 15615 + }, + { + "epoch": 0.6575157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00013613421728325017, + "loss": 3.2405, + "step": 15616 + }, + { + "epoch": 0.6575578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.00013610417731673913, + "loss": 2.8941, + "step": 15617 + }, + { + "epoch": 0.6576, + "grad_norm": 0.43359375, + "learning_rate": 0.00013607413942527596, + "loss": 2.7251, + "step": 15618 + }, + { + "epoch": 0.6576421052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.00013604410360940826, + "loss": 3.286, + "step": 15619 + }, + { + "epoch": 0.6576842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.00013601406986968307, + "loss": 3.3008, + "step": 15620 + }, + { + "epoch": 0.6577263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00013598403820664755, + "loss": 3.2681, + "step": 15621 + }, + { + "epoch": 0.6577684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.00013595400862084894, + "loss": 2.8217, + "step": 15622 + }, + { + "epoch": 0.6578105263157895, + "grad_norm": 0.43359375, + "learning_rate": 0.0001359239811128342, + "loss": 3.0123, + "step": 15623 + }, + { + "epoch": 0.6578526315789474, + "grad_norm": 0.421875, + "learning_rate": 0.00013589395568315067, + "loss": 3.5839, + "step": 15624 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.462890625, + "learning_rate": 0.00013586393233234502, + "loss": 3.2106, + "step": 15625 + }, + { + "epoch": 0.6579368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.0001358339110609646, + "loss": 3.6317, + "step": 15626 + }, + { + "epoch": 0.6579789473684211, + "grad_norm": 0.416015625, + "learning_rate": 0.000135803891869556, + "loss": 2.8747, + "step": 15627 + }, + { + "epoch": 0.658021052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.00013577387475866638, + "loss": 2.9033, + "step": 15628 + }, + { + "epoch": 0.6580631578947368, + "grad_norm": 0.44921875, + "learning_rate": 0.00013574385972884258, + "loss": 2.8642, + "step": 15629 + }, + { + "epoch": 0.6581052631578947, + "grad_norm": 0.439453125, + "learning_rate": 0.00013571384678063127, + "loss": 3.1112, + "step": 15630 + }, + { + "epoch": 0.6581473684210526, + "grad_norm": 0.4375, + "learning_rate": 0.00013568383591457963, + "loss": 3.1546, + "step": 15631 + }, + { + "epoch": 0.6581894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.00013565382713123399, + "loss": 3.1903, + "step": 15632 + }, + { + "epoch": 0.6582315789473684, + "grad_norm": 0.43359375, + "learning_rate": 0.00013562382043114145, + "loss": 3.0894, + "step": 15633 + }, + { + "epoch": 0.6582736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.0001355938158148483, + "loss": 3.1193, + "step": 15634 + }, + { + "epoch": 0.6583157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.00013556381328290153, + "loss": 3.3353, + "step": 15635 + }, + { + "epoch": 0.6583578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00013553381283584764, + "loss": 2.6582, + "step": 15636 + }, + { + "epoch": 0.6584, + "grad_norm": 0.388671875, + "learning_rate": 0.00013550381447423315, + "loss": 2.8553, + "step": 15637 + }, + { + "epoch": 0.6584421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.00013547381819860465, + "loss": 3.4918, + "step": 15638 + }, + { + "epoch": 0.6584842105263158, + "grad_norm": 0.4453125, + "learning_rate": 0.00013544382400950853, + "loss": 3.2636, + "step": 15639 + }, + { + "epoch": 0.6585263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.00013541383190749152, + "loss": 2.7619, + "step": 15640 + }, + { + "epoch": 0.6585684210526316, + "grad_norm": 0.458984375, + "learning_rate": 0.00013538384189309966, + "loss": 3.2107, + "step": 15641 + }, + { + "epoch": 0.6586105263157894, + "grad_norm": 0.431640625, + "learning_rate": 0.00013535385396687966, + "loss": 2.4485, + "step": 15642 + }, + { + "epoch": 0.6586526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.0001353238681293777, + "loss": 3.1182, + "step": 15643 + }, + { + "epoch": 0.6586947368421052, + "grad_norm": 0.4140625, + "learning_rate": 0.00013529388438114016, + "loss": 3.0032, + "step": 15644 + }, + { + "epoch": 0.6587368421052632, + "grad_norm": 0.431640625, + "learning_rate": 0.00013526390272271325, + "loss": 2.8326, + "step": 15645 + }, + { + "epoch": 0.658778947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.0001352339231546431, + "loss": 3.1958, + "step": 15646 + }, + { + "epoch": 0.658821052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.00013520394567747627, + "loss": 2.7169, + "step": 15647 + }, + { + "epoch": 0.6588631578947368, + "grad_norm": 0.44140625, + "learning_rate": 0.00013517397029175844, + "loss": 2.9366, + "step": 15648 + }, + { + "epoch": 0.6589052631578948, + "grad_norm": 0.443359375, + "learning_rate": 0.00013514399699803608, + "loss": 2.972, + "step": 15649 + }, + { + "epoch": 0.6589473684210526, + "grad_norm": 0.451171875, + "learning_rate": 0.0001351140257968551, + "loss": 3.2461, + "step": 15650 + }, + { + "epoch": 0.6589894736842106, + "grad_norm": 0.42578125, + "learning_rate": 0.00013508405668876165, + "loss": 3.3538, + "step": 15651 + }, + { + "epoch": 0.6590315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.0001350540896743016, + "loss": 3.5014, + "step": 15652 + }, + { + "epoch": 0.6590736842105264, + "grad_norm": 0.427734375, + "learning_rate": 0.00013502412475402105, + "loss": 3.1084, + "step": 15653 + }, + { + "epoch": 0.6591157894736842, + "grad_norm": 0.4296875, + "learning_rate": 0.0001349941619284658, + "loss": 3.0358, + "step": 15654 + }, + { + "epoch": 0.659157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.0001349642011981817, + "loss": 2.8567, + "step": 15655 + }, + { + "epoch": 0.6592, + "grad_norm": 0.40234375, + "learning_rate": 0.00013493424256371485, + "loss": 2.9034, + "step": 15656 + }, + { + "epoch": 0.6592421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00013490428602561084, + "loss": 3.0365, + "step": 15657 + }, + { + "epoch": 0.6592842105263158, + "grad_norm": 0.412109375, + "learning_rate": 0.00013487433158441552, + "loss": 3.2563, + "step": 15658 + }, + { + "epoch": 0.6593263157894736, + "grad_norm": 0.416015625, + "learning_rate": 0.0001348443792406746, + "loss": 3.2114, + "step": 15659 + }, + { + "epoch": 0.6593684210526316, + "grad_norm": 0.470703125, + "learning_rate": 0.00013481442899493382, + "loss": 3.5359, + "step": 15660 + }, + { + "epoch": 0.6594105263157894, + "grad_norm": 0.41796875, + "learning_rate": 0.0001347844808477388, + "loss": 3.1023, + "step": 15661 + }, + { + "epoch": 0.6594526315789474, + "grad_norm": 0.482421875, + "learning_rate": 0.00013475453479963506, + "loss": 2.9235, + "step": 15662 + }, + { + "epoch": 0.6594947368421052, + "grad_norm": 0.4296875, + "learning_rate": 0.00013472459085116839, + "loss": 2.7418, + "step": 15663 + }, + { + "epoch": 0.6595368421052632, + "grad_norm": 0.4375, + "learning_rate": 0.00013469464900288418, + "loss": 3.1674, + "step": 15664 + }, + { + "epoch": 0.659578947368421, + "grad_norm": 0.45703125, + "learning_rate": 0.00013466470925532808, + "loss": 3.6334, + "step": 15665 + }, + { + "epoch": 0.659621052631579, + "grad_norm": 0.373046875, + "learning_rate": 0.00013463477160904541, + "loss": 2.5271, + "step": 15666 + }, + { + "epoch": 0.6596631578947368, + "grad_norm": 0.47265625, + "learning_rate": 0.00013460483606458166, + "loss": 2.8849, + "step": 15667 + }, + { + "epoch": 0.6597052631578947, + "grad_norm": 0.427734375, + "learning_rate": 0.0001345749026224822, + "loss": 3.0873, + "step": 15668 + }, + { + "epoch": 0.6597473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.0001345449712832924, + "loss": 3.1987, + "step": 15669 + }, + { + "epoch": 0.6597894736842105, + "grad_norm": 0.47265625, + "learning_rate": 0.0001345150420475575, + "loss": 2.7454, + "step": 15670 + }, + { + "epoch": 0.6598315789473684, + "grad_norm": 0.453125, + "learning_rate": 0.00013448511491582295, + "loss": 3.2422, + "step": 15671 + }, + { + "epoch": 0.6598736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.0001344551898886338, + "loss": 3.2176, + "step": 15672 + }, + { + "epoch": 0.6599157894736842, + "grad_norm": 0.455078125, + "learning_rate": 0.0001344252669665354, + "loss": 3.2383, + "step": 15673 + }, + { + "epoch": 0.6599578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00013439534615007284, + "loss": 3.0154, + "step": 15674 + }, + { + "epoch": 0.66, + "grad_norm": 0.453125, + "learning_rate": 0.00013436542743979125, + "loss": 3.4572, + "step": 15675 + }, + { + "epoch": 0.6600421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.0001343355108362357, + "loss": 3.5481, + "step": 15676 + }, + { + "epoch": 0.6600842105263158, + "grad_norm": 0.5625, + "learning_rate": 0.0001343055963399511, + "loss": 3.204, + "step": 15677 + }, + { + "epoch": 0.6601263157894737, + "grad_norm": 0.435546875, + "learning_rate": 0.00013427568395148271, + "loss": 3.3478, + "step": 15678 + }, + { + "epoch": 0.6601684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.0001342457736713754, + "loss": 3.2411, + "step": 15679 + }, + { + "epoch": 0.6602105263157895, + "grad_norm": 0.435546875, + "learning_rate": 0.0001342158655001741, + "loss": 2.8372, + "step": 15680 + }, + { + "epoch": 0.6602526315789473, + "grad_norm": 0.431640625, + "learning_rate": 0.00013418595943842372, + "loss": 3.1275, + "step": 15681 + }, + { + "epoch": 0.6602947368421053, + "grad_norm": 0.427734375, + "learning_rate": 0.00013415605548666905, + "loss": 3.1359, + "step": 15682 + }, + { + "epoch": 0.6603368421052631, + "grad_norm": 0.435546875, + "learning_rate": 0.0001341261536454549, + "loss": 3.167, + "step": 15683 + }, + { + "epoch": 0.6603789473684211, + "grad_norm": 0.455078125, + "learning_rate": 0.00013409625391532605, + "loss": 3.4055, + "step": 15684 + }, + { + "epoch": 0.6604210526315789, + "grad_norm": 0.4296875, + "learning_rate": 0.00013406635629682742, + "loss": 3.3793, + "step": 15685 + }, + { + "epoch": 0.6604631578947369, + "grad_norm": 0.431640625, + "learning_rate": 0.0001340364607905034, + "loss": 3.3549, + "step": 15686 + }, + { + "epoch": 0.6605052631578947, + "grad_norm": 0.453125, + "learning_rate": 0.00013400656739689888, + "loss": 3.3546, + "step": 15687 + }, + { + "epoch": 0.6605473684210527, + "grad_norm": 0.435546875, + "learning_rate": 0.00013397667611655844, + "loss": 3.0565, + "step": 15688 + }, + { + "epoch": 0.6605894736842105, + "grad_norm": 0.416015625, + "learning_rate": 0.00013394678695002654, + "loss": 3.0792, + "step": 15689 + }, + { + "epoch": 0.6606315789473685, + "grad_norm": 0.408203125, + "learning_rate": 0.000133916899897848, + "loss": 2.9204, + "step": 15690 + }, + { + "epoch": 0.6606736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.00013388701496056698, + "loss": 3.0613, + "step": 15691 + }, + { + "epoch": 0.6607157894736843, + "grad_norm": 0.419921875, + "learning_rate": 0.0001338571321387283, + "loss": 3.2818, + "step": 15692 + }, + { + "epoch": 0.6607578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.00013382725143287603, + "loss": 3.4337, + "step": 15693 + }, + { + "epoch": 0.6608, + "grad_norm": 0.43359375, + "learning_rate": 0.0001337973728435548, + "loss": 2.7607, + "step": 15694 + }, + { + "epoch": 0.6608421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00013376749637130896, + "loss": 2.9089, + "step": 15695 + }, + { + "epoch": 0.6608842105263157, + "grad_norm": 0.498046875, + "learning_rate": 0.00013373762201668275, + "loss": 3.4544, + "step": 15696 + }, + { + "epoch": 0.6609263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.00013370774978022047, + "loss": 3.1535, + "step": 15697 + }, + { + "epoch": 0.6609684210526315, + "grad_norm": 0.419921875, + "learning_rate": 0.00013367787966246623, + "loss": 2.8811, + "step": 15698 + }, + { + "epoch": 0.6610105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.00013364801166396457, + "loss": 3.256, + "step": 15699 + }, + { + "epoch": 0.6610526315789473, + "grad_norm": 0.4453125, + "learning_rate": 0.00013361814578525922, + "loss": 3.2203, + "step": 15700 + }, + { + "epoch": 0.6610947368421053, + "grad_norm": 0.453125, + "learning_rate": 0.0001335882820268947, + "loss": 2.9717, + "step": 15701 + }, + { + "epoch": 0.6611368421052631, + "grad_norm": 0.423828125, + "learning_rate": 0.0001335584203894147, + "loss": 3.589, + "step": 15702 + }, + { + "epoch": 0.6611789473684211, + "grad_norm": 0.408203125, + "learning_rate": 0.00013352856087336357, + "loss": 3.4689, + "step": 15703 + }, + { + "epoch": 0.6612210526315789, + "grad_norm": 0.4296875, + "learning_rate": 0.00013349870347928523, + "loss": 2.3425, + "step": 15704 + }, + { + "epoch": 0.6612631578947369, + "grad_norm": 0.43359375, + "learning_rate": 0.00013346884820772354, + "loss": 3.1679, + "step": 15705 + }, + { + "epoch": 0.6613052631578947, + "grad_norm": 0.427734375, + "learning_rate": 0.0001334389950592227, + "loss": 3.2857, + "step": 15706 + }, + { + "epoch": 0.6613473684210527, + "grad_norm": 0.439453125, + "learning_rate": 0.00013340914403432618, + "loss": 3.3305, + "step": 15707 + }, + { + "epoch": 0.6613894736842105, + "grad_norm": 0.44140625, + "learning_rate": 0.00013337929513357831, + "loss": 3.0762, + "step": 15708 + }, + { + "epoch": 0.6614315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.00013334944835752247, + "loss": 3.2639, + "step": 15709 + }, + { + "epoch": 0.6614736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.0001333196037067027, + "loss": 3.0184, + "step": 15710 + }, + { + "epoch": 0.6615157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.00013328976118166263, + "loss": 3.3818, + "step": 15711 + }, + { + "epoch": 0.6615578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00013325992078294603, + "loss": 2.8022, + "step": 15712 + }, + { + "epoch": 0.6616, + "grad_norm": 0.41796875, + "learning_rate": 0.0001332300825110965, + "loss": 3.0591, + "step": 15713 + }, + { + "epoch": 0.6616421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.00013320024636665756, + "loss": 2.5197, + "step": 15714 + }, + { + "epoch": 0.6616842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.0001331704123501731, + "loss": 3.4446, + "step": 15715 + }, + { + "epoch": 0.6617263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.00013314058046218625, + "loss": 2.8752, + "step": 15716 + }, + { + "epoch": 0.6617684210526316, + "grad_norm": 0.47265625, + "learning_rate": 0.00013311075070324084, + "loss": 3.1053, + "step": 15717 + }, + { + "epoch": 0.6618105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.0001330809230738802, + "loss": 2.795, + "step": 15718 + }, + { + "epoch": 0.6618526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.00013305109757464774, + "loss": 3.0697, + "step": 15719 + }, + { + "epoch": 0.6618947368421053, + "grad_norm": 0.42578125, + "learning_rate": 0.00013302127420608689, + "loss": 3.2741, + "step": 15720 + }, + { + "epoch": 0.6619368421052632, + "grad_norm": 0.48828125, + "learning_rate": 0.0001329914529687409, + "loss": 3.6601, + "step": 15721 + }, + { + "epoch": 0.661978947368421, + "grad_norm": 0.4765625, + "learning_rate": 0.00013296163386315336, + "loss": 2.5136, + "step": 15722 + }, + { + "epoch": 0.662021052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00013293181688986712, + "loss": 3.1676, + "step": 15723 + }, + { + "epoch": 0.6620631578947368, + "grad_norm": 0.421875, + "learning_rate": 0.00013290200204942572, + "loss": 3.1506, + "step": 15724 + }, + { + "epoch": 0.6621052631578948, + "grad_norm": 0.423828125, + "learning_rate": 0.00013287218934237226, + "loss": 2.6463, + "step": 15725 + }, + { + "epoch": 0.6621473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.0001328423787692499, + "loss": 2.7638, + "step": 15726 + }, + { + "epoch": 0.6621894736842105, + "grad_norm": 0.439453125, + "learning_rate": 0.00013281257033060173, + "loss": 3.1715, + "step": 15727 + }, + { + "epoch": 0.6622315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00013278276402697086, + "loss": 3.212, + "step": 15728 + }, + { + "epoch": 0.6622736842105263, + "grad_norm": 0.451171875, + "learning_rate": 0.0001327529598589003, + "loss": 3.1292, + "step": 15729 + }, + { + "epoch": 0.6623157894736842, + "grad_norm": 0.435546875, + "learning_rate": 0.00013272315782693296, + "loss": 3.3163, + "step": 15730 + }, + { + "epoch": 0.6623578947368421, + "grad_norm": 0.470703125, + "learning_rate": 0.00013269335793161197, + "loss": 2.9168, + "step": 15731 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4375, + "learning_rate": 0.00013266356017348015, + "loss": 3.2731, + "step": 15732 + }, + { + "epoch": 0.6624421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00013263376455308043, + "loss": 3.0833, + "step": 15733 + }, + { + "epoch": 0.6624842105263158, + "grad_norm": 0.470703125, + "learning_rate": 0.00013260397107095563, + "loss": 2.8879, + "step": 15734 + }, + { + "epoch": 0.6625263157894736, + "grad_norm": 0.419921875, + "learning_rate": 0.0001325741797276485, + "loss": 3.2165, + "step": 15735 + }, + { + "epoch": 0.6625684210526316, + "grad_norm": 0.455078125, + "learning_rate": 0.0001325443905237018, + "loss": 2.933, + "step": 15736 + }, + { + "epoch": 0.6626105263157894, + "grad_norm": 0.44921875, + "learning_rate": 0.00013251460345965838, + "loss": 2.9521, + "step": 15737 + }, + { + "epoch": 0.6626526315789474, + "grad_norm": 0.427734375, + "learning_rate": 0.0001324848185360608, + "loss": 2.6726, + "step": 15738 + }, + { + "epoch": 0.6626947368421052, + "grad_norm": 0.431640625, + "learning_rate": 0.0001324550357534518, + "loss": 2.8963, + "step": 15739 + }, + { + "epoch": 0.6627368421052632, + "grad_norm": 0.4296875, + "learning_rate": 0.00013242525511237394, + "loss": 2.882, + "step": 15740 + }, + { + "epoch": 0.662778947368421, + "grad_norm": 0.4609375, + "learning_rate": 0.00013239547661336975, + "loss": 2.807, + "step": 15741 + }, + { + "epoch": 0.662821052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.00013236570025698186, + "loss": 3.15, + "step": 15742 + }, + { + "epoch": 0.6628631578947368, + "grad_norm": 0.455078125, + "learning_rate": 0.00013233592604375256, + "loss": 3.0825, + "step": 15743 + }, + { + "epoch": 0.6629052631578948, + "grad_norm": 0.427734375, + "learning_rate": 0.00013230615397422462, + "loss": 3.0947, + "step": 15744 + }, + { + "epoch": 0.6629473684210526, + "grad_norm": 0.482421875, + "learning_rate": 0.0001322763840489401, + "loss": 3.1431, + "step": 15745 + }, + { + "epoch": 0.6629894736842106, + "grad_norm": 0.474609375, + "learning_rate": 0.00013224661626844164, + "loss": 3.3134, + "step": 15746 + }, + { + "epoch": 0.6630315789473684, + "grad_norm": 0.439453125, + "learning_rate": 0.00013221685063327144, + "loss": 3.2757, + "step": 15747 + }, + { + "epoch": 0.6630736842105264, + "grad_norm": 0.39453125, + "learning_rate": 0.00013218708714397186, + "loss": 3.5356, + "step": 15748 + }, + { + "epoch": 0.6631157894736842, + "grad_norm": 0.447265625, + "learning_rate": 0.00013215732580108513, + "loss": 3.2489, + "step": 15749 + }, + { + "epoch": 0.6631578947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.00013212756660515337, + "loss": 2.8478, + "step": 15750 + }, + { + "epoch": 0.6632, + "grad_norm": 0.421875, + "learning_rate": 0.00013209780955671904, + "loss": 3.173, + "step": 15751 + }, + { + "epoch": 0.6632421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.00013206805465632387, + "loss": 2.9065, + "step": 15752 + }, + { + "epoch": 0.6632842105263158, + "grad_norm": 0.41015625, + "learning_rate": 0.00013203830190451033, + "loss": 3.2499, + "step": 15753 + }, + { + "epoch": 0.6633263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.00013200855130182033, + "loss": 2.926, + "step": 15754 + }, + { + "epoch": 0.6633684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.00013197880284879582, + "loss": 3.4132, + "step": 15755 + }, + { + "epoch": 0.6634105263157895, + "grad_norm": 0.44921875, + "learning_rate": 0.0001319490565459789, + "loss": 2.9822, + "step": 15756 + }, + { + "epoch": 0.6634526315789474, + "grad_norm": 0.404296875, + "learning_rate": 0.00013191931239391137, + "loss": 2.9443, + "step": 15757 + }, + { + "epoch": 0.6634947368421052, + "grad_norm": 0.44921875, + "learning_rate": 0.00013188957039313542, + "loss": 3.2769, + "step": 15758 + }, + { + "epoch": 0.6635368421052632, + "grad_norm": 0.419921875, + "learning_rate": 0.00013185983054419254, + "loss": 3.15, + "step": 15759 + }, + { + "epoch": 0.663578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00013183009284762492, + "loss": 3.4536, + "step": 15760 + }, + { + "epoch": 0.663621052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.00013180035730397396, + "loss": 3.0111, + "step": 15761 + }, + { + "epoch": 0.6636631578947368, + "grad_norm": 0.41796875, + "learning_rate": 0.00013177062391378172, + "loss": 3.0216, + "step": 15762 + }, + { + "epoch": 0.6637052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.00013174089267758982, + "loss": 3.2921, + "step": 15763 + }, + { + "epoch": 0.6637473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.00013171116359593976, + "loss": 2.9425, + "step": 15764 + }, + { + "epoch": 0.6637894736842105, + "grad_norm": 0.396484375, + "learning_rate": 0.00013168143666937354, + "loss": 3.3366, + "step": 15765 + }, + { + "epoch": 0.6638315789473684, + "grad_norm": 0.451171875, + "learning_rate": 0.00013165171189843234, + "loss": 3.2112, + "step": 15766 + }, + { + "epoch": 0.6638736842105263, + "grad_norm": 0.455078125, + "learning_rate": 0.0001316219892836581, + "loss": 3.1818, + "step": 15767 + }, + { + "epoch": 0.6639157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.0001315922688255919, + "loss": 2.6691, + "step": 15768 + }, + { + "epoch": 0.6639578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.00013156255052477553, + "loss": 3.0826, + "step": 15769 + }, + { + "epoch": 0.664, + "grad_norm": 0.421875, + "learning_rate": 0.00013153283438175034, + "loss": 2.8736, + "step": 15770 + }, + { + "epoch": 0.6640421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00013150312039705774, + "loss": 2.9973, + "step": 15771 + }, + { + "epoch": 0.6640842105263158, + "grad_norm": 0.451171875, + "learning_rate": 0.00013147340857123902, + "loss": 2.7762, + "step": 15772 + }, + { + "epoch": 0.6641263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.00013144369890483548, + "loss": 2.8798, + "step": 15773 + }, + { + "epoch": 0.6641684210526316, + "grad_norm": 0.453125, + "learning_rate": 0.0001314139913983886, + "loss": 2.8144, + "step": 15774 + }, + { + "epoch": 0.6642105263157895, + "grad_norm": 0.46875, + "learning_rate": 0.0001313842860524393, + "loss": 2.5495, + "step": 15775 + }, + { + "epoch": 0.6642526315789473, + "grad_norm": 0.447265625, + "learning_rate": 0.0001313545828675291, + "loss": 2.9559, + "step": 15776 + }, + { + "epoch": 0.6642947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.00013132488184419884, + "loss": 2.9774, + "step": 15777 + }, + { + "epoch": 0.6643368421052631, + "grad_norm": 0.47265625, + "learning_rate": 0.00013129518298298987, + "loss": 2.7229, + "step": 15778 + }, + { + "epoch": 0.6643789473684211, + "grad_norm": 0.431640625, + "learning_rate": 0.00013126548628444323, + "loss": 2.816, + "step": 15779 + }, + { + "epoch": 0.6644210526315789, + "grad_norm": 0.45703125, + "learning_rate": 0.00013123579174909977, + "loss": 3.2036, + "step": 15780 + }, + { + "epoch": 0.6644631578947369, + "grad_norm": 0.431640625, + "learning_rate": 0.0001312060993775009, + "loss": 2.5084, + "step": 15781 + }, + { + "epoch": 0.6645052631578947, + "grad_norm": 0.435546875, + "learning_rate": 0.0001311764091701871, + "loss": 3.0876, + "step": 15782 + }, + { + "epoch": 0.6645473684210527, + "grad_norm": 0.482421875, + "learning_rate": 0.00013114672112769966, + "loss": 2.8164, + "step": 15783 + }, + { + "epoch": 0.6645894736842105, + "grad_norm": 0.453125, + "learning_rate": 0.00013111703525057916, + "loss": 2.9774, + "step": 15784 + }, + { + "epoch": 0.6646315789473685, + "grad_norm": 0.408203125, + "learning_rate": 0.00013108735153936668, + "loss": 2.8797, + "step": 15785 + }, + { + "epoch": 0.6646736842105263, + "grad_norm": 0.4375, + "learning_rate": 0.00013105766999460297, + "loss": 2.8749, + "step": 15786 + }, + { + "epoch": 0.6647157894736843, + "grad_norm": 0.427734375, + "learning_rate": 0.00013102799061682875, + "loss": 2.8532, + "step": 15787 + }, + { + "epoch": 0.6647578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00013099831340658474, + "loss": 2.6231, + "step": 15788 + }, + { + "epoch": 0.6648, + "grad_norm": 0.494140625, + "learning_rate": 0.00013096863836441154, + "loss": 2.8444, + "step": 15789 + }, + { + "epoch": 0.6648421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.00013093896549085, + "loss": 3.0132, + "step": 15790 + }, + { + "epoch": 0.6648842105263157, + "grad_norm": 0.41796875, + "learning_rate": 0.0001309092947864406, + "loss": 2.4935, + "step": 15791 + }, + { + "epoch": 0.6649263157894737, + "grad_norm": 0.4453125, + "learning_rate": 0.00013087962625172396, + "loss": 3.209, + "step": 15792 + }, + { + "epoch": 0.6649684210526315, + "grad_norm": 0.49609375, + "learning_rate": 0.00013084995988724054, + "loss": 3.0388, + "step": 15793 + }, + { + "epoch": 0.6650105263157895, + "grad_norm": 0.408203125, + "learning_rate": 0.00013082029569353088, + "loss": 2.7978, + "step": 15794 + }, + { + "epoch": 0.6650526315789473, + "grad_norm": 0.443359375, + "learning_rate": 0.00013079063367113535, + "loss": 2.6761, + "step": 15795 + }, + { + "epoch": 0.6650947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.00013076097382059437, + "loss": 3.4351, + "step": 15796 + }, + { + "epoch": 0.6651368421052631, + "grad_norm": 0.421875, + "learning_rate": 0.00013073131614244843, + "loss": 3.068, + "step": 15797 + }, + { + "epoch": 0.6651789473684211, + "grad_norm": 0.43359375, + "learning_rate": 0.00013070166063723775, + "loss": 3.0334, + "step": 15798 + }, + { + "epoch": 0.6652210526315789, + "grad_norm": 0.431640625, + "learning_rate": 0.00013067200730550266, + "loss": 2.8792, + "step": 15799 + }, + { + "epoch": 0.6652631578947369, + "grad_norm": 0.4375, + "learning_rate": 0.00013064235614778342, + "loss": 3.2272, + "step": 15800 + }, + { + "epoch": 0.6653052631578947, + "grad_norm": 0.416015625, + "learning_rate": 0.00013061270716462016, + "loss": 3.505, + "step": 15801 + }, + { + "epoch": 0.6653473684210526, + "grad_norm": 0.44140625, + "learning_rate": 0.00013058306035655312, + "loss": 2.9735, + "step": 15802 + }, + { + "epoch": 0.6653894736842105, + "grad_norm": 0.435546875, + "learning_rate": 0.00013055341572412244, + "loss": 2.9673, + "step": 15803 + }, + { + "epoch": 0.6654315789473684, + "grad_norm": 0.4453125, + "learning_rate": 0.00013052377326786804, + "loss": 3.0999, + "step": 15804 + }, + { + "epoch": 0.6654736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.00013049413298833024, + "loss": 3.18, + "step": 15805 + }, + { + "epoch": 0.6655157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.0001304644948860489, + "loss": 2.9492, + "step": 15806 + }, + { + "epoch": 0.6655578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00013043485896156403, + "loss": 3.547, + "step": 15807 + }, + { + "epoch": 0.6656, + "grad_norm": 0.435546875, + "learning_rate": 0.00013040522521541554, + "loss": 3.3056, + "step": 15808 + }, + { + "epoch": 0.6656421052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.00013037559364814332, + "loss": 2.8112, + "step": 15809 + }, + { + "epoch": 0.6656842105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.00013034596426028729, + "loss": 3.0902, + "step": 15810 + }, + { + "epoch": 0.6657263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.00013031633705238704, + "loss": 3.2839, + "step": 15811 + }, + { + "epoch": 0.6657684210526316, + "grad_norm": 0.439453125, + "learning_rate": 0.0001302867120249826, + "loss": 2.9039, + "step": 15812 + }, + { + "epoch": 0.6658105263157895, + "grad_norm": 0.416015625, + "learning_rate": 0.00013025708917861365, + "loss": 3.1066, + "step": 15813 + }, + { + "epoch": 0.6658526315789474, + "grad_norm": 0.427734375, + "learning_rate": 0.00013022746851381985, + "loss": 3.1412, + "step": 15814 + }, + { + "epoch": 0.6658947368421053, + "grad_norm": 0.3984375, + "learning_rate": 0.0001301978500311408, + "loss": 2.9414, + "step": 15815 + }, + { + "epoch": 0.6659368421052632, + "grad_norm": 0.51953125, + "learning_rate": 0.0001301682337311162, + "loss": 3.0336, + "step": 15816 + }, + { + "epoch": 0.665978947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.00013013861961428555, + "loss": 3.1247, + "step": 15817 + }, + { + "epoch": 0.666021052631579, + "grad_norm": 0.453125, + "learning_rate": 0.00013010900768118834, + "loss": 3.245, + "step": 15818 + }, + { + "epoch": 0.6660631578947368, + "grad_norm": 0.419921875, + "learning_rate": 0.00013007939793236435, + "loss": 3.4649, + "step": 15819 + }, + { + "epoch": 0.6661052631578948, + "grad_norm": 0.447265625, + "learning_rate": 0.00013004979036835264, + "loss": 2.8322, + "step": 15820 + }, + { + "epoch": 0.6661473684210526, + "grad_norm": 0.447265625, + "learning_rate": 0.00013002018498969287, + "loss": 3.3361, + "step": 15821 + }, + { + "epoch": 0.6661894736842106, + "grad_norm": 0.4140625, + "learning_rate": 0.00012999058179692438, + "loss": 2.9639, + "step": 15822 + }, + { + "epoch": 0.6662315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00012996098079058648, + "loss": 3.3286, + "step": 15823 + }, + { + "epoch": 0.6662736842105264, + "grad_norm": 0.4375, + "learning_rate": 0.00012993138197121848, + "loss": 3.2119, + "step": 15824 + }, + { + "epoch": 0.6663157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.00012990178533935954, + "loss": 2.9074, + "step": 15825 + }, + { + "epoch": 0.6663578947368421, + "grad_norm": 0.447265625, + "learning_rate": 0.00012987219089554913, + "loss": 3.6743, + "step": 15826 + }, + { + "epoch": 0.6664, + "grad_norm": 0.439453125, + "learning_rate": 0.00012984259864032608, + "loss": 3.5049, + "step": 15827 + }, + { + "epoch": 0.666442105263158, + "grad_norm": 0.431640625, + "learning_rate": 0.0001298130085742299, + "loss": 2.8127, + "step": 15828 + }, + { + "epoch": 0.6664842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.00012978342069779928, + "loss": 3.1235, + "step": 15829 + }, + { + "epoch": 0.6665263157894736, + "grad_norm": 0.40234375, + "learning_rate": 0.0001297538350115736, + "loss": 2.9444, + "step": 15830 + }, + { + "epoch": 0.6665684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.00012972425151609173, + "loss": 2.8065, + "step": 15831 + }, + { + "epoch": 0.6666105263157894, + "grad_norm": 0.396484375, + "learning_rate": 0.00012969467021189263, + "loss": 3.1415, + "step": 15832 + }, + { + "epoch": 0.6666526315789474, + "grad_norm": 0.412109375, + "learning_rate": 0.00012966509109951546, + "loss": 2.776, + "step": 15833 + }, + { + "epoch": 0.6666947368421052, + "grad_norm": 0.4453125, + "learning_rate": 0.00012963551417949874, + "loss": 3.0222, + "step": 15834 + }, + { + "epoch": 0.6667368421052632, + "grad_norm": 0.45703125, + "learning_rate": 0.00012960593945238172, + "loss": 3.1074, + "step": 15835 + }, + { + "epoch": 0.666778947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00012957636691870284, + "loss": 3.2742, + "step": 15836 + }, + { + "epoch": 0.666821052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00012954679657900113, + "loss": 3.2158, + "step": 15837 + }, + { + "epoch": 0.6668631578947368, + "grad_norm": 0.439453125, + "learning_rate": 0.0001295172284338153, + "loss": 3.1544, + "step": 15838 + }, + { + "epoch": 0.6669052631578948, + "grad_norm": 0.41796875, + "learning_rate": 0.00012948766248368392, + "loss": 2.5699, + "step": 15839 + }, + { + "epoch": 0.6669473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.00012945809872914592, + "loss": 3.2143, + "step": 15840 + }, + { + "epoch": 0.6669894736842106, + "grad_norm": 0.443359375, + "learning_rate": 0.00012942853717073948, + "loss": 3.1279, + "step": 15841 + }, + { + "epoch": 0.6670315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.0001293989778090037, + "loss": 2.9067, + "step": 15842 + }, + { + "epoch": 0.6670736842105263, + "grad_norm": 0.53125, + "learning_rate": 0.0001293694206444766, + "loss": 3.4485, + "step": 15843 + }, + { + "epoch": 0.6671157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.00012933986567769705, + "loss": 2.917, + "step": 15844 + }, + { + "epoch": 0.6671578947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.00012931031290920337, + "loss": 2.8975, + "step": 15845 + }, + { + "epoch": 0.6672, + "grad_norm": 0.4375, + "learning_rate": 0.00012928076233953396, + "loss": 3.2469, + "step": 15846 + }, + { + "epoch": 0.6672421052631579, + "grad_norm": 0.455078125, + "learning_rate": 0.00012925121396922722, + "loss": 3.114, + "step": 15847 + }, + { + "epoch": 0.6672842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.00012922166779882142, + "loss": 3.2593, + "step": 15848 + }, + { + "epoch": 0.6673263157894737, + "grad_norm": 0.458984375, + "learning_rate": 0.00012919212382885508, + "loss": 3.1166, + "step": 15849 + }, + { + "epoch": 0.6673684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.00012916258205986615, + "loss": 3.1908, + "step": 15850 + }, + { + "epoch": 0.6674105263157895, + "grad_norm": 0.4296875, + "learning_rate": 0.00012913304249239304, + "loss": 3.0478, + "step": 15851 + }, + { + "epoch": 0.6674526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.00012910350512697395, + "loss": 2.9303, + "step": 15852 + }, + { + "epoch": 0.6674947368421053, + "grad_norm": 0.49609375, + "learning_rate": 0.00012907396996414693, + "loss": 3.3043, + "step": 15853 + }, + { + "epoch": 0.6675368421052632, + "grad_norm": 0.5, + "learning_rate": 0.00012904443700445008, + "loss": 2.8537, + "step": 15854 + }, + { + "epoch": 0.667578947368421, + "grad_norm": 0.47265625, + "learning_rate": 0.00012901490624842143, + "loss": 2.9666, + "step": 15855 + }, + { + "epoch": 0.6676210526315789, + "grad_norm": 0.4453125, + "learning_rate": 0.0001289853776965991, + "loss": 3.0076, + "step": 15856 + }, + { + "epoch": 0.6676631578947368, + "grad_norm": 0.40234375, + "learning_rate": 0.00012895585134952087, + "loss": 2.7696, + "step": 15857 + }, + { + "epoch": 0.6677052631578947, + "grad_norm": 0.421875, + "learning_rate": 0.00012892632720772494, + "loss": 3.2988, + "step": 15858 + }, + { + "epoch": 0.6677473684210526, + "grad_norm": 0.40625, + "learning_rate": 0.00012889680527174903, + "loss": 2.9033, + "step": 15859 + }, + { + "epoch": 0.6677894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.00012886728554213108, + "loss": 3.1483, + "step": 15860 + }, + { + "epoch": 0.6678315789473684, + "grad_norm": 0.408203125, + "learning_rate": 0.00012883776801940884, + "loss": 3.1064, + "step": 15861 + }, + { + "epoch": 0.6678736842105263, + "grad_norm": 0.5, + "learning_rate": 0.00012880825270412008, + "loss": 2.9161, + "step": 15862 + }, + { + "epoch": 0.6679157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.00012877873959680258, + "loss": 2.645, + "step": 15863 + }, + { + "epoch": 0.6679578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00012874922869799394, + "loss": 2.843, + "step": 15864 + }, + { + "epoch": 0.668, + "grad_norm": 0.451171875, + "learning_rate": 0.00012871972000823195, + "loss": 3.0943, + "step": 15865 + }, + { + "epoch": 0.6680421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00012869021352805422, + "loss": 3.4077, + "step": 15866 + }, + { + "epoch": 0.6680842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.00012866070925799823, + "loss": 2.5298, + "step": 15867 + }, + { + "epoch": 0.6681263157894737, + "grad_norm": 0.466796875, + "learning_rate": 0.00012863120719860155, + "loss": 3.514, + "step": 15868 + }, + { + "epoch": 0.6681684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.00012860170735040167, + "loss": 2.8745, + "step": 15869 + }, + { + "epoch": 0.6682105263157895, + "grad_norm": 0.42578125, + "learning_rate": 0.00012857220971393605, + "loss": 2.6445, + "step": 15870 + }, + { + "epoch": 0.6682526315789473, + "grad_norm": 0.435546875, + "learning_rate": 0.0001285427142897421, + "loss": 2.6437, + "step": 15871 + }, + { + "epoch": 0.6682947368421053, + "grad_norm": 0.419921875, + "learning_rate": 0.00012851322107835709, + "loss": 2.8648, + "step": 15872 + }, + { + "epoch": 0.6683368421052631, + "grad_norm": 0.4609375, + "learning_rate": 0.00012848373008031856, + "loss": 3.3423, + "step": 15873 + }, + { + "epoch": 0.6683789473684211, + "grad_norm": 0.4375, + "learning_rate": 0.00012845424129616365, + "loss": 3.0569, + "step": 15874 + }, + { + "epoch": 0.6684210526315789, + "grad_norm": 0.423828125, + "learning_rate": 0.00012842475472642968, + "loss": 3.3816, + "step": 15875 + }, + { + "epoch": 0.6684631578947369, + "grad_norm": 0.423828125, + "learning_rate": 0.00012839527037165384, + "loss": 3.1974, + "step": 15876 + }, + { + "epoch": 0.6685052631578947, + "grad_norm": 0.4296875, + "learning_rate": 0.00012836578823237327, + "loss": 3.7311, + "step": 15877 + }, + { + "epoch": 0.6685473684210527, + "grad_norm": 0.4453125, + "learning_rate": 0.00012833630830912516, + "loss": 3.2166, + "step": 15878 + }, + { + "epoch": 0.6685894736842105, + "grad_norm": 0.44921875, + "learning_rate": 0.00012830683060244646, + "loss": 3.0744, + "step": 15879 + }, + { + "epoch": 0.6686315789473685, + "grad_norm": 0.427734375, + "learning_rate": 0.00012827735511287443, + "loss": 3.2041, + "step": 15880 + }, + { + "epoch": 0.6686736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00012824788184094592, + "loss": 2.9794, + "step": 15881 + }, + { + "epoch": 0.6687157894736843, + "grad_norm": 0.41796875, + "learning_rate": 0.00012821841078719797, + "loss": 3.0495, + "step": 15882 + }, + { + "epoch": 0.6687578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00012818894195216752, + "loss": 3.6279, + "step": 15883 + }, + { + "epoch": 0.6688, + "grad_norm": 0.43359375, + "learning_rate": 0.00012815947533639137, + "loss": 2.7101, + "step": 15884 + }, + { + "epoch": 0.6688421052631579, + "grad_norm": 0.40625, + "learning_rate": 0.0001281300109404065, + "loss": 3.3523, + "step": 15885 + }, + { + "epoch": 0.6688842105263157, + "grad_norm": 0.427734375, + "learning_rate": 0.00012810054876474946, + "loss": 3.1733, + "step": 15886 + }, + { + "epoch": 0.6689263157894737, + "grad_norm": 0.439453125, + "learning_rate": 0.00012807108880995744, + "loss": 2.7345, + "step": 15887 + }, + { + "epoch": 0.6689684210526315, + "grad_norm": 0.44921875, + "learning_rate": 0.00012804163107656675, + "loss": 3.5655, + "step": 15888 + }, + { + "epoch": 0.6690105263157895, + "grad_norm": 0.458984375, + "learning_rate": 0.0001280121755651143, + "loss": 2.7114, + "step": 15889 + }, + { + "epoch": 0.6690526315789473, + "grad_norm": 0.47265625, + "learning_rate": 0.00012798272227613667, + "loss": 3.3194, + "step": 15890 + }, + { + "epoch": 0.6690947368421053, + "grad_norm": 0.419921875, + "learning_rate": 0.0001279532712101704, + "loss": 3.0179, + "step": 15891 + }, + { + "epoch": 0.6691368421052631, + "grad_norm": 0.447265625, + "learning_rate": 0.00012792382236775235, + "loss": 3.1671, + "step": 15892 + }, + { + "epoch": 0.6691789473684211, + "grad_norm": 0.41796875, + "learning_rate": 0.0001278943757494186, + "loss": 3.1426, + "step": 15893 + }, + { + "epoch": 0.6692210526315789, + "grad_norm": 0.4296875, + "learning_rate": 0.00012786493135570604, + "loss": 3.3884, + "step": 15894 + }, + { + "epoch": 0.6692631578947369, + "grad_norm": 0.423828125, + "learning_rate": 0.00012783548918715077, + "loss": 2.9857, + "step": 15895 + }, + { + "epoch": 0.6693052631578947, + "grad_norm": 0.3984375, + "learning_rate": 0.0001278060492442894, + "loss": 2.8803, + "step": 15896 + }, + { + "epoch": 0.6693473684210526, + "grad_norm": 0.419921875, + "learning_rate": 0.00012777661152765826, + "loss": 2.961, + "step": 15897 + }, + { + "epoch": 0.6693894736842105, + "grad_norm": 0.419921875, + "learning_rate": 0.00012774717603779369, + "loss": 3.1851, + "step": 15898 + }, + { + "epoch": 0.6694315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00012771774277523188, + "loss": 2.7351, + "step": 15899 + }, + { + "epoch": 0.6694736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.00012768831174050905, + "loss": 2.7822, + "step": 15900 + }, + { + "epoch": 0.6695157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.00012765888293416163, + "loss": 3.2094, + "step": 15901 + }, + { + "epoch": 0.6695578947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.00012762945635672545, + "loss": 2.9743, + "step": 15902 + }, + { + "epoch": 0.6696, + "grad_norm": 0.451171875, + "learning_rate": 0.00012760003200873698, + "loss": 3.468, + "step": 15903 + }, + { + "epoch": 0.6696421052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00012757060989073194, + "loss": 3.1297, + "step": 15904 + }, + { + "epoch": 0.6696842105263158, + "grad_norm": 0.5, + "learning_rate": 0.00012754119000324662, + "loss": 2.5427, + "step": 15905 + }, + { + "epoch": 0.6697263157894737, + "grad_norm": 0.466796875, + "learning_rate": 0.00012751177234681695, + "loss": 3.3635, + "step": 15906 + }, + { + "epoch": 0.6697684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.00012748235692197873, + "loss": 2.9842, + "step": 15907 + }, + { + "epoch": 0.6698105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.00012745294372926825, + "loss": 3.0108, + "step": 15908 + }, + { + "epoch": 0.6698526315789474, + "grad_norm": 0.439453125, + "learning_rate": 0.00012742353276922098, + "loss": 3.1352, + "step": 15909 + }, + { + "epoch": 0.6698947368421052, + "grad_norm": 0.435546875, + "learning_rate": 0.00012739412404237305, + "loss": 3.3357, + "step": 15910 + }, + { + "epoch": 0.6699368421052632, + "grad_norm": 0.478515625, + "learning_rate": 0.00012736471754925995, + "loss": 2.9537, + "step": 15911 + }, + { + "epoch": 0.669978947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.00012733531329041774, + "loss": 3.0622, + "step": 15912 + }, + { + "epoch": 0.670021052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00012730591126638196, + "loss": 2.9481, + "step": 15913 + }, + { + "epoch": 0.6700631578947368, + "grad_norm": 0.41796875, + "learning_rate": 0.0001272765114776883, + "loss": 3.0313, + "step": 15914 + }, + { + "epoch": 0.6701052631578948, + "grad_norm": 0.453125, + "learning_rate": 0.00012724711392487245, + "loss": 3.1777, + "step": 15915 + }, + { + "epoch": 0.6701473684210526, + "grad_norm": 0.435546875, + "learning_rate": 0.00012721771860846987, + "loss": 3.0189, + "step": 15916 + }, + { + "epoch": 0.6701894736842106, + "grad_norm": 0.447265625, + "learning_rate": 0.00012718832552901632, + "loss": 3.4344, + "step": 15917 + }, + { + "epoch": 0.6702315789473684, + "grad_norm": 0.41796875, + "learning_rate": 0.00012715893468704704, + "loss": 2.7198, + "step": 15918 + }, + { + "epoch": 0.6702736842105264, + "grad_norm": 0.490234375, + "learning_rate": 0.00012712954608309774, + "loss": 2.9259, + "step": 15919 + }, + { + "epoch": 0.6703157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.00012710015971770372, + "loss": 3.0983, + "step": 15920 + }, + { + "epoch": 0.6703578947368422, + "grad_norm": 0.5546875, + "learning_rate": 0.0001270707755914004, + "loss": 2.9564, + "step": 15921 + }, + { + "epoch": 0.6704, + "grad_norm": 0.43359375, + "learning_rate": 0.00012704139370472308, + "loss": 2.9131, + "step": 15922 + }, + { + "epoch": 0.6704421052631578, + "grad_norm": 0.439453125, + "learning_rate": 0.00012701201405820704, + "loss": 3.1411, + "step": 15923 + }, + { + "epoch": 0.6704842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.0001269826366523878, + "loss": 2.3204, + "step": 15924 + }, + { + "epoch": 0.6705263157894736, + "grad_norm": 0.443359375, + "learning_rate": 0.00012695326148780012, + "loss": 3.0725, + "step": 15925 + }, + { + "epoch": 0.6705684210526316, + "grad_norm": 0.4296875, + "learning_rate": 0.0001269238885649796, + "loss": 3.1156, + "step": 15926 + }, + { + "epoch": 0.6706105263157894, + "grad_norm": 0.43359375, + "learning_rate": 0.0001268945178844612, + "loss": 3.3699, + "step": 15927 + }, + { + "epoch": 0.6706526315789474, + "grad_norm": 0.4375, + "learning_rate": 0.00012686514944678008, + "loss": 3.2315, + "step": 15928 + }, + { + "epoch": 0.6706947368421052, + "grad_norm": 0.43359375, + "learning_rate": 0.00012683578325247118, + "loss": 2.8811, + "step": 15929 + }, + { + "epoch": 0.6707368421052632, + "grad_norm": 0.4296875, + "learning_rate": 0.00012680641930206966, + "loss": 3.1503, + "step": 15930 + }, + { + "epoch": 0.670778947368421, + "grad_norm": 0.40625, + "learning_rate": 0.0001267770575961104, + "loss": 3.0572, + "step": 15931 + }, + { + "epoch": 0.670821052631579, + "grad_norm": 0.423828125, + "learning_rate": 0.00012674769813512832, + "loss": 3.3249, + "step": 15932 + }, + { + "epoch": 0.6708631578947368, + "grad_norm": 0.44140625, + "learning_rate": 0.00012671834091965842, + "loss": 2.9281, + "step": 15933 + }, + { + "epoch": 0.6709052631578948, + "grad_norm": 0.46875, + "learning_rate": 0.00012668898595023553, + "loss": 2.9572, + "step": 15934 + }, + { + "epoch": 0.6709473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.00012665963322739437, + "loss": 2.9123, + "step": 15935 + }, + { + "epoch": 0.6709894736842106, + "grad_norm": 0.43359375, + "learning_rate": 0.00012663028275166981, + "loss": 3.1905, + "step": 15936 + }, + { + "epoch": 0.6710315789473684, + "grad_norm": 0.45703125, + "learning_rate": 0.0001266009345235966, + "loss": 3.0376, + "step": 15937 + }, + { + "epoch": 0.6710736842105263, + "grad_norm": 0.41015625, + "learning_rate": 0.0001265715885437092, + "loss": 2.8841, + "step": 15938 + }, + { + "epoch": 0.6711157894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.00012654224481254258, + "loss": 2.5057, + "step": 15939 + }, + { + "epoch": 0.6711578947368421, + "grad_norm": 0.4453125, + "learning_rate": 0.00012651290333063117, + "loss": 2.7287, + "step": 15940 + }, + { + "epoch": 0.6712, + "grad_norm": 0.4375, + "learning_rate": 0.0001264835640985096, + "loss": 3.2412, + "step": 15941 + }, + { + "epoch": 0.6712421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00012645422711671234, + "loss": 3.3325, + "step": 15942 + }, + { + "epoch": 0.6712842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.0001264248923857739, + "loss": 2.9342, + "step": 15943 + }, + { + "epoch": 0.6713263157894737, + "grad_norm": 0.46875, + "learning_rate": 0.00012639555990622876, + "loss": 3.1991, + "step": 15944 + }, + { + "epoch": 0.6713684210526316, + "grad_norm": 0.4140625, + "learning_rate": 0.00012636622967861113, + "loss": 2.9292, + "step": 15945 + }, + { + "epoch": 0.6714105263157895, + "grad_norm": 0.419921875, + "learning_rate": 0.0001263369017034558, + "loss": 3.3595, + "step": 15946 + }, + { + "epoch": 0.6714526315789474, + "grad_norm": 0.421875, + "learning_rate": 0.00012630757598129655, + "loss": 2.779, + "step": 15947 + }, + { + "epoch": 0.6714947368421053, + "grad_norm": 0.46875, + "learning_rate": 0.00012627825251266812, + "loss": 2.7281, + "step": 15948 + }, + { + "epoch": 0.6715368421052632, + "grad_norm": 0.408203125, + "learning_rate": 0.0001262489312981045, + "loss": 3.003, + "step": 15949 + }, + { + "epoch": 0.671578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00012621961233813994, + "loss": 3.3618, + "step": 15950 + }, + { + "epoch": 0.6716210526315789, + "grad_norm": 0.447265625, + "learning_rate": 0.00012619029563330864, + "loss": 3.1853, + "step": 15951 + }, + { + "epoch": 0.6716631578947369, + "grad_norm": 0.43359375, + "learning_rate": 0.0001261609811841446, + "loss": 3.0302, + "step": 15952 + }, + { + "epoch": 0.6717052631578947, + "grad_norm": 0.419921875, + "learning_rate": 0.00012613166899118212, + "loss": 3.0751, + "step": 15953 + }, + { + "epoch": 0.6717473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.00012610235905495494, + "loss": 2.8066, + "step": 15954 + }, + { + "epoch": 0.6717894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.00012607305137599727, + "loss": 3.1997, + "step": 15955 + }, + { + "epoch": 0.6718315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.00012604374595484303, + "loss": 3.1338, + "step": 15956 + }, + { + "epoch": 0.6718736842105263, + "grad_norm": 0.451171875, + "learning_rate": 0.00012601444279202608, + "loss": 3.2804, + "step": 15957 + }, + { + "epoch": 0.6719157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.0001259851418880803, + "loss": 2.7821, + "step": 15958 + }, + { + "epoch": 0.6719578947368421, + "grad_norm": 0.455078125, + "learning_rate": 0.00012595584324353942, + "loss": 2.957, + "step": 15959 + }, + { + "epoch": 0.672, + "grad_norm": 0.43359375, + "learning_rate": 0.00012592654685893756, + "loss": 3.1528, + "step": 15960 + }, + { + "epoch": 0.6720421052631579, + "grad_norm": 0.45703125, + "learning_rate": 0.00012589725273480802, + "loss": 3.1736, + "step": 15961 + }, + { + "epoch": 0.6720842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.0001258679608716849, + "loss": 3.5325, + "step": 15962 + }, + { + "epoch": 0.6721263157894737, + "grad_norm": 0.474609375, + "learning_rate": 0.0001258386712701015, + "loss": 2.9699, + "step": 15963 + }, + { + "epoch": 0.6721684210526315, + "grad_norm": 0.427734375, + "learning_rate": 0.00012580938393059175, + "loss": 3.2067, + "step": 15964 + }, + { + "epoch": 0.6722105263157895, + "grad_norm": 0.447265625, + "learning_rate": 0.00012578009885368907, + "loss": 2.9865, + "step": 15965 + }, + { + "epoch": 0.6722526315789473, + "grad_norm": 0.4296875, + "learning_rate": 0.00012575081603992695, + "loss": 3.4989, + "step": 15966 + }, + { + "epoch": 0.6722947368421053, + "grad_norm": 0.431640625, + "learning_rate": 0.00012572153548983915, + "loss": 3.1228, + "step": 15967 + }, + { + "epoch": 0.6723368421052631, + "grad_norm": 0.423828125, + "learning_rate": 0.00012569225720395882, + "loss": 2.8226, + "step": 15968 + }, + { + "epoch": 0.6723789473684211, + "grad_norm": 0.41796875, + "learning_rate": 0.00012566298118281966, + "loss": 2.6992, + "step": 15969 + }, + { + "epoch": 0.6724210526315789, + "grad_norm": 0.4609375, + "learning_rate": 0.00012563370742695468, + "loss": 3.7462, + "step": 15970 + }, + { + "epoch": 0.6724631578947369, + "grad_norm": 0.439453125, + "learning_rate": 0.00012560443593689755, + "loss": 3.1766, + "step": 15971 + }, + { + "epoch": 0.6725052631578947, + "grad_norm": 0.42578125, + "learning_rate": 0.00012557516671318143, + "loss": 2.9605, + "step": 15972 + }, + { + "epoch": 0.6725473684210527, + "grad_norm": 0.419921875, + "learning_rate": 0.00012554589975633958, + "loss": 3.0569, + "step": 15973 + }, + { + "epoch": 0.6725894736842105, + "grad_norm": 0.44921875, + "learning_rate": 0.00012551663506690523, + "loss": 3.1149, + "step": 15974 + }, + { + "epoch": 0.6726315789473685, + "grad_norm": 0.4375, + "learning_rate": 0.00012548737264541138, + "loss": 2.9173, + "step": 15975 + }, + { + "epoch": 0.6726736842105263, + "grad_norm": 0.41796875, + "learning_rate": 0.00012545811249239152, + "loss": 2.5799, + "step": 15976 + }, + { + "epoch": 0.6727157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.00012542885460837836, + "loss": 3.2402, + "step": 15977 + }, + { + "epoch": 0.6727578947368421, + "grad_norm": 0.44921875, + "learning_rate": 0.00012539959899390525, + "loss": 3.3866, + "step": 15978 + }, + { + "epoch": 0.6728, + "grad_norm": 0.458984375, + "learning_rate": 0.0001253703456495049, + "loss": 2.9762, + "step": 15979 + }, + { + "epoch": 0.6728421052631579, + "grad_norm": 0.451171875, + "learning_rate": 0.00012534109457571047, + "loss": 2.9285, + "step": 15980 + }, + { + "epoch": 0.6728842105263158, + "grad_norm": 0.44140625, + "learning_rate": 0.00012531184577305488, + "loss": 3.2687, + "step": 15981 + }, + { + "epoch": 0.6729263157894737, + "grad_norm": 0.443359375, + "learning_rate": 0.0001252825992420708, + "loss": 3.0216, + "step": 15982 + }, + { + "epoch": 0.6729684210526315, + "grad_norm": 0.439453125, + "learning_rate": 0.00012525335498329146, + "loss": 3.246, + "step": 15983 + }, + { + "epoch": 0.6730105263157895, + "grad_norm": 0.416015625, + "learning_rate": 0.00012522411299724923, + "loss": 2.8129, + "step": 15984 + }, + { + "epoch": 0.6730526315789473, + "grad_norm": 0.42578125, + "learning_rate": 0.00012519487328447712, + "loss": 3.116, + "step": 15985 + }, + { + "epoch": 0.6730947368421053, + "grad_norm": 0.46484375, + "learning_rate": 0.0001251656358455078, + "loss": 3.4185, + "step": 15986 + }, + { + "epoch": 0.6731368421052631, + "grad_norm": 0.427734375, + "learning_rate": 0.00012513640068087393, + "loss": 3.5245, + "step": 15987 + }, + { + "epoch": 0.6731789473684211, + "grad_norm": 0.439453125, + "learning_rate": 0.0001251071677911081, + "loss": 3.0252, + "step": 15988 + }, + { + "epoch": 0.6732210526315789, + "grad_norm": 0.435546875, + "learning_rate": 0.00012507793717674294, + "loss": 3.0259, + "step": 15989 + }, + { + "epoch": 0.6732631578947369, + "grad_norm": 0.443359375, + "learning_rate": 0.00012504870883831102, + "loss": 3.4577, + "step": 15990 + }, + { + "epoch": 0.6733052631578947, + "grad_norm": 0.447265625, + "learning_rate": 0.00012501948277634468, + "loss": 3.2727, + "step": 15991 + }, + { + "epoch": 0.6733473684210526, + "grad_norm": 0.4296875, + "learning_rate": 0.00012499025899137664, + "loss": 3.0052, + "step": 15992 + }, + { + "epoch": 0.6733894736842105, + "grad_norm": 0.455078125, + "learning_rate": 0.0001249610374839392, + "loss": 3.4769, + "step": 15993 + }, + { + "epoch": 0.6734315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.00012493181825456472, + "loss": 3.4875, + "step": 15994 + }, + { + "epoch": 0.6734736842105263, + "grad_norm": 0.47265625, + "learning_rate": 0.00012490260130378555, + "loss": 3.2303, + "step": 15995 + }, + { + "epoch": 0.6735157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.00012487338663213403, + "loss": 2.9853, + "step": 15996 + }, + { + "epoch": 0.6735578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00012484417424014237, + "loss": 3.518, + "step": 15997 + }, + { + "epoch": 0.6736, + "grad_norm": 0.421875, + "learning_rate": 0.00012481496412834273, + "loss": 3.184, + "step": 15998 + }, + { + "epoch": 0.6736421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00012478575629726744, + "loss": 3.5022, + "step": 15999 + }, + { + "epoch": 0.6736842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.00012475655074744855, + "loss": 3.3783, + "step": 16000 + }, + { + "epoch": 0.6737263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.00012472734747941816, + "loss": 3.4482, + "step": 16001 + }, + { + "epoch": 0.6737684210526316, + "grad_norm": 0.388671875, + "learning_rate": 0.0001246981464937083, + "loss": 2.9048, + "step": 16002 + }, + { + "epoch": 0.6738105263157895, + "grad_norm": 0.431640625, + "learning_rate": 0.00012466894779085096, + "loss": 2.8888, + "step": 16003 + }, + { + "epoch": 0.6738526315789474, + "grad_norm": 0.435546875, + "learning_rate": 0.00012463975137137819, + "loss": 2.5122, + "step": 16004 + }, + { + "epoch": 0.6738947368421052, + "grad_norm": 0.4296875, + "learning_rate": 0.00012461055723582182, + "loss": 3.2735, + "step": 16005 + }, + { + "epoch": 0.6739368421052632, + "grad_norm": 0.43359375, + "learning_rate": 0.0001245813653847137, + "loss": 3.1916, + "step": 16006 + }, + { + "epoch": 0.673978947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.0001245521758185858, + "loss": 3.0942, + "step": 16007 + }, + { + "epoch": 0.674021052631579, + "grad_norm": 0.4453125, + "learning_rate": 0.0001245229885379699, + "loss": 3.4631, + "step": 16008 + }, + { + "epoch": 0.6740631578947368, + "grad_norm": 0.4375, + "learning_rate": 0.00012449380354339772, + "loss": 2.7936, + "step": 16009 + }, + { + "epoch": 0.6741052631578948, + "grad_norm": 0.43359375, + "learning_rate": 0.00012446462083540093, + "loss": 2.7816, + "step": 16010 + }, + { + "epoch": 0.6741473684210526, + "grad_norm": 0.43359375, + "learning_rate": 0.00012443544041451128, + "loss": 3.2789, + "step": 16011 + }, + { + "epoch": 0.6741894736842106, + "grad_norm": 0.462890625, + "learning_rate": 0.00012440626228126038, + "loss": 3.3192, + "step": 16012 + }, + { + "epoch": 0.6742315789473684, + "grad_norm": 0.4609375, + "learning_rate": 0.0001243770864361797, + "loss": 3.0779, + "step": 16013 + }, + { + "epoch": 0.6742736842105264, + "grad_norm": 0.44140625, + "learning_rate": 0.00012434791287980102, + "loss": 3.1375, + "step": 16014 + }, + { + "epoch": 0.6743157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.00012431874161265572, + "loss": 3.0256, + "step": 16015 + }, + { + "epoch": 0.6743578947368422, + "grad_norm": 0.416015625, + "learning_rate": 0.00012428957263527527, + "loss": 3.4919, + "step": 16016 + }, + { + "epoch": 0.6744, + "grad_norm": 0.44921875, + "learning_rate": 0.00012426040594819114, + "loss": 3.1525, + "step": 16017 + }, + { + "epoch": 0.6744421052631578, + "grad_norm": 0.421875, + "learning_rate": 0.00012423124155193463, + "loss": 2.9154, + "step": 16018 + }, + { + "epoch": 0.6744842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.00012420207944703715, + "loss": 2.8667, + "step": 16019 + }, + { + "epoch": 0.6745263157894736, + "grad_norm": 0.431640625, + "learning_rate": 0.00012417291963402987, + "loss": 3.2322, + "step": 16020 + }, + { + "epoch": 0.6745684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.00012414376211344435, + "loss": 3.0335, + "step": 16021 + }, + { + "epoch": 0.6746105263157894, + "grad_norm": 0.48046875, + "learning_rate": 0.00012411460688581142, + "loss": 2.7226, + "step": 16022 + }, + { + "epoch": 0.6746526315789474, + "grad_norm": 0.419921875, + "learning_rate": 0.00012408545395166254, + "loss": 2.7735, + "step": 16023 + }, + { + "epoch": 0.6746947368421052, + "grad_norm": 0.431640625, + "learning_rate": 0.00012405630331152875, + "loss": 3.1164, + "step": 16024 + }, + { + "epoch": 0.6747368421052632, + "grad_norm": 0.40234375, + "learning_rate": 0.00012402715496594114, + "loss": 3.0255, + "step": 16025 + }, + { + "epoch": 0.674778947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.00012399800891543073, + "loss": 2.4151, + "step": 16026 + }, + { + "epoch": 0.674821052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.00012396886516052847, + "loss": 3.5808, + "step": 16027 + }, + { + "epoch": 0.6748631578947368, + "grad_norm": 0.41796875, + "learning_rate": 0.00012393972370176558, + "loss": 2.8057, + "step": 16028 + }, + { + "epoch": 0.6749052631578948, + "grad_norm": 0.4140625, + "learning_rate": 0.00012391058453967266, + "loss": 2.943, + "step": 16029 + }, + { + "epoch": 0.6749473684210526, + "grad_norm": 0.447265625, + "learning_rate": 0.00012388144767478082, + "loss": 3.0527, + "step": 16030 + }, + { + "epoch": 0.6749894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.00012385231310762078, + "loss": 3.149, + "step": 16031 + }, + { + "epoch": 0.6750315789473684, + "grad_norm": 0.451171875, + "learning_rate": 0.00012382318083872343, + "loss": 3.2126, + "step": 16032 + }, + { + "epoch": 0.6750736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.00012379405086861947, + "loss": 3.2796, + "step": 16033 + }, + { + "epoch": 0.6751157894736842, + "grad_norm": 0.4453125, + "learning_rate": 0.0001237649231978395, + "loss": 3.1493, + "step": 16034 + }, + { + "epoch": 0.6751578947368421, + "grad_norm": 0.4609375, + "learning_rate": 0.00012373579782691449, + "loss": 2.94, + "step": 16035 + }, + { + "epoch": 0.6752, + "grad_norm": 0.431640625, + "learning_rate": 0.00012370667475637473, + "loss": 3.397, + "step": 16036 + }, + { + "epoch": 0.6752421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.00012367755398675116, + "loss": 3.0456, + "step": 16037 + }, + { + "epoch": 0.6752842105263158, + "grad_norm": 0.4765625, + "learning_rate": 0.00012364843551857392, + "loss": 3.1292, + "step": 16038 + }, + { + "epoch": 0.6753263157894737, + "grad_norm": 0.45703125, + "learning_rate": 0.0001236193193523738, + "loss": 2.8186, + "step": 16039 + }, + { + "epoch": 0.6753684210526316, + "grad_norm": 0.41796875, + "learning_rate": 0.00012359020548868125, + "loss": 2.7601, + "step": 16040 + }, + { + "epoch": 0.6754105263157895, + "grad_norm": 0.453125, + "learning_rate": 0.00012356109392802647, + "loss": 2.9628, + "step": 16041 + }, + { + "epoch": 0.6754526315789474, + "grad_norm": 0.443359375, + "learning_rate": 0.00012353198467094024, + "loss": 2.6467, + "step": 16042 + }, + { + "epoch": 0.6754947368421053, + "grad_norm": 0.44921875, + "learning_rate": 0.00012350287771795246, + "loss": 3.7813, + "step": 16043 + }, + { + "epoch": 0.6755368421052632, + "grad_norm": 0.439453125, + "learning_rate": 0.00012347377306959377, + "loss": 3.2144, + "step": 16044 + }, + { + "epoch": 0.6755789473684211, + "grad_norm": 0.43359375, + "learning_rate": 0.0001234446707263941, + "loss": 3.1624, + "step": 16045 + }, + { + "epoch": 0.6756210526315789, + "grad_norm": 0.427734375, + "learning_rate": 0.00012341557068888396, + "loss": 2.9355, + "step": 16046 + }, + { + "epoch": 0.6756631578947369, + "grad_norm": 0.416015625, + "learning_rate": 0.0001233864729575934, + "loss": 3.156, + "step": 16047 + }, + { + "epoch": 0.6757052631578947, + "grad_norm": 0.453125, + "learning_rate": 0.00012335737753305247, + "loss": 3.7806, + "step": 16048 + }, + { + "epoch": 0.6757473684210527, + "grad_norm": 0.4375, + "learning_rate": 0.0001233282844157914, + "loss": 3.3783, + "step": 16049 + }, + { + "epoch": 0.6757894736842105, + "grad_norm": 0.427734375, + "learning_rate": 0.00012329919360634, + "loss": 2.9129, + "step": 16050 + }, + { + "epoch": 0.6758315789473684, + "grad_norm": 0.40234375, + "learning_rate": 0.00012327010510522862, + "loss": 2.5533, + "step": 16051 + }, + { + "epoch": 0.6758736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.00012324101891298685, + "loss": 2.9665, + "step": 16052 + }, + { + "epoch": 0.6759157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.0001232119350301449, + "loss": 2.595, + "step": 16053 + }, + { + "epoch": 0.6759578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.00012318285345723249, + "loss": 2.9708, + "step": 16054 + }, + { + "epoch": 0.676, + "grad_norm": 0.421875, + "learning_rate": 0.0001231537741947795, + "loss": 3.4914, + "step": 16055 + }, + { + "epoch": 0.6760421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.0001231246972433157, + "loss": 3.0139, + "step": 16056 + }, + { + "epoch": 0.6760842105263158, + "grad_norm": 0.48046875, + "learning_rate": 0.00012309562260337073, + "loss": 3.2564, + "step": 16057 + }, + { + "epoch": 0.6761263157894737, + "grad_norm": 0.41796875, + "learning_rate": 0.00012306655027547465, + "loss": 3.0608, + "step": 16058 + }, + { + "epoch": 0.6761684210526315, + "grad_norm": 0.455078125, + "learning_rate": 0.00012303748026015664, + "loss": 2.9952, + "step": 16059 + }, + { + "epoch": 0.6762105263157895, + "grad_norm": 0.423828125, + "learning_rate": 0.00012300841255794668, + "loss": 3.4193, + "step": 16060 + }, + { + "epoch": 0.6762526315789473, + "grad_norm": 0.416015625, + "learning_rate": 0.00012297934716937422, + "loss": 3.327, + "step": 16061 + }, + { + "epoch": 0.6762947368421053, + "grad_norm": 0.419921875, + "learning_rate": 0.00012295028409496886, + "loss": 2.9947, + "step": 16062 + }, + { + "epoch": 0.6763368421052631, + "grad_norm": 0.416015625, + "learning_rate": 0.00012292122333526002, + "loss": 3.1509, + "step": 16063 + }, + { + "epoch": 0.6763789473684211, + "grad_norm": 0.431640625, + "learning_rate": 0.00012289216489077715, + "loss": 3.1413, + "step": 16064 + }, + { + "epoch": 0.6764210526315789, + "grad_norm": 0.3984375, + "learning_rate": 0.00012286310876204973, + "loss": 3.0548, + "step": 16065 + }, + { + "epoch": 0.6764631578947369, + "grad_norm": 0.43359375, + "learning_rate": 0.00012283405494960698, + "loss": 3.6163, + "step": 16066 + }, + { + "epoch": 0.6765052631578947, + "grad_norm": 0.4609375, + "learning_rate": 0.0001228050034539784, + "loss": 2.5047, + "step": 16067 + }, + { + "epoch": 0.6765473684210527, + "grad_norm": 0.45703125, + "learning_rate": 0.00012277595427569323, + "loss": 2.901, + "step": 16068 + }, + { + "epoch": 0.6765894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.0001227469074152807, + "loss": 3.1595, + "step": 16069 + }, + { + "epoch": 0.6766315789473685, + "grad_norm": 0.416015625, + "learning_rate": 0.00012271786287327, + "loss": 3.2306, + "step": 16070 + }, + { + "epoch": 0.6766736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.0001226888206501903, + "loss": 3.2328, + "step": 16071 + }, + { + "epoch": 0.6767157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.0001226597807465707, + "loss": 3.0643, + "step": 16072 + }, + { + "epoch": 0.6767578947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.00012263074316294016, + "loss": 3.0075, + "step": 16073 + }, + { + "epoch": 0.6768, + "grad_norm": 0.408203125, + "learning_rate": 0.00012260170789982794, + "loss": 2.8612, + "step": 16074 + }, + { + "epoch": 0.6768421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.0001225726749577629, + "loss": 3.2311, + "step": 16075 + }, + { + "epoch": 0.6768842105263158, + "grad_norm": 0.4765625, + "learning_rate": 0.00012254364433727402, + "loss": 2.9804, + "step": 16076 + }, + { + "epoch": 0.6769263157894737, + "grad_norm": 0.47265625, + "learning_rate": 0.00012251461603889017, + "loss": 3.4231, + "step": 16077 + }, + { + "epoch": 0.6769684210526316, + "grad_norm": 0.40625, + "learning_rate": 0.00012248559006314023, + "loss": 2.9016, + "step": 16078 + }, + { + "epoch": 0.6770105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.00012245656641055302, + "loss": 3.2588, + "step": 16079 + }, + { + "epoch": 0.6770526315789474, + "grad_norm": 0.48046875, + "learning_rate": 0.00012242754508165729, + "loss": 3.2213, + "step": 16080 + }, + { + "epoch": 0.6770947368421053, + "grad_norm": 0.404296875, + "learning_rate": 0.00012239852607698174, + "loss": 3.1591, + "step": 16081 + }, + { + "epoch": 0.6771368421052631, + "grad_norm": 0.4140625, + "learning_rate": 0.00012236950939705518, + "loss": 2.8133, + "step": 16082 + }, + { + "epoch": 0.6771789473684211, + "grad_norm": 0.419921875, + "learning_rate": 0.0001223404950424062, + "loss": 3.2476, + "step": 16083 + }, + { + "epoch": 0.677221052631579, + "grad_norm": 0.421875, + "learning_rate": 0.00012231148301356345, + "loss": 3.0044, + "step": 16084 + }, + { + "epoch": 0.6772631578947368, + "grad_norm": 0.4375, + "learning_rate": 0.00012228247331105542, + "loss": 3.0845, + "step": 16085 + }, + { + "epoch": 0.6773052631578947, + "grad_norm": 0.3984375, + "learning_rate": 0.00012225346593541057, + "loss": 3.3282, + "step": 16086 + }, + { + "epoch": 0.6773473684210526, + "grad_norm": 0.4375, + "learning_rate": 0.00012222446088715766, + "loss": 3.0728, + "step": 16087 + }, + { + "epoch": 0.6773894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.00012219545816682476, + "loss": 2.9561, + "step": 16088 + }, + { + "epoch": 0.6774315789473684, + "grad_norm": 0.482421875, + "learning_rate": 0.00012216645777494056, + "loss": 3.4179, + "step": 16089 + }, + { + "epoch": 0.6774736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.00012213745971203328, + "loss": 2.75, + "step": 16090 + }, + { + "epoch": 0.6775157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.00012210846397863126, + "loss": 3.3597, + "step": 16091 + }, + { + "epoch": 0.6775578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.00012207947057526277, + "loss": 2.6293, + "step": 16092 + }, + { + "epoch": 0.6776, + "grad_norm": 0.431640625, + "learning_rate": 0.0001220504795024559, + "loss": 3.2396, + "step": 16093 + }, + { + "epoch": 0.6776421052631579, + "grad_norm": 0.44921875, + "learning_rate": 0.00012202149076073915, + "loss": 2.7772, + "step": 16094 + }, + { + "epoch": 0.6776842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.00012199250435064029, + "loss": 2.8994, + "step": 16095 + }, + { + "epoch": 0.6777263157894737, + "grad_norm": 0.439453125, + "learning_rate": 0.00012196352027268782, + "loss": 3.221, + "step": 16096 + }, + { + "epoch": 0.6777684210526316, + "grad_norm": 0.44140625, + "learning_rate": 0.00012193453852740935, + "loss": 3.1589, + "step": 16097 + }, + { + "epoch": 0.6778105263157894, + "grad_norm": 0.400390625, + "learning_rate": 0.00012190555911533322, + "loss": 2.5522, + "step": 16098 + }, + { + "epoch": 0.6778526315789474, + "grad_norm": 0.458984375, + "learning_rate": 0.00012187658203698729, + "loss": 3.1229, + "step": 16099 + }, + { + "epoch": 0.6778947368421052, + "grad_norm": 0.439453125, + "learning_rate": 0.0001218476072928994, + "loss": 3.0717, + "step": 16100 + }, + { + "epoch": 0.6779368421052632, + "grad_norm": 0.42578125, + "learning_rate": 0.00012181863488359774, + "loss": 3.14, + "step": 16101 + }, + { + "epoch": 0.677978947368421, + "grad_norm": 0.453125, + "learning_rate": 0.00012178966480960973, + "loss": 3.1837, + "step": 16102 + }, + { + "epoch": 0.678021052631579, + "grad_norm": 0.45703125, + "learning_rate": 0.00012176069707146362, + "loss": 3.235, + "step": 16103 + }, + { + "epoch": 0.6780631578947368, + "grad_norm": 0.421875, + "learning_rate": 0.0001217317316696867, + "loss": 3.1908, + "step": 16104 + }, + { + "epoch": 0.6781052631578948, + "grad_norm": 0.41796875, + "learning_rate": 0.00012170276860480703, + "loss": 3.2871, + "step": 16105 + }, + { + "epoch": 0.6781473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.00012167380787735219, + "loss": 3.1537, + "step": 16106 + }, + { + "epoch": 0.6781894736842106, + "grad_norm": 0.46484375, + "learning_rate": 0.00012164484948784976, + "loss": 3.4178, + "step": 16107 + }, + { + "epoch": 0.6782315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.00012161589343682739, + "loss": 3.1416, + "step": 16108 + }, + { + "epoch": 0.6782736842105264, + "grad_norm": 0.421875, + "learning_rate": 0.00012158693972481249, + "loss": 2.8099, + "step": 16109 + }, + { + "epoch": 0.6783157894736842, + "grad_norm": 0.44921875, + "learning_rate": 0.00012155798835233287, + "loss": 3.1204, + "step": 16110 + }, + { + "epoch": 0.6783578947368422, + "grad_norm": 0.427734375, + "learning_rate": 0.00012152903931991558, + "loss": 3.3064, + "step": 16111 + }, + { + "epoch": 0.6784, + "grad_norm": 0.423828125, + "learning_rate": 0.00012150009262808845, + "loss": 3.0845, + "step": 16112 + }, + { + "epoch": 0.6784421052631578, + "grad_norm": 0.44140625, + "learning_rate": 0.00012147114827737846, + "loss": 3.2118, + "step": 16113 + }, + { + "epoch": 0.6784842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.00012144220626831323, + "loss": 3.3953, + "step": 16114 + }, + { + "epoch": 0.6785263157894736, + "grad_norm": 0.392578125, + "learning_rate": 0.00012141326660141993, + "loss": 2.2515, + "step": 16115 + }, + { + "epoch": 0.6785684210526316, + "grad_norm": 0.408203125, + "learning_rate": 0.00012138432927722575, + "loss": 3.3975, + "step": 16116 + }, + { + "epoch": 0.6786105263157894, + "grad_norm": 0.453125, + "learning_rate": 0.0001213553942962582, + "loss": 2.9525, + "step": 16117 + }, + { + "epoch": 0.6786526315789474, + "grad_norm": 0.65625, + "learning_rate": 0.00012132646165904396, + "loss": 3.1446, + "step": 16118 + }, + { + "epoch": 0.6786947368421052, + "grad_norm": 0.427734375, + "learning_rate": 0.00012129753136611063, + "loss": 2.9322, + "step": 16119 + }, + { + "epoch": 0.6787368421052632, + "grad_norm": 0.423828125, + "learning_rate": 0.00012126860341798487, + "loss": 3.4651, + "step": 16120 + }, + { + "epoch": 0.678778947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00012123967781519398, + "loss": 3.1704, + "step": 16121 + }, + { + "epoch": 0.678821052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.00012121075455826488, + "loss": 3.4154, + "step": 16122 + }, + { + "epoch": 0.6788631578947368, + "grad_norm": 0.43359375, + "learning_rate": 0.00012118183364772453, + "loss": 2.9112, + "step": 16123 + }, + { + "epoch": 0.6789052631578948, + "grad_norm": 0.421875, + "learning_rate": 0.00012115291508409978, + "loss": 3.4438, + "step": 16124 + }, + { + "epoch": 0.6789473684210526, + "grad_norm": 0.44140625, + "learning_rate": 0.00012112399886791747, + "loss": 3.5528, + "step": 16125 + }, + { + "epoch": 0.6789894736842105, + "grad_norm": 0.431640625, + "learning_rate": 0.00012109508499970464, + "loss": 3.0166, + "step": 16126 + }, + { + "epoch": 0.6790315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.00012106617347998775, + "loss": 2.7317, + "step": 16127 + }, + { + "epoch": 0.6790736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.00012103726430929374, + "loss": 3.0632, + "step": 16128 + }, + { + "epoch": 0.6791157894736842, + "grad_norm": 0.43359375, + "learning_rate": 0.00012100835748814929, + "loss": 2.9947, + "step": 16129 + }, + { + "epoch": 0.6791578947368421, + "grad_norm": 0.46484375, + "learning_rate": 0.00012097945301708097, + "loss": 3.4151, + "step": 16130 + }, + { + "epoch": 0.6792, + "grad_norm": 0.421875, + "learning_rate": 0.00012095055089661545, + "loss": 2.9139, + "step": 16131 + }, + { + "epoch": 0.6792421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.00012092165112727916, + "loss": 2.727, + "step": 16132 + }, + { + "epoch": 0.6792842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.00012089275370959882, + "loss": 3.3195, + "step": 16133 + }, + { + "epoch": 0.6793263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.00012086385864410079, + "loss": 3.408, + "step": 16134 + }, + { + "epoch": 0.6793684210526316, + "grad_norm": 0.4296875, + "learning_rate": 0.00012083496593131154, + "loss": 3.1718, + "step": 16135 + }, + { + "epoch": 0.6794105263157895, + "grad_norm": 0.453125, + "learning_rate": 0.00012080607557175749, + "loss": 3.0446, + "step": 16136 + }, + { + "epoch": 0.6794526315789474, + "grad_norm": 0.4453125, + "learning_rate": 0.0001207771875659649, + "loss": 3.4796, + "step": 16137 + }, + { + "epoch": 0.6794947368421053, + "grad_norm": 0.41015625, + "learning_rate": 0.00012074830191446012, + "loss": 3.3173, + "step": 16138 + }, + { + "epoch": 0.6795368421052631, + "grad_norm": 0.44140625, + "learning_rate": 0.00012071941861776944, + "loss": 2.4586, + "step": 16139 + }, + { + "epoch": 0.6795789473684211, + "grad_norm": 0.44921875, + "learning_rate": 0.00012069053767641897, + "loss": 3.4476, + "step": 16140 + }, + { + "epoch": 0.6796210526315789, + "grad_norm": 0.44140625, + "learning_rate": 0.00012066165909093504, + "loss": 3.1829, + "step": 16141 + }, + { + "epoch": 0.6796631578947369, + "grad_norm": 0.435546875, + "learning_rate": 0.0001206327828618437, + "loss": 2.7153, + "step": 16142 + }, + { + "epoch": 0.6797052631578947, + "grad_norm": 0.435546875, + "learning_rate": 0.00012060390898967107, + "loss": 3.0918, + "step": 16143 + }, + { + "epoch": 0.6797473684210527, + "grad_norm": 0.43359375, + "learning_rate": 0.00012057503747494322, + "loss": 2.9281, + "step": 16144 + }, + { + "epoch": 0.6797894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.00012054616831818607, + "loss": 3.0911, + "step": 16145 + }, + { + "epoch": 0.6798315789473685, + "grad_norm": 0.4296875, + "learning_rate": 0.00012051730151992566, + "loss": 3.4514, + "step": 16146 + }, + { + "epoch": 0.6798736842105263, + "grad_norm": 0.431640625, + "learning_rate": 0.0001204884370806878, + "loss": 3.1015, + "step": 16147 + }, + { + "epoch": 0.6799157894736843, + "grad_norm": 0.4140625, + "learning_rate": 0.0001204595750009986, + "loss": 2.8403, + "step": 16148 + }, + { + "epoch": 0.6799578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00012043071528138355, + "loss": 3.4872, + "step": 16149 + }, + { + "epoch": 0.68, + "grad_norm": 0.447265625, + "learning_rate": 0.00012040185792236874, + "loss": 2.7525, + "step": 16150 + }, + { + "epoch": 0.6800421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00012037300292447981, + "loss": 3.2922, + "step": 16151 + }, + { + "epoch": 0.6800842105263157, + "grad_norm": 0.4296875, + "learning_rate": 0.00012034415028824247, + "loss": 3.3392, + "step": 16152 + }, + { + "epoch": 0.6801263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00012031530001418234, + "loss": 3.3113, + "step": 16153 + }, + { + "epoch": 0.6801684210526315, + "grad_norm": 0.44921875, + "learning_rate": 0.00012028645210282502, + "loss": 3.2078, + "step": 16154 + }, + { + "epoch": 0.6802105263157895, + "grad_norm": 0.423828125, + "learning_rate": 0.00012025760655469628, + "loss": 2.7872, + "step": 16155 + }, + { + "epoch": 0.6802526315789473, + "grad_norm": 0.4609375, + "learning_rate": 0.00012022876337032134, + "loss": 2.7714, + "step": 16156 + }, + { + "epoch": 0.6802947368421053, + "grad_norm": 0.400390625, + "learning_rate": 0.00012019992255022597, + "loss": 3.0251, + "step": 16157 + }, + { + "epoch": 0.6803368421052631, + "grad_norm": 0.43359375, + "learning_rate": 0.00012017108409493547, + "loss": 3.585, + "step": 16158 + }, + { + "epoch": 0.6803789473684211, + "grad_norm": 0.41796875, + "learning_rate": 0.00012014224800497528, + "loss": 3.2309, + "step": 16159 + }, + { + "epoch": 0.6804210526315789, + "grad_norm": 0.40625, + "learning_rate": 0.00012011341428087078, + "loss": 2.8453, + "step": 16160 + }, + { + "epoch": 0.6804631578947369, + "grad_norm": 0.443359375, + "learning_rate": 0.00012008458292314714, + "loss": 2.9876, + "step": 16161 + }, + { + "epoch": 0.6805052631578947, + "grad_norm": 0.427734375, + "learning_rate": 0.00012005575393232995, + "loss": 2.9682, + "step": 16162 + }, + { + "epoch": 0.6805473684210527, + "grad_norm": 0.4453125, + "learning_rate": 0.00012002692730894405, + "loss": 3.472, + "step": 16163 + }, + { + "epoch": 0.6805894736842105, + "grad_norm": 0.474609375, + "learning_rate": 0.00011999810305351502, + "loss": 3.3375, + "step": 16164 + }, + { + "epoch": 0.6806315789473685, + "grad_norm": 0.427734375, + "learning_rate": 0.00011996928116656761, + "loss": 3.1445, + "step": 16165 + }, + { + "epoch": 0.6806736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.00011994046164862724, + "loss": 3.2009, + "step": 16166 + }, + { + "epoch": 0.6807157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.0001199116445002188, + "loss": 3.0228, + "step": 16167 + }, + { + "epoch": 0.6807578947368421, + "grad_norm": 0.4921875, + "learning_rate": 0.00011988282972186729, + "loss": 3.1926, + "step": 16168 + }, + { + "epoch": 0.6808, + "grad_norm": 0.427734375, + "learning_rate": 0.00011985401731409792, + "loss": 3.2791, + "step": 16169 + }, + { + "epoch": 0.6808421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.00011982520727743523, + "loss": 3.0767, + "step": 16170 + }, + { + "epoch": 0.6808842105263158, + "grad_norm": 0.416015625, + "learning_rate": 0.00011979639961240454, + "loss": 3.163, + "step": 16171 + }, + { + "epoch": 0.6809263157894737, + "grad_norm": 0.44921875, + "learning_rate": 0.00011976759431953027, + "loss": 2.5058, + "step": 16172 + }, + { + "epoch": 0.6809684210526316, + "grad_norm": 0.4296875, + "learning_rate": 0.0001197387913993375, + "loss": 2.7328, + "step": 16173 + }, + { + "epoch": 0.6810105263157895, + "grad_norm": 0.431640625, + "learning_rate": 0.00011970999085235088, + "loss": 2.9342, + "step": 16174 + }, + { + "epoch": 0.6810526315789474, + "grad_norm": 0.443359375, + "learning_rate": 0.00011968119267909517, + "loss": 2.9526, + "step": 16175 + }, + { + "epoch": 0.6810947368421053, + "grad_norm": 0.431640625, + "learning_rate": 0.00011965239688009499, + "loss": 3.4862, + "step": 16176 + }, + { + "epoch": 0.6811368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.00011962360345587491, + "loss": 2.9894, + "step": 16177 + }, + { + "epoch": 0.6811789473684211, + "grad_norm": 0.447265625, + "learning_rate": 0.00011959481240695977, + "loss": 3.0826, + "step": 16178 + }, + { + "epoch": 0.681221052631579, + "grad_norm": 0.38671875, + "learning_rate": 0.00011956602373387377, + "loss": 2.5881, + "step": 16179 + }, + { + "epoch": 0.6812631578947368, + "grad_norm": 0.443359375, + "learning_rate": 0.0001195372374371417, + "loss": 3.0944, + "step": 16180 + }, + { + "epoch": 0.6813052631578947, + "grad_norm": 0.412109375, + "learning_rate": 0.00011950845351728772, + "loss": 3.0159, + "step": 16181 + }, + { + "epoch": 0.6813473684210526, + "grad_norm": 0.43359375, + "learning_rate": 0.00011947967197483648, + "loss": 2.3523, + "step": 16182 + }, + { + "epoch": 0.6813894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.00011945089281031227, + "loss": 3.0046, + "step": 16183 + }, + { + "epoch": 0.6814315789473684, + "grad_norm": 0.416015625, + "learning_rate": 0.0001194221160242393, + "loss": 3.0739, + "step": 16184 + }, + { + "epoch": 0.6814736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.00011939334161714215, + "loss": 3.2549, + "step": 16185 + }, + { + "epoch": 0.6815157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.00011936456958954467, + "loss": 3.1204, + "step": 16186 + }, + { + "epoch": 0.6815578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00011933579994197133, + "loss": 3.1981, + "step": 16187 + }, + { + "epoch": 0.6816, + "grad_norm": 0.44921875, + "learning_rate": 0.00011930703267494619, + "loss": 3.4245, + "step": 16188 + }, + { + "epoch": 0.6816421052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.00011927826778899336, + "loss": 3.1462, + "step": 16189 + }, + { + "epoch": 0.6816842105263158, + "grad_norm": 0.453125, + "learning_rate": 0.00011924950528463691, + "loss": 2.4574, + "step": 16190 + }, + { + "epoch": 0.6817263157894737, + "grad_norm": 0.453125, + "learning_rate": 0.00011922074516240083, + "loss": 2.5084, + "step": 16191 + }, + { + "epoch": 0.6817684210526316, + "grad_norm": 0.44140625, + "learning_rate": 0.00011919198742280912, + "loss": 2.8044, + "step": 16192 + }, + { + "epoch": 0.6818105263157894, + "grad_norm": 0.41796875, + "learning_rate": 0.0001191632320663856, + "loss": 3.0638, + "step": 16193 + }, + { + "epoch": 0.6818526315789474, + "grad_norm": 0.435546875, + "learning_rate": 0.00011913447909365438, + "loss": 3.1425, + "step": 16194 + }, + { + "epoch": 0.6818947368421052, + "grad_norm": 0.412109375, + "learning_rate": 0.00011910572850513918, + "loss": 2.9236, + "step": 16195 + }, + { + "epoch": 0.6819368421052632, + "grad_norm": 0.44140625, + "learning_rate": 0.00011907698030136383, + "loss": 2.8871, + "step": 16196 + }, + { + "epoch": 0.681978947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00011904823448285204, + "loss": 2.871, + "step": 16197 + }, + { + "epoch": 0.682021052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00011901949105012758, + "loss": 2.983, + "step": 16198 + }, + { + "epoch": 0.6820631578947368, + "grad_norm": 0.4296875, + "learning_rate": 0.00011899075000371409, + "loss": 3.4097, + "step": 16199 + }, + { + "epoch": 0.6821052631578948, + "grad_norm": 0.427734375, + "learning_rate": 0.00011896201134413512, + "loss": 3.0942, + "step": 16200 + }, + { + "epoch": 0.6821473684210526, + "grad_norm": 0.458984375, + "learning_rate": 0.0001189332750719144, + "loss": 2.9146, + "step": 16201 + }, + { + "epoch": 0.6821894736842106, + "grad_norm": 0.43359375, + "learning_rate": 0.00011890454118757544, + "loss": 3.0275, + "step": 16202 + }, + { + "epoch": 0.6822315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.00011887580969164171, + "loss": 3.6323, + "step": 16203 + }, + { + "epoch": 0.6822736842105264, + "grad_norm": 0.4375, + "learning_rate": 0.00011884708058463667, + "loss": 3.4811, + "step": 16204 + }, + { + "epoch": 0.6823157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.0001188183538670837, + "loss": 2.984, + "step": 16205 + }, + { + "epoch": 0.682357894736842, + "grad_norm": 0.453125, + "learning_rate": 0.0001187896295395062, + "loss": 3.2407, + "step": 16206 + }, + { + "epoch": 0.6824, + "grad_norm": 0.4140625, + "learning_rate": 0.00011876090760242747, + "loss": 2.773, + "step": 16207 + }, + { + "epoch": 0.6824421052631578, + "grad_norm": 0.421875, + "learning_rate": 0.00011873218805637073, + "loss": 3.0291, + "step": 16208 + }, + { + "epoch": 0.6824842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.00011870347090185937, + "loss": 3.1989, + "step": 16209 + }, + { + "epoch": 0.6825263157894736, + "grad_norm": 0.4453125, + "learning_rate": 0.00011867475613941652, + "loss": 3.469, + "step": 16210 + }, + { + "epoch": 0.6825684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.00011864604376956531, + "loss": 3.3692, + "step": 16211 + }, + { + "epoch": 0.6826105263157894, + "grad_norm": 0.458984375, + "learning_rate": 0.00011861733379282885, + "loss": 3.0437, + "step": 16212 + }, + { + "epoch": 0.6826526315789474, + "grad_norm": 0.427734375, + "learning_rate": 0.00011858862620973019, + "loss": 3.2612, + "step": 16213 + }, + { + "epoch": 0.6826947368421052, + "grad_norm": 0.408203125, + "learning_rate": 0.00011855992102079235, + "loss": 3.2095, + "step": 16214 + }, + { + "epoch": 0.6827368421052632, + "grad_norm": 0.431640625, + "learning_rate": 0.00011853121822653823, + "loss": 3.2219, + "step": 16215 + }, + { + "epoch": 0.682778947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.00011850251782749094, + "loss": 2.9198, + "step": 16216 + }, + { + "epoch": 0.682821052631579, + "grad_norm": 0.408203125, + "learning_rate": 0.00011847381982417327, + "loss": 3.3388, + "step": 16217 + }, + { + "epoch": 0.6828631578947368, + "grad_norm": 0.427734375, + "learning_rate": 0.00011844512421710807, + "loss": 2.7277, + "step": 16218 + }, + { + "epoch": 0.6829052631578947, + "grad_norm": 0.44921875, + "learning_rate": 0.00011841643100681813, + "loss": 3.1026, + "step": 16219 + }, + { + "epoch": 0.6829473684210526, + "grad_norm": 0.439453125, + "learning_rate": 0.00011838774019382623, + "loss": 2.8913, + "step": 16220 + }, + { + "epoch": 0.6829894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.00011835905177865505, + "loss": 3.2164, + "step": 16221 + }, + { + "epoch": 0.6830315789473684, + "grad_norm": 0.41015625, + "learning_rate": 0.00011833036576182718, + "loss": 2.4817, + "step": 16222 + }, + { + "epoch": 0.6830736842105263, + "grad_norm": 0.455078125, + "learning_rate": 0.00011830168214386553, + "loss": 2.9741, + "step": 16223 + }, + { + "epoch": 0.6831157894736842, + "grad_norm": 0.44140625, + "learning_rate": 0.00011827300092529231, + "loss": 3.0859, + "step": 16224 + }, + { + "epoch": 0.6831578947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.0001182443221066303, + "loss": 3.2365, + "step": 16225 + }, + { + "epoch": 0.6832, + "grad_norm": 0.4375, + "learning_rate": 0.00011821564568840199, + "loss": 3.3139, + "step": 16226 + }, + { + "epoch": 0.6832421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00011818697167112974, + "loss": 2.9414, + "step": 16227 + }, + { + "epoch": 0.6832842105263158, + "grad_norm": 0.44921875, + "learning_rate": 0.00011815830005533598, + "loss": 3.3312, + "step": 16228 + }, + { + "epoch": 0.6833263157894737, + "grad_norm": 0.427734375, + "learning_rate": 0.00011812963084154302, + "loss": 2.5778, + "step": 16229 + }, + { + "epoch": 0.6833684210526316, + "grad_norm": 0.443359375, + "learning_rate": 0.00011810096403027343, + "loss": 2.9523, + "step": 16230 + }, + { + "epoch": 0.6834105263157895, + "grad_norm": 0.490234375, + "learning_rate": 0.0001180722996220491, + "loss": 3.2114, + "step": 16231 + }, + { + "epoch": 0.6834526315789474, + "grad_norm": 0.427734375, + "learning_rate": 0.00011804363761739257, + "loss": 2.6708, + "step": 16232 + }, + { + "epoch": 0.6834947368421053, + "grad_norm": 0.431640625, + "learning_rate": 0.00011801497801682593, + "loss": 3.287, + "step": 16233 + }, + { + "epoch": 0.6835368421052631, + "grad_norm": 0.427734375, + "learning_rate": 0.0001179863208208713, + "loss": 2.8955, + "step": 16234 + }, + { + "epoch": 0.6835789473684211, + "grad_norm": 0.41796875, + "learning_rate": 0.00011795766603005078, + "loss": 3.1961, + "step": 16235 + }, + { + "epoch": 0.6836210526315789, + "grad_norm": 0.431640625, + "learning_rate": 0.00011792901364488637, + "loss": 3.46, + "step": 16236 + }, + { + "epoch": 0.6836631578947369, + "grad_norm": 0.419921875, + "learning_rate": 0.00011790036366590032, + "loss": 3.2679, + "step": 16237 + }, + { + "epoch": 0.6837052631578947, + "grad_norm": 0.45703125, + "learning_rate": 0.00011787171609361427, + "loss": 3.0178, + "step": 16238 + }, + { + "epoch": 0.6837473684210527, + "grad_norm": 0.43359375, + "learning_rate": 0.00011784307092855048, + "loss": 2.7105, + "step": 16239 + }, + { + "epoch": 0.6837894736842105, + "grad_norm": 0.45703125, + "learning_rate": 0.00011781442817123048, + "loss": 3.0033, + "step": 16240 + }, + { + "epoch": 0.6838315789473685, + "grad_norm": 0.46875, + "learning_rate": 0.00011778578782217636, + "loss": 2.8953, + "step": 16241 + }, + { + "epoch": 0.6838736842105263, + "grad_norm": 0.451171875, + "learning_rate": 0.00011775714988190985, + "loss": 3.3307, + "step": 16242 + }, + { + "epoch": 0.6839157894736843, + "grad_norm": 0.447265625, + "learning_rate": 0.00011772851435095261, + "loss": 2.8446, + "step": 16243 + }, + { + "epoch": 0.6839578947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.00011769988122982662, + "loss": 3.152, + "step": 16244 + }, + { + "epoch": 0.684, + "grad_norm": 0.421875, + "learning_rate": 0.00011767125051905314, + "loss": 2.8681, + "step": 16245 + }, + { + "epoch": 0.6840421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.0001176426222191542, + "loss": 2.762, + "step": 16246 + }, + { + "epoch": 0.6840842105263157, + "grad_norm": 0.421875, + "learning_rate": 0.00011761399633065098, + "loss": 3.3489, + "step": 16247 + }, + { + "epoch": 0.6841263157894737, + "grad_norm": 0.400390625, + "learning_rate": 0.00011758537285406528, + "loss": 2.9021, + "step": 16248 + }, + { + "epoch": 0.6841684210526315, + "grad_norm": 0.427734375, + "learning_rate": 0.00011755675178991851, + "loss": 3.4785, + "step": 16249 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.00011752813313873211, + "loss": 2.7558, + "step": 16250 + }, + { + "epoch": 0.6842526315789473, + "grad_norm": 0.44921875, + "learning_rate": 0.00011749951690102748, + "loss": 3.3634, + "step": 16251 + }, + { + "epoch": 0.6842947368421053, + "grad_norm": 0.439453125, + "learning_rate": 0.00011747090307732589, + "loss": 3.4248, + "step": 16252 + }, + { + "epoch": 0.6843368421052631, + "grad_norm": 0.412109375, + "learning_rate": 0.00011744229166814888, + "loss": 3.1195, + "step": 16253 + }, + { + "epoch": 0.6843789473684211, + "grad_norm": 0.435546875, + "learning_rate": 0.00011741368267401742, + "loss": 3.1863, + "step": 16254 + }, + { + "epoch": 0.6844210526315789, + "grad_norm": 0.427734375, + "learning_rate": 0.00011738507609545293, + "loss": 3.2127, + "step": 16255 + }, + { + "epoch": 0.6844631578947369, + "grad_norm": 0.44140625, + "learning_rate": 0.00011735647193297658, + "loss": 3.1887, + "step": 16256 + }, + { + "epoch": 0.6845052631578947, + "grad_norm": 0.40234375, + "learning_rate": 0.00011732787018710945, + "loss": 2.7824, + "step": 16257 + }, + { + "epoch": 0.6845473684210527, + "grad_norm": 0.45703125, + "learning_rate": 0.00011729927085837264, + "loss": 3.0537, + "step": 16258 + }, + { + "epoch": 0.6845894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.00011727067394728708, + "loss": 3.2699, + "step": 16259 + }, + { + "epoch": 0.6846315789473684, + "grad_norm": 0.396484375, + "learning_rate": 0.0001172420794543741, + "loss": 2.9023, + "step": 16260 + }, + { + "epoch": 0.6846736842105263, + "grad_norm": 0.455078125, + "learning_rate": 0.00011721348738015425, + "loss": 2.8951, + "step": 16261 + }, + { + "epoch": 0.6847157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.00011718489772514876, + "loss": 2.9293, + "step": 16262 + }, + { + "epoch": 0.6847578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.00011715631048987834, + "loss": 3.0959, + "step": 16263 + }, + { + "epoch": 0.6848, + "grad_norm": 0.427734375, + "learning_rate": 0.0001171277256748639, + "loss": 3.0854, + "step": 16264 + }, + { + "epoch": 0.6848421052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.00011709914328062616, + "loss": 3.3589, + "step": 16265 + }, + { + "epoch": 0.6848842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.0001170705633076859, + "loss": 3.5302, + "step": 16266 + }, + { + "epoch": 0.6849263157894737, + "grad_norm": 0.44921875, + "learning_rate": 0.00011704198575656374, + "loss": 3.1648, + "step": 16267 + }, + { + "epoch": 0.6849684210526316, + "grad_norm": 0.45703125, + "learning_rate": 0.00011701341062778034, + "loss": 3.3053, + "step": 16268 + }, + { + "epoch": 0.6850105263157895, + "grad_norm": 0.43359375, + "learning_rate": 0.0001169848379218564, + "loss": 2.9369, + "step": 16269 + }, + { + "epoch": 0.6850526315789474, + "grad_norm": 0.431640625, + "learning_rate": 0.00011695626763931247, + "loss": 3.2693, + "step": 16270 + }, + { + "epoch": 0.6850947368421053, + "grad_norm": 0.416015625, + "learning_rate": 0.00011692769978066897, + "loss": 3.2811, + "step": 16271 + }, + { + "epoch": 0.6851368421052632, + "grad_norm": 0.408203125, + "learning_rate": 0.00011689913434644648, + "loss": 2.9411, + "step": 16272 + }, + { + "epoch": 0.685178947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00011687057133716533, + "loss": 2.8029, + "step": 16273 + }, + { + "epoch": 0.685221052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00011684201075334597, + "loss": 2.9066, + "step": 16274 + }, + { + "epoch": 0.6852631578947368, + "grad_norm": 0.439453125, + "learning_rate": 0.00011681345259550863, + "loss": 3.1444, + "step": 16275 + }, + { + "epoch": 0.6853052631578948, + "grad_norm": 0.439453125, + "learning_rate": 0.00011678489686417379, + "loss": 3.1446, + "step": 16276 + }, + { + "epoch": 0.6853473684210526, + "grad_norm": 0.408203125, + "learning_rate": 0.0001167563435598616, + "loss": 3.2857, + "step": 16277 + }, + { + "epoch": 0.6853894736842105, + "grad_norm": 0.41015625, + "learning_rate": 0.00011672779268309227, + "loss": 3.0498, + "step": 16278 + }, + { + "epoch": 0.6854315789473684, + "grad_norm": 0.408203125, + "learning_rate": 0.00011669924423438599, + "loss": 2.7376, + "step": 16279 + }, + { + "epoch": 0.6854736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.00011667069821426285, + "loss": 3.0223, + "step": 16280 + }, + { + "epoch": 0.6855157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.00011664215462324293, + "loss": 3.3344, + "step": 16281 + }, + { + "epoch": 0.6855578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.00011661361346184627, + "loss": 2.9819, + "step": 16282 + }, + { + "epoch": 0.6856, + "grad_norm": 0.44140625, + "learning_rate": 0.00011658507473059274, + "loss": 3.4173, + "step": 16283 + }, + { + "epoch": 0.6856421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.0001165565384300025, + "loss": 2.9902, + "step": 16284 + }, + { + "epoch": 0.6856842105263158, + "grad_norm": 0.439453125, + "learning_rate": 0.00011652800456059532, + "loss": 3.3064, + "step": 16285 + }, + { + "epoch": 0.6857263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.00011649947312289108, + "loss": 3.1919, + "step": 16286 + }, + { + "epoch": 0.6857684210526316, + "grad_norm": 0.41015625, + "learning_rate": 0.0001164709441174096, + "loss": 2.81, + "step": 16287 + }, + { + "epoch": 0.6858105263157894, + "grad_norm": 0.458984375, + "learning_rate": 0.00011644241754467053, + "loss": 2.9744, + "step": 16288 + }, + { + "epoch": 0.6858526315789474, + "grad_norm": 0.4296875, + "learning_rate": 0.00011641389340519387, + "loss": 3.1061, + "step": 16289 + }, + { + "epoch": 0.6858947368421052, + "grad_norm": 0.451171875, + "learning_rate": 0.00011638537169949892, + "loss": 2.8245, + "step": 16290 + }, + { + "epoch": 0.6859368421052632, + "grad_norm": 0.447265625, + "learning_rate": 0.00011635685242810562, + "loss": 3.1369, + "step": 16291 + }, + { + "epoch": 0.685978947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.00011632833559153345, + "loss": 3.5735, + "step": 16292 + }, + { + "epoch": 0.686021052631579, + "grad_norm": 0.41796875, + "learning_rate": 0.00011629982119030191, + "loss": 3.2884, + "step": 16293 + }, + { + "epoch": 0.6860631578947368, + "grad_norm": 0.443359375, + "learning_rate": 0.00011627130922493057, + "loss": 3.2307, + "step": 16294 + }, + { + "epoch": 0.6861052631578948, + "grad_norm": 0.419921875, + "learning_rate": 0.00011624279969593873, + "loss": 3.2151, + "step": 16295 + }, + { + "epoch": 0.6861473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.00011621429260384614, + "loss": 3.3047, + "step": 16296 + }, + { + "epoch": 0.6861894736842106, + "grad_norm": 0.408203125, + "learning_rate": 0.00011618578794917176, + "loss": 2.8561, + "step": 16297 + }, + { + "epoch": 0.6862315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.00011615728573243525, + "loss": 3.1308, + "step": 16298 + }, + { + "epoch": 0.6862736842105264, + "grad_norm": 0.419921875, + "learning_rate": 0.00011612878595415557, + "loss": 3.2669, + "step": 16299 + }, + { + "epoch": 0.6863157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.00011610028861485222, + "loss": 2.9908, + "step": 16300 + }, + { + "epoch": 0.6863578947368421, + "grad_norm": 0.40234375, + "learning_rate": 0.00011607179371504429, + "loss": 2.6822, + "step": 16301 + }, + { + "epoch": 0.6864, + "grad_norm": 0.421875, + "learning_rate": 0.00011604330125525078, + "loss": 2.8336, + "step": 16302 + }, + { + "epoch": 0.6864421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.00011601481123599117, + "loss": 3.1341, + "step": 16303 + }, + { + "epoch": 0.6864842105263158, + "grad_norm": 0.439453125, + "learning_rate": 0.00011598632365778406, + "loss": 3.1834, + "step": 16304 + }, + { + "epoch": 0.6865263157894737, + "grad_norm": 0.404296875, + "learning_rate": 0.00011595783852114885, + "loss": 2.9666, + "step": 16305 + }, + { + "epoch": 0.6865684210526316, + "grad_norm": 0.443359375, + "learning_rate": 0.00011592935582660416, + "loss": 2.9744, + "step": 16306 + }, + { + "epoch": 0.6866105263157894, + "grad_norm": 0.431640625, + "learning_rate": 0.00011590087557466916, + "loss": 3.0851, + "step": 16307 + }, + { + "epoch": 0.6866526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.00011587239776586266, + "loss": 2.933, + "step": 16308 + }, + { + "epoch": 0.6866947368421052, + "grad_norm": 0.4375, + "learning_rate": 0.00011584392240070349, + "loss": 3.1021, + "step": 16309 + }, + { + "epoch": 0.6867368421052632, + "grad_norm": 0.447265625, + "learning_rate": 0.0001158154494797104, + "loss": 3.2615, + "step": 16310 + }, + { + "epoch": 0.686778947368421, + "grad_norm": 0.470703125, + "learning_rate": 0.00011578697900340207, + "loss": 3.1459, + "step": 16311 + }, + { + "epoch": 0.686821052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.0001157585109722975, + "loss": 2.5403, + "step": 16312 + }, + { + "epoch": 0.6868631578947368, + "grad_norm": 0.431640625, + "learning_rate": 0.00011573004538691492, + "loss": 2.9488, + "step": 16313 + }, + { + "epoch": 0.6869052631578947, + "grad_norm": 0.400390625, + "learning_rate": 0.00011570158224777336, + "loss": 2.6024, + "step": 16314 + }, + { + "epoch": 0.6869473684210526, + "grad_norm": 0.44921875, + "learning_rate": 0.00011567312155539097, + "loss": 3.0249, + "step": 16315 + }, + { + "epoch": 0.6869894736842105, + "grad_norm": 0.416015625, + "learning_rate": 0.00011564466331028659, + "loss": 2.8846, + "step": 16316 + }, + { + "epoch": 0.6870315789473684, + "grad_norm": 0.439453125, + "learning_rate": 0.00011561620751297857, + "loss": 2.745, + "step": 16317 + }, + { + "epoch": 0.6870736842105263, + "grad_norm": 0.419921875, + "learning_rate": 0.00011558775416398529, + "loss": 3.01, + "step": 16318 + }, + { + "epoch": 0.6871157894736842, + "grad_norm": 0.48046875, + "learning_rate": 0.00011555930326382536, + "loss": 3.5908, + "step": 16319 + }, + { + "epoch": 0.6871578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00011553085481301678, + "loss": 3.2262, + "step": 16320 + }, + { + "epoch": 0.6872, + "grad_norm": 0.421875, + "learning_rate": 0.00011550240881207822, + "loss": 2.8055, + "step": 16321 + }, + { + "epoch": 0.6872421052631579, + "grad_norm": 0.39453125, + "learning_rate": 0.00011547396526152754, + "loss": 2.879, + "step": 16322 + }, + { + "epoch": 0.6872842105263158, + "grad_norm": 0.412109375, + "learning_rate": 0.00011544552416188325, + "loss": 3.464, + "step": 16323 + }, + { + "epoch": 0.6873263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.00011541708551366345, + "loss": 3.1478, + "step": 16324 + }, + { + "epoch": 0.6873684210526316, + "grad_norm": 0.451171875, + "learning_rate": 0.00011538864931738618, + "loss": 3.1797, + "step": 16325 + }, + { + "epoch": 0.6874105263157895, + "grad_norm": 0.44921875, + "learning_rate": 0.00011536021557356957, + "loss": 3.2848, + "step": 16326 + }, + { + "epoch": 0.6874526315789473, + "grad_norm": 0.4453125, + "learning_rate": 0.00011533178428273156, + "loss": 2.6118, + "step": 16327 + }, + { + "epoch": 0.6874947368421053, + "grad_norm": 0.421875, + "learning_rate": 0.0001153033554453904, + "loss": 3.0494, + "step": 16328 + }, + { + "epoch": 0.6875368421052631, + "grad_norm": 0.46484375, + "learning_rate": 0.00011527492906206363, + "loss": 3.0882, + "step": 16329 + }, + { + "epoch": 0.6875789473684211, + "grad_norm": 0.41796875, + "learning_rate": 0.00011524650513326946, + "loss": 2.642, + "step": 16330 + }, + { + "epoch": 0.6876210526315789, + "grad_norm": 0.412109375, + "learning_rate": 0.00011521808365952566, + "loss": 2.6467, + "step": 16331 + }, + { + "epoch": 0.6876631578947369, + "grad_norm": 0.447265625, + "learning_rate": 0.00011518966464134997, + "loss": 3.1021, + "step": 16332 + }, + { + "epoch": 0.6877052631578947, + "grad_norm": 0.423828125, + "learning_rate": 0.00011516124807926024, + "loss": 3.1584, + "step": 16333 + }, + { + "epoch": 0.6877473684210527, + "grad_norm": 0.458984375, + "learning_rate": 0.00011513283397377403, + "loss": 2.9561, + "step": 16334 + }, + { + "epoch": 0.6877894736842105, + "grad_norm": 0.40625, + "learning_rate": 0.00011510442232540918, + "loss": 2.9454, + "step": 16335 + }, + { + "epoch": 0.6878315789473685, + "grad_norm": 0.42578125, + "learning_rate": 0.0001150760131346833, + "loss": 2.8391, + "step": 16336 + }, + { + "epoch": 0.6878736842105263, + "grad_norm": 0.4140625, + "learning_rate": 0.00011504760640211392, + "loss": 2.9557, + "step": 16337 + }, + { + "epoch": 0.6879157894736843, + "grad_norm": 0.48046875, + "learning_rate": 0.00011501920212821859, + "loss": 3.1737, + "step": 16338 + }, + { + "epoch": 0.6879578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00011499080031351484, + "loss": 2.9135, + "step": 16339 + }, + { + "epoch": 0.688, + "grad_norm": 0.4140625, + "learning_rate": 0.00011496240095852001, + "loss": 3.2937, + "step": 16340 + }, + { + "epoch": 0.6880421052631579, + "grad_norm": 0.44921875, + "learning_rate": 0.00011493400406375162, + "loss": 3.0662, + "step": 16341 + }, + { + "epoch": 0.6880842105263157, + "grad_norm": 0.41796875, + "learning_rate": 0.00011490560962972688, + "loss": 3.2675, + "step": 16342 + }, + { + "epoch": 0.6881263157894737, + "grad_norm": 0.43359375, + "learning_rate": 0.00011487721765696327, + "loss": 3.1199, + "step": 16343 + }, + { + "epoch": 0.6881684210526315, + "grad_norm": 0.43359375, + "learning_rate": 0.00011484882814597802, + "loss": 2.7314, + "step": 16344 + }, + { + "epoch": 0.6882105263157895, + "grad_norm": 0.439453125, + "learning_rate": 0.00011482044109728831, + "loss": 2.9474, + "step": 16345 + }, + { + "epoch": 0.6882526315789473, + "grad_norm": 0.4296875, + "learning_rate": 0.00011479205651141134, + "loss": 3.3588, + "step": 16346 + }, + { + "epoch": 0.6882947368421053, + "grad_norm": 0.4140625, + "learning_rate": 0.00011476367438886424, + "loss": 3.021, + "step": 16347 + }, + { + "epoch": 0.6883368421052631, + "grad_norm": 0.416015625, + "learning_rate": 0.00011473529473016408, + "loss": 3.2984, + "step": 16348 + }, + { + "epoch": 0.6883789473684211, + "grad_norm": 0.427734375, + "learning_rate": 0.00011470691753582784, + "loss": 3.074, + "step": 16349 + }, + { + "epoch": 0.6884210526315789, + "grad_norm": 0.421875, + "learning_rate": 0.00011467854280637269, + "loss": 3.3534, + "step": 16350 + }, + { + "epoch": 0.6884631578947369, + "grad_norm": 0.390625, + "learning_rate": 0.0001146501705423155, + "loss": 2.8463, + "step": 16351 + }, + { + "epoch": 0.6885052631578947, + "grad_norm": 0.435546875, + "learning_rate": 0.00011462180074417317, + "loss": 2.8634, + "step": 16352 + }, + { + "epoch": 0.6885473684210527, + "grad_norm": 0.435546875, + "learning_rate": 0.00011459343341246255, + "loss": 3.2214, + "step": 16353 + }, + { + "epoch": 0.6885894736842105, + "grad_norm": 0.42578125, + "learning_rate": 0.00011456506854770051, + "loss": 3.0087, + "step": 16354 + }, + { + "epoch": 0.6886315789473684, + "grad_norm": 0.404296875, + "learning_rate": 0.00011453670615040376, + "loss": 2.8773, + "step": 16355 + }, + { + "epoch": 0.6886736842105263, + "grad_norm": 0.408203125, + "learning_rate": 0.000114508346221089, + "loss": 3.0765, + "step": 16356 + }, + { + "epoch": 0.6887157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.00011447998876027316, + "loss": 3.6035, + "step": 16357 + }, + { + "epoch": 0.6887578947368421, + "grad_norm": 0.443359375, + "learning_rate": 0.00011445163376847248, + "loss": 3.2539, + "step": 16358 + }, + { + "epoch": 0.6888, + "grad_norm": 0.57421875, + "learning_rate": 0.00011442328124620388, + "loss": 3.0006, + "step": 16359 + }, + { + "epoch": 0.6888421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.00011439493119398381, + "loss": 2.8496, + "step": 16360 + }, + { + "epoch": 0.6888842105263158, + "grad_norm": 0.44921875, + "learning_rate": 0.00011436658361232873, + "loss": 3.1002, + "step": 16361 + }, + { + "epoch": 0.6889263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.00011433823850175514, + "loss": 3.2487, + "step": 16362 + }, + { + "epoch": 0.6889684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.0001143098958627794, + "loss": 2.8381, + "step": 16363 + }, + { + "epoch": 0.6890105263157895, + "grad_norm": 0.41015625, + "learning_rate": 0.00011428155569591809, + "loss": 2.7803, + "step": 16364 + }, + { + "epoch": 0.6890526315789474, + "grad_norm": 0.44140625, + "learning_rate": 0.00011425321800168717, + "loss": 3.1177, + "step": 16365 + }, + { + "epoch": 0.6890947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.00011422488278060322, + "loss": 2.5929, + "step": 16366 + }, + { + "epoch": 0.6891368421052632, + "grad_norm": 0.44921875, + "learning_rate": 0.00011419655003318239, + "loss": 3.3445, + "step": 16367 + }, + { + "epoch": 0.689178947368421, + "grad_norm": 0.4453125, + "learning_rate": 0.00011416821975994084, + "loss": 3.3862, + "step": 16368 + }, + { + "epoch": 0.689221052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00011413989196139473, + "loss": 3.4973, + "step": 16369 + }, + { + "epoch": 0.6892631578947368, + "grad_norm": 0.4453125, + "learning_rate": 0.0001141115666380601, + "loss": 2.8978, + "step": 16370 + }, + { + "epoch": 0.6893052631578948, + "grad_norm": 0.453125, + "learning_rate": 0.00011408324379045321, + "loss": 3.2052, + "step": 16371 + }, + { + "epoch": 0.6893473684210526, + "grad_norm": 0.43359375, + "learning_rate": 0.00011405492341908977, + "loss": 3.3215, + "step": 16372 + }, + { + "epoch": 0.6893894736842106, + "grad_norm": 0.419921875, + "learning_rate": 0.00011402660552448604, + "loss": 3.1445, + "step": 16373 + }, + { + "epoch": 0.6894315789473684, + "grad_norm": 0.451171875, + "learning_rate": 0.00011399829010715765, + "loss": 3.2772, + "step": 16374 + }, + { + "epoch": 0.6894736842105263, + "grad_norm": 0.4296875, + "learning_rate": 0.0001139699771676207, + "loss": 3.0099, + "step": 16375 + }, + { + "epoch": 0.6895157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.00011394166670639091, + "loss": 2.8397, + "step": 16376 + }, + { + "epoch": 0.6895578947368421, + "grad_norm": 0.421875, + "learning_rate": 0.00011391335872398404, + "loss": 2.8979, + "step": 16377 + }, + { + "epoch": 0.6896, + "grad_norm": 0.423828125, + "learning_rate": 0.00011388505322091605, + "loss": 2.9595, + "step": 16378 + }, + { + "epoch": 0.689642105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.00011385675019770225, + "loss": 2.9557, + "step": 16379 + }, + { + "epoch": 0.6896842105263158, + "grad_norm": 0.43359375, + "learning_rate": 0.00011382844965485873, + "loss": 3.1993, + "step": 16380 + }, + { + "epoch": 0.6897263157894736, + "grad_norm": 0.423828125, + "learning_rate": 0.00011380015159290064, + "loss": 3.0748, + "step": 16381 + }, + { + "epoch": 0.6897684210526316, + "grad_norm": 0.48046875, + "learning_rate": 0.00011377185601234386, + "loss": 3.0308, + "step": 16382 + }, + { + "epoch": 0.6898105263157894, + "grad_norm": 0.412109375, + "learning_rate": 0.00011374356291370383, + "loss": 3.2683, + "step": 16383 + }, + { + "epoch": 0.6898526315789474, + "grad_norm": 0.408203125, + "learning_rate": 0.00011371527229749596, + "loss": 2.7047, + "step": 16384 + }, + { + "epoch": 0.6898947368421052, + "grad_norm": 0.4140625, + "learning_rate": 0.0001136869841642357, + "loss": 3.1632, + "step": 16385 + }, + { + "epoch": 0.6899368421052632, + "grad_norm": 0.416015625, + "learning_rate": 0.00011365869851443834, + "loss": 3.2978, + "step": 16386 + }, + { + "epoch": 0.689978947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.00011363041534861948, + "loss": 3.0313, + "step": 16387 + }, + { + "epoch": 0.690021052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00011360213466729405, + "loss": 3.3926, + "step": 16388 + }, + { + "epoch": 0.6900631578947368, + "grad_norm": 0.42578125, + "learning_rate": 0.00011357385647097756, + "loss": 3.0301, + "step": 16389 + }, + { + "epoch": 0.6901052631578948, + "grad_norm": 0.42578125, + "learning_rate": 0.00011354558076018508, + "loss": 3.2553, + "step": 16390 + }, + { + "epoch": 0.6901473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.00011351730753543183, + "loss": 2.6601, + "step": 16391 + }, + { + "epoch": 0.6901894736842106, + "grad_norm": 0.458984375, + "learning_rate": 0.00011348903679723289, + "loss": 3.2735, + "step": 16392 + }, + { + "epoch": 0.6902315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.00011346076854610316, + "loss": 3.595, + "step": 16393 + }, + { + "epoch": 0.6902736842105263, + "grad_norm": 0.470703125, + "learning_rate": 0.00011343250278255801, + "loss": 2.9079, + "step": 16394 + }, + { + "epoch": 0.6903157894736842, + "grad_norm": 0.443359375, + "learning_rate": 0.000113404239507112, + "loss": 3.2109, + "step": 16395 + }, + { + "epoch": 0.6903578947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00011337597872028038, + "loss": 2.8577, + "step": 16396 + }, + { + "epoch": 0.6904, + "grad_norm": 0.419921875, + "learning_rate": 0.00011334772042257787, + "loss": 3.4765, + "step": 16397 + }, + { + "epoch": 0.6904421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00011331946461451936, + "loss": 3.1288, + "step": 16398 + }, + { + "epoch": 0.6904842105263158, + "grad_norm": 0.4140625, + "learning_rate": 0.00011329121129661959, + "loss": 3.2431, + "step": 16399 + }, + { + "epoch": 0.6905263157894737, + "grad_norm": 0.431640625, + "learning_rate": 0.00011326296046939332, + "loss": 2.7721, + "step": 16400 + }, + { + "epoch": 0.6905684210526316, + "grad_norm": 0.43359375, + "learning_rate": 0.00011323471213335526, + "loss": 3.0284, + "step": 16401 + }, + { + "epoch": 0.6906105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.00011320646628901998, + "loss": 3.1735, + "step": 16402 + }, + { + "epoch": 0.6906526315789474, + "grad_norm": 0.427734375, + "learning_rate": 0.00011317822293690222, + "loss": 3.4179, + "step": 16403 + }, + { + "epoch": 0.6906947368421053, + "grad_norm": 0.431640625, + "learning_rate": 0.0001131499820775165, + "loss": 3.2273, + "step": 16404 + }, + { + "epoch": 0.6907368421052632, + "grad_norm": 0.451171875, + "learning_rate": 0.00011312174371137729, + "loss": 2.8546, + "step": 16405 + }, + { + "epoch": 0.690778947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00011309350783899911, + "loss": 3.0019, + "step": 16406 + }, + { + "epoch": 0.690821052631579, + "grad_norm": 0.447265625, + "learning_rate": 0.00011306527446089634, + "loss": 3.4377, + "step": 16407 + }, + { + "epoch": 0.6908631578947368, + "grad_norm": 0.447265625, + "learning_rate": 0.0001130370435775834, + "loss": 3.2937, + "step": 16408 + }, + { + "epoch": 0.6909052631578947, + "grad_norm": 0.423828125, + "learning_rate": 0.00011300881518957448, + "loss": 2.9478, + "step": 16409 + }, + { + "epoch": 0.6909473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.00011298058929738412, + "loss": 3.4034, + "step": 16410 + }, + { + "epoch": 0.6909894736842105, + "grad_norm": 0.462890625, + "learning_rate": 0.00011295236590152639, + "loss": 3.264, + "step": 16411 + }, + { + "epoch": 0.6910315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.00011292414500251555, + "loss": 2.7518, + "step": 16412 + }, + { + "epoch": 0.6910736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.00011289592660086573, + "loss": 2.9315, + "step": 16413 + }, + { + "epoch": 0.6911157894736842, + "grad_norm": 0.447265625, + "learning_rate": 0.00011286771069709104, + "loss": 3.0569, + "step": 16414 + }, + { + "epoch": 0.6911578947368421, + "grad_norm": 0.41015625, + "learning_rate": 0.00011283949729170553, + "loss": 3.3152, + "step": 16415 + }, + { + "epoch": 0.6912, + "grad_norm": 0.427734375, + "learning_rate": 0.00011281128638522322, + "loss": 2.7624, + "step": 16416 + }, + { + "epoch": 0.6912421052631579, + "grad_norm": 0.431640625, + "learning_rate": 0.00011278307797815798, + "loss": 3.1863, + "step": 16417 + }, + { + "epoch": 0.6912842105263158, + "grad_norm": 0.443359375, + "learning_rate": 0.00011275487207102394, + "loss": 2.9469, + "step": 16418 + }, + { + "epoch": 0.6913263157894737, + "grad_norm": 0.41015625, + "learning_rate": 0.0001127266686643349, + "loss": 3.2813, + "step": 16419 + }, + { + "epoch": 0.6913684210526316, + "grad_norm": 0.4375, + "learning_rate": 0.00011269846775860465, + "loss": 2.6729, + "step": 16420 + }, + { + "epoch": 0.6914105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.00011267026935434699, + "loss": 3.4617, + "step": 16421 + }, + { + "epoch": 0.6914526315789473, + "grad_norm": 0.427734375, + "learning_rate": 0.00011264207345207564, + "loss": 3.06, + "step": 16422 + }, + { + "epoch": 0.6914947368421053, + "grad_norm": 0.451171875, + "learning_rate": 0.00011261388005230436, + "loss": 3.1881, + "step": 16423 + }, + { + "epoch": 0.6915368421052631, + "grad_norm": 0.390625, + "learning_rate": 0.00011258568915554666, + "loss": 2.7078, + "step": 16424 + }, + { + "epoch": 0.6915789473684211, + "grad_norm": 0.435546875, + "learning_rate": 0.00011255750076231632, + "loss": 2.9213, + "step": 16425 + }, + { + "epoch": 0.6916210526315789, + "grad_norm": 0.44921875, + "learning_rate": 0.00011252931487312687, + "loss": 2.9451, + "step": 16426 + }, + { + "epoch": 0.6916631578947369, + "grad_norm": 0.423828125, + "learning_rate": 0.00011250113148849175, + "loss": 3.046, + "step": 16427 + }, + { + "epoch": 0.6917052631578947, + "grad_norm": 0.421875, + "learning_rate": 0.00011247295060892448, + "loss": 2.7479, + "step": 16428 + }, + { + "epoch": 0.6917473684210527, + "grad_norm": 0.453125, + "learning_rate": 0.00011244477223493845, + "loss": 2.9175, + "step": 16429 + }, + { + "epoch": 0.6917894736842105, + "grad_norm": 0.44140625, + "learning_rate": 0.00011241659636704704, + "loss": 3.3704, + "step": 16430 + }, + { + "epoch": 0.6918315789473685, + "grad_norm": 0.4296875, + "learning_rate": 0.00011238842300576352, + "loss": 2.9725, + "step": 16431 + }, + { + "epoch": 0.6918736842105263, + "grad_norm": 0.427734375, + "learning_rate": 0.00011236025215160143, + "loss": 3.3457, + "step": 16432 + }, + { + "epoch": 0.6919157894736843, + "grad_norm": 0.447265625, + "learning_rate": 0.00011233208380507362, + "loss": 3.2574, + "step": 16433 + }, + { + "epoch": 0.6919578947368421, + "grad_norm": 0.4921875, + "learning_rate": 0.0001123039179666936, + "loss": 3.3999, + "step": 16434 + }, + { + "epoch": 0.692, + "grad_norm": 0.447265625, + "learning_rate": 0.0001122757546369744, + "loss": 3.1016, + "step": 16435 + }, + { + "epoch": 0.6920421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00011224759381642902, + "loss": 3.3405, + "step": 16436 + }, + { + "epoch": 0.6920842105263157, + "grad_norm": 0.458984375, + "learning_rate": 0.00011221943550557085, + "loss": 2.5575, + "step": 16437 + }, + { + "epoch": 0.6921263157894737, + "grad_norm": 0.48046875, + "learning_rate": 0.00011219127970491247, + "loss": 3.1215, + "step": 16438 + }, + { + "epoch": 0.6921684210526315, + "grad_norm": 0.4375, + "learning_rate": 0.00011216312641496726, + "loss": 3.3596, + "step": 16439 + }, + { + "epoch": 0.6922105263157895, + "grad_norm": 0.421875, + "learning_rate": 0.00011213497563624772, + "loss": 3.0254, + "step": 16440 + }, + { + "epoch": 0.6922526315789473, + "grad_norm": 0.44921875, + "learning_rate": 0.00011210682736926713, + "loss": 3.0889, + "step": 16441 + }, + { + "epoch": 0.6922947368421053, + "grad_norm": 0.435546875, + "learning_rate": 0.00011207868161453796, + "loss": 3.1656, + "step": 16442 + }, + { + "epoch": 0.6923368421052631, + "grad_norm": 0.42578125, + "learning_rate": 0.00011205053837257326, + "loss": 3.1241, + "step": 16443 + }, + { + "epoch": 0.6923789473684211, + "grad_norm": 0.431640625, + "learning_rate": 0.00011202239764388567, + "loss": 2.8511, + "step": 16444 + }, + { + "epoch": 0.6924210526315789, + "grad_norm": 0.470703125, + "learning_rate": 0.00011199425942898774, + "loss": 3.0178, + "step": 16445 + }, + { + "epoch": 0.6924631578947369, + "grad_norm": 0.4609375, + "learning_rate": 0.00011196612372839249, + "loss": 3.0678, + "step": 16446 + }, + { + "epoch": 0.6925052631578947, + "grad_norm": 0.443359375, + "learning_rate": 0.00011193799054261205, + "loss": 3.2099, + "step": 16447 + }, + { + "epoch": 0.6925473684210526, + "grad_norm": 0.439453125, + "learning_rate": 0.00011190985987215943, + "loss": 3.0619, + "step": 16448 + }, + { + "epoch": 0.6925894736842105, + "grad_norm": 0.451171875, + "learning_rate": 0.0001118817317175467, + "loss": 3.1331, + "step": 16449 + }, + { + "epoch": 0.6926315789473684, + "grad_norm": 0.453125, + "learning_rate": 0.00011185360607928665, + "loss": 3.1066, + "step": 16450 + }, + { + "epoch": 0.6926736842105263, + "grad_norm": 0.466796875, + "learning_rate": 0.00011182548295789158, + "loss": 3.1472, + "step": 16451 + }, + { + "epoch": 0.6927157894736842, + "grad_norm": 0.453125, + "learning_rate": 0.00011179736235387378, + "loss": 2.87, + "step": 16452 + }, + { + "epoch": 0.6927578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00011176924426774582, + "loss": 2.8794, + "step": 16453 + }, + { + "epoch": 0.6928, + "grad_norm": 0.435546875, + "learning_rate": 0.00011174112870001962, + "loss": 2.951, + "step": 16454 + }, + { + "epoch": 0.6928421052631579, + "grad_norm": 0.419921875, + "learning_rate": 0.00011171301565120781, + "loss": 3.0599, + "step": 16455 + }, + { + "epoch": 0.6928842105263158, + "grad_norm": 0.412109375, + "learning_rate": 0.00011168490512182219, + "loss": 3.0366, + "step": 16456 + }, + { + "epoch": 0.6929263157894737, + "grad_norm": 0.42578125, + "learning_rate": 0.0001116567971123752, + "loss": 2.6173, + "step": 16457 + }, + { + "epoch": 0.6929684210526316, + "grad_norm": 0.416015625, + "learning_rate": 0.00011162869162337878, + "loss": 3.2064, + "step": 16458 + }, + { + "epoch": 0.6930105263157895, + "grad_norm": 0.484375, + "learning_rate": 0.00011160058865534502, + "loss": 3.0706, + "step": 16459 + }, + { + "epoch": 0.6930526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00011157248820878596, + "loss": 3.1579, + "step": 16460 + }, + { + "epoch": 0.6930947368421052, + "grad_norm": 0.416015625, + "learning_rate": 0.0001115443902842134, + "loss": 3.3192, + "step": 16461 + }, + { + "epoch": 0.6931368421052632, + "grad_norm": 0.423828125, + "learning_rate": 0.00011151629488213955, + "loss": 3.4768, + "step": 16462 + }, + { + "epoch": 0.693178947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00011148820200307594, + "loss": 3.3651, + "step": 16463 + }, + { + "epoch": 0.693221052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.00011146011164753461, + "loss": 2.7716, + "step": 16464 + }, + { + "epoch": 0.6932631578947368, + "grad_norm": 0.439453125, + "learning_rate": 0.00011143202381602726, + "loss": 3.1964, + "step": 16465 + }, + { + "epoch": 0.6933052631578948, + "grad_norm": 0.412109375, + "learning_rate": 0.00011140393850906566, + "loss": 3.1017, + "step": 16466 + }, + { + "epoch": 0.6933473684210526, + "grad_norm": 0.408203125, + "learning_rate": 0.00011137585572716144, + "loss": 3.4841, + "step": 16467 + }, + { + "epoch": 0.6933894736842106, + "grad_norm": 0.443359375, + "learning_rate": 0.00011134777547082626, + "loss": 3.0423, + "step": 16468 + }, + { + "epoch": 0.6934315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.00011131969774057166, + "loss": 3.0539, + "step": 16469 + }, + { + "epoch": 0.6934736842105264, + "grad_norm": 0.421875, + "learning_rate": 0.00011129162253690916, + "loss": 3.3442, + "step": 16470 + }, + { + "epoch": 0.6935157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.00011126354986035042, + "loss": 3.2052, + "step": 16471 + }, + { + "epoch": 0.6935578947368422, + "grad_norm": 0.4296875, + "learning_rate": 0.00011123547971140679, + "loss": 2.8121, + "step": 16472 + }, + { + "epoch": 0.6936, + "grad_norm": 0.421875, + "learning_rate": 0.00011120741209058969, + "loss": 3.225, + "step": 16473 + }, + { + "epoch": 0.693642105263158, + "grad_norm": 0.44140625, + "learning_rate": 0.00011117934699841045, + "loss": 3.1394, + "step": 16474 + }, + { + "epoch": 0.6936842105263158, + "grad_norm": 0.39453125, + "learning_rate": 0.00011115128443538042, + "loss": 2.7482, + "step": 16475 + }, + { + "epoch": 0.6937263157894736, + "grad_norm": 0.447265625, + "learning_rate": 0.00011112322440201081, + "loss": 2.7058, + "step": 16476 + }, + { + "epoch": 0.6937684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.00011109516689881282, + "loss": 3.2368, + "step": 16477 + }, + { + "epoch": 0.6938105263157894, + "grad_norm": 0.421875, + "learning_rate": 0.00011106711192629776, + "loss": 2.9772, + "step": 16478 + }, + { + "epoch": 0.6938526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.0001110390594849767, + "loss": 2.3507, + "step": 16479 + }, + { + "epoch": 0.6938947368421052, + "grad_norm": 0.44921875, + "learning_rate": 0.00011101100957536067, + "loss": 3.2843, + "step": 16480 + }, + { + "epoch": 0.6939368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.00011098296219796075, + "loss": 2.9698, + "step": 16481 + }, + { + "epoch": 0.693978947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.00011095491735328795, + "loss": 3.3932, + "step": 16482 + }, + { + "epoch": 0.694021052631579, + "grad_norm": 0.4375, + "learning_rate": 0.00011092687504185307, + "loss": 2.9106, + "step": 16483 + }, + { + "epoch": 0.6940631578947368, + "grad_norm": 0.40625, + "learning_rate": 0.0001108988352641673, + "loss": 3.0251, + "step": 16484 + }, + { + "epoch": 0.6941052631578948, + "grad_norm": 0.435546875, + "learning_rate": 0.00011087079802074112, + "loss": 3.283, + "step": 16485 + }, + { + "epoch": 0.6941473684210526, + "grad_norm": 0.4296875, + "learning_rate": 0.00011084276331208565, + "loss": 3.0841, + "step": 16486 + }, + { + "epoch": 0.6941894736842106, + "grad_norm": 0.4296875, + "learning_rate": 0.00011081473113871152, + "loss": 3.01, + "step": 16487 + }, + { + "epoch": 0.6942315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.00011078670150112946, + "loss": 2.869, + "step": 16488 + }, + { + "epoch": 0.6942736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.00011075867439985013, + "loss": 3.5004, + "step": 16489 + }, + { + "epoch": 0.6943157894736842, + "grad_norm": 0.4609375, + "learning_rate": 0.00011073064983538402, + "loss": 2.6513, + "step": 16490 + }, + { + "epoch": 0.6943578947368421, + "grad_norm": 0.412109375, + "learning_rate": 0.00011070262780824205, + "loss": 2.7632, + "step": 16491 + }, + { + "epoch": 0.6944, + "grad_norm": 0.45703125, + "learning_rate": 0.00011067460831893436, + "loss": 2.9029, + "step": 16492 + }, + { + "epoch": 0.6944421052631579, + "grad_norm": 0.427734375, + "learning_rate": 0.0001106465913679717, + "loss": 2.8965, + "step": 16493 + }, + { + "epoch": 0.6944842105263158, + "grad_norm": 0.4453125, + "learning_rate": 0.00011061857695586438, + "loss": 3.4039, + "step": 16494 + }, + { + "epoch": 0.6945263157894737, + "grad_norm": 0.419921875, + "learning_rate": 0.00011059056508312284, + "loss": 3.1164, + "step": 16495 + }, + { + "epoch": 0.6945684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.00011056255575025742, + "loss": 3.1529, + "step": 16496 + }, + { + "epoch": 0.6946105263157895, + "grad_norm": 0.44140625, + "learning_rate": 0.00011053454895777829, + "loss": 2.9859, + "step": 16497 + }, + { + "epoch": 0.6946526315789474, + "grad_norm": 0.423828125, + "learning_rate": 0.000110506544706196, + "loss": 3.0734, + "step": 16498 + }, + { + "epoch": 0.6946947368421053, + "grad_norm": 0.435546875, + "learning_rate": 0.00011047854299602039, + "loss": 2.6992, + "step": 16499 + }, + { + "epoch": 0.6947368421052632, + "grad_norm": 0.423828125, + "learning_rate": 0.00011045054382776199, + "loss": 2.7945, + "step": 16500 + }, + { + "epoch": 0.694778947368421, + "grad_norm": 0.408203125, + "learning_rate": 0.00011042254720193052, + "loss": 3.1227, + "step": 16501 + }, + { + "epoch": 0.6948210526315789, + "grad_norm": 0.4296875, + "learning_rate": 0.00011039455311903632, + "loss": 2.9842, + "step": 16502 + }, + { + "epoch": 0.6948631578947368, + "grad_norm": 0.427734375, + "learning_rate": 0.00011036656157958936, + "loss": 3.3261, + "step": 16503 + }, + { + "epoch": 0.6949052631578947, + "grad_norm": 3.84375, + "learning_rate": 0.00011033857258409946, + "loss": 3.4224, + "step": 16504 + }, + { + "epoch": 0.6949473684210526, + "grad_norm": 0.4453125, + "learning_rate": 0.00011031058613307687, + "loss": 2.8688, + "step": 16505 + }, + { + "epoch": 0.6949894736842105, + "grad_norm": 0.453125, + "learning_rate": 0.0001102826022270311, + "loss": 3.2516, + "step": 16506 + }, + { + "epoch": 0.6950315789473684, + "grad_norm": 0.435546875, + "learning_rate": 0.00011025462086647233, + "loss": 3.2864, + "step": 16507 + }, + { + "epoch": 0.6950736842105263, + "grad_norm": 0.71875, + "learning_rate": 0.00011022664205191002, + "loss": 3.4576, + "step": 16508 + }, + { + "epoch": 0.6951157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.00011019866578385412, + "loss": 3.2072, + "step": 16509 + }, + { + "epoch": 0.6951578947368421, + "grad_norm": 0.45703125, + "learning_rate": 0.00011017069206281427, + "loss": 3.1646, + "step": 16510 + }, + { + "epoch": 0.6952, + "grad_norm": 0.4140625, + "learning_rate": 0.00011014272088930013, + "loss": 3.0003, + "step": 16511 + }, + { + "epoch": 0.6952421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00011011475226382131, + "loss": 3.0334, + "step": 16512 + }, + { + "epoch": 0.6952842105263158, + "grad_norm": 0.423828125, + "learning_rate": 0.00011008678618688723, + "loss": 3.2111, + "step": 16513 + }, + { + "epoch": 0.6953263157894737, + "grad_norm": 0.4375, + "learning_rate": 0.00011005882265900768, + "loss": 2.942, + "step": 16514 + }, + { + "epoch": 0.6953684210526315, + "grad_norm": 0.4140625, + "learning_rate": 0.0001100308616806918, + "loss": 2.5132, + "step": 16515 + }, + { + "epoch": 0.6954105263157895, + "grad_norm": 0.416015625, + "learning_rate": 0.00011000290325244935, + "loss": 2.8922, + "step": 16516 + }, + { + "epoch": 0.6954526315789473, + "grad_norm": 0.423828125, + "learning_rate": 0.00010997494737478931, + "loss": 3.2324, + "step": 16517 + }, + { + "epoch": 0.6954947368421053, + "grad_norm": 0.400390625, + "learning_rate": 0.00010994699404822131, + "loss": 2.7703, + "step": 16518 + }, + { + "epoch": 0.6955368421052631, + "grad_norm": 0.453125, + "learning_rate": 0.00010991904327325453, + "loss": 3.0457, + "step": 16519 + }, + { + "epoch": 0.6955789473684211, + "grad_norm": 0.4375, + "learning_rate": 0.00010989109505039808, + "loss": 3.0829, + "step": 16520 + }, + { + "epoch": 0.6956210526315789, + "grad_norm": 0.41015625, + "learning_rate": 0.00010986314938016145, + "loss": 2.9194, + "step": 16521 + }, + { + "epoch": 0.6956631578947369, + "grad_norm": 0.427734375, + "learning_rate": 0.00010983520626305341, + "loss": 3.1658, + "step": 16522 + }, + { + "epoch": 0.6957052631578947, + "grad_norm": 0.408203125, + "learning_rate": 0.0001098072656995834, + "loss": 3.1023, + "step": 16523 + }, + { + "epoch": 0.6957473684210527, + "grad_norm": 0.43359375, + "learning_rate": 0.0001097793276902601, + "loss": 3.1457, + "step": 16524 + }, + { + "epoch": 0.6957894736842105, + "grad_norm": 0.431640625, + "learning_rate": 0.0001097513922355928, + "loss": 2.7371, + "step": 16525 + }, + { + "epoch": 0.6958315789473685, + "grad_norm": 0.4296875, + "learning_rate": 0.00010972345933609034, + "loss": 3.0368, + "step": 16526 + }, + { + "epoch": 0.6958736842105263, + "grad_norm": 0.439453125, + "learning_rate": 0.00010969552899226166, + "loss": 3.023, + "step": 16527 + }, + { + "epoch": 0.6959157894736843, + "grad_norm": 0.453125, + "learning_rate": 0.00010966760120461556, + "loss": 2.8969, + "step": 16528 + }, + { + "epoch": 0.6959578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.00010963967597366078, + "loss": 3.2524, + "step": 16529 + }, + { + "epoch": 0.696, + "grad_norm": 0.412109375, + "learning_rate": 0.0001096117532999063, + "loss": 3.1935, + "step": 16530 + }, + { + "epoch": 0.6960421052631579, + "grad_norm": 0.439453125, + "learning_rate": 0.00010958383318386073, + "loss": 3.4846, + "step": 16531 + }, + { + "epoch": 0.6960842105263157, + "grad_norm": 0.431640625, + "learning_rate": 0.00010955591562603273, + "loss": 2.7995, + "step": 16532 + }, + { + "epoch": 0.6961263157894737, + "grad_norm": 0.3984375, + "learning_rate": 0.00010952800062693095, + "loss": 2.3446, + "step": 16533 + }, + { + "epoch": 0.6961684210526315, + "grad_norm": 0.435546875, + "learning_rate": 0.00010950008818706391, + "loss": 3.0496, + "step": 16534 + }, + { + "epoch": 0.6962105263157895, + "grad_norm": 0.416015625, + "learning_rate": 0.00010947217830694023, + "loss": 2.975, + "step": 16535 + }, + { + "epoch": 0.6962526315789473, + "grad_norm": 0.40625, + "learning_rate": 0.00010944427098706821, + "loss": 3.2035, + "step": 16536 + }, + { + "epoch": 0.6962947368421053, + "grad_norm": 0.412109375, + "learning_rate": 0.00010941636622795655, + "loss": 3.3659, + "step": 16537 + }, + { + "epoch": 0.6963368421052631, + "grad_norm": 0.419921875, + "learning_rate": 0.00010938846403011352, + "loss": 3.1652, + "step": 16538 + }, + { + "epoch": 0.6963789473684211, + "grad_norm": 0.40625, + "learning_rate": 0.00010936056439404746, + "loss": 3.0609, + "step": 16539 + }, + { + "epoch": 0.6964210526315789, + "grad_norm": 0.435546875, + "learning_rate": 0.00010933266732026665, + "loss": 3.314, + "step": 16540 + }, + { + "epoch": 0.6964631578947369, + "grad_norm": 0.431640625, + "learning_rate": 0.00010930477280927939, + "loss": 3.3458, + "step": 16541 + }, + { + "epoch": 0.6965052631578947, + "grad_norm": 0.43359375, + "learning_rate": 0.00010927688086159385, + "loss": 3.1161, + "step": 16542 + }, + { + "epoch": 0.6965473684210526, + "grad_norm": 0.427734375, + "learning_rate": 0.00010924899147771819, + "loss": 2.9989, + "step": 16543 + }, + { + "epoch": 0.6965894736842105, + "grad_norm": 0.427734375, + "learning_rate": 0.00010922110465816044, + "loss": 3.3675, + "step": 16544 + }, + { + "epoch": 0.6966315789473684, + "grad_norm": 0.404296875, + "learning_rate": 0.00010919322040342886, + "loss": 2.9962, + "step": 16545 + }, + { + "epoch": 0.6966736842105263, + "grad_norm": 0.44140625, + "learning_rate": 0.00010916533871403134, + "loss": 2.9618, + "step": 16546 + }, + { + "epoch": 0.6967157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.00010913745959047589, + "loss": 3.24, + "step": 16547 + }, + { + "epoch": 0.6967578947368421, + "grad_norm": 0.419921875, + "learning_rate": 0.00010910958303327043, + "loss": 3.0315, + "step": 16548 + }, + { + "epoch": 0.6968, + "grad_norm": 0.447265625, + "learning_rate": 0.0001090817090429228, + "loss": 2.5952, + "step": 16549 + }, + { + "epoch": 0.6968421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00010905383761994087, + "loss": 3.202, + "step": 16550 + }, + { + "epoch": 0.6968842105263158, + "grad_norm": 0.404296875, + "learning_rate": 0.00010902596876483235, + "loss": 2.9901, + "step": 16551 + }, + { + "epoch": 0.6969263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.0001089981024781051, + "loss": 3.1672, + "step": 16552 + }, + { + "epoch": 0.6969684210526316, + "grad_norm": 0.42578125, + "learning_rate": 0.00010897023876026673, + "loss": 2.943, + "step": 16553 + }, + { + "epoch": 0.6970105263157895, + "grad_norm": 0.435546875, + "learning_rate": 0.00010894237761182496, + "loss": 3.2603, + "step": 16554 + }, + { + "epoch": 0.6970526315789474, + "grad_norm": 0.466796875, + "learning_rate": 0.0001089145190332873, + "loss": 3.3624, + "step": 16555 + }, + { + "epoch": 0.6970947368421052, + "grad_norm": 0.443359375, + "learning_rate": 0.00010888666302516137, + "loss": 2.8968, + "step": 16556 + }, + { + "epoch": 0.6971368421052632, + "grad_norm": 0.431640625, + "learning_rate": 0.0001088588095879546, + "loss": 3.1117, + "step": 16557 + }, + { + "epoch": 0.697178947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00010883095872217442, + "loss": 3.0859, + "step": 16558 + }, + { + "epoch": 0.697221052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00010880311042832846, + "loss": 2.9567, + "step": 16559 + }, + { + "epoch": 0.6972631578947368, + "grad_norm": 0.408203125, + "learning_rate": 0.00010877526470692378, + "loss": 3.006, + "step": 16560 + }, + { + "epoch": 0.6973052631578948, + "grad_norm": 0.5078125, + "learning_rate": 0.00010874742155846792, + "loss": 3.0657, + "step": 16561 + }, + { + "epoch": 0.6973473684210526, + "grad_norm": 0.421875, + "learning_rate": 0.00010871958098346807, + "loss": 3.3572, + "step": 16562 + }, + { + "epoch": 0.6973894736842106, + "grad_norm": 0.42578125, + "learning_rate": 0.00010869174298243146, + "loss": 2.764, + "step": 16563 + }, + { + "epoch": 0.6974315789473684, + "grad_norm": 0.49609375, + "learning_rate": 0.00010866390755586524, + "loss": 2.7574, + "step": 16564 + }, + { + "epoch": 0.6974736842105264, + "grad_norm": 0.4296875, + "learning_rate": 0.00010863607470427653, + "loss": 2.8312, + "step": 16565 + }, + { + "epoch": 0.6975157894736842, + "grad_norm": 0.4609375, + "learning_rate": 0.00010860824442817258, + "loss": 3.2233, + "step": 16566 + }, + { + "epoch": 0.6975578947368422, + "grad_norm": 0.419921875, + "learning_rate": 0.0001085804167280601, + "loss": 2.9248, + "step": 16567 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4296875, + "learning_rate": 0.0001085525916044464, + "loss": 2.6604, + "step": 16568 + }, + { + "epoch": 0.6976421052631578, + "grad_norm": 0.4296875, + "learning_rate": 0.00010852476905783823, + "loss": 3.0337, + "step": 16569 + }, + { + "epoch": 0.6976842105263158, + "grad_norm": 0.47265625, + "learning_rate": 0.00010849694908874259, + "loss": 3.2376, + "step": 16570 + }, + { + "epoch": 0.6977263157894736, + "grad_norm": 0.435546875, + "learning_rate": 0.00010846913169766626, + "loss": 3.2844, + "step": 16571 + }, + { + "epoch": 0.6977684210526316, + "grad_norm": 0.412109375, + "learning_rate": 0.00010844131688511594, + "loss": 3.1323, + "step": 16572 + }, + { + "epoch": 0.6978105263157894, + "grad_norm": 0.4453125, + "learning_rate": 0.00010841350465159871, + "loss": 3.3473, + "step": 16573 + }, + { + "epoch": 0.6978526315789474, + "grad_norm": 0.4140625, + "learning_rate": 0.00010838569499762088, + "loss": 3.2903, + "step": 16574 + }, + { + "epoch": 0.6978947368421052, + "grad_norm": 0.45703125, + "learning_rate": 0.0001083578879236895, + "loss": 2.8586, + "step": 16575 + }, + { + "epoch": 0.6979368421052632, + "grad_norm": 0.421875, + "learning_rate": 0.00010833008343031078, + "loss": 3.3354, + "step": 16576 + }, + { + "epoch": 0.697978947368421, + "grad_norm": 0.451171875, + "learning_rate": 0.00010830228151799157, + "loss": 3.1322, + "step": 16577 + }, + { + "epoch": 0.698021052631579, + "grad_norm": 0.43359375, + "learning_rate": 0.00010827448218723831, + "loss": 3.2725, + "step": 16578 + }, + { + "epoch": 0.6980631578947368, + "grad_norm": 0.4609375, + "learning_rate": 0.00010824668543855737, + "loss": 3.2174, + "step": 16579 + }, + { + "epoch": 0.6981052631578948, + "grad_norm": 0.42578125, + "learning_rate": 0.00010821889127245546, + "loss": 2.7626, + "step": 16580 + }, + { + "epoch": 0.6981473684210526, + "grad_norm": 0.416015625, + "learning_rate": 0.00010819109968943858, + "loss": 3.3108, + "step": 16581 + }, + { + "epoch": 0.6981894736842105, + "grad_norm": 0.447265625, + "learning_rate": 0.00010816331069001342, + "loss": 3.48, + "step": 16582 + }, + { + "epoch": 0.6982315789473684, + "grad_norm": 0.412109375, + "learning_rate": 0.00010813552427468593, + "loss": 3.2027, + "step": 16583 + }, + { + "epoch": 0.6982736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.00010810774044396257, + "loss": 3.1309, + "step": 16584 + }, + { + "epoch": 0.6983157894736842, + "grad_norm": 0.4140625, + "learning_rate": 0.00010807995919834946, + "loss": 2.8929, + "step": 16585 + }, + { + "epoch": 0.6983578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.00010805218053835276, + "loss": 3.2349, + "step": 16586 + }, + { + "epoch": 0.6984, + "grad_norm": 0.43359375, + "learning_rate": 0.00010802440446447853, + "loss": 3.4102, + "step": 16587 + }, + { + "epoch": 0.6984421052631579, + "grad_norm": 0.44140625, + "learning_rate": 0.00010799663097723276, + "loss": 2.8622, + "step": 16588 + }, + { + "epoch": 0.6984842105263158, + "grad_norm": 0.4453125, + "learning_rate": 0.0001079688600771217, + "loss": 3.0378, + "step": 16589 + }, + { + "epoch": 0.6985263157894737, + "grad_norm": 0.408203125, + "learning_rate": 0.00010794109176465095, + "loss": 3.0569, + "step": 16590 + }, + { + "epoch": 0.6985684210526316, + "grad_norm": 0.46875, + "learning_rate": 0.00010791332604032669, + "loss": 3.3399, + "step": 16591 + }, + { + "epoch": 0.6986105263157895, + "grad_norm": 0.4453125, + "learning_rate": 0.00010788556290465468, + "loss": 3.3416, + "step": 16592 + }, + { + "epoch": 0.6986526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.00010785780235814074, + "loss": 3.1937, + "step": 16593 + }, + { + "epoch": 0.6986947368421053, + "grad_norm": 0.43359375, + "learning_rate": 0.00010783004440129063, + "loss": 3.0957, + "step": 16594 + }, + { + "epoch": 0.6987368421052632, + "grad_norm": 0.43359375, + "learning_rate": 0.00010780228903460998, + "loss": 3.5741, + "step": 16595 + }, + { + "epoch": 0.698778947368421, + "grad_norm": 0.4140625, + "learning_rate": 0.00010777453625860473, + "loss": 3.1415, + "step": 16596 + }, + { + "epoch": 0.6988210526315789, + "grad_norm": 0.396484375, + "learning_rate": 0.00010774678607378013, + "loss": 2.5353, + "step": 16597 + }, + { + "epoch": 0.6988631578947369, + "grad_norm": 0.458984375, + "learning_rate": 0.00010771903848064204, + "loss": 3.1803, + "step": 16598 + }, + { + "epoch": 0.6989052631578947, + "grad_norm": 0.419921875, + "learning_rate": 0.00010769129347969589, + "loss": 3.0482, + "step": 16599 + }, + { + "epoch": 0.6989473684210527, + "grad_norm": 0.423828125, + "learning_rate": 0.00010766355107144718, + "loss": 3.4186, + "step": 16600 + }, + { + "epoch": 0.6989894736842105, + "grad_norm": 0.431640625, + "learning_rate": 0.00010763581125640134, + "loss": 3.096, + "step": 16601 + }, + { + "epoch": 0.6990315789473684, + "grad_norm": 0.443359375, + "learning_rate": 0.00010760807403506373, + "loss": 2.8746, + "step": 16602 + }, + { + "epoch": 0.6990736842105263, + "grad_norm": 0.423828125, + "learning_rate": 0.00010758033940793971, + "loss": 3.2631, + "step": 16603 + }, + { + "epoch": 0.6991157894736842, + "grad_norm": 0.4375, + "learning_rate": 0.00010755260737553449, + "loss": 3.1139, + "step": 16604 + }, + { + "epoch": 0.6991578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00010752487793835347, + "loss": 3.4771, + "step": 16605 + }, + { + "epoch": 0.6992, + "grad_norm": 0.421875, + "learning_rate": 0.00010749715109690179, + "loss": 3.1211, + "step": 16606 + }, + { + "epoch": 0.6992421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00010746942685168457, + "loss": 3.1463, + "step": 16607 + }, + { + "epoch": 0.6992842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.00010744170520320692, + "loss": 3.243, + "step": 16608 + }, + { + "epoch": 0.6993263157894737, + "grad_norm": 0.443359375, + "learning_rate": 0.00010741398615197392, + "loss": 3.1078, + "step": 16609 + }, + { + "epoch": 0.6993684210526315, + "grad_norm": 0.427734375, + "learning_rate": 0.00010738626969849056, + "loss": 3.1656, + "step": 16610 + }, + { + "epoch": 0.6994105263157895, + "grad_norm": 0.453125, + "learning_rate": 0.0001073585558432617, + "loss": 3.5144, + "step": 16611 + }, + { + "epoch": 0.6994526315789473, + "grad_norm": 0.447265625, + "learning_rate": 0.00010733084458679246, + "loss": 2.9088, + "step": 16612 + }, + { + "epoch": 0.6994947368421053, + "grad_norm": 0.41796875, + "learning_rate": 0.00010730313592958757, + "loss": 3.1005, + "step": 16613 + }, + { + "epoch": 0.6995368421052631, + "grad_norm": 0.41796875, + "learning_rate": 0.00010727542987215192, + "loss": 2.8828, + "step": 16614 + }, + { + "epoch": 0.6995789473684211, + "grad_norm": 0.4375, + "learning_rate": 0.00010724772641499022, + "loss": 3.2217, + "step": 16615 + }, + { + "epoch": 0.6996210526315789, + "grad_norm": 0.46875, + "learning_rate": 0.00010722002555860721, + "loss": 3.4868, + "step": 16616 + }, + { + "epoch": 0.6996631578947369, + "grad_norm": 0.43359375, + "learning_rate": 0.00010719232730350762, + "loss": 3.2557, + "step": 16617 + }, + { + "epoch": 0.6997052631578947, + "grad_norm": 0.451171875, + "learning_rate": 0.000107164631650196, + "loss": 2.9334, + "step": 16618 + }, + { + "epoch": 0.6997473684210527, + "grad_norm": 0.4296875, + "learning_rate": 0.00010713693859917687, + "loss": 3.3321, + "step": 16619 + }, + { + "epoch": 0.6997894736842105, + "grad_norm": 0.4296875, + "learning_rate": 0.00010710924815095496, + "loss": 2.9559, + "step": 16620 + }, + { + "epoch": 0.6998315789473685, + "grad_norm": 0.44921875, + "learning_rate": 0.00010708156030603467, + "loss": 3.4068, + "step": 16621 + }, + { + "epoch": 0.6998736842105263, + "grad_norm": 0.3984375, + "learning_rate": 0.00010705387506492042, + "loss": 2.6694, + "step": 16622 + }, + { + "epoch": 0.6999157894736842, + "grad_norm": 0.412109375, + "learning_rate": 0.00010702619242811659, + "loss": 2.721, + "step": 16623 + }, + { + "epoch": 0.6999578947368421, + "grad_norm": 0.435546875, + "learning_rate": 0.0001069985123961276, + "loss": 2.7245, + "step": 16624 + }, + { + "epoch": 0.7, + "grad_norm": 0.421875, + "learning_rate": 0.00010697083496945764, + "loss": 3.3076, + "step": 16625 + }, + { + "epoch": 0.7000421052631579, + "grad_norm": 0.4609375, + "learning_rate": 0.00010694316014861093, + "loss": 3.3274, + "step": 16626 + }, + { + "epoch": 0.7000842105263158, + "grad_norm": 0.515625, + "learning_rate": 0.0001069154879340919, + "loss": 3.111, + "step": 16627 + }, + { + "epoch": 0.7001263157894737, + "grad_norm": 0.3984375, + "learning_rate": 0.00010688781832640452, + "loss": 2.4551, + "step": 16628 + }, + { + "epoch": 0.7001684210526316, + "grad_norm": 0.4609375, + "learning_rate": 0.00010686015132605296, + "loss": 2.5352, + "step": 16629 + }, + { + "epoch": 0.7002105263157895, + "grad_norm": 0.46875, + "learning_rate": 0.00010683248693354123, + "loss": 3.0144, + "step": 16630 + }, + { + "epoch": 0.7002526315789473, + "grad_norm": 0.408203125, + "learning_rate": 0.00010680482514937328, + "loss": 3.0624, + "step": 16631 + }, + { + "epoch": 0.7002947368421053, + "grad_norm": 0.41796875, + "learning_rate": 0.0001067771659740534, + "loss": 3.3789, + "step": 16632 + }, + { + "epoch": 0.7003368421052631, + "grad_norm": 0.4140625, + "learning_rate": 0.00010674950940808505, + "loss": 3.3257, + "step": 16633 + }, + { + "epoch": 0.7003789473684211, + "grad_norm": 0.447265625, + "learning_rate": 0.00010672185545197252, + "loss": 3.298, + "step": 16634 + }, + { + "epoch": 0.7004210526315789, + "grad_norm": 0.44140625, + "learning_rate": 0.00010669420410621928, + "loss": 3.125, + "step": 16635 + }, + { + "epoch": 0.7004631578947368, + "grad_norm": 0.482421875, + "learning_rate": 0.00010666655537132933, + "loss": 2.8726, + "step": 16636 + }, + { + "epoch": 0.7005052631578947, + "grad_norm": 0.4375, + "learning_rate": 0.00010663890924780634, + "loss": 3.3646, + "step": 16637 + }, + { + "epoch": 0.7005473684210526, + "grad_norm": 0.443359375, + "learning_rate": 0.00010661126573615388, + "loss": 2.6596, + "step": 16638 + }, + { + "epoch": 0.7005894736842105, + "grad_norm": 0.44140625, + "learning_rate": 0.00010658362483687586, + "loss": 3.4095, + "step": 16639 + }, + { + "epoch": 0.7006315789473684, + "grad_norm": 0.4375, + "learning_rate": 0.00010655598655047552, + "loss": 2.8012, + "step": 16640 + }, + { + "epoch": 0.7006736842105263, + "grad_norm": 0.435546875, + "learning_rate": 0.00010652835087745674, + "loss": 2.7025, + "step": 16641 + }, + { + "epoch": 0.7007157894736842, + "grad_norm": 0.3984375, + "learning_rate": 0.00010650071781832266, + "loss": 2.8998, + "step": 16642 + }, + { + "epoch": 0.7007578947368421, + "grad_norm": 0.431640625, + "learning_rate": 0.00010647308737357697, + "loss": 3.3097, + "step": 16643 + }, + { + "epoch": 0.7008, + "grad_norm": 0.443359375, + "learning_rate": 0.00010644545954372304, + "loss": 2.5418, + "step": 16644 + }, + { + "epoch": 0.7008421052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.0001064178343292641, + "loss": 3.0269, + "step": 16645 + }, + { + "epoch": 0.7008842105263158, + "grad_norm": 0.44921875, + "learning_rate": 0.00010639021173070357, + "loss": 2.8906, + "step": 16646 + }, + { + "epoch": 0.7009263157894737, + "grad_norm": 0.46484375, + "learning_rate": 0.00010636259174854454, + "loss": 2.9361, + "step": 16647 + }, + { + "epoch": 0.7009684210526316, + "grad_norm": 0.427734375, + "learning_rate": 0.00010633497438329049, + "loss": 2.8861, + "step": 16648 + }, + { + "epoch": 0.7010105263157895, + "grad_norm": 0.43359375, + "learning_rate": 0.00010630735963544425, + "loss": 3.0437, + "step": 16649 + }, + { + "epoch": 0.7010526315789474, + "grad_norm": 0.408203125, + "learning_rate": 0.00010627974750550922, + "loss": 2.6417, + "step": 16650 + }, + { + "epoch": 0.7010947368421052, + "grad_norm": 0.427734375, + "learning_rate": 0.0001062521379939882, + "loss": 3.2037, + "step": 16651 + }, + { + "epoch": 0.7011368421052632, + "grad_norm": 0.41796875, + "learning_rate": 0.00010622453110138439, + "loss": 3.2096, + "step": 16652 + }, + { + "epoch": 0.701178947368421, + "grad_norm": 0.44140625, + "learning_rate": 0.0001061969268282007, + "loss": 3.1297, + "step": 16653 + }, + { + "epoch": 0.701221052631579, + "grad_norm": 0.435546875, + "learning_rate": 0.00010616932517493996, + "loss": 2.9453, + "step": 16654 + }, + { + "epoch": 0.7012631578947368, + "grad_norm": 0.451171875, + "learning_rate": 0.00010614172614210529, + "loss": 3.1269, + "step": 16655 + }, + { + "epoch": 0.7013052631578948, + "grad_norm": 0.404296875, + "learning_rate": 0.00010611412973019915, + "loss": 3.0827, + "step": 16656 + }, + { + "epoch": 0.7013473684210526, + "grad_norm": 0.4453125, + "learning_rate": 0.00010608653593972473, + "loss": 3.1703, + "step": 16657 + }, + { + "epoch": 0.7013894736842106, + "grad_norm": 0.474609375, + "learning_rate": 0.00010605894477118431, + "loss": 3.0012, + "step": 16658 + }, + { + "epoch": 0.7014315789473684, + "grad_norm": 0.419921875, + "learning_rate": 0.00010603135622508089, + "loss": 3.0868, + "step": 16659 + }, + { + "epoch": 0.7014736842105264, + "grad_norm": 0.416015625, + "learning_rate": 0.00010600377030191701, + "loss": 3.0522, + "step": 16660 + }, + { + "epoch": 0.7015157894736842, + "grad_norm": 0.408203125, + "learning_rate": 0.00010597618700219527, + "loss": 3.1718, + "step": 16661 + }, + { + "epoch": 0.7015578947368422, + "grad_norm": 0.41015625, + "learning_rate": 0.00010594860632641817, + "loss": 2.8979, + "step": 16662 + }, + { + "epoch": 0.7016, + "grad_norm": 0.46484375, + "learning_rate": 0.0001059210282750881, + "loss": 3.0655, + "step": 16663 + }, + { + "epoch": 0.7016421052631578, + "grad_norm": 0.4296875, + "learning_rate": 0.0001058934528487078, + "loss": 2.7588, + "step": 16664 + }, + { + "epoch": 0.7016842105263158, + "grad_norm": 0.44140625, + "learning_rate": 0.00010586588004777931, + "loss": 3.0551, + "step": 16665 + }, + { + "epoch": 0.7017263157894736, + "grad_norm": 0.431640625, + "learning_rate": 0.00010583830987280519, + "loss": 3.1109, + "step": 16666 + }, + { + "epoch": 0.7017684210526316, + "grad_norm": 0.44921875, + "learning_rate": 0.0001058107423242877, + "loss": 2.6443, + "step": 16667 + }, + { + "epoch": 0.7018105263157894, + "grad_norm": 0.416015625, + "learning_rate": 0.00010578317740272905, + "loss": 3.0563, + "step": 16668 + }, + { + "epoch": 0.7018526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.00010575561510863149, + "loss": 2.9679, + "step": 16669 + }, + { + "epoch": 0.7018947368421052, + "grad_norm": 0.408203125, + "learning_rate": 0.00010572805544249703, + "loss": 2.9888, + "step": 16670 + }, + { + "epoch": 0.7019368421052632, + "grad_norm": 0.431640625, + "learning_rate": 0.00010570049840482809, + "loss": 2.8357, + "step": 16671 + }, + { + "epoch": 0.701978947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00010567294399612632, + "loss": 3.021, + "step": 16672 + }, + { + "epoch": 0.702021052631579, + "grad_norm": 0.5625, + "learning_rate": 0.00010564539221689404, + "loss": 3.0833, + "step": 16673 + }, + { + "epoch": 0.7020631578947368, + "grad_norm": 0.451171875, + "learning_rate": 0.00010561784306763308, + "loss": 3.4316, + "step": 16674 + }, + { + "epoch": 0.7021052631578948, + "grad_norm": 0.40625, + "learning_rate": 0.00010559029654884541, + "loss": 2.9175, + "step": 16675 + }, + { + "epoch": 0.7021473684210526, + "grad_norm": 0.435546875, + "learning_rate": 0.00010556275266103285, + "loss": 3.4217, + "step": 16676 + }, + { + "epoch": 0.7021894736842105, + "grad_norm": 0.41796875, + "learning_rate": 0.00010553521140469721, + "loss": 2.6792, + "step": 16677 + }, + { + "epoch": 0.7022315789473684, + "grad_norm": 0.4453125, + "learning_rate": 0.00010550767278034031, + "loss": 2.9779, + "step": 16678 + }, + { + "epoch": 0.7022736842105263, + "grad_norm": 0.431640625, + "learning_rate": 0.00010548013678846375, + "loss": 2.9548, + "step": 16679 + }, + { + "epoch": 0.7023157894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.00010545260342956937, + "loss": 2.6873, + "step": 16680 + }, + { + "epoch": 0.7023578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00010542507270415872, + "loss": 3.3689, + "step": 16681 + }, + { + "epoch": 0.7024, + "grad_norm": 0.43359375, + "learning_rate": 0.00010539754461273338, + "loss": 3.3119, + "step": 16682 + }, + { + "epoch": 0.7024421052631579, + "grad_norm": 0.4296875, + "learning_rate": 0.00010537001915579486, + "loss": 3.1748, + "step": 16683 + }, + { + "epoch": 0.7024842105263158, + "grad_norm": 0.41796875, + "learning_rate": 0.00010534249633384465, + "loss": 2.8221, + "step": 16684 + }, + { + "epoch": 0.7025263157894737, + "grad_norm": 0.439453125, + "learning_rate": 0.00010531497614738414, + "loss": 2.5808, + "step": 16685 + }, + { + "epoch": 0.7025684210526316, + "grad_norm": 0.439453125, + "learning_rate": 0.00010528745859691482, + "loss": 3.0362, + "step": 16686 + }, + { + "epoch": 0.7026105263157895, + "grad_norm": 0.431640625, + "learning_rate": 0.000105259943682938, + "loss": 3.2403, + "step": 16687 + }, + { + "epoch": 0.7026526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00010523243140595496, + "loss": 3.1517, + "step": 16688 + }, + { + "epoch": 0.7026947368421053, + "grad_norm": 0.412109375, + "learning_rate": 0.0001052049217664669, + "loss": 2.9609, + "step": 16689 + }, + { + "epoch": 0.7027368421052631, + "grad_norm": 0.462890625, + "learning_rate": 0.00010517741476497502, + "loss": 3.1796, + "step": 16690 + }, + { + "epoch": 0.7027789473684211, + "grad_norm": 0.44921875, + "learning_rate": 0.00010514991040198052, + "loss": 3.1659, + "step": 16691 + }, + { + "epoch": 0.7028210526315789, + "grad_norm": 0.435546875, + "learning_rate": 0.00010512240867798434, + "loss": 3.3833, + "step": 16692 + }, + { + "epoch": 0.7028631578947369, + "grad_norm": 0.470703125, + "learning_rate": 0.00010509490959348786, + "loss": 2.8139, + "step": 16693 + }, + { + "epoch": 0.7029052631578947, + "grad_norm": 0.447265625, + "learning_rate": 0.00010506741314899165, + "loss": 3.3989, + "step": 16694 + }, + { + "epoch": 0.7029473684210527, + "grad_norm": 0.419921875, + "learning_rate": 0.00010503991934499702, + "loss": 3.3479, + "step": 16695 + }, + { + "epoch": 0.7029894736842105, + "grad_norm": 0.49609375, + "learning_rate": 0.00010501242818200471, + "loss": 2.8311, + "step": 16696 + }, + { + "epoch": 0.7030315789473685, + "grad_norm": 0.419921875, + "learning_rate": 0.00010498493966051561, + "loss": 3.0758, + "step": 16697 + }, + { + "epoch": 0.7030736842105263, + "grad_norm": 0.390625, + "learning_rate": 0.00010495745378103052, + "loss": 2.6687, + "step": 16698 + }, + { + "epoch": 0.7031157894736842, + "grad_norm": 0.4609375, + "learning_rate": 0.00010492997054405013, + "loss": 2.9243, + "step": 16699 + }, + { + "epoch": 0.7031578947368421, + "grad_norm": 0.427734375, + "learning_rate": 0.0001049024899500754, + "loss": 3.3541, + "step": 16700 + }, + { + "epoch": 0.7032, + "grad_norm": 0.423828125, + "learning_rate": 0.00010487501199960662, + "loss": 2.8791, + "step": 16701 + }, + { + "epoch": 0.7032421052631579, + "grad_norm": 0.4375, + "learning_rate": 0.0001048475366931447, + "loss": 3.3868, + "step": 16702 + }, + { + "epoch": 0.7032842105263158, + "grad_norm": 0.42578125, + "learning_rate": 0.00010482006403119016, + "loss": 2.4396, + "step": 16703 + }, + { + "epoch": 0.7033263157894737, + "grad_norm": 0.443359375, + "learning_rate": 0.00010479259401424343, + "loss": 2.8902, + "step": 16704 + }, + { + "epoch": 0.7033684210526315, + "grad_norm": 0.421875, + "learning_rate": 0.00010476512664280503, + "loss": 3.3431, + "step": 16705 + }, + { + "epoch": 0.7034105263157895, + "grad_norm": 0.4375, + "learning_rate": 0.0001047376619173753, + "loss": 3.2178, + "step": 16706 + }, + { + "epoch": 0.7034526315789473, + "grad_norm": 0.427734375, + "learning_rate": 0.00010471019983845487, + "loss": 2.7693, + "step": 16707 + }, + { + "epoch": 0.7034947368421053, + "grad_norm": 0.404296875, + "learning_rate": 0.00010468274040654369, + "loss": 3.2433, + "step": 16708 + }, + { + "epoch": 0.7035368421052631, + "grad_norm": 0.44921875, + "learning_rate": 0.00010465528362214243, + "loss": 3.004, + "step": 16709 + }, + { + "epoch": 0.7035789473684211, + "grad_norm": 0.4609375, + "learning_rate": 0.00010462782948575097, + "loss": 3.0806, + "step": 16710 + }, + { + "epoch": 0.7036210526315789, + "grad_norm": 0.451171875, + "learning_rate": 0.00010460037799786973, + "loss": 3.3326, + "step": 16711 + }, + { + "epoch": 0.7036631578947369, + "grad_norm": 0.435546875, + "learning_rate": 0.00010457292915899877, + "loss": 3.0864, + "step": 16712 + }, + { + "epoch": 0.7037052631578947, + "grad_norm": 0.421875, + "learning_rate": 0.00010454548296963806, + "loss": 3.4395, + "step": 16713 + }, + { + "epoch": 0.7037473684210527, + "grad_norm": 0.4375, + "learning_rate": 0.00010451803943028795, + "loss": 3.0074, + "step": 16714 + }, + { + "epoch": 0.7037894736842105, + "grad_norm": 0.38671875, + "learning_rate": 0.00010449059854144805, + "loss": 2.716, + "step": 16715 + }, + { + "epoch": 0.7038315789473685, + "grad_norm": 0.447265625, + "learning_rate": 0.00010446316030361867, + "loss": 2.7013, + "step": 16716 + }, + { + "epoch": 0.7038736842105263, + "grad_norm": 0.443359375, + "learning_rate": 0.00010443572471729929, + "loss": 2.702, + "step": 16717 + }, + { + "epoch": 0.7039157894736842, + "grad_norm": 0.439453125, + "learning_rate": 0.00010440829178299008, + "loss": 3.0986, + "step": 16718 + }, + { + "epoch": 0.7039578947368421, + "grad_norm": 0.4296875, + "learning_rate": 0.00010438086150119073, + "loss": 3.1983, + "step": 16719 + }, + { + "epoch": 0.704, + "grad_norm": 0.44140625, + "learning_rate": 0.00010435343387240099, + "loss": 2.7745, + "step": 16720 + }, + { + "epoch": 0.7040421052631579, + "grad_norm": 0.4140625, + "learning_rate": 0.00010432600889712057, + "loss": 2.8786, + "step": 16721 + }, + { + "epoch": 0.7040842105263158, + "grad_norm": 0.453125, + "learning_rate": 0.000104298586575849, + "loss": 2.8894, + "step": 16722 + }, + { + "epoch": 0.7041263157894737, + "grad_norm": 0.4296875, + "learning_rate": 0.00010427116690908617, + "loss": 2.9818, + "step": 16723 + }, + { + "epoch": 0.7041684210526316, + "grad_norm": 0.419921875, + "learning_rate": 0.0001042437498973313, + "loss": 3.048, + "step": 16724 + }, + { + "epoch": 0.7042105263157895, + "grad_norm": 0.431640625, + "learning_rate": 0.0001042163355410842, + "loss": 3.4527, + "step": 16725 + }, + { + "epoch": 0.7042526315789474, + "grad_norm": 0.42578125, + "learning_rate": 0.00010418892384084402, + "loss": 3.247, + "step": 16726 + }, + { + "epoch": 0.7042947368421053, + "grad_norm": 0.431640625, + "learning_rate": 0.00010416151479711042, + "loss": 3.0934, + "step": 16727 + }, + { + "epoch": 0.7043368421052632, + "grad_norm": 0.41796875, + "learning_rate": 0.00010413410841038264, + "loss": 2.9772, + "step": 16728 + }, + { + "epoch": 0.7043789473684211, + "grad_norm": 0.41015625, + "learning_rate": 0.00010410670468115996, + "loss": 2.6052, + "step": 16729 + }, + { + "epoch": 0.704421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.00010407930360994187, + "loss": 2.9569, + "step": 16730 + }, + { + "epoch": 0.7044631578947368, + "grad_norm": 0.44921875, + "learning_rate": 0.0001040519051972272, + "loss": 3.1986, + "step": 16731 + }, + { + "epoch": 0.7045052631578947, + "grad_norm": 0.51953125, + "learning_rate": 0.0001040245094435155, + "loss": 2.6324, + "step": 16732 + }, + { + "epoch": 0.7045473684210526, + "grad_norm": 0.431640625, + "learning_rate": 0.00010399711634930567, + "loss": 2.819, + "step": 16733 + }, + { + "epoch": 0.7045894736842105, + "grad_norm": 0.4375, + "learning_rate": 0.00010396972591509685, + "loss": 3.4435, + "step": 16734 + }, + { + "epoch": 0.7046315789473684, + "grad_norm": 0.43359375, + "learning_rate": 0.00010394233814138803, + "loss": 3.2124, + "step": 16735 + }, + { + "epoch": 0.7046736842105263, + "grad_norm": 0.43359375, + "learning_rate": 0.0001039149530286782, + "loss": 3.3302, + "step": 16736 + }, + { + "epoch": 0.7047157894736842, + "grad_norm": 0.423828125, + "learning_rate": 0.00010388757057746631, + "loss": 3.1718, + "step": 16737 + }, + { + "epoch": 0.7047578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.0001038601907882511, + "loss": 3.1592, + "step": 16738 + }, + { + "epoch": 0.7048, + "grad_norm": 0.447265625, + "learning_rate": 0.0001038328136615316, + "loss": 3.2752, + "step": 16739 + }, + { + "epoch": 0.7048421052631579, + "grad_norm": 0.412109375, + "learning_rate": 0.0001038054391978065, + "loss": 2.9082, + "step": 16740 + }, + { + "epoch": 0.7048842105263158, + "grad_norm": 0.435546875, + "learning_rate": 0.00010377806739757454, + "loss": 3.2233, + "step": 16741 + }, + { + "epoch": 0.7049263157894737, + "grad_norm": 0.423828125, + "learning_rate": 0.00010375069826133435, + "loss": 3.0101, + "step": 16742 + }, + { + "epoch": 0.7049684210526316, + "grad_norm": 0.44140625, + "learning_rate": 0.00010372333178958463, + "loss": 3.1636, + "step": 16743 + }, + { + "epoch": 0.7050105263157894, + "grad_norm": 0.404296875, + "learning_rate": 0.00010369596798282393, + "loss": 2.7833, + "step": 16744 + }, + { + "epoch": 0.7050526315789474, + "grad_norm": 0.439453125, + "learning_rate": 0.00010366860684155072, + "loss": 3.2063, + "step": 16745 + }, + { + "epoch": 0.7050947368421052, + "grad_norm": 0.4296875, + "learning_rate": 0.00010364124836626362, + "loss": 3.6625, + "step": 16746 + }, + { + "epoch": 0.7051368421052632, + "grad_norm": 0.4609375, + "learning_rate": 0.00010361389255746103, + "loss": 3.1599, + "step": 16747 + }, + { + "epoch": 0.705178947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.00010358653941564131, + "loss": 2.9522, + "step": 16748 + }, + { + "epoch": 0.705221052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00010355918894130282, + "loss": 2.8161, + "step": 16749 + }, + { + "epoch": 0.7052631578947368, + "grad_norm": 0.46875, + "learning_rate": 0.00010353184113494385, + "loss": 3.1789, + "step": 16750 + }, + { + "epoch": 0.7053052631578948, + "grad_norm": 0.416015625, + "learning_rate": 0.00010350449599706263, + "loss": 3.0638, + "step": 16751 + }, + { + "epoch": 0.7053473684210526, + "grad_norm": 0.423828125, + "learning_rate": 0.00010347715352815738, + "loss": 2.812, + "step": 16752 + }, + { + "epoch": 0.7053894736842106, + "grad_norm": 0.462890625, + "learning_rate": 0.00010344981372872614, + "loss": 3.1208, + "step": 16753 + }, + { + "epoch": 0.7054315789473684, + "grad_norm": 0.423828125, + "learning_rate": 0.00010342247659926717, + "loss": 2.6972, + "step": 16754 + }, + { + "epoch": 0.7054736842105264, + "grad_norm": 0.4296875, + "learning_rate": 0.0001033951421402785, + "loss": 3.346, + "step": 16755 + }, + { + "epoch": 0.7055157894736842, + "grad_norm": 0.416015625, + "learning_rate": 0.00010336781035225806, + "loss": 3.0928, + "step": 16756 + }, + { + "epoch": 0.705557894736842, + "grad_norm": 0.419921875, + "learning_rate": 0.00010334048123570383, + "loss": 2.5824, + "step": 16757 + }, + { + "epoch": 0.7056, + "grad_norm": 0.39453125, + "learning_rate": 0.00010331315479111373, + "loss": 2.865, + "step": 16758 + }, + { + "epoch": 0.7056421052631578, + "grad_norm": 0.44921875, + "learning_rate": 0.00010328583101898558, + "loss": 3.2539, + "step": 16759 + }, + { + "epoch": 0.7056842105263158, + "grad_norm": 0.490234375, + "learning_rate": 0.00010325850991981714, + "loss": 3.1026, + "step": 16760 + }, + { + "epoch": 0.7057263157894736, + "grad_norm": 0.431640625, + "learning_rate": 0.0001032311914941064, + "loss": 3.4005, + "step": 16761 + }, + { + "epoch": 0.7057684210526316, + "grad_norm": 0.4609375, + "learning_rate": 0.00010320387574235071, + "loss": 3.1221, + "step": 16762 + }, + { + "epoch": 0.7058105263157894, + "grad_norm": 0.439453125, + "learning_rate": 0.00010317656266504805, + "loss": 3.0309, + "step": 16763 + }, + { + "epoch": 0.7058526315789474, + "grad_norm": 0.44921875, + "learning_rate": 0.00010314925226269589, + "loss": 3.3726, + "step": 16764 + }, + { + "epoch": 0.7058947368421052, + "grad_norm": 0.408203125, + "learning_rate": 0.0001031219445357918, + "loss": 3.3472, + "step": 16765 + }, + { + "epoch": 0.7059368421052632, + "grad_norm": 0.42578125, + "learning_rate": 0.00010309463948483333, + "loss": 2.7601, + "step": 16766 + }, + { + "epoch": 0.705978947368421, + "grad_norm": 0.423828125, + "learning_rate": 0.00010306733711031782, + "loss": 3.2505, + "step": 16767 + }, + { + "epoch": 0.706021052631579, + "grad_norm": 0.416015625, + "learning_rate": 0.00010304003741274295, + "loss": 3.1844, + "step": 16768 + }, + { + "epoch": 0.7060631578947368, + "grad_norm": 0.4296875, + "learning_rate": 0.00010301274039260578, + "loss": 3.8285, + "step": 16769 + }, + { + "epoch": 0.7061052631578948, + "grad_norm": 0.43359375, + "learning_rate": 0.00010298544605040385, + "loss": 2.7247, + "step": 16770 + }, + { + "epoch": 0.7061473684210526, + "grad_norm": 0.412109375, + "learning_rate": 0.00010295815438663436, + "loss": 2.8939, + "step": 16771 + }, + { + "epoch": 0.7061894736842105, + "grad_norm": 0.412109375, + "learning_rate": 0.00010293086540179453, + "loss": 2.368, + "step": 16772 + }, + { + "epoch": 0.7062315789473684, + "grad_norm": 0.46875, + "learning_rate": 0.00010290357909638154, + "loss": 3.0345, + "step": 16773 + }, + { + "epoch": 0.7062736842105263, + "grad_norm": 0.44140625, + "learning_rate": 0.00010287629547089239, + "loss": 3.2272, + "step": 16774 + }, + { + "epoch": 0.7063157894736842, + "grad_norm": 0.41796875, + "learning_rate": 0.00010284901452582449, + "loss": 2.9205, + "step": 16775 + }, + { + "epoch": 0.7063578947368421, + "grad_norm": 0.43359375, + "learning_rate": 0.00010282173626167445, + "loss": 2.8684, + "step": 16776 + }, + { + "epoch": 0.7064, + "grad_norm": 0.42578125, + "learning_rate": 0.00010279446067893964, + "loss": 3.116, + "step": 16777 + }, + { + "epoch": 0.7064421052631579, + "grad_norm": 0.447265625, + "learning_rate": 0.00010276718777811661, + "loss": 2.9388, + "step": 16778 + }, + { + "epoch": 0.7064842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.0001027399175597025, + "loss": 3.6932, + "step": 16779 + }, + { + "epoch": 0.7065263157894737, + "grad_norm": 0.45703125, + "learning_rate": 0.00010271265002419409, + "loss": 3.127, + "step": 16780 + }, + { + "epoch": 0.7065684210526316, + "grad_norm": 0.431640625, + "learning_rate": 0.00010268538517208806, + "loss": 3.1645, + "step": 16781 + }, + { + "epoch": 0.7066105263157895, + "grad_norm": 0.431640625, + "learning_rate": 0.00010265812300388139, + "loss": 2.7272, + "step": 16782 + }, + { + "epoch": 0.7066526315789474, + "grad_norm": 0.421875, + "learning_rate": 0.00010263086352007042, + "loss": 3.2609, + "step": 16783 + }, + { + "epoch": 0.7066947368421053, + "grad_norm": 0.451171875, + "learning_rate": 0.00010260360672115213, + "loss": 3.1046, + "step": 16784 + }, + { + "epoch": 0.7067368421052631, + "grad_norm": 0.458984375, + "learning_rate": 0.0001025763526076228, + "loss": 2.7069, + "step": 16785 + }, + { + "epoch": 0.7067789473684211, + "grad_norm": 0.421875, + "learning_rate": 0.00010254910117997915, + "loss": 2.3588, + "step": 16786 + }, + { + "epoch": 0.7068210526315789, + "grad_norm": 0.431640625, + "learning_rate": 0.00010252185243871767, + "loss": 2.6971, + "step": 16787 + }, + { + "epoch": 0.7068631578947369, + "grad_norm": 0.427734375, + "learning_rate": 0.00010249460638433467, + "loss": 3.2409, + "step": 16788 + }, + { + "epoch": 0.7069052631578947, + "grad_norm": 0.421875, + "learning_rate": 0.00010246736301732679, + "loss": 2.9812, + "step": 16789 + }, + { + "epoch": 0.7069473684210527, + "grad_norm": 0.4140625, + "learning_rate": 0.00010244012233819002, + "loss": 2.8533, + "step": 16790 + }, + { + "epoch": 0.7069894736842105, + "grad_norm": 0.44921875, + "learning_rate": 0.00010241288434742105, + "loss": 3.3971, + "step": 16791 + }, + { + "epoch": 0.7070315789473685, + "grad_norm": 0.451171875, + "learning_rate": 0.00010238564904551573, + "loss": 3.5716, + "step": 16792 + }, + { + "epoch": 0.7070736842105263, + "grad_norm": 0.421875, + "learning_rate": 0.00010235841643297053, + "loss": 3.0048, + "step": 16793 + }, + { + "epoch": 0.7071157894736843, + "grad_norm": 0.412109375, + "learning_rate": 0.00010233118651028154, + "loss": 3.353, + "step": 16794 + }, + { + "epoch": 0.7071578947368421, + "grad_norm": 0.439453125, + "learning_rate": 0.00010230395927794478, + "loss": 3.1493, + "step": 16795 + }, + { + "epoch": 0.7072, + "grad_norm": 0.453125, + "learning_rate": 0.00010227673473645638, + "loss": 3.3531, + "step": 16796 + }, + { + "epoch": 0.7072421052631579, + "grad_norm": 0.484375, + "learning_rate": 0.00010224951288631221, + "loss": 3.0967, + "step": 16797 + }, + { + "epoch": 0.7072842105263157, + "grad_norm": 0.40625, + "learning_rate": 0.00010222229372800848, + "loss": 2.6947, + "step": 16798 + }, + { + "epoch": 0.7073263157894737, + "grad_norm": 0.416015625, + "learning_rate": 0.00010219507726204078, + "loss": 3.2281, + "step": 16799 + }, + { + "epoch": 0.7073684210526315, + "grad_norm": 0.447265625, + "learning_rate": 0.00010216786348890516, + "loss": 3.3852, + "step": 16800 + }, + { + "epoch": 0.7074105263157895, + "grad_norm": 0.443359375, + "learning_rate": 0.00010214065240909739, + "loss": 3.1668, + "step": 16801 + }, + { + "epoch": 0.7074526315789473, + "grad_norm": 0.43359375, + "learning_rate": 0.0001021134440231132, + "loss": 2.7059, + "step": 16802 + }, + { + "epoch": 0.7074947368421053, + "grad_norm": 0.42578125, + "learning_rate": 0.00010208623833144831, + "loss": 3.3877, + "step": 16803 + }, + { + "epoch": 0.7075368421052631, + "grad_norm": 0.423828125, + "learning_rate": 0.00010205903533459837, + "loss": 2.8692, + "step": 16804 + }, + { + "epoch": 0.7075789473684211, + "grad_norm": 0.42578125, + "learning_rate": 0.00010203183503305896, + "loss": 2.8384, + "step": 16805 + }, + { + "epoch": 0.7076210526315789, + "grad_norm": 0.431640625, + "learning_rate": 0.0001020046374273256, + "loss": 3.067, + "step": 16806 + }, + { + "epoch": 0.7076631578947369, + "grad_norm": 0.421875, + "learning_rate": 0.00010197744251789393, + "loss": 3.3237, + "step": 16807 + }, + { + "epoch": 0.7077052631578947, + "grad_norm": 0.421875, + "learning_rate": 0.00010195025030525935, + "loss": 3.2135, + "step": 16808 + }, + { + "epoch": 0.7077473684210527, + "grad_norm": 0.412109375, + "learning_rate": 0.00010192306078991723, + "loss": 2.6191, + "step": 16809 + }, + { + "epoch": 0.7077894736842105, + "grad_norm": 0.40234375, + "learning_rate": 0.000101895873972363, + "loss": 2.8221, + "step": 16810 + }, + { + "epoch": 0.7078315789473684, + "grad_norm": 0.451171875, + "learning_rate": 0.0001018686898530919, + "loss": 3.3585, + "step": 16811 + }, + { + "epoch": 0.7078736842105263, + "grad_norm": 0.431640625, + "learning_rate": 0.00010184150843259923, + "loss": 2.7731, + "step": 16812 + }, + { + "epoch": 0.7079157894736842, + "grad_norm": 0.42578125, + "learning_rate": 0.0001018143297113801, + "loss": 2.7212, + "step": 16813 + }, + { + "epoch": 0.7079578947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.00010178715368992989, + "loss": 2.8268, + "step": 16814 + }, + { + "epoch": 0.708, + "grad_norm": 0.421875, + "learning_rate": 0.00010175998036874357, + "loss": 3.4696, + "step": 16815 + }, + { + "epoch": 0.7080421052631579, + "grad_norm": 0.421875, + "learning_rate": 0.00010173280974831625, + "loss": 2.6628, + "step": 16816 + }, + { + "epoch": 0.7080842105263158, + "grad_norm": 0.4375, + "learning_rate": 0.00010170564182914291, + "loss": 3.7094, + "step": 16817 + }, + { + "epoch": 0.7081263157894737, + "grad_norm": 0.443359375, + "learning_rate": 0.00010167847661171855, + "loss": 3.027, + "step": 16818 + }, + { + "epoch": 0.7081684210526316, + "grad_norm": 0.451171875, + "learning_rate": 0.00010165131409653808, + "loss": 2.8369, + "step": 16819 + }, + { + "epoch": 0.7082105263157895, + "grad_norm": 0.41796875, + "learning_rate": 0.00010162415428409635, + "loss": 3.4182, + "step": 16820 + }, + { + "epoch": 0.7082526315789474, + "grad_norm": 0.41796875, + "learning_rate": 0.00010159699717488813, + "loss": 2.8661, + "step": 16821 + }, + { + "epoch": 0.7082947368421053, + "grad_norm": 0.416015625, + "learning_rate": 0.00010156984276940831, + "loss": 2.8583, + "step": 16822 + }, + { + "epoch": 0.7083368421052632, + "grad_norm": 0.427734375, + "learning_rate": 0.0001015426910681516, + "loss": 3.0961, + "step": 16823 + }, + { + "epoch": 0.7083789473684211, + "grad_norm": 0.431640625, + "learning_rate": 0.00010151554207161259, + "loss": 3.1078, + "step": 16824 + }, + { + "epoch": 0.708421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00010148839578028596, + "loss": 3.5337, + "step": 16825 + }, + { + "epoch": 0.7084631578947368, + "grad_norm": 0.4296875, + "learning_rate": 0.00010146125219466626, + "loss": 2.8675, + "step": 16826 + }, + { + "epoch": 0.7085052631578947, + "grad_norm": 0.3984375, + "learning_rate": 0.00010143411131524801, + "loss": 3.1385, + "step": 16827 + }, + { + "epoch": 0.7085473684210526, + "grad_norm": 0.40625, + "learning_rate": 0.00010140697314252561, + "loss": 3.0397, + "step": 16828 + }, + { + "epoch": 0.7085894736842105, + "grad_norm": 0.423828125, + "learning_rate": 0.00010137983767699365, + "loss": 2.4184, + "step": 16829 + }, + { + "epoch": 0.7086315789473684, + "grad_norm": 0.42578125, + "learning_rate": 0.00010135270491914644, + "loss": 3.1687, + "step": 16830 + }, + { + "epoch": 0.7086736842105263, + "grad_norm": 0.408203125, + "learning_rate": 0.0001013255748694783, + "loss": 3.2474, + "step": 16831 + }, + { + "epoch": 0.7087157894736842, + "grad_norm": 0.427734375, + "learning_rate": 0.0001012984475284835, + "loss": 3.0533, + "step": 16832 + }, + { + "epoch": 0.7087578947368421, + "grad_norm": 0.4375, + "learning_rate": 0.00010127132289665616, + "loss": 3.0474, + "step": 16833 + }, + { + "epoch": 0.7088, + "grad_norm": 0.412109375, + "learning_rate": 0.00010124420097449077, + "loss": 2.8751, + "step": 16834 + }, + { + "epoch": 0.7088421052631579, + "grad_norm": 0.443359375, + "learning_rate": 0.00010121708176248107, + "loss": 3.0923, + "step": 16835 + }, + { + "epoch": 0.7088842105263158, + "grad_norm": 0.4296875, + "learning_rate": 0.0001011899652611215, + "loss": 2.9358, + "step": 16836 + }, + { + "epoch": 0.7089263157894737, + "grad_norm": 0.451171875, + "learning_rate": 0.00010116285147090574, + "loss": 3.2144, + "step": 16837 + }, + { + "epoch": 0.7089684210526316, + "grad_norm": 0.455078125, + "learning_rate": 0.00010113574039232804, + "loss": 3.0642, + "step": 16838 + }, + { + "epoch": 0.7090105263157894, + "grad_norm": 0.43359375, + "learning_rate": 0.00010110863202588224, + "loss": 2.6987, + "step": 16839 + }, + { + "epoch": 0.7090526315789474, + "grad_norm": 0.43359375, + "learning_rate": 0.00010108152637206214, + "loss": 3.0445, + "step": 16840 + }, + { + "epoch": 0.7090947368421052, + "grad_norm": 0.419921875, + "learning_rate": 0.00010105442343136184, + "loss": 2.546, + "step": 16841 + }, + { + "epoch": 0.7091368421052632, + "grad_norm": 0.416015625, + "learning_rate": 0.00010102732320427477, + "loss": 2.9341, + "step": 16842 + }, + { + "epoch": 0.709178947368421, + "grad_norm": 0.42578125, + "learning_rate": 0.000101000225691295, + "loss": 3.1991, + "step": 16843 + }, + { + "epoch": 0.709221052631579, + "grad_norm": 0.453125, + "learning_rate": 0.00010097313089291587, + "loss": 3.4971, + "step": 16844 + }, + { + "epoch": 0.7092631578947368, + "grad_norm": 0.43359375, + "learning_rate": 0.0001009460388096313, + "loss": 2.8716, + "step": 16845 + }, + { + "epoch": 0.7093052631578948, + "grad_norm": 0.4296875, + "learning_rate": 0.00010091894944193475, + "loss": 3.3469, + "step": 16846 + }, + { + "epoch": 0.7093473684210526, + "grad_norm": 0.439453125, + "learning_rate": 0.00010089186279031979, + "loss": 2.8271, + "step": 16847 + }, + { + "epoch": 0.7093894736842106, + "grad_norm": 0.44921875, + "learning_rate": 0.00010086477885527989, + "loss": 3.139, + "step": 16848 + }, + { + "epoch": 0.7094315789473684, + "grad_norm": 0.4296875, + "learning_rate": 0.0001008376976373084, + "loss": 3.1309, + "step": 16849 + }, + { + "epoch": 0.7094736842105264, + "grad_norm": 0.482421875, + "learning_rate": 0.00010081061913689898, + "loss": 3.1511, + "step": 16850 + }, + { + "epoch": 0.7095157894736842, + "grad_norm": 0.421875, + "learning_rate": 0.00010078354335454463, + "loss": 3.1276, + "step": 16851 + }, + { + "epoch": 0.709557894736842, + "grad_norm": 0.431640625, + "learning_rate": 0.00010075647029073895, + "loss": 3.2364, + "step": 16852 + }, + { + "epoch": 0.7096, + "grad_norm": 0.423828125, + "learning_rate": 0.00010072939994597487, + "loss": 3.1303, + "step": 16853 + }, + { + "epoch": 0.7096421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00010070233232074583, + "loss": 3.3399, + "step": 16854 + }, + { + "epoch": 0.7096842105263158, + "grad_norm": 0.4453125, + "learning_rate": 0.00010067526741554486, + "loss": 3.0039, + "step": 16855 + }, + { + "epoch": 0.7097263157894736, + "grad_norm": 0.416015625, + "learning_rate": 0.00010064820523086499, + "loss": 3.0008, + "step": 16856 + }, + { + "epoch": 0.7097684210526316, + "grad_norm": 0.4296875, + "learning_rate": 0.0001006211457671995, + "loss": 3.0404, + "step": 16857 + }, + { + "epoch": 0.7098105263157894, + "grad_norm": 0.439453125, + "learning_rate": 0.00010059408902504105, + "loss": 3.3679, + "step": 16858 + }, + { + "epoch": 0.7098526315789474, + "grad_norm": 0.443359375, + "learning_rate": 0.00010056703500488296, + "loss": 3.4215, + "step": 16859 + }, + { + "epoch": 0.7098947368421052, + "grad_norm": 0.462890625, + "learning_rate": 0.0001005399837072177, + "loss": 3.1663, + "step": 16860 + }, + { + "epoch": 0.7099368421052632, + "grad_norm": 0.53515625, + "learning_rate": 0.00010051293513253842, + "loss": 3.4876, + "step": 16861 + }, + { + "epoch": 0.709978947368421, + "grad_norm": 0.443359375, + "learning_rate": 0.00010048588928133784, + "loss": 3.0212, + "step": 16862 + }, + { + "epoch": 0.710021052631579, + "grad_norm": 0.41015625, + "learning_rate": 0.00010045884615410869, + "loss": 3.3315, + "step": 16863 + }, + { + "epoch": 0.7100631578947368, + "grad_norm": 0.4375, + "learning_rate": 0.00010043180575134365, + "loss": 3.072, + "step": 16864 + }, + { + "epoch": 0.7101052631578947, + "grad_norm": 0.484375, + "learning_rate": 0.0001004047680735353, + "loss": 2.8151, + "step": 16865 + }, + { + "epoch": 0.7101473684210526, + "grad_norm": 0.40625, + "learning_rate": 0.00010037773312117646, + "loss": 2.5591, + "step": 16866 + }, + { + "epoch": 0.7101894736842105, + "grad_norm": 0.447265625, + "learning_rate": 0.00010035070089475935, + "loss": 3.6372, + "step": 16867 + }, + { + "epoch": 0.7102315789473684, + "grad_norm": 0.443359375, + "learning_rate": 0.00010032367139477674, + "loss": 3.0567, + "step": 16868 + }, + { + "epoch": 0.7102736842105263, + "grad_norm": 0.45703125, + "learning_rate": 0.00010029664462172097, + "loss": 2.8877, + "step": 16869 + }, + { + "epoch": 0.7103157894736842, + "grad_norm": 0.39453125, + "learning_rate": 0.00010026962057608444, + "loss": 2.9324, + "step": 16870 + }, + { + "epoch": 0.7103578947368421, + "grad_norm": 0.416015625, + "learning_rate": 0.0001002425992583595, + "loss": 3.0796, + "step": 16871 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4453125, + "learning_rate": 0.00010021558066903835, + "loss": 2.9856, + "step": 16872 + }, + { + "epoch": 0.7104421052631579, + "grad_norm": 0.42578125, + "learning_rate": 0.00010018856480861349, + "loss": 3.1319, + "step": 16873 + }, + { + "epoch": 0.7104842105263158, + "grad_norm": 0.427734375, + "learning_rate": 0.00010016155167757682, + "loss": 3.0465, + "step": 16874 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 0.46484375, + "learning_rate": 0.00010013454127642068, + "loss": 3.1929, + "step": 16875 + }, + { + "epoch": 0.7105684210526316, + "grad_norm": 0.423828125, + "learning_rate": 0.00010010753360563713, + "loss": 3.2114, + "step": 16876 + }, + { + "epoch": 0.7106105263157895, + "grad_norm": 0.412109375, + "learning_rate": 0.00010008052866571821, + "loss": 3.0843, + "step": 16877 + }, + { + "epoch": 0.7106526315789473, + "grad_norm": 0.4296875, + "learning_rate": 0.0001000535264571559, + "loss": 2.9095, + "step": 16878 + }, + { + "epoch": 0.7106947368421053, + "grad_norm": 0.423828125, + "learning_rate": 0.00010002652698044215, + "loss": 3.1875, + "step": 16879 + }, + { + "epoch": 0.7107368421052631, + "grad_norm": 0.4453125, + "learning_rate": 9.999953023606879e-05, + "loss": 3.4109, + "step": 16880 + }, + { + "epoch": 0.7107789473684211, + "grad_norm": 0.4140625, + "learning_rate": 9.997253622452782e-05, + "loss": 3.0866, + "step": 16881 + }, + { + "epoch": 0.7108210526315789, + "grad_norm": 0.412109375, + "learning_rate": 9.994554494631097e-05, + "loss": 3.0554, + "step": 16882 + }, + { + "epoch": 0.7108631578947369, + "grad_norm": 0.412109375, + "learning_rate": 9.991855640191e-05, + "loss": 3.4006, + "step": 16883 + }, + { + "epoch": 0.7109052631578947, + "grad_norm": 0.494140625, + "learning_rate": 9.989157059181659e-05, + "loss": 3.1484, + "step": 16884 + }, + { + "epoch": 0.7109473684210527, + "grad_norm": 0.42578125, + "learning_rate": 9.986458751652239e-05, + "loss": 2.9575, + "step": 16885 + }, + { + "epoch": 0.7109894736842105, + "grad_norm": 0.43359375, + "learning_rate": 9.9837607176519e-05, + "loss": 2.8618, + "step": 16886 + }, + { + "epoch": 0.7110315789473685, + "grad_norm": 0.4375, + "learning_rate": 9.98106295722979e-05, + "loss": 3.0809, + "step": 16887 + }, + { + "epoch": 0.7110736842105263, + "grad_norm": 0.482421875, + "learning_rate": 9.978365470435071e-05, + "loss": 3.3581, + "step": 16888 + }, + { + "epoch": 0.7111157894736843, + "grad_norm": 0.453125, + "learning_rate": 9.975668257316886e-05, + "loss": 2.9859, + "step": 16889 + }, + { + "epoch": 0.7111578947368421, + "grad_norm": 0.443359375, + "learning_rate": 9.972971317924374e-05, + "loss": 2.9541, + "step": 16890 + }, + { + "epoch": 0.7112, + "grad_norm": 0.421875, + "learning_rate": 9.970274652306666e-05, + "loss": 2.8482, + "step": 16891 + }, + { + "epoch": 0.7112421052631579, + "grad_norm": 0.4453125, + "learning_rate": 9.967578260512897e-05, + "loss": 3.0412, + "step": 16892 + }, + { + "epoch": 0.7112842105263157, + "grad_norm": 0.439453125, + "learning_rate": 9.964882142592188e-05, + "loss": 3.1506, + "step": 16893 + }, + { + "epoch": 0.7113263157894737, + "grad_norm": 0.43359375, + "learning_rate": 9.962186298593654e-05, + "loss": 3.5694, + "step": 16894 + }, + { + "epoch": 0.7113684210526315, + "grad_norm": 0.4375, + "learning_rate": 9.959490728566433e-05, + "loss": 3.4042, + "step": 16895 + }, + { + "epoch": 0.7114105263157895, + "grad_norm": 0.404296875, + "learning_rate": 9.956795432559601e-05, + "loss": 3.2916, + "step": 16896 + }, + { + "epoch": 0.7114526315789473, + "grad_norm": 0.4140625, + "learning_rate": 9.95410041062229e-05, + "loss": 3.0627, + "step": 16897 + }, + { + "epoch": 0.7114947368421053, + "grad_norm": 0.4296875, + "learning_rate": 9.951405662803592e-05, + "loss": 3.3171, + "step": 16898 + }, + { + "epoch": 0.7115368421052631, + "grad_norm": 0.443359375, + "learning_rate": 9.948711189152601e-05, + "loss": 3.1674, + "step": 16899 + }, + { + "epoch": 0.7115789473684211, + "grad_norm": 0.41796875, + "learning_rate": 9.946016989718409e-05, + "loss": 2.9179, + "step": 16900 + }, + { + "epoch": 0.7116210526315789, + "grad_norm": 0.44921875, + "learning_rate": 9.943323064550089e-05, + "loss": 2.9274, + "step": 16901 + }, + { + "epoch": 0.7116631578947369, + "grad_norm": 0.4921875, + "learning_rate": 9.940629413696753e-05, + "loss": 3.222, + "step": 16902 + }, + { + "epoch": 0.7117052631578947, + "grad_norm": 0.423828125, + "learning_rate": 9.937936037207434e-05, + "loss": 3.0021, + "step": 16903 + }, + { + "epoch": 0.7117473684210527, + "grad_norm": 0.451171875, + "learning_rate": 9.935242935131234e-05, + "loss": 3.306, + "step": 16904 + }, + { + "epoch": 0.7117894736842105, + "grad_norm": 0.4375, + "learning_rate": 9.932550107517205e-05, + "loss": 3.1326, + "step": 16905 + }, + { + "epoch": 0.7118315789473684, + "grad_norm": 0.419921875, + "learning_rate": 9.929857554414409e-05, + "loss": 3.1925, + "step": 16906 + }, + { + "epoch": 0.7118736842105263, + "grad_norm": 0.431640625, + "learning_rate": 9.927165275871904e-05, + "loss": 2.9996, + "step": 16907 + }, + { + "epoch": 0.7119157894736842, + "grad_norm": 0.412109375, + "learning_rate": 9.924473271938731e-05, + "loss": 3.0013, + "step": 16908 + }, + { + "epoch": 0.7119578947368421, + "grad_norm": 0.4296875, + "learning_rate": 9.921781542663955e-05, + "loss": 3.5803, + "step": 16909 + }, + { + "epoch": 0.712, + "grad_norm": 0.419921875, + "learning_rate": 9.919090088096589e-05, + "loss": 2.9812, + "step": 16910 + }, + { + "epoch": 0.7120421052631579, + "grad_norm": 0.61328125, + "learning_rate": 9.916398908285698e-05, + "loss": 3.3846, + "step": 16911 + }, + { + "epoch": 0.7120842105263158, + "grad_norm": 0.4375, + "learning_rate": 9.913708003280278e-05, + "loss": 3.0663, + "step": 16912 + }, + { + "epoch": 0.7121263157894737, + "grad_norm": 0.42578125, + "learning_rate": 9.911017373129383e-05, + "loss": 2.9371, + "step": 16913 + }, + { + "epoch": 0.7121684210526316, + "grad_norm": 0.42578125, + "learning_rate": 9.908327017882021e-05, + "loss": 2.9263, + "step": 16914 + }, + { + "epoch": 0.7122105263157895, + "grad_norm": 0.4296875, + "learning_rate": 9.905636937587203e-05, + "loss": 2.782, + "step": 16915 + }, + { + "epoch": 0.7122526315789474, + "grad_norm": 0.42578125, + "learning_rate": 9.90294713229396e-05, + "loss": 3.0034, + "step": 16916 + }, + { + "epoch": 0.7122947368421053, + "grad_norm": 0.42578125, + "learning_rate": 9.900257602051266e-05, + "loss": 3.2003, + "step": 16917 + }, + { + "epoch": 0.7123368421052632, + "grad_norm": 0.400390625, + "learning_rate": 9.897568346908154e-05, + "loss": 2.9292, + "step": 16918 + }, + { + "epoch": 0.712378947368421, + "grad_norm": 0.474609375, + "learning_rate": 9.894879366913581e-05, + "loss": 2.8176, + "step": 16919 + }, + { + "epoch": 0.712421052631579, + "grad_norm": 0.408203125, + "learning_rate": 9.892190662116571e-05, + "loss": 2.5378, + "step": 16920 + }, + { + "epoch": 0.7124631578947368, + "grad_norm": 0.45703125, + "learning_rate": 9.889502232566094e-05, + "loss": 2.9101, + "step": 16921 + }, + { + "epoch": 0.7125052631578948, + "grad_norm": 0.42578125, + "learning_rate": 9.886814078311135e-05, + "loss": 3.0179, + "step": 16922 + }, + { + "epoch": 0.7125473684210526, + "grad_norm": 0.421875, + "learning_rate": 9.884126199400665e-05, + "loss": 2.9971, + "step": 16923 + }, + { + "epoch": 0.7125894736842106, + "grad_norm": 0.404296875, + "learning_rate": 9.881438595883646e-05, + "loss": 2.6038, + "step": 16924 + }, + { + "epoch": 0.7126315789473684, + "grad_norm": 0.4140625, + "learning_rate": 9.87875126780907e-05, + "loss": 2.7271, + "step": 16925 + }, + { + "epoch": 0.7126736842105263, + "grad_norm": 0.447265625, + "learning_rate": 9.876064215225861e-05, + "loss": 2.7675, + "step": 16926 + }, + { + "epoch": 0.7127157894736842, + "grad_norm": 0.4375, + "learning_rate": 9.873377438183001e-05, + "loss": 3.4314, + "step": 16927 + }, + { + "epoch": 0.7127578947368421, + "grad_norm": 0.40625, + "learning_rate": 9.870690936729432e-05, + "loss": 3.6537, + "step": 16928 + }, + { + "epoch": 0.7128, + "grad_norm": 0.421875, + "learning_rate": 9.868004710914097e-05, + "loss": 3.0912, + "step": 16929 + }, + { + "epoch": 0.7128421052631579, + "grad_norm": 0.4296875, + "learning_rate": 9.865318760785937e-05, + "loss": 3.1416, + "step": 16930 + }, + { + "epoch": 0.7128842105263158, + "grad_norm": 0.4140625, + "learning_rate": 9.862633086393877e-05, + "loss": 3.2296, + "step": 16931 + }, + { + "epoch": 0.7129263157894736, + "grad_norm": 0.439453125, + "learning_rate": 9.859947687786877e-05, + "loss": 3.1374, + "step": 16932 + }, + { + "epoch": 0.7129684210526316, + "grad_norm": 0.4453125, + "learning_rate": 9.857262565013819e-05, + "loss": 3.3868, + "step": 16933 + }, + { + "epoch": 0.7130105263157894, + "grad_norm": 0.4296875, + "learning_rate": 9.854577718123659e-05, + "loss": 3.0208, + "step": 16934 + }, + { + "epoch": 0.7130526315789474, + "grad_norm": 0.4375, + "learning_rate": 9.851893147165295e-05, + "loss": 3.3971, + "step": 16935 + }, + { + "epoch": 0.7130947368421052, + "grad_norm": 0.4140625, + "learning_rate": 9.849208852187638e-05, + "loss": 3.0257, + "step": 16936 + }, + { + "epoch": 0.7131368421052632, + "grad_norm": 0.421875, + "learning_rate": 9.846524833239597e-05, + "loss": 3.1356, + "step": 16937 + }, + { + "epoch": 0.713178947368421, + "grad_norm": 0.41796875, + "learning_rate": 9.843841090370071e-05, + "loss": 2.7626, + "step": 16938 + }, + { + "epoch": 0.713221052631579, + "grad_norm": 0.400390625, + "learning_rate": 9.841157623627947e-05, + "loss": 2.4258, + "step": 16939 + }, + { + "epoch": 0.7132631578947368, + "grad_norm": 0.435546875, + "learning_rate": 9.838474433062114e-05, + "loss": 3.1583, + "step": 16940 + }, + { + "epoch": 0.7133052631578948, + "grad_norm": 0.4375, + "learning_rate": 9.835791518721474e-05, + "loss": 2.873, + "step": 16941 + }, + { + "epoch": 0.7133473684210526, + "grad_norm": 0.455078125, + "learning_rate": 9.833108880654892e-05, + "loss": 3.0108, + "step": 16942 + }, + { + "epoch": 0.7133894736842106, + "grad_norm": 0.462890625, + "learning_rate": 9.830426518911248e-05, + "loss": 2.9234, + "step": 16943 + }, + { + "epoch": 0.7134315789473684, + "grad_norm": 0.466796875, + "learning_rate": 9.827744433539406e-05, + "loss": 2.8782, + "step": 16944 + }, + { + "epoch": 0.7134736842105264, + "grad_norm": 0.419921875, + "learning_rate": 9.825062624588238e-05, + "loss": 3.3175, + "step": 16945 + }, + { + "epoch": 0.7135157894736842, + "grad_norm": 0.439453125, + "learning_rate": 9.822381092106597e-05, + "loss": 3.2667, + "step": 16946 + }, + { + "epoch": 0.7135578947368421, + "grad_norm": 0.421875, + "learning_rate": 9.819699836143331e-05, + "loss": 2.9353, + "step": 16947 + }, + { + "epoch": 0.7136, + "grad_norm": 0.43359375, + "learning_rate": 9.817018856747307e-05, + "loss": 2.937, + "step": 16948 + }, + { + "epoch": 0.7136421052631579, + "grad_norm": 0.43359375, + "learning_rate": 9.814338153967359e-05, + "loss": 3.0775, + "step": 16949 + }, + { + "epoch": 0.7136842105263158, + "grad_norm": 0.458984375, + "learning_rate": 9.811657727852328e-05, + "loss": 3.2195, + "step": 16950 + }, + { + "epoch": 0.7137263157894737, + "grad_norm": 0.4453125, + "learning_rate": 9.808977578451047e-05, + "loss": 3.1562, + "step": 16951 + }, + { + "epoch": 0.7137684210526316, + "grad_norm": 0.439453125, + "learning_rate": 9.806297705812348e-05, + "loss": 2.9758, + "step": 16952 + }, + { + "epoch": 0.7138105263157895, + "grad_norm": 0.48046875, + "learning_rate": 9.80361810998505e-05, + "loss": 2.7876, + "step": 16953 + }, + { + "epoch": 0.7138526315789474, + "grad_norm": 0.4140625, + "learning_rate": 9.800938791017977e-05, + "loss": 3.2477, + "step": 16954 + }, + { + "epoch": 0.7138947368421052, + "grad_norm": 0.458984375, + "learning_rate": 9.798259748959931e-05, + "loss": 2.933, + "step": 16955 + }, + { + "epoch": 0.7139368421052632, + "grad_norm": 0.419921875, + "learning_rate": 9.795580983859742e-05, + "loss": 3.3745, + "step": 16956 + }, + { + "epoch": 0.713978947368421, + "grad_norm": 0.43359375, + "learning_rate": 9.792902495766201e-05, + "loss": 3.1138, + "step": 16957 + }, + { + "epoch": 0.714021052631579, + "grad_norm": 0.4140625, + "learning_rate": 9.790224284728107e-05, + "loss": 2.6987, + "step": 16958 + }, + { + "epoch": 0.7140631578947368, + "grad_norm": 0.46484375, + "learning_rate": 9.78754635079426e-05, + "loss": 3.5947, + "step": 16959 + }, + { + "epoch": 0.7141052631578947, + "grad_norm": 0.400390625, + "learning_rate": 9.784868694013443e-05, + "loss": 2.6584, + "step": 16960 + }, + { + "epoch": 0.7141473684210526, + "grad_norm": 0.427734375, + "learning_rate": 9.782191314434441e-05, + "loss": 2.9984, + "step": 16961 + }, + { + "epoch": 0.7141894736842105, + "grad_norm": 0.443359375, + "learning_rate": 9.779514212106025e-05, + "loss": 3.096, + "step": 16962 + }, + { + "epoch": 0.7142315789473684, + "grad_norm": 0.421875, + "learning_rate": 9.776837387076986e-05, + "loss": 3.4277, + "step": 16963 + }, + { + "epoch": 0.7142736842105263, + "grad_norm": 0.4296875, + "learning_rate": 9.774160839396082e-05, + "loss": 2.7251, + "step": 16964 + }, + { + "epoch": 0.7143157894736842, + "grad_norm": 0.41796875, + "learning_rate": 9.771484569112079e-05, + "loss": 3.1881, + "step": 16965 + }, + { + "epoch": 0.7143578947368421, + "grad_norm": 0.4140625, + "learning_rate": 9.768808576273733e-05, + "loss": 3.2741, + "step": 16966 + }, + { + "epoch": 0.7144, + "grad_norm": 0.419921875, + "learning_rate": 9.766132860929799e-05, + "loss": 3.158, + "step": 16967 + }, + { + "epoch": 0.7144421052631579, + "grad_norm": 0.462890625, + "learning_rate": 9.763457423129027e-05, + "loss": 2.6538, + "step": 16968 + }, + { + "epoch": 0.7144842105263158, + "grad_norm": 0.4375, + "learning_rate": 9.760782262920148e-05, + "loss": 3.0077, + "step": 16969 + }, + { + "epoch": 0.7145263157894737, + "grad_norm": 0.4296875, + "learning_rate": 9.75810738035193e-05, + "loss": 3.1471, + "step": 16970 + }, + { + "epoch": 0.7145684210526316, + "grad_norm": 0.458984375, + "learning_rate": 9.755432775473067e-05, + "loss": 3.1062, + "step": 16971 + }, + { + "epoch": 0.7146105263157895, + "grad_norm": 0.408203125, + "learning_rate": 9.752758448332317e-05, + "loss": 2.8726, + "step": 16972 + }, + { + "epoch": 0.7146526315789473, + "grad_norm": 0.42578125, + "learning_rate": 9.750084398978393e-05, + "loss": 3.1971, + "step": 16973 + }, + { + "epoch": 0.7146947368421053, + "grad_norm": 0.470703125, + "learning_rate": 9.747410627460016e-05, + "loss": 3.1966, + "step": 16974 + }, + { + "epoch": 0.7147368421052631, + "grad_norm": 0.423828125, + "learning_rate": 9.744737133825893e-05, + "loss": 2.4889, + "step": 16975 + }, + { + "epoch": 0.7147789473684211, + "grad_norm": 0.42578125, + "learning_rate": 9.742063918124728e-05, + "loss": 3.0717, + "step": 16976 + }, + { + "epoch": 0.7148210526315789, + "grad_norm": 0.4296875, + "learning_rate": 9.73939098040525e-05, + "loss": 3.141, + "step": 16977 + }, + { + "epoch": 0.7148631578947369, + "grad_norm": 0.451171875, + "learning_rate": 9.736718320716117e-05, + "loss": 3.0969, + "step": 16978 + }, + { + "epoch": 0.7149052631578947, + "grad_norm": 0.4453125, + "learning_rate": 9.734045939106054e-05, + "loss": 2.9496, + "step": 16979 + }, + { + "epoch": 0.7149473684210527, + "grad_norm": 0.45703125, + "learning_rate": 9.731373835623735e-05, + "loss": 3.3816, + "step": 16980 + }, + { + "epoch": 0.7149894736842105, + "grad_norm": 0.41015625, + "learning_rate": 9.728702010317847e-05, + "loss": 3.1532, + "step": 16981 + }, + { + "epoch": 0.7150315789473685, + "grad_norm": 0.58203125, + "learning_rate": 9.726030463237065e-05, + "loss": 3.2694, + "step": 16982 + }, + { + "epoch": 0.7150736842105263, + "grad_norm": 0.41796875, + "learning_rate": 9.723359194430051e-05, + "loss": 3.2479, + "step": 16983 + }, + { + "epoch": 0.7151157894736843, + "grad_norm": 0.41796875, + "learning_rate": 9.720688203945501e-05, + "loss": 3.1115, + "step": 16984 + }, + { + "epoch": 0.7151578947368421, + "grad_norm": 0.451171875, + "learning_rate": 9.718017491832046e-05, + "loss": 3.0338, + "step": 16985 + }, + { + "epoch": 0.7152, + "grad_norm": 0.443359375, + "learning_rate": 9.715347058138371e-05, + "loss": 3.0961, + "step": 16986 + }, + { + "epoch": 0.7152421052631579, + "grad_norm": 0.388671875, + "learning_rate": 9.712676902913095e-05, + "loss": 2.9653, + "step": 16987 + }, + { + "epoch": 0.7152842105263157, + "grad_norm": 0.578125, + "learning_rate": 9.710007026204895e-05, + "loss": 2.9415, + "step": 16988 + }, + { + "epoch": 0.7153263157894737, + "grad_norm": 0.4296875, + "learning_rate": 9.707337428062401e-05, + "loss": 2.3012, + "step": 16989 + }, + { + "epoch": 0.7153684210526315, + "grad_norm": 0.4609375, + "learning_rate": 9.704668108534243e-05, + "loss": 3.1839, + "step": 16990 + }, + { + "epoch": 0.7154105263157895, + "grad_norm": 0.435546875, + "learning_rate": 9.701999067669079e-05, + "loss": 2.948, + "step": 16991 + }, + { + "epoch": 0.7154526315789473, + "grad_norm": 0.44140625, + "learning_rate": 9.699330305515497e-05, + "loss": 3.2594, + "step": 16992 + }, + { + "epoch": 0.7154947368421053, + "grad_norm": 0.4140625, + "learning_rate": 9.69666182212216e-05, + "loss": 3.2303, + "step": 16993 + }, + { + "epoch": 0.7155368421052631, + "grad_norm": 0.419921875, + "learning_rate": 9.693993617537644e-05, + "loss": 3.3469, + "step": 16994 + }, + { + "epoch": 0.7155789473684211, + "grad_norm": 0.4453125, + "learning_rate": 9.69132569181059e-05, + "loss": 3.1142, + "step": 16995 + }, + { + "epoch": 0.7156210526315789, + "grad_norm": 0.439453125, + "learning_rate": 9.688658044989596e-05, + "loss": 3.2032, + "step": 16996 + }, + { + "epoch": 0.7156631578947369, + "grad_norm": 0.41015625, + "learning_rate": 9.685990677123263e-05, + "loss": 3.1566, + "step": 16997 + }, + { + "epoch": 0.7157052631578947, + "grad_norm": 0.41796875, + "learning_rate": 9.683323588260185e-05, + "loss": 3.0318, + "step": 16998 + }, + { + "epoch": 0.7157473684210526, + "grad_norm": 0.43359375, + "learning_rate": 9.680656778448946e-05, + "loss": 3.6259, + "step": 16999 + }, + { + "epoch": 0.7157894736842105, + "grad_norm": 0.416015625, + "learning_rate": 9.677990247738158e-05, + "loss": 3.1723, + "step": 17000 + }, + { + "epoch": 0.7158315789473684, + "grad_norm": 0.431640625, + "learning_rate": 9.675323996176369e-05, + "loss": 2.608, + "step": 17001 + }, + { + "epoch": 0.7158736842105263, + "grad_norm": 0.412109375, + "learning_rate": 9.672658023812179e-05, + "loss": 3.4809, + "step": 17002 + }, + { + "epoch": 0.7159157894736842, + "grad_norm": 0.43359375, + "learning_rate": 9.669992330694153e-05, + "loss": 3.2492, + "step": 17003 + }, + { + "epoch": 0.7159578947368421, + "grad_norm": 0.423828125, + "learning_rate": 9.66732691687085e-05, + "loss": 2.7765, + "step": 17004 + }, + { + "epoch": 0.716, + "grad_norm": 0.44140625, + "learning_rate": 9.66466178239084e-05, + "loss": 3.43, + "step": 17005 + }, + { + "epoch": 0.7160421052631579, + "grad_norm": 0.4140625, + "learning_rate": 9.661996927302663e-05, + "loss": 2.7444, + "step": 17006 + }, + { + "epoch": 0.7160842105263158, + "grad_norm": 0.41796875, + "learning_rate": 9.659332351654899e-05, + "loss": 2.959, + "step": 17007 + }, + { + "epoch": 0.7161263157894737, + "grad_norm": 0.4296875, + "learning_rate": 9.656668055496057e-05, + "loss": 3.1073, + "step": 17008 + }, + { + "epoch": 0.7161684210526316, + "grad_norm": 0.421875, + "learning_rate": 9.654004038874703e-05, + "loss": 3.2833, + "step": 17009 + }, + { + "epoch": 0.7162105263157895, + "grad_norm": 0.43359375, + "learning_rate": 9.651340301839367e-05, + "loss": 3.2678, + "step": 17010 + }, + { + "epoch": 0.7162526315789474, + "grad_norm": 0.419921875, + "learning_rate": 9.648676844438576e-05, + "loss": 3.0995, + "step": 17011 + }, + { + "epoch": 0.7162947368421053, + "grad_norm": 0.4453125, + "learning_rate": 9.646013666720854e-05, + "loss": 3.4039, + "step": 17012 + }, + { + "epoch": 0.7163368421052632, + "grad_norm": 0.4609375, + "learning_rate": 9.643350768734727e-05, + "loss": 3.4853, + "step": 17013 + }, + { + "epoch": 0.716378947368421, + "grad_norm": 0.421875, + "learning_rate": 9.6406881505287e-05, + "loss": 3.2767, + "step": 17014 + }, + { + "epoch": 0.716421052631579, + "grad_norm": 0.41796875, + "learning_rate": 9.638025812151282e-05, + "loss": 3.3066, + "step": 17015 + }, + { + "epoch": 0.7164631578947368, + "grad_norm": 0.392578125, + "learning_rate": 9.635363753650994e-05, + "loss": 2.878, + "step": 17016 + }, + { + "epoch": 0.7165052631578948, + "grad_norm": 0.421875, + "learning_rate": 9.632701975076327e-05, + "loss": 3.0943, + "step": 17017 + }, + { + "epoch": 0.7165473684210526, + "grad_norm": 0.42578125, + "learning_rate": 9.63004047647577e-05, + "loss": 3.1208, + "step": 17018 + }, + { + "epoch": 0.7165894736842106, + "grad_norm": 0.423828125, + "learning_rate": 9.627379257897823e-05, + "loss": 3.2549, + "step": 17019 + }, + { + "epoch": 0.7166315789473684, + "grad_norm": 0.4296875, + "learning_rate": 9.624718319390957e-05, + "loss": 3.2786, + "step": 17020 + }, + { + "epoch": 0.7166736842105264, + "grad_norm": 0.439453125, + "learning_rate": 9.622057661003661e-05, + "loss": 3.5141, + "step": 17021 + }, + { + "epoch": 0.7167157894736842, + "grad_norm": 0.4375, + "learning_rate": 9.619397282784398e-05, + "loss": 2.9217, + "step": 17022 + }, + { + "epoch": 0.7167578947368421, + "grad_norm": 0.4609375, + "learning_rate": 9.616737184781654e-05, + "loss": 3.0516, + "step": 17023 + }, + { + "epoch": 0.7168, + "grad_norm": 0.44140625, + "learning_rate": 9.614077367043883e-05, + "loss": 3.3174, + "step": 17024 + }, + { + "epoch": 0.716842105263158, + "grad_norm": 0.44140625, + "learning_rate": 9.611417829619543e-05, + "loss": 2.8571, + "step": 17025 + }, + { + "epoch": 0.7168842105263158, + "grad_norm": 0.42578125, + "learning_rate": 9.608758572557091e-05, + "loss": 3.3848, + "step": 17026 + }, + { + "epoch": 0.7169263157894736, + "grad_norm": 0.421875, + "learning_rate": 9.606099595904974e-05, + "loss": 3.1613, + "step": 17027 + }, + { + "epoch": 0.7169684210526316, + "grad_norm": 0.4453125, + "learning_rate": 9.603440899711627e-05, + "loss": 2.915, + "step": 17028 + }, + { + "epoch": 0.7170105263157894, + "grad_norm": 0.4296875, + "learning_rate": 9.600782484025513e-05, + "loss": 3.3253, + "step": 17029 + }, + { + "epoch": 0.7170526315789474, + "grad_norm": 0.40625, + "learning_rate": 9.598124348895032e-05, + "loss": 2.6367, + "step": 17030 + }, + { + "epoch": 0.7170947368421052, + "grad_norm": 0.421875, + "learning_rate": 9.595466494368634e-05, + "loss": 3.4796, + "step": 17031 + }, + { + "epoch": 0.7171368421052632, + "grad_norm": 0.44140625, + "learning_rate": 9.592808920494742e-05, + "loss": 3.4416, + "step": 17032 + }, + { + "epoch": 0.717178947368421, + "grad_norm": 0.4296875, + "learning_rate": 9.590151627321766e-05, + "loss": 2.9603, + "step": 17033 + }, + { + "epoch": 0.717221052631579, + "grad_norm": 0.61328125, + "learning_rate": 9.587494614898124e-05, + "loss": 2.5606, + "step": 17034 + }, + { + "epoch": 0.7172631578947368, + "grad_norm": 0.423828125, + "learning_rate": 9.58483788327221e-05, + "loss": 2.9297, + "step": 17035 + }, + { + "epoch": 0.7173052631578948, + "grad_norm": 0.4140625, + "learning_rate": 9.582181432492457e-05, + "loss": 3.0156, + "step": 17036 + }, + { + "epoch": 0.7173473684210526, + "grad_norm": 0.4921875, + "learning_rate": 9.579525262607224e-05, + "loss": 2.9094, + "step": 17037 + }, + { + "epoch": 0.7173894736842106, + "grad_norm": 0.4140625, + "learning_rate": 9.576869373664931e-05, + "loss": 2.6853, + "step": 17038 + }, + { + "epoch": 0.7174315789473684, + "grad_norm": 0.4140625, + "learning_rate": 9.574213765713957e-05, + "loss": 3.0654, + "step": 17039 + }, + { + "epoch": 0.7174736842105263, + "grad_norm": 0.4609375, + "learning_rate": 9.571558438802685e-05, + "loss": 3.0479, + "step": 17040 + }, + { + "epoch": 0.7175157894736842, + "grad_norm": 0.421875, + "learning_rate": 9.568903392979488e-05, + "loss": 3.2583, + "step": 17041 + }, + { + "epoch": 0.7175578947368421, + "grad_norm": 0.435546875, + "learning_rate": 9.566248628292734e-05, + "loss": 3.2792, + "step": 17042 + }, + { + "epoch": 0.7176, + "grad_norm": 0.4453125, + "learning_rate": 9.563594144790813e-05, + "loss": 2.8641, + "step": 17043 + }, + { + "epoch": 0.7176421052631579, + "grad_norm": 0.44921875, + "learning_rate": 9.560939942522055e-05, + "loss": 3.2026, + "step": 17044 + }, + { + "epoch": 0.7176842105263158, + "grad_norm": 0.4375, + "learning_rate": 9.558286021534842e-05, + "loss": 3.4816, + "step": 17045 + }, + { + "epoch": 0.7177263157894737, + "grad_norm": 0.50390625, + "learning_rate": 9.555632381877502e-05, + "loss": 2.7224, + "step": 17046 + }, + { + "epoch": 0.7177684210526316, + "grad_norm": 0.453125, + "learning_rate": 9.552979023598402e-05, + "loss": 3.4709, + "step": 17047 + }, + { + "epoch": 0.7178105263157895, + "grad_norm": 0.419921875, + "learning_rate": 9.550325946745872e-05, + "loss": 3.1153, + "step": 17048 + }, + { + "epoch": 0.7178526315789474, + "grad_norm": 0.40234375, + "learning_rate": 9.547673151368244e-05, + "loss": 3.1406, + "step": 17049 + }, + { + "epoch": 0.7178947368421053, + "grad_norm": 0.447265625, + "learning_rate": 9.545020637513871e-05, + "loss": 2.9752, + "step": 17050 + }, + { + "epoch": 0.7179368421052632, + "grad_norm": 0.447265625, + "learning_rate": 9.542368405231047e-05, + "loss": 3.0054, + "step": 17051 + }, + { + "epoch": 0.717978947368421, + "grad_norm": 0.42578125, + "learning_rate": 9.539716454568129e-05, + "loss": 3.3265, + "step": 17052 + }, + { + "epoch": 0.7180210526315789, + "grad_norm": 0.431640625, + "learning_rate": 9.53706478557339e-05, + "loss": 3.4637, + "step": 17053 + }, + { + "epoch": 0.7180631578947368, + "grad_norm": 0.416015625, + "learning_rate": 9.534413398295184e-05, + "loss": 3.163, + "step": 17054 + }, + { + "epoch": 0.7181052631578947, + "grad_norm": 0.4296875, + "learning_rate": 9.531762292781774e-05, + "loss": 2.8401, + "step": 17055 + }, + { + "epoch": 0.7181473684210526, + "grad_norm": 0.44140625, + "learning_rate": 9.529111469081489e-05, + "loss": 2.6626, + "step": 17056 + }, + { + "epoch": 0.7181894736842105, + "grad_norm": 0.41015625, + "learning_rate": 9.526460927242617e-05, + "loss": 3.2765, + "step": 17057 + }, + { + "epoch": 0.7182315789473684, + "grad_norm": 0.400390625, + "learning_rate": 9.523810667313437e-05, + "loss": 2.6277, + "step": 17058 + }, + { + "epoch": 0.7182736842105263, + "grad_norm": 0.427734375, + "learning_rate": 9.52116068934226e-05, + "loss": 3.2643, + "step": 17059 + }, + { + "epoch": 0.7183157894736842, + "grad_norm": 0.443359375, + "learning_rate": 9.51851099337733e-05, + "loss": 2.7817, + "step": 17060 + }, + { + "epoch": 0.7183578947368421, + "grad_norm": 0.4140625, + "learning_rate": 9.515861579466961e-05, + "loss": 3.1166, + "step": 17061 + }, + { + "epoch": 0.7184, + "grad_norm": 0.40234375, + "learning_rate": 9.513212447659381e-05, + "loss": 3.1691, + "step": 17062 + }, + { + "epoch": 0.7184421052631579, + "grad_norm": 0.416015625, + "learning_rate": 9.51056359800288e-05, + "loss": 2.6046, + "step": 17063 + }, + { + "epoch": 0.7184842105263158, + "grad_norm": 0.44140625, + "learning_rate": 9.507915030545716e-05, + "loss": 3.1064, + "step": 17064 + }, + { + "epoch": 0.7185263157894737, + "grad_norm": 0.46875, + "learning_rate": 9.505266745336124e-05, + "loss": 3.3288, + "step": 17065 + }, + { + "epoch": 0.7185684210526316, + "grad_norm": 0.412109375, + "learning_rate": 9.50261874242239e-05, + "loss": 3.1016, + "step": 17066 + }, + { + "epoch": 0.7186105263157895, + "grad_norm": 0.44140625, + "learning_rate": 9.49997102185271e-05, + "loss": 2.8839, + "step": 17067 + }, + { + "epoch": 0.7186526315789473, + "grad_norm": 0.423828125, + "learning_rate": 9.497323583675366e-05, + "loss": 2.9743, + "step": 17068 + }, + { + "epoch": 0.7186947368421053, + "grad_norm": 0.435546875, + "learning_rate": 9.494676427938556e-05, + "loss": 3.0364, + "step": 17069 + }, + { + "epoch": 0.7187368421052631, + "grad_norm": 0.443359375, + "learning_rate": 9.492029554690529e-05, + "loss": 3.1935, + "step": 17070 + }, + { + "epoch": 0.7187789473684211, + "grad_norm": 0.435546875, + "learning_rate": 9.489382963979504e-05, + "loss": 3.477, + "step": 17071 + }, + { + "epoch": 0.7188210526315789, + "grad_norm": 0.4453125, + "learning_rate": 9.486736655853698e-05, + "loss": 3.2681, + "step": 17072 + }, + { + "epoch": 0.7188631578947369, + "grad_norm": 0.431640625, + "learning_rate": 9.484090630361322e-05, + "loss": 2.7928, + "step": 17073 + }, + { + "epoch": 0.7189052631578947, + "grad_norm": 0.42578125, + "learning_rate": 9.481444887550575e-05, + "loss": 3.314, + "step": 17074 + }, + { + "epoch": 0.7189473684210527, + "grad_norm": 0.421875, + "learning_rate": 9.478799427469686e-05, + "loss": 3.1794, + "step": 17075 + }, + { + "epoch": 0.7189894736842105, + "grad_norm": 0.43359375, + "learning_rate": 9.476154250166819e-05, + "loss": 3.1201, + "step": 17076 + }, + { + "epoch": 0.7190315789473685, + "grad_norm": 0.41796875, + "learning_rate": 9.473509355690188e-05, + "loss": 3.118, + "step": 17077 + }, + { + "epoch": 0.7190736842105263, + "grad_norm": 0.45703125, + "learning_rate": 9.470864744087976e-05, + "loss": 2.9984, + "step": 17078 + }, + { + "epoch": 0.7191157894736843, + "grad_norm": 0.435546875, + "learning_rate": 9.46822041540836e-05, + "loss": 2.9723, + "step": 17079 + }, + { + "epoch": 0.7191578947368421, + "grad_norm": 0.431640625, + "learning_rate": 9.465576369699522e-05, + "loss": 2.8231, + "step": 17080 + }, + { + "epoch": 0.7192, + "grad_norm": 0.40234375, + "learning_rate": 9.46293260700962e-05, + "loss": 2.7715, + "step": 17081 + }, + { + "epoch": 0.7192421052631579, + "grad_norm": 0.51171875, + "learning_rate": 9.46028912738684e-05, + "loss": 3.0794, + "step": 17082 + }, + { + "epoch": 0.7192842105263157, + "grad_norm": 0.435546875, + "learning_rate": 9.457645930879333e-05, + "loss": 3.0214, + "step": 17083 + }, + { + "epoch": 0.7193263157894737, + "grad_norm": 0.419921875, + "learning_rate": 9.455003017535255e-05, + "loss": 3.108, + "step": 17084 + }, + { + "epoch": 0.7193684210526315, + "grad_norm": 0.462890625, + "learning_rate": 9.452360387402755e-05, + "loss": 3.1912, + "step": 17085 + }, + { + "epoch": 0.7194105263157895, + "grad_norm": 0.443359375, + "learning_rate": 9.449718040529987e-05, + "loss": 2.9224, + "step": 17086 + }, + { + "epoch": 0.7194526315789473, + "grad_norm": 0.4375, + "learning_rate": 9.447075976965081e-05, + "loss": 3.287, + "step": 17087 + }, + { + "epoch": 0.7194947368421053, + "grad_norm": 0.44140625, + "learning_rate": 9.44443419675618e-05, + "loss": 3.1698, + "step": 17088 + }, + { + "epoch": 0.7195368421052631, + "grad_norm": 0.416015625, + "learning_rate": 9.441792699951399e-05, + "loss": 3.3985, + "step": 17089 + }, + { + "epoch": 0.7195789473684211, + "grad_norm": 0.45703125, + "learning_rate": 9.439151486598884e-05, + "loss": 3.7491, + "step": 17090 + }, + { + "epoch": 0.7196210526315789, + "grad_norm": 0.451171875, + "learning_rate": 9.436510556746747e-05, + "loss": 3.0031, + "step": 17091 + }, + { + "epoch": 0.7196631578947369, + "grad_norm": 0.45703125, + "learning_rate": 9.433869910443102e-05, + "loss": 3.1601, + "step": 17092 + }, + { + "epoch": 0.7197052631578947, + "grad_norm": 0.443359375, + "learning_rate": 9.431229547736056e-05, + "loss": 3.3263, + "step": 17093 + }, + { + "epoch": 0.7197473684210526, + "grad_norm": 0.466796875, + "learning_rate": 9.428589468673718e-05, + "loss": 3.1814, + "step": 17094 + }, + { + "epoch": 0.7197894736842105, + "grad_norm": 0.416015625, + "learning_rate": 9.425949673304179e-05, + "loss": 3.1573, + "step": 17095 + }, + { + "epoch": 0.7198315789473684, + "grad_norm": 0.42578125, + "learning_rate": 9.423310161675532e-05, + "loss": 2.5584, + "step": 17096 + }, + { + "epoch": 0.7198736842105263, + "grad_norm": 0.45703125, + "learning_rate": 9.420670933835892e-05, + "loss": 3.0186, + "step": 17097 + }, + { + "epoch": 0.7199157894736842, + "grad_norm": 0.439453125, + "learning_rate": 9.418031989833304e-05, + "loss": 2.8352, + "step": 17098 + }, + { + "epoch": 0.7199578947368421, + "grad_norm": 0.431640625, + "learning_rate": 9.415393329715872e-05, + "loss": 2.9753, + "step": 17099 + }, + { + "epoch": 0.72, + "grad_norm": 0.451171875, + "learning_rate": 9.412754953531663e-05, + "loss": 3.0613, + "step": 17100 + }, + { + "epoch": 0.7200421052631579, + "grad_norm": 0.51171875, + "learning_rate": 9.410116861328746e-05, + "loss": 2.9335, + "step": 17101 + }, + { + "epoch": 0.7200842105263158, + "grad_norm": 0.412109375, + "learning_rate": 9.40747905315518e-05, + "loss": 2.9339, + "step": 17102 + }, + { + "epoch": 0.7201263157894737, + "grad_norm": 0.443359375, + "learning_rate": 9.404841529059018e-05, + "loss": 3.0512, + "step": 17103 + }, + { + "epoch": 0.7201684210526316, + "grad_norm": 0.4375, + "learning_rate": 9.402204289088337e-05, + "loss": 3.3844, + "step": 17104 + }, + { + "epoch": 0.7202105263157895, + "grad_norm": 0.4453125, + "learning_rate": 9.39956733329115e-05, + "loss": 3.4266, + "step": 17105 + }, + { + "epoch": 0.7202526315789474, + "grad_norm": 0.41796875, + "learning_rate": 9.396930661715524e-05, + "loss": 2.9153, + "step": 17106 + }, + { + "epoch": 0.7202947368421052, + "grad_norm": 0.44921875, + "learning_rate": 9.39429427440949e-05, + "loss": 3.3782, + "step": 17107 + }, + { + "epoch": 0.7203368421052632, + "grad_norm": 0.412109375, + "learning_rate": 9.391658171421078e-05, + "loss": 3.1637, + "step": 17108 + }, + { + "epoch": 0.720378947368421, + "grad_norm": 0.435546875, + "learning_rate": 9.389022352798315e-05, + "loss": 2.6935, + "step": 17109 + }, + { + "epoch": 0.720421052631579, + "grad_norm": 0.412109375, + "learning_rate": 9.386386818589212e-05, + "loss": 2.9736, + "step": 17110 + }, + { + "epoch": 0.7204631578947368, + "grad_norm": 0.44921875, + "learning_rate": 9.383751568841817e-05, + "loss": 3.0077, + "step": 17111 + }, + { + "epoch": 0.7205052631578948, + "grad_norm": 0.43359375, + "learning_rate": 9.381116603604101e-05, + "loss": 3.0732, + "step": 17112 + }, + { + "epoch": 0.7205473684210526, + "grad_norm": 0.44140625, + "learning_rate": 9.378481922924107e-05, + "loss": 3.6222, + "step": 17113 + }, + { + "epoch": 0.7205894736842106, + "grad_norm": 0.421875, + "learning_rate": 9.375847526849798e-05, + "loss": 3.396, + "step": 17114 + }, + { + "epoch": 0.7206315789473684, + "grad_norm": 0.427734375, + "learning_rate": 9.3732134154292e-05, + "loss": 2.8817, + "step": 17115 + }, + { + "epoch": 0.7206736842105264, + "grad_norm": 0.40625, + "learning_rate": 9.370579588710292e-05, + "loss": 3.5487, + "step": 17116 + }, + { + "epoch": 0.7207157894736842, + "grad_norm": 0.453125, + "learning_rate": 9.367946046741052e-05, + "loss": 2.7398, + "step": 17117 + }, + { + "epoch": 0.7207578947368422, + "grad_norm": 0.40625, + "learning_rate": 9.365312789569485e-05, + "loss": 3.0182, + "step": 17118 + }, + { + "epoch": 0.7208, + "grad_norm": 0.423828125, + "learning_rate": 9.362679817243533e-05, + "loss": 2.675, + "step": 17119 + }, + { + "epoch": 0.7208421052631578, + "grad_norm": 0.470703125, + "learning_rate": 9.360047129811197e-05, + "loss": 3.1011, + "step": 17120 + }, + { + "epoch": 0.7208842105263158, + "grad_norm": 0.439453125, + "learning_rate": 9.35741472732041e-05, + "loss": 3.2048, + "step": 17121 + }, + { + "epoch": 0.7209263157894736, + "grad_norm": 0.42578125, + "learning_rate": 9.354782609819157e-05, + "loss": 3.2017, + "step": 17122 + }, + { + "epoch": 0.7209684210526316, + "grad_norm": 0.41015625, + "learning_rate": 9.352150777355381e-05, + "loss": 3.137, + "step": 17123 + }, + { + "epoch": 0.7210105263157894, + "grad_norm": 0.466796875, + "learning_rate": 9.349519229977033e-05, + "loss": 2.791, + "step": 17124 + }, + { + "epoch": 0.7210526315789474, + "grad_norm": 0.435546875, + "learning_rate": 9.346887967732056e-05, + "loss": 3.3161, + "step": 17125 + }, + { + "epoch": 0.7210947368421052, + "grad_norm": 0.443359375, + "learning_rate": 9.344256990668379e-05, + "loss": 3.0221, + "step": 17126 + }, + { + "epoch": 0.7211368421052632, + "grad_norm": 0.43359375, + "learning_rate": 9.341626298833963e-05, + "loss": 3.2131, + "step": 17127 + }, + { + "epoch": 0.721178947368421, + "grad_norm": 0.453125, + "learning_rate": 9.3389958922767e-05, + "loss": 3.4351, + "step": 17128 + }, + { + "epoch": 0.721221052631579, + "grad_norm": 0.447265625, + "learning_rate": 9.336365771044541e-05, + "loss": 2.3956, + "step": 17129 + }, + { + "epoch": 0.7212631578947368, + "grad_norm": 0.443359375, + "learning_rate": 9.333735935185391e-05, + "loss": 2.6837, + "step": 17130 + }, + { + "epoch": 0.7213052631578948, + "grad_norm": 0.40234375, + "learning_rate": 9.331106384747168e-05, + "loss": 3.2801, + "step": 17131 + }, + { + "epoch": 0.7213473684210526, + "grad_norm": 0.447265625, + "learning_rate": 9.328477119777775e-05, + "loss": 3.4917, + "step": 17132 + }, + { + "epoch": 0.7213894736842106, + "grad_norm": 0.423828125, + "learning_rate": 9.325848140325108e-05, + "loss": 2.9655, + "step": 17133 + }, + { + "epoch": 0.7214315789473684, + "grad_norm": 0.435546875, + "learning_rate": 9.32321944643709e-05, + "loss": 2.9106, + "step": 17134 + }, + { + "epoch": 0.7214736842105263, + "grad_norm": 0.42578125, + "learning_rate": 9.320591038161574e-05, + "loss": 2.7763, + "step": 17135 + }, + { + "epoch": 0.7215157894736842, + "grad_norm": 0.44140625, + "learning_rate": 9.317962915546477e-05, + "loss": 3.1005, + "step": 17136 + }, + { + "epoch": 0.7215578947368421, + "grad_norm": 0.43359375, + "learning_rate": 9.31533507863967e-05, + "loss": 2.7304, + "step": 17137 + }, + { + "epoch": 0.7216, + "grad_norm": 0.431640625, + "learning_rate": 9.312707527489029e-05, + "loss": 3.3067, + "step": 17138 + }, + { + "epoch": 0.7216421052631579, + "grad_norm": 0.41015625, + "learning_rate": 9.310080262142422e-05, + "loss": 2.9438, + "step": 17139 + }, + { + "epoch": 0.7216842105263158, + "grad_norm": 0.431640625, + "learning_rate": 9.307453282647723e-05, + "loss": 3.3292, + "step": 17140 + }, + { + "epoch": 0.7217263157894737, + "grad_norm": 0.435546875, + "learning_rate": 9.304826589052784e-05, + "loss": 3.1027, + "step": 17141 + }, + { + "epoch": 0.7217684210526316, + "grad_norm": 0.4296875, + "learning_rate": 9.302200181405457e-05, + "loss": 2.7469, + "step": 17142 + }, + { + "epoch": 0.7218105263157895, + "grad_norm": 0.44921875, + "learning_rate": 9.299574059753604e-05, + "loss": 2.9746, + "step": 17143 + }, + { + "epoch": 0.7218526315789474, + "grad_norm": 0.51953125, + "learning_rate": 9.296948224145065e-05, + "loss": 3.4325, + "step": 17144 + }, + { + "epoch": 0.7218947368421053, + "grad_norm": 0.44921875, + "learning_rate": 9.29432267462768e-05, + "loss": 2.8044, + "step": 17145 + }, + { + "epoch": 0.7219368421052632, + "grad_norm": 0.44140625, + "learning_rate": 9.291697411249283e-05, + "loss": 3.1655, + "step": 17146 + }, + { + "epoch": 0.721978947368421, + "grad_norm": 0.453125, + "learning_rate": 9.289072434057705e-05, + "loss": 3.0306, + "step": 17147 + }, + { + "epoch": 0.7220210526315789, + "grad_norm": 0.419921875, + "learning_rate": 9.286447743100762e-05, + "loss": 3.2504, + "step": 17148 + }, + { + "epoch": 0.7220631578947369, + "grad_norm": 0.40625, + "learning_rate": 9.283823338426276e-05, + "loss": 2.8076, + "step": 17149 + }, + { + "epoch": 0.7221052631578947, + "grad_norm": 0.482421875, + "learning_rate": 9.281199220082068e-05, + "loss": 2.9643, + "step": 17150 + }, + { + "epoch": 0.7221473684210526, + "grad_norm": 0.443359375, + "learning_rate": 9.278575388115939e-05, + "loss": 3.3393, + "step": 17151 + }, + { + "epoch": 0.7221894736842105, + "grad_norm": 0.451171875, + "learning_rate": 9.2759518425757e-05, + "loss": 3.103, + "step": 17152 + }, + { + "epoch": 0.7222315789473684, + "grad_norm": 0.4296875, + "learning_rate": 9.273328583509139e-05, + "loss": 3.4286, + "step": 17153 + }, + { + "epoch": 0.7222736842105263, + "grad_norm": 0.435546875, + "learning_rate": 9.270705610964055e-05, + "loss": 3.5033, + "step": 17154 + }, + { + "epoch": 0.7223157894736842, + "grad_norm": 0.431640625, + "learning_rate": 9.268082924988233e-05, + "loss": 3.0154, + "step": 17155 + }, + { + "epoch": 0.7223578947368421, + "grad_norm": 0.427734375, + "learning_rate": 9.265460525629457e-05, + "loss": 2.8436, + "step": 17156 + }, + { + "epoch": 0.7224, + "grad_norm": 0.41015625, + "learning_rate": 9.262838412935492e-05, + "loss": 3.1102, + "step": 17157 + }, + { + "epoch": 0.7224421052631579, + "grad_norm": 0.408203125, + "learning_rate": 9.260216586954132e-05, + "loss": 2.54, + "step": 17158 + }, + { + "epoch": 0.7224842105263158, + "grad_norm": 0.388671875, + "learning_rate": 9.257595047733134e-05, + "loss": 2.9371, + "step": 17159 + }, + { + "epoch": 0.7225263157894737, + "grad_norm": 0.4296875, + "learning_rate": 9.254973795320254e-05, + "loss": 3.3212, + "step": 17160 + }, + { + "epoch": 0.7225684210526315, + "grad_norm": 0.42578125, + "learning_rate": 9.252352829763252e-05, + "loss": 3.1005, + "step": 17161 + }, + { + "epoch": 0.7226105263157895, + "grad_norm": 0.421875, + "learning_rate": 9.24973215110988e-05, + "loss": 3.0158, + "step": 17162 + }, + { + "epoch": 0.7226526315789473, + "grad_norm": 0.4765625, + "learning_rate": 9.24711175940788e-05, + "loss": 3.2439, + "step": 17163 + }, + { + "epoch": 0.7226947368421053, + "grad_norm": 0.416015625, + "learning_rate": 9.24449165470499e-05, + "loss": 2.9197, + "step": 17164 + }, + { + "epoch": 0.7227368421052631, + "grad_norm": 0.4375, + "learning_rate": 9.241871837048957e-05, + "loss": 3.1942, + "step": 17165 + }, + { + "epoch": 0.7227789473684211, + "grad_norm": 0.421875, + "learning_rate": 9.239252306487503e-05, + "loss": 3.3429, + "step": 17166 + }, + { + "epoch": 0.7228210526315789, + "grad_norm": 0.408203125, + "learning_rate": 9.236633063068356e-05, + "loss": 3.2426, + "step": 17167 + }, + { + "epoch": 0.7228631578947369, + "grad_norm": 0.421875, + "learning_rate": 9.23401410683923e-05, + "loss": 2.9097, + "step": 17168 + }, + { + "epoch": 0.7229052631578947, + "grad_norm": 0.421875, + "learning_rate": 9.231395437847845e-05, + "loss": 3.4837, + "step": 17169 + }, + { + "epoch": 0.7229473684210527, + "grad_norm": 0.41796875, + "learning_rate": 9.22877705614191e-05, + "loss": 3.2581, + "step": 17170 + }, + { + "epoch": 0.7229894736842105, + "grad_norm": 0.412109375, + "learning_rate": 9.226158961769115e-05, + "loss": 2.9161, + "step": 17171 + }, + { + "epoch": 0.7230315789473685, + "grad_norm": 0.447265625, + "learning_rate": 9.223541154777187e-05, + "loss": 3.7349, + "step": 17172 + }, + { + "epoch": 0.7230736842105263, + "grad_norm": 0.455078125, + "learning_rate": 9.220923635213788e-05, + "loss": 2.7016, + "step": 17173 + }, + { + "epoch": 0.7231157894736842, + "grad_norm": 0.44140625, + "learning_rate": 9.218306403126627e-05, + "loss": 3.1164, + "step": 17174 + }, + { + "epoch": 0.7231578947368421, + "grad_norm": 0.431640625, + "learning_rate": 9.21568945856338e-05, + "loss": 3.0624, + "step": 17175 + }, + { + "epoch": 0.7232, + "grad_norm": 0.4296875, + "learning_rate": 9.21307280157172e-05, + "loss": 3.6387, + "step": 17176 + }, + { + "epoch": 0.7232421052631579, + "grad_norm": 0.431640625, + "learning_rate": 9.210456432199338e-05, + "loss": 3.3541, + "step": 17177 + }, + { + "epoch": 0.7232842105263158, + "grad_norm": 0.42578125, + "learning_rate": 9.207840350493874e-05, + "loss": 3.2809, + "step": 17178 + }, + { + "epoch": 0.7233263157894737, + "grad_norm": 0.5390625, + "learning_rate": 9.205224556503019e-05, + "loss": 2.5804, + "step": 17179 + }, + { + "epoch": 0.7233684210526315, + "grad_norm": 0.388671875, + "learning_rate": 9.202609050274399e-05, + "loss": 3.0256, + "step": 17180 + }, + { + "epoch": 0.7234105263157895, + "grad_norm": 0.435546875, + "learning_rate": 9.199993831855688e-05, + "loss": 3.4492, + "step": 17181 + }, + { + "epoch": 0.7234526315789473, + "grad_norm": 0.4375, + "learning_rate": 9.197378901294525e-05, + "loss": 3.411, + "step": 17182 + }, + { + "epoch": 0.7234947368421053, + "grad_norm": 0.423828125, + "learning_rate": 9.194764258638552e-05, + "loss": 3.0743, + "step": 17183 + }, + { + "epoch": 0.7235368421052631, + "grad_norm": 0.42578125, + "learning_rate": 9.192149903935403e-05, + "loss": 3.2384, + "step": 17184 + }, + { + "epoch": 0.7235789473684211, + "grad_norm": 0.44921875, + "learning_rate": 9.189535837232701e-05, + "loss": 3.2272, + "step": 17185 + }, + { + "epoch": 0.7236210526315789, + "grad_norm": 0.443359375, + "learning_rate": 9.186922058578099e-05, + "loss": 3.1117, + "step": 17186 + }, + { + "epoch": 0.7236631578947369, + "grad_norm": 0.4375, + "learning_rate": 9.184308568019176e-05, + "loss": 3.0523, + "step": 17187 + }, + { + "epoch": 0.7237052631578947, + "grad_norm": 0.4453125, + "learning_rate": 9.181695365603587e-05, + "loss": 3.0491, + "step": 17188 + }, + { + "epoch": 0.7237473684210526, + "grad_norm": 0.443359375, + "learning_rate": 9.179082451378906e-05, + "loss": 3.2683, + "step": 17189 + }, + { + "epoch": 0.7237894736842105, + "grad_norm": 0.42578125, + "learning_rate": 9.176469825392763e-05, + "loss": 2.7752, + "step": 17190 + }, + { + "epoch": 0.7238315789473684, + "grad_norm": 0.41015625, + "learning_rate": 9.173857487692747e-05, + "loss": 3.1661, + "step": 17191 + }, + { + "epoch": 0.7238736842105263, + "grad_norm": 0.419921875, + "learning_rate": 9.171245438326445e-05, + "loss": 2.9622, + "step": 17192 + }, + { + "epoch": 0.7239157894736842, + "grad_norm": 0.4296875, + "learning_rate": 9.16863367734147e-05, + "loss": 3.0648, + "step": 17193 + }, + { + "epoch": 0.7239578947368421, + "grad_norm": 0.404296875, + "learning_rate": 9.166022204785371e-05, + "loss": 3.04, + "step": 17194 + }, + { + "epoch": 0.724, + "grad_norm": 0.40625, + "learning_rate": 9.163411020705762e-05, + "loss": 2.9497, + "step": 17195 + }, + { + "epoch": 0.7240421052631579, + "grad_norm": 0.427734375, + "learning_rate": 9.160800125150176e-05, + "loss": 3.2913, + "step": 17196 + }, + { + "epoch": 0.7240842105263158, + "grad_norm": 0.443359375, + "learning_rate": 9.158189518166213e-05, + "loss": 2.4641, + "step": 17197 + }, + { + "epoch": 0.7241263157894737, + "grad_norm": 0.474609375, + "learning_rate": 9.155579199801422e-05, + "loss": 2.8075, + "step": 17198 + }, + { + "epoch": 0.7241684210526316, + "grad_norm": 0.447265625, + "learning_rate": 9.152969170103362e-05, + "loss": 2.8622, + "step": 17199 + }, + { + "epoch": 0.7242105263157895, + "grad_norm": 0.40234375, + "learning_rate": 9.150359429119587e-05, + "loss": 3.0946, + "step": 17200 + }, + { + "epoch": 0.7242526315789474, + "grad_norm": 0.466796875, + "learning_rate": 9.147749976897629e-05, + "loss": 2.7788, + "step": 17201 + }, + { + "epoch": 0.7242947368421052, + "grad_norm": 0.4140625, + "learning_rate": 9.145140813485058e-05, + "loss": 3.1854, + "step": 17202 + }, + { + "epoch": 0.7243368421052632, + "grad_norm": 0.42578125, + "learning_rate": 9.142531938929377e-05, + "loss": 2.6205, + "step": 17203 + }, + { + "epoch": 0.724378947368421, + "grad_norm": 0.4296875, + "learning_rate": 9.139923353278141e-05, + "loss": 2.8062, + "step": 17204 + }, + { + "epoch": 0.724421052631579, + "grad_norm": 0.431640625, + "learning_rate": 9.137315056578863e-05, + "loss": 2.8614, + "step": 17205 + }, + { + "epoch": 0.7244631578947368, + "grad_norm": 0.439453125, + "learning_rate": 9.134707048879071e-05, + "loss": 2.8881, + "step": 17206 + }, + { + "epoch": 0.7245052631578948, + "grad_norm": 0.41796875, + "learning_rate": 9.132099330226273e-05, + "loss": 2.9783, + "step": 17207 + }, + { + "epoch": 0.7245473684210526, + "grad_norm": 0.44140625, + "learning_rate": 9.129491900667974e-05, + "loss": 2.9735, + "step": 17208 + }, + { + "epoch": 0.7245894736842106, + "grad_norm": 0.4296875, + "learning_rate": 9.126884760251703e-05, + "loss": 3.0815, + "step": 17209 + }, + { + "epoch": 0.7246315789473684, + "grad_norm": 0.412109375, + "learning_rate": 9.124277909024923e-05, + "loss": 3.0697, + "step": 17210 + }, + { + "epoch": 0.7246736842105264, + "grad_norm": 0.40625, + "learning_rate": 9.121671347035157e-05, + "loss": 2.9519, + "step": 17211 + }, + { + "epoch": 0.7247157894736842, + "grad_norm": 0.423828125, + "learning_rate": 9.119065074329879e-05, + "loss": 3.4233, + "step": 17212 + }, + { + "epoch": 0.7247578947368422, + "grad_norm": 0.400390625, + "learning_rate": 9.116459090956581e-05, + "loss": 3.2722, + "step": 17213 + }, + { + "epoch": 0.7248, + "grad_norm": 0.427734375, + "learning_rate": 9.113853396962732e-05, + "loss": 3.2711, + "step": 17214 + }, + { + "epoch": 0.7248421052631578, + "grad_norm": 0.421875, + "learning_rate": 9.111247992395811e-05, + "loss": 3.095, + "step": 17215 + }, + { + "epoch": 0.7248842105263158, + "grad_norm": 0.43359375, + "learning_rate": 9.10864287730328e-05, + "loss": 3.3393, + "step": 17216 + }, + { + "epoch": 0.7249263157894736, + "grad_norm": 0.4375, + "learning_rate": 9.106038051732598e-05, + "loss": 3.1158, + "step": 17217 + }, + { + "epoch": 0.7249684210526316, + "grad_norm": 0.47265625, + "learning_rate": 9.103433515731238e-05, + "loss": 2.8447, + "step": 17218 + }, + { + "epoch": 0.7250105263157894, + "grad_norm": 0.47265625, + "learning_rate": 9.10082926934664e-05, + "loss": 3.0099, + "step": 17219 + }, + { + "epoch": 0.7250526315789474, + "grad_norm": 0.416015625, + "learning_rate": 9.098225312626252e-05, + "loss": 3.02, + "step": 17220 + }, + { + "epoch": 0.7250947368421052, + "grad_norm": 0.4375, + "learning_rate": 9.095621645617513e-05, + "loss": 2.9472, + "step": 17221 + }, + { + "epoch": 0.7251368421052632, + "grad_norm": 0.4375, + "learning_rate": 9.093018268367863e-05, + "loss": 3.4754, + "step": 17222 + }, + { + "epoch": 0.725178947368421, + "grad_norm": 0.4453125, + "learning_rate": 9.09041518092473e-05, + "loss": 3.0367, + "step": 17223 + }, + { + "epoch": 0.725221052631579, + "grad_norm": 0.48828125, + "learning_rate": 9.08781238333553e-05, + "loss": 3.1135, + "step": 17224 + }, + { + "epoch": 0.7252631578947368, + "grad_norm": 0.431640625, + "learning_rate": 9.085209875647704e-05, + "loss": 3.2696, + "step": 17225 + }, + { + "epoch": 0.7253052631578948, + "grad_norm": 0.4140625, + "learning_rate": 9.082607657908651e-05, + "loss": 2.9587, + "step": 17226 + }, + { + "epoch": 0.7253473684210526, + "grad_norm": 0.453125, + "learning_rate": 9.080005730165786e-05, + "loss": 2.5119, + "step": 17227 + }, + { + "epoch": 0.7253894736842105, + "grad_norm": 0.419921875, + "learning_rate": 9.077404092466513e-05, + "loss": 3.1148, + "step": 17228 + }, + { + "epoch": 0.7254315789473684, + "grad_norm": 0.44921875, + "learning_rate": 9.074802744858227e-05, + "loss": 3.1082, + "step": 17229 + }, + { + "epoch": 0.7254736842105263, + "grad_norm": 0.42578125, + "learning_rate": 9.072201687388315e-05, + "loss": 2.963, + "step": 17230 + }, + { + "epoch": 0.7255157894736842, + "grad_norm": 0.439453125, + "learning_rate": 9.06960092010419e-05, + "loss": 3.2361, + "step": 17231 + }, + { + "epoch": 0.7255578947368421, + "grad_norm": 0.41015625, + "learning_rate": 9.067000443053206e-05, + "loss": 2.7243, + "step": 17232 + }, + { + "epoch": 0.7256, + "grad_norm": 0.4453125, + "learning_rate": 9.064400256282756e-05, + "loss": 3.2412, + "step": 17233 + }, + { + "epoch": 0.7256421052631579, + "grad_norm": 0.421875, + "learning_rate": 9.061800359840214e-05, + "loss": 2.4901, + "step": 17234 + }, + { + "epoch": 0.7256842105263158, + "grad_norm": 0.42578125, + "learning_rate": 9.059200753772942e-05, + "loss": 3.2408, + "step": 17235 + }, + { + "epoch": 0.7257263157894737, + "grad_norm": 0.439453125, + "learning_rate": 9.056601438128303e-05, + "loss": 2.5786, + "step": 17236 + }, + { + "epoch": 0.7257684210526316, + "grad_norm": 0.42578125, + "learning_rate": 9.054002412953643e-05, + "loss": 3.2745, + "step": 17237 + }, + { + "epoch": 0.7258105263157895, + "grad_norm": 0.42578125, + "learning_rate": 9.051403678296342e-05, + "loss": 2.9857, + "step": 17238 + }, + { + "epoch": 0.7258526315789474, + "grad_norm": 0.451171875, + "learning_rate": 9.048805234203708e-05, + "loss": 3.4904, + "step": 17239 + }, + { + "epoch": 0.7258947368421053, + "grad_norm": 0.45703125, + "learning_rate": 9.046207080723112e-05, + "loss": 3.1568, + "step": 17240 + }, + { + "epoch": 0.7259368421052631, + "grad_norm": 0.423828125, + "learning_rate": 9.043609217901877e-05, + "loss": 3.0911, + "step": 17241 + }, + { + "epoch": 0.7259789473684211, + "grad_norm": 0.439453125, + "learning_rate": 9.041011645787334e-05, + "loss": 2.9859, + "step": 17242 + }, + { + "epoch": 0.7260210526315789, + "grad_norm": 0.44140625, + "learning_rate": 9.038414364426806e-05, + "loss": 2.7096, + "step": 17243 + }, + { + "epoch": 0.7260631578947369, + "grad_norm": 0.453125, + "learning_rate": 9.035817373867608e-05, + "loss": 3.2808, + "step": 17244 + }, + { + "epoch": 0.7261052631578947, + "grad_norm": 0.416015625, + "learning_rate": 9.033220674157075e-05, + "loss": 3.2713, + "step": 17245 + }, + { + "epoch": 0.7261473684210527, + "grad_norm": 0.4453125, + "learning_rate": 9.030624265342486e-05, + "loss": 3.3782, + "step": 17246 + }, + { + "epoch": 0.7261894736842105, + "grad_norm": 0.431640625, + "learning_rate": 9.028028147471177e-05, + "loss": 2.887, + "step": 17247 + }, + { + "epoch": 0.7262315789473685, + "grad_norm": 0.419921875, + "learning_rate": 9.02543232059041e-05, + "loss": 2.6477, + "step": 17248 + }, + { + "epoch": 0.7262736842105263, + "grad_norm": 0.427734375, + "learning_rate": 9.022836784747507e-05, + "loss": 2.5102, + "step": 17249 + }, + { + "epoch": 0.7263157894736842, + "grad_norm": 0.423828125, + "learning_rate": 9.020241539989746e-05, + "loss": 2.9786, + "step": 17250 + }, + { + "epoch": 0.7263578947368421, + "grad_norm": 0.5078125, + "learning_rate": 9.017646586364398e-05, + "loss": 2.9606, + "step": 17251 + }, + { + "epoch": 0.7264, + "grad_norm": 0.482421875, + "learning_rate": 9.015051923918768e-05, + "loss": 3.2284, + "step": 17252 + }, + { + "epoch": 0.7264421052631579, + "grad_norm": 0.396484375, + "learning_rate": 9.012457552700095e-05, + "loss": 2.8222, + "step": 17253 + }, + { + "epoch": 0.7264842105263158, + "grad_norm": 0.44140625, + "learning_rate": 9.009863472755677e-05, + "loss": 3.0449, + "step": 17254 + }, + { + "epoch": 0.7265263157894737, + "grad_norm": 0.455078125, + "learning_rate": 9.007269684132741e-05, + "loss": 2.7784, + "step": 17255 + }, + { + "epoch": 0.7265684210526315, + "grad_norm": 0.42578125, + "learning_rate": 9.004676186878571e-05, + "loss": 3.2042, + "step": 17256 + }, + { + "epoch": 0.7266105263157895, + "grad_norm": 0.408203125, + "learning_rate": 9.002082981040405e-05, + "loss": 3.128, + "step": 17257 + }, + { + "epoch": 0.7266526315789473, + "grad_norm": 0.44921875, + "learning_rate": 8.999490066665491e-05, + "loss": 3.1982, + "step": 17258 + }, + { + "epoch": 0.7266947368421053, + "grad_norm": 0.474609375, + "learning_rate": 8.996897443801069e-05, + "loss": 3.3585, + "step": 17259 + }, + { + "epoch": 0.7267368421052631, + "grad_norm": 0.439453125, + "learning_rate": 8.994305112494364e-05, + "loss": 3.3966, + "step": 17260 + }, + { + "epoch": 0.7267789473684211, + "grad_norm": 0.427734375, + "learning_rate": 8.991713072792631e-05, + "loss": 3.1515, + "step": 17261 + }, + { + "epoch": 0.7268210526315789, + "grad_norm": 0.42578125, + "learning_rate": 8.98912132474306e-05, + "loss": 3.3688, + "step": 17262 + }, + { + "epoch": 0.7268631578947369, + "grad_norm": 0.404296875, + "learning_rate": 8.986529868392904e-05, + "loss": 2.9692, + "step": 17263 + }, + { + "epoch": 0.7269052631578947, + "grad_norm": 0.42578125, + "learning_rate": 8.983938703789337e-05, + "loss": 3.3511, + "step": 17264 + }, + { + "epoch": 0.7269473684210527, + "grad_norm": 0.478515625, + "learning_rate": 8.981347830979603e-05, + "loss": 3.0733, + "step": 17265 + }, + { + "epoch": 0.7269894736842105, + "grad_norm": 0.431640625, + "learning_rate": 8.978757250010888e-05, + "loss": 2.7986, + "step": 17266 + }, + { + "epoch": 0.7270315789473685, + "grad_norm": 0.4765625, + "learning_rate": 8.976166960930385e-05, + "loss": 2.9375, + "step": 17267 + }, + { + "epoch": 0.7270736842105263, + "grad_norm": 0.4140625, + "learning_rate": 8.973576963785308e-05, + "loss": 3.3754, + "step": 17268 + }, + { + "epoch": 0.7271157894736842, + "grad_norm": 0.408203125, + "learning_rate": 8.970987258622812e-05, + "loss": 3.2108, + "step": 17269 + }, + { + "epoch": 0.7271578947368421, + "grad_norm": 0.439453125, + "learning_rate": 8.968397845490111e-05, + "loss": 3.1547, + "step": 17270 + }, + { + "epoch": 0.7272, + "grad_norm": 0.462890625, + "learning_rate": 8.965808724434349e-05, + "loss": 2.6631, + "step": 17271 + }, + { + "epoch": 0.7272421052631579, + "grad_norm": 0.423828125, + "learning_rate": 8.963219895502719e-05, + "loss": 2.826, + "step": 17272 + }, + { + "epoch": 0.7272842105263158, + "grad_norm": 0.45703125, + "learning_rate": 8.960631358742383e-05, + "loss": 3.4719, + "step": 17273 + }, + { + "epoch": 0.7273263157894737, + "grad_norm": 0.4375, + "learning_rate": 8.958043114200496e-05, + "loss": 3.2278, + "step": 17274 + }, + { + "epoch": 0.7273684210526316, + "grad_norm": 0.431640625, + "learning_rate": 8.955455161924216e-05, + "loss": 2.8796, + "step": 17275 + }, + { + "epoch": 0.7274105263157895, + "grad_norm": 0.458984375, + "learning_rate": 8.952867501960682e-05, + "loss": 2.7719, + "step": 17276 + }, + { + "epoch": 0.7274526315789474, + "grad_norm": 0.40234375, + "learning_rate": 8.950280134357059e-05, + "loss": 2.3865, + "step": 17277 + }, + { + "epoch": 0.7274947368421053, + "grad_norm": 0.439453125, + "learning_rate": 8.947693059160469e-05, + "loss": 3.2381, + "step": 17278 + }, + { + "epoch": 0.7275368421052631, + "grad_norm": 0.443359375, + "learning_rate": 8.945106276418055e-05, + "loss": 3.184, + "step": 17279 + }, + { + "epoch": 0.7275789473684211, + "grad_norm": 0.439453125, + "learning_rate": 8.942519786176939e-05, + "loss": 3.1452, + "step": 17280 + }, + { + "epoch": 0.727621052631579, + "grad_norm": 0.408203125, + "learning_rate": 8.939933588484247e-05, + "loss": 2.9437, + "step": 17281 + }, + { + "epoch": 0.7276631578947368, + "grad_norm": 0.412109375, + "learning_rate": 8.937347683387096e-05, + "loss": 3.0372, + "step": 17282 + }, + { + "epoch": 0.7277052631578947, + "grad_norm": 0.43359375, + "learning_rate": 8.934762070932587e-05, + "loss": 2.8693, + "step": 17283 + }, + { + "epoch": 0.7277473684210526, + "grad_norm": 0.45703125, + "learning_rate": 8.932176751167845e-05, + "loss": 3.6909, + "step": 17284 + }, + { + "epoch": 0.7277894736842105, + "grad_norm": 0.4375, + "learning_rate": 8.929591724139966e-05, + "loss": 3.1891, + "step": 17285 + }, + { + "epoch": 0.7278315789473684, + "grad_norm": 0.416015625, + "learning_rate": 8.927006989896042e-05, + "loss": 3.0754, + "step": 17286 + }, + { + "epoch": 0.7278736842105263, + "grad_norm": 0.447265625, + "learning_rate": 8.924422548483165e-05, + "loss": 3.2261, + "step": 17287 + }, + { + "epoch": 0.7279157894736842, + "grad_norm": 0.416015625, + "learning_rate": 8.921838399948423e-05, + "loss": 2.528, + "step": 17288 + }, + { + "epoch": 0.7279578947368421, + "grad_norm": 0.431640625, + "learning_rate": 8.919254544338895e-05, + "loss": 3.0695, + "step": 17289 + }, + { + "epoch": 0.728, + "grad_norm": 0.431640625, + "learning_rate": 8.916670981701655e-05, + "loss": 3.2911, + "step": 17290 + }, + { + "epoch": 0.7280421052631579, + "grad_norm": 0.45703125, + "learning_rate": 8.914087712083762e-05, + "loss": 3.251, + "step": 17291 + }, + { + "epoch": 0.7280842105263158, + "grad_norm": 0.439453125, + "learning_rate": 8.9115047355323e-05, + "loss": 2.9643, + "step": 17292 + }, + { + "epoch": 0.7281263157894737, + "grad_norm": 0.408203125, + "learning_rate": 8.908922052094318e-05, + "loss": 3.0655, + "step": 17293 + }, + { + "epoch": 0.7281684210526316, + "grad_norm": 0.435546875, + "learning_rate": 8.90633966181687e-05, + "loss": 3.4648, + "step": 17294 + }, + { + "epoch": 0.7282105263157894, + "grad_norm": 0.466796875, + "learning_rate": 8.903757564747006e-05, + "loss": 3.0558, + "step": 17295 + }, + { + "epoch": 0.7282526315789474, + "grad_norm": 0.41796875, + "learning_rate": 8.901175760931765e-05, + "loss": 2.9578, + "step": 17296 + }, + { + "epoch": 0.7282947368421052, + "grad_norm": 0.4296875, + "learning_rate": 8.898594250418188e-05, + "loss": 3.3682, + "step": 17297 + }, + { + "epoch": 0.7283368421052632, + "grad_norm": 0.412109375, + "learning_rate": 8.896013033253295e-05, + "loss": 2.8451, + "step": 17298 + }, + { + "epoch": 0.728378947368421, + "grad_norm": 0.4140625, + "learning_rate": 8.893432109484135e-05, + "loss": 2.5537, + "step": 17299 + }, + { + "epoch": 0.728421052631579, + "grad_norm": 0.4140625, + "learning_rate": 8.890851479157713e-05, + "loss": 3.04, + "step": 17300 + }, + { + "epoch": 0.7284631578947368, + "grad_norm": 0.44140625, + "learning_rate": 8.888271142321052e-05, + "loss": 3.4336, + "step": 17301 + }, + { + "epoch": 0.7285052631578948, + "grad_norm": 0.431640625, + "learning_rate": 8.885691099021159e-05, + "loss": 3.5854, + "step": 17302 + }, + { + "epoch": 0.7285473684210526, + "grad_norm": 0.431640625, + "learning_rate": 8.883111349305042e-05, + "loss": 3.2895, + "step": 17303 + }, + { + "epoch": 0.7285894736842106, + "grad_norm": 0.43359375, + "learning_rate": 8.880531893219698e-05, + "loss": 3.4168, + "step": 17304 + }, + { + "epoch": 0.7286315789473684, + "grad_norm": 0.40234375, + "learning_rate": 8.877952730812117e-05, + "loss": 2.5948, + "step": 17305 + }, + { + "epoch": 0.7286736842105264, + "grad_norm": 0.44921875, + "learning_rate": 8.87537386212931e-05, + "loss": 3.1602, + "step": 17306 + }, + { + "epoch": 0.7287157894736842, + "grad_norm": 0.4296875, + "learning_rate": 8.87279528721823e-05, + "loss": 3.2576, + "step": 17307 + }, + { + "epoch": 0.7287578947368422, + "grad_norm": 0.43359375, + "learning_rate": 8.870217006125877e-05, + "loss": 2.5548, + "step": 17308 + }, + { + "epoch": 0.7288, + "grad_norm": 0.443359375, + "learning_rate": 8.86763901889922e-05, + "loss": 2.7953, + "step": 17309 + }, + { + "epoch": 0.7288421052631578, + "grad_norm": 0.5, + "learning_rate": 8.865061325585227e-05, + "loss": 3.199, + "step": 17310 + }, + { + "epoch": 0.7288842105263158, + "grad_norm": 0.41796875, + "learning_rate": 8.862483926230857e-05, + "loss": 2.7377, + "step": 17311 + }, + { + "epoch": 0.7289263157894736, + "grad_norm": 0.41015625, + "learning_rate": 8.859906820883063e-05, + "loss": 3.3574, + "step": 17312 + }, + { + "epoch": 0.7289684210526316, + "grad_norm": 0.408203125, + "learning_rate": 8.857330009588816e-05, + "loss": 3.0753, + "step": 17313 + }, + { + "epoch": 0.7290105263157894, + "grad_norm": 0.41015625, + "learning_rate": 8.854753492395034e-05, + "loss": 3.0465, + "step": 17314 + }, + { + "epoch": 0.7290526315789474, + "grad_norm": 0.498046875, + "learning_rate": 8.852177269348682e-05, + "loss": 3.3348, + "step": 17315 + }, + { + "epoch": 0.7290947368421052, + "grad_norm": 0.494140625, + "learning_rate": 8.849601340496688e-05, + "loss": 3.3594, + "step": 17316 + }, + { + "epoch": 0.7291368421052632, + "grad_norm": 0.404296875, + "learning_rate": 8.847025705885983e-05, + "loss": 2.9826, + "step": 17317 + }, + { + "epoch": 0.729178947368421, + "grad_norm": 0.451171875, + "learning_rate": 8.84445036556349e-05, + "loss": 3.0789, + "step": 17318 + }, + { + "epoch": 0.729221052631579, + "grad_norm": 0.42578125, + "learning_rate": 8.841875319576118e-05, + "loss": 3.0824, + "step": 17319 + }, + { + "epoch": 0.7292631578947368, + "grad_norm": 0.427734375, + "learning_rate": 8.839300567970809e-05, + "loss": 3.3129, + "step": 17320 + }, + { + "epoch": 0.7293052631578948, + "grad_norm": 0.42578125, + "learning_rate": 8.836726110794443e-05, + "loss": 2.7362, + "step": 17321 + }, + { + "epoch": 0.7293473684210526, + "grad_norm": 0.439453125, + "learning_rate": 8.834151948093952e-05, + "loss": 3.0541, + "step": 17322 + }, + { + "epoch": 0.7293894736842105, + "grad_norm": 0.43359375, + "learning_rate": 8.8315780799162e-05, + "loss": 2.881, + "step": 17323 + }, + { + "epoch": 0.7294315789473684, + "grad_norm": 0.5234375, + "learning_rate": 8.829004506308106e-05, + "loss": 2.8521, + "step": 17324 + }, + { + "epoch": 0.7294736842105263, + "grad_norm": 0.4375, + "learning_rate": 8.826431227316553e-05, + "loss": 3.1212, + "step": 17325 + }, + { + "epoch": 0.7295157894736842, + "grad_norm": 0.443359375, + "learning_rate": 8.823858242988408e-05, + "loss": 2.97, + "step": 17326 + }, + { + "epoch": 0.7295578947368421, + "grad_norm": 0.41796875, + "learning_rate": 8.821285553370578e-05, + "loss": 2.902, + "step": 17327 + }, + { + "epoch": 0.7296, + "grad_norm": 0.4140625, + "learning_rate": 8.8187131585099e-05, + "loss": 3.1538, + "step": 17328 + }, + { + "epoch": 0.7296421052631579, + "grad_norm": 0.419921875, + "learning_rate": 8.816141058453272e-05, + "loss": 3.1456, + "step": 17329 + }, + { + "epoch": 0.7296842105263158, + "grad_norm": 0.41796875, + "learning_rate": 8.813569253247522e-05, + "loss": 2.7389, + "step": 17330 + }, + { + "epoch": 0.7297263157894737, + "grad_norm": 0.43359375, + "learning_rate": 8.810997742939531e-05, + "loss": 3.4947, + "step": 17331 + }, + { + "epoch": 0.7297684210526316, + "grad_norm": 0.44140625, + "learning_rate": 8.808426527576138e-05, + "loss": 3.0299, + "step": 17332 + }, + { + "epoch": 0.7298105263157895, + "grad_norm": 0.4140625, + "learning_rate": 8.805855607204192e-05, + "loss": 2.8992, + "step": 17333 + }, + { + "epoch": 0.7298526315789474, + "grad_norm": 0.431640625, + "learning_rate": 8.803284981870527e-05, + "loss": 3.1772, + "step": 17334 + }, + { + "epoch": 0.7298947368421053, + "grad_norm": 0.4296875, + "learning_rate": 8.800714651621972e-05, + "loss": 3.0338, + "step": 17335 + }, + { + "epoch": 0.7299368421052631, + "grad_norm": 0.439453125, + "learning_rate": 8.798144616505377e-05, + "loss": 3.018, + "step": 17336 + }, + { + "epoch": 0.7299789473684211, + "grad_norm": 0.43359375, + "learning_rate": 8.795574876567539e-05, + "loss": 2.7872, + "step": 17337 + }, + { + "epoch": 0.7300210526315789, + "grad_norm": 0.443359375, + "learning_rate": 8.793005431855292e-05, + "loss": 3.0086, + "step": 17338 + }, + { + "epoch": 0.7300631578947369, + "grad_norm": 0.4453125, + "learning_rate": 8.790436282415443e-05, + "loss": 3.4956, + "step": 17339 + }, + { + "epoch": 0.7301052631578947, + "grad_norm": 0.443359375, + "learning_rate": 8.787867428294801e-05, + "loss": 3.4822, + "step": 17340 + }, + { + "epoch": 0.7301473684210527, + "grad_norm": 0.3984375, + "learning_rate": 8.785298869540167e-05, + "loss": 3.0295, + "step": 17341 + }, + { + "epoch": 0.7301894736842105, + "grad_norm": 0.443359375, + "learning_rate": 8.782730606198325e-05, + "loss": 3.0432, + "step": 17342 + }, + { + "epoch": 0.7302315789473685, + "grad_norm": 0.431640625, + "learning_rate": 8.780162638316097e-05, + "loss": 3.2298, + "step": 17343 + }, + { + "epoch": 0.7302736842105263, + "grad_norm": 0.41796875, + "learning_rate": 8.777594965940231e-05, + "loss": 3.3024, + "step": 17344 + }, + { + "epoch": 0.7303157894736843, + "grad_norm": 0.4375, + "learning_rate": 8.77502758911753e-05, + "loss": 3.5554, + "step": 17345 + }, + { + "epoch": 0.7303578947368421, + "grad_norm": 0.431640625, + "learning_rate": 8.772460507894764e-05, + "loss": 3.0164, + "step": 17346 + }, + { + "epoch": 0.7304, + "grad_norm": 0.416015625, + "learning_rate": 8.7698937223187e-05, + "loss": 3.1862, + "step": 17347 + }, + { + "epoch": 0.7304421052631579, + "grad_norm": 0.443359375, + "learning_rate": 8.767327232436103e-05, + "loss": 3.5387, + "step": 17348 + }, + { + "epoch": 0.7304842105263157, + "grad_norm": 0.427734375, + "learning_rate": 8.764761038293733e-05, + "loss": 2.7363, + "step": 17349 + }, + { + "epoch": 0.7305263157894737, + "grad_norm": 0.421875, + "learning_rate": 8.762195139938342e-05, + "loss": 2.899, + "step": 17350 + }, + { + "epoch": 0.7305684210526315, + "grad_norm": 0.41796875, + "learning_rate": 8.759629537416666e-05, + "loss": 2.8102, + "step": 17351 + }, + { + "epoch": 0.7306105263157895, + "grad_norm": 0.423828125, + "learning_rate": 8.757064230775466e-05, + "loss": 3.327, + "step": 17352 + }, + { + "epoch": 0.7306526315789473, + "grad_norm": 0.4609375, + "learning_rate": 8.754499220061473e-05, + "loss": 3.1067, + "step": 17353 + }, + { + "epoch": 0.7306947368421053, + "grad_norm": 0.40234375, + "learning_rate": 8.751934505321415e-05, + "loss": 2.5714, + "step": 17354 + }, + { + "epoch": 0.7307368421052631, + "grad_norm": 0.400390625, + "learning_rate": 8.74937008660202e-05, + "loss": 2.8484, + "step": 17355 + }, + { + "epoch": 0.7307789473684211, + "grad_norm": 0.412109375, + "learning_rate": 8.746805963950011e-05, + "loss": 3.0872, + "step": 17356 + }, + { + "epoch": 0.7308210526315789, + "grad_norm": 0.4375, + "learning_rate": 8.744242137412097e-05, + "loss": 2.6025, + "step": 17357 + }, + { + "epoch": 0.7308631578947369, + "grad_norm": 0.419921875, + "learning_rate": 8.741678607034986e-05, + "loss": 2.9304, + "step": 17358 + }, + { + "epoch": 0.7309052631578947, + "grad_norm": 0.421875, + "learning_rate": 8.739115372865394e-05, + "loss": 2.837, + "step": 17359 + }, + { + "epoch": 0.7309473684210527, + "grad_norm": 0.423828125, + "learning_rate": 8.736552434950015e-05, + "loss": 2.8668, + "step": 17360 + }, + { + "epoch": 0.7309894736842105, + "grad_norm": 0.4140625, + "learning_rate": 8.733989793335542e-05, + "loss": 2.9285, + "step": 17361 + }, + { + "epoch": 0.7310315789473684, + "grad_norm": 0.435546875, + "learning_rate": 8.731427448068666e-05, + "loss": 3.2715, + "step": 17362 + }, + { + "epoch": 0.7310736842105263, + "grad_norm": 0.453125, + "learning_rate": 8.728865399196062e-05, + "loss": 3.0269, + "step": 17363 + }, + { + "epoch": 0.7311157894736842, + "grad_norm": 0.419921875, + "learning_rate": 8.726303646764417e-05, + "loss": 2.8758, + "step": 17364 + }, + { + "epoch": 0.7311578947368421, + "grad_norm": 0.4140625, + "learning_rate": 8.723742190820396e-05, + "loss": 3.4082, + "step": 17365 + }, + { + "epoch": 0.7312, + "grad_norm": 0.396484375, + "learning_rate": 8.72118103141066e-05, + "loss": 3.1981, + "step": 17366 + }, + { + "epoch": 0.7312421052631579, + "grad_norm": 0.41015625, + "learning_rate": 8.71862016858189e-05, + "loss": 3.0439, + "step": 17367 + }, + { + "epoch": 0.7312842105263158, + "grad_norm": 0.431640625, + "learning_rate": 8.716059602380727e-05, + "loss": 3.0945, + "step": 17368 + }, + { + "epoch": 0.7313263157894737, + "grad_norm": 0.4296875, + "learning_rate": 8.713499332853828e-05, + "loss": 2.9448, + "step": 17369 + }, + { + "epoch": 0.7313684210526316, + "grad_norm": 0.455078125, + "learning_rate": 8.710939360047834e-05, + "loss": 3.3814, + "step": 17370 + }, + { + "epoch": 0.7314105263157895, + "grad_norm": 0.46484375, + "learning_rate": 8.708379684009387e-05, + "loss": 3.5724, + "step": 17371 + }, + { + "epoch": 0.7314526315789474, + "grad_norm": 0.41015625, + "learning_rate": 8.70582030478512e-05, + "loss": 3.1196, + "step": 17372 + }, + { + "epoch": 0.7314947368421053, + "grad_norm": 0.43359375, + "learning_rate": 8.703261222421652e-05, + "loss": 3.4754, + "step": 17373 + }, + { + "epoch": 0.7315368421052632, + "grad_norm": 0.427734375, + "learning_rate": 8.700702436965635e-05, + "loss": 3.0133, + "step": 17374 + }, + { + "epoch": 0.7315789473684211, + "grad_norm": 0.443359375, + "learning_rate": 8.698143948463652e-05, + "loss": 3.3963, + "step": 17375 + }, + { + "epoch": 0.731621052631579, + "grad_norm": 0.447265625, + "learning_rate": 8.69558575696234e-05, + "loss": 3.1036, + "step": 17376 + }, + { + "epoch": 0.7316631578947368, + "grad_norm": 0.5234375, + "learning_rate": 8.693027862508299e-05, + "loss": 2.8572, + "step": 17377 + }, + { + "epoch": 0.7317052631578947, + "grad_norm": 0.435546875, + "learning_rate": 8.690470265148124e-05, + "loss": 3.2866, + "step": 17378 + }, + { + "epoch": 0.7317473684210526, + "grad_norm": 0.4296875, + "learning_rate": 8.687912964928432e-05, + "loss": 2.8592, + "step": 17379 + }, + { + "epoch": 0.7317894736842105, + "grad_norm": 0.4375, + "learning_rate": 8.685355961895783e-05, + "loss": 3.1077, + "step": 17380 + }, + { + "epoch": 0.7318315789473684, + "grad_norm": 0.42578125, + "learning_rate": 8.682799256096796e-05, + "loss": 2.8482, + "step": 17381 + }, + { + "epoch": 0.7318736842105263, + "grad_norm": 0.40234375, + "learning_rate": 8.68024284757802e-05, + "loss": 2.9467, + "step": 17382 + }, + { + "epoch": 0.7319157894736842, + "grad_norm": 0.466796875, + "learning_rate": 8.67768673638605e-05, + "loss": 3.4864, + "step": 17383 + }, + { + "epoch": 0.7319578947368421, + "grad_norm": 0.419921875, + "learning_rate": 8.675130922567454e-05, + "loss": 2.9723, + "step": 17384 + }, + { + "epoch": 0.732, + "grad_norm": 0.427734375, + "learning_rate": 8.672575406168781e-05, + "loss": 3.2359, + "step": 17385 + }, + { + "epoch": 0.7320421052631579, + "grad_norm": 0.44140625, + "learning_rate": 8.670020187236616e-05, + "loss": 2.9765, + "step": 17386 + }, + { + "epoch": 0.7320842105263158, + "grad_norm": 0.42578125, + "learning_rate": 8.667465265817478e-05, + "loss": 3.1162, + "step": 17387 + }, + { + "epoch": 0.7321263157894737, + "grad_norm": 0.44140625, + "learning_rate": 8.664910641957955e-05, + "loss": 2.7987, + "step": 17388 + }, + { + "epoch": 0.7321684210526316, + "grad_norm": 0.435546875, + "learning_rate": 8.662356315704545e-05, + "loss": 3.2022, + "step": 17389 + }, + { + "epoch": 0.7322105263157894, + "grad_norm": 0.4296875, + "learning_rate": 8.659802287103827e-05, + "loss": 3.2669, + "step": 17390 + }, + { + "epoch": 0.7322526315789474, + "grad_norm": 0.427734375, + "learning_rate": 8.657248556202293e-05, + "loss": 2.9025, + "step": 17391 + }, + { + "epoch": 0.7322947368421052, + "grad_norm": 0.478515625, + "learning_rate": 8.6546951230465e-05, + "loss": 3.0792, + "step": 17392 + }, + { + "epoch": 0.7323368421052632, + "grad_norm": 0.439453125, + "learning_rate": 8.652141987682955e-05, + "loss": 2.8577, + "step": 17393 + }, + { + "epoch": 0.732378947368421, + "grad_norm": 0.435546875, + "learning_rate": 8.649589150158165e-05, + "loss": 3.2648, + "step": 17394 + }, + { + "epoch": 0.732421052631579, + "grad_norm": 0.421875, + "learning_rate": 8.647036610518671e-05, + "loss": 3.1647, + "step": 17395 + }, + { + "epoch": 0.7324631578947368, + "grad_norm": 0.453125, + "learning_rate": 8.644484368810937e-05, + "loss": 2.877, + "step": 17396 + }, + { + "epoch": 0.7325052631578948, + "grad_norm": 0.44921875, + "learning_rate": 8.641932425081497e-05, + "loss": 3.1518, + "step": 17397 + }, + { + "epoch": 0.7325473684210526, + "grad_norm": 0.427734375, + "learning_rate": 8.63938077937681e-05, + "loss": 2.8324, + "step": 17398 + }, + { + "epoch": 0.7325894736842106, + "grad_norm": 0.423828125, + "learning_rate": 8.636829431743393e-05, + "loss": 2.6568, + "step": 17399 + }, + { + "epoch": 0.7326315789473684, + "grad_norm": 0.455078125, + "learning_rate": 8.634278382227717e-05, + "loss": 3.2829, + "step": 17400 + }, + { + "epoch": 0.7326736842105264, + "grad_norm": 0.423828125, + "learning_rate": 8.631727630876251e-05, + "loss": 2.9695, + "step": 17401 + }, + { + "epoch": 0.7327157894736842, + "grad_norm": 0.4140625, + "learning_rate": 8.629177177735495e-05, + "loss": 2.8694, + "step": 17402 + }, + { + "epoch": 0.732757894736842, + "grad_norm": 0.44921875, + "learning_rate": 8.626627022851876e-05, + "loss": 2.9942, + "step": 17403 + }, + { + "epoch": 0.7328, + "grad_norm": 0.419921875, + "learning_rate": 8.624077166271893e-05, + "loss": 3.0147, + "step": 17404 + }, + { + "epoch": 0.7328421052631579, + "grad_norm": 0.4296875, + "learning_rate": 8.621527608041968e-05, + "loss": 3.1181, + "step": 17405 + }, + { + "epoch": 0.7328842105263158, + "grad_norm": 0.408203125, + "learning_rate": 8.618978348208573e-05, + "loss": 2.9844, + "step": 17406 + }, + { + "epoch": 0.7329263157894736, + "grad_norm": 0.443359375, + "learning_rate": 8.616429386818145e-05, + "loss": 2.9553, + "step": 17407 + }, + { + "epoch": 0.7329684210526316, + "grad_norm": 0.421875, + "learning_rate": 8.613880723917123e-05, + "loss": 3.0633, + "step": 17408 + }, + { + "epoch": 0.7330105263157894, + "grad_norm": 0.41796875, + "learning_rate": 8.611332359551943e-05, + "loss": 3.2271, + "step": 17409 + }, + { + "epoch": 0.7330526315789474, + "grad_norm": 0.42578125, + "learning_rate": 8.608784293769023e-05, + "loss": 3.3197, + "step": 17410 + }, + { + "epoch": 0.7330947368421052, + "grad_norm": 0.4296875, + "learning_rate": 8.60623652661481e-05, + "loss": 3.4767, + "step": 17411 + }, + { + "epoch": 0.7331368421052632, + "grad_norm": 0.451171875, + "learning_rate": 8.603689058135688e-05, + "loss": 3.3598, + "step": 17412 + }, + { + "epoch": 0.733178947368421, + "grad_norm": 0.4765625, + "learning_rate": 8.601141888378097e-05, + "loss": 3.1161, + "step": 17413 + }, + { + "epoch": 0.733221052631579, + "grad_norm": 0.44140625, + "learning_rate": 8.598595017388433e-05, + "loss": 3.0147, + "step": 17414 + }, + { + "epoch": 0.7332631578947368, + "grad_norm": 0.4375, + "learning_rate": 8.596048445213096e-05, + "loss": 2.6017, + "step": 17415 + }, + { + "epoch": 0.7333052631578947, + "grad_norm": 0.412109375, + "learning_rate": 8.593502171898482e-05, + "loss": 3.2113, + "step": 17416 + }, + { + "epoch": 0.7333473684210526, + "grad_norm": 0.453125, + "learning_rate": 8.590956197490981e-05, + "loss": 3.2438, + "step": 17417 + }, + { + "epoch": 0.7333894736842105, + "grad_norm": 0.453125, + "learning_rate": 8.588410522036979e-05, + "loss": 3.2906, + "step": 17418 + }, + { + "epoch": 0.7334315789473684, + "grad_norm": 0.40234375, + "learning_rate": 8.585865145582847e-05, + "loss": 3.1117, + "step": 17419 + }, + { + "epoch": 0.7334736842105263, + "grad_norm": 0.51953125, + "learning_rate": 8.583320068174974e-05, + "loss": 3.1481, + "step": 17420 + }, + { + "epoch": 0.7335157894736842, + "grad_norm": 0.427734375, + "learning_rate": 8.580775289859721e-05, + "loss": 3.1312, + "step": 17421 + }, + { + "epoch": 0.7335578947368421, + "grad_norm": 0.431640625, + "learning_rate": 8.578230810683449e-05, + "loss": 3.1325, + "step": 17422 + }, + { + "epoch": 0.7336, + "grad_norm": 0.44921875, + "learning_rate": 8.57568663069252e-05, + "loss": 2.9373, + "step": 17423 + }, + { + "epoch": 0.7336421052631579, + "grad_norm": 0.46484375, + "learning_rate": 8.57314274993328e-05, + "loss": 2.9642, + "step": 17424 + }, + { + "epoch": 0.7336842105263158, + "grad_norm": 0.4140625, + "learning_rate": 8.57059916845207e-05, + "loss": 2.8016, + "step": 17425 + }, + { + "epoch": 0.7337263157894737, + "grad_norm": 0.45703125, + "learning_rate": 8.568055886295248e-05, + "loss": 3.4876, + "step": 17426 + }, + { + "epoch": 0.7337684210526316, + "grad_norm": 0.453125, + "learning_rate": 8.565512903509143e-05, + "loss": 3.1125, + "step": 17427 + }, + { + "epoch": 0.7338105263157895, + "grad_norm": 0.455078125, + "learning_rate": 8.562970220140079e-05, + "loss": 2.6941, + "step": 17428 + }, + { + "epoch": 0.7338526315789474, + "grad_norm": 0.421875, + "learning_rate": 8.560427836234388e-05, + "loss": 3.1189, + "step": 17429 + }, + { + "epoch": 0.7338947368421053, + "grad_norm": 0.458984375, + "learning_rate": 8.557885751838387e-05, + "loss": 3.0854, + "step": 17430 + }, + { + "epoch": 0.7339368421052631, + "grad_norm": 0.41796875, + "learning_rate": 8.555343966998385e-05, + "loss": 3.1352, + "step": 17431 + }, + { + "epoch": 0.7339789473684211, + "grad_norm": 0.453125, + "learning_rate": 8.552802481760691e-05, + "loss": 3.4583, + "step": 17432 + }, + { + "epoch": 0.7340210526315789, + "grad_norm": 0.451171875, + "learning_rate": 8.550261296171624e-05, + "loss": 3.0273, + "step": 17433 + }, + { + "epoch": 0.7340631578947369, + "grad_norm": 0.435546875, + "learning_rate": 8.547720410277454e-05, + "loss": 2.8165, + "step": 17434 + }, + { + "epoch": 0.7341052631578947, + "grad_norm": 0.439453125, + "learning_rate": 8.545179824124493e-05, + "loss": 2.7775, + "step": 17435 + }, + { + "epoch": 0.7341473684210527, + "grad_norm": 0.427734375, + "learning_rate": 8.542639537759025e-05, + "loss": 3.1867, + "step": 17436 + }, + { + "epoch": 0.7341894736842105, + "grad_norm": 0.43359375, + "learning_rate": 8.540099551227326e-05, + "loss": 2.9713, + "step": 17437 + }, + { + "epoch": 0.7342315789473685, + "grad_norm": 0.71484375, + "learning_rate": 8.537559864575675e-05, + "loss": 2.9672, + "step": 17438 + }, + { + "epoch": 0.7342736842105263, + "grad_norm": 0.458984375, + "learning_rate": 8.535020477850333e-05, + "loss": 2.9943, + "step": 17439 + }, + { + "epoch": 0.7343157894736843, + "grad_norm": 0.42578125, + "learning_rate": 8.532481391097587e-05, + "loss": 3.2691, + "step": 17440 + }, + { + "epoch": 0.7343578947368421, + "grad_norm": 0.4609375, + "learning_rate": 8.529942604363666e-05, + "loss": 3.1355, + "step": 17441 + }, + { + "epoch": 0.7344, + "grad_norm": 0.427734375, + "learning_rate": 8.527404117694849e-05, + "loss": 3.1819, + "step": 17442 + }, + { + "epoch": 0.7344421052631579, + "grad_norm": 0.43359375, + "learning_rate": 8.52486593113737e-05, + "loss": 3.0089, + "step": 17443 + }, + { + "epoch": 0.7344842105263157, + "grad_norm": 0.427734375, + "learning_rate": 8.522328044737479e-05, + "loss": 3.1019, + "step": 17444 + }, + { + "epoch": 0.7345263157894737, + "grad_norm": 0.4453125, + "learning_rate": 8.519790458541409e-05, + "loss": 3.2983, + "step": 17445 + }, + { + "epoch": 0.7345684210526315, + "grad_norm": 0.42578125, + "learning_rate": 8.517253172595388e-05, + "loss": 3.4151, + "step": 17446 + }, + { + "epoch": 0.7346105263157895, + "grad_norm": 0.439453125, + "learning_rate": 8.514716186945664e-05, + "loss": 3.4249, + "step": 17447 + }, + { + "epoch": 0.7346526315789473, + "grad_norm": 0.416015625, + "learning_rate": 8.512179501638423e-05, + "loss": 2.7593, + "step": 17448 + }, + { + "epoch": 0.7346947368421053, + "grad_norm": 0.4296875, + "learning_rate": 8.509643116719918e-05, + "loss": 2.776, + "step": 17449 + }, + { + "epoch": 0.7347368421052631, + "grad_norm": 0.412109375, + "learning_rate": 8.507107032236322e-05, + "loss": 3.0074, + "step": 17450 + }, + { + "epoch": 0.7347789473684211, + "grad_norm": 0.404296875, + "learning_rate": 8.504571248233867e-05, + "loss": 2.7538, + "step": 17451 + }, + { + "epoch": 0.7348210526315789, + "grad_norm": 0.4296875, + "learning_rate": 8.502035764758739e-05, + "loss": 3.3589, + "step": 17452 + }, + { + "epoch": 0.7348631578947369, + "grad_norm": 0.44921875, + "learning_rate": 8.499500581857131e-05, + "loss": 3.1937, + "step": 17453 + }, + { + "epoch": 0.7349052631578947, + "grad_norm": 0.4296875, + "learning_rate": 8.496965699575249e-05, + "loss": 2.8483, + "step": 17454 + }, + { + "epoch": 0.7349473684210527, + "grad_norm": 0.458984375, + "learning_rate": 8.494431117959247e-05, + "loss": 3.2585, + "step": 17455 + }, + { + "epoch": 0.7349894736842105, + "grad_norm": 0.421875, + "learning_rate": 8.491896837055333e-05, + "loss": 2.7329, + "step": 17456 + }, + { + "epoch": 0.7350315789473684, + "grad_norm": 0.47265625, + "learning_rate": 8.489362856909643e-05, + "loss": 3.1627, + "step": 17457 + }, + { + "epoch": 0.7350736842105263, + "grad_norm": 0.41796875, + "learning_rate": 8.486829177568375e-05, + "loss": 3.3207, + "step": 17458 + }, + { + "epoch": 0.7351157894736842, + "grad_norm": 0.44921875, + "learning_rate": 8.484295799077674e-05, + "loss": 2.8375, + "step": 17459 + }, + { + "epoch": 0.7351578947368421, + "grad_norm": 0.42578125, + "learning_rate": 8.481762721483699e-05, + "loss": 2.9576, + "step": 17460 + }, + { + "epoch": 0.7352, + "grad_norm": 0.4609375, + "learning_rate": 8.479229944832601e-05, + "loss": 2.9608, + "step": 17461 + }, + { + "epoch": 0.7352421052631579, + "grad_norm": 0.4375, + "learning_rate": 8.476697469170514e-05, + "loss": 2.7885, + "step": 17462 + }, + { + "epoch": 0.7352842105263158, + "grad_norm": 0.43359375, + "learning_rate": 8.474165294543598e-05, + "loss": 3.1125, + "step": 17463 + }, + { + "epoch": 0.7353263157894737, + "grad_norm": 0.408203125, + "learning_rate": 8.471633420997959e-05, + "loss": 3.2301, + "step": 17464 + }, + { + "epoch": 0.7353684210526316, + "grad_norm": 0.4140625, + "learning_rate": 8.469101848579757e-05, + "loss": 3.1992, + "step": 17465 + }, + { + "epoch": 0.7354105263157895, + "grad_norm": 0.435546875, + "learning_rate": 8.466570577335078e-05, + "loss": 2.7817, + "step": 17466 + }, + { + "epoch": 0.7354526315789474, + "grad_norm": 0.42578125, + "learning_rate": 8.464039607310067e-05, + "loss": 2.8892, + "step": 17467 + }, + { + "epoch": 0.7354947368421053, + "grad_norm": 0.435546875, + "learning_rate": 8.461508938550825e-05, + "loss": 2.4723, + "step": 17468 + }, + { + "epoch": 0.7355368421052632, + "grad_norm": 0.4140625, + "learning_rate": 8.458978571103448e-05, + "loss": 3.1733, + "step": 17469 + }, + { + "epoch": 0.735578947368421, + "grad_norm": 0.427734375, + "learning_rate": 8.456448505014064e-05, + "loss": 3.1467, + "step": 17470 + }, + { + "epoch": 0.735621052631579, + "grad_norm": 0.4609375, + "learning_rate": 8.453918740328733e-05, + "loss": 3.214, + "step": 17471 + }, + { + "epoch": 0.7356631578947368, + "grad_norm": 0.4140625, + "learning_rate": 8.45138927709358e-05, + "loss": 3.3429, + "step": 17472 + }, + { + "epoch": 0.7357052631578948, + "grad_norm": 0.42578125, + "learning_rate": 8.448860115354653e-05, + "loss": 3.1089, + "step": 17473 + }, + { + "epoch": 0.7357473684210526, + "grad_norm": 0.484375, + "learning_rate": 8.446331255158057e-05, + "loss": 2.9917, + "step": 17474 + }, + { + "epoch": 0.7357894736842105, + "grad_norm": 0.431640625, + "learning_rate": 8.443802696549857e-05, + "loss": 3.214, + "step": 17475 + }, + { + "epoch": 0.7358315789473684, + "grad_norm": 0.42578125, + "learning_rate": 8.441274439576115e-05, + "loss": 2.8693, + "step": 17476 + }, + { + "epoch": 0.7358736842105263, + "grad_norm": 0.44140625, + "learning_rate": 8.4387464842829e-05, + "loss": 3.1647, + "step": 17477 + }, + { + "epoch": 0.7359157894736842, + "grad_norm": 0.408203125, + "learning_rate": 8.436218830716258e-05, + "loss": 2.6969, + "step": 17478 + }, + { + "epoch": 0.7359578947368421, + "grad_norm": 0.43359375, + "learning_rate": 8.433691478922254e-05, + "loss": 2.813, + "step": 17479 + }, + { + "epoch": 0.736, + "grad_norm": 0.427734375, + "learning_rate": 8.431164428946927e-05, + "loss": 3.3372, + "step": 17480 + }, + { + "epoch": 0.7360421052631579, + "grad_norm": 0.458984375, + "learning_rate": 8.428637680836316e-05, + "loss": 3.085, + "step": 17481 + }, + { + "epoch": 0.7360842105263158, + "grad_norm": 0.4375, + "learning_rate": 8.426111234636455e-05, + "loss": 3.0584, + "step": 17482 + }, + { + "epoch": 0.7361263157894737, + "grad_norm": 0.435546875, + "learning_rate": 8.423585090393376e-05, + "loss": 3.6094, + "step": 17483 + }, + { + "epoch": 0.7361684210526316, + "grad_norm": 0.439453125, + "learning_rate": 8.421059248153098e-05, + "loss": 2.8345, + "step": 17484 + }, + { + "epoch": 0.7362105263157894, + "grad_norm": 0.4296875, + "learning_rate": 8.418533707961634e-05, + "loss": 3.0573, + "step": 17485 + }, + { + "epoch": 0.7362526315789474, + "grad_norm": 0.443359375, + "learning_rate": 8.416008469865013e-05, + "loss": 3.5585, + "step": 17486 + }, + { + "epoch": 0.7362947368421052, + "grad_norm": 0.609375, + "learning_rate": 8.413483533909231e-05, + "loss": 3.2086, + "step": 17487 + }, + { + "epoch": 0.7363368421052632, + "grad_norm": 0.423828125, + "learning_rate": 8.41095890014029e-05, + "loss": 3.4099, + "step": 17488 + }, + { + "epoch": 0.736378947368421, + "grad_norm": 0.435546875, + "learning_rate": 8.408434568604187e-05, + "loss": 3.4943, + "step": 17489 + }, + { + "epoch": 0.736421052631579, + "grad_norm": 0.423828125, + "learning_rate": 8.405910539346911e-05, + "loss": 3.2247, + "step": 17490 + }, + { + "epoch": 0.7364631578947368, + "grad_norm": 0.451171875, + "learning_rate": 8.403386812414448e-05, + "loss": 3.1184, + "step": 17491 + }, + { + "epoch": 0.7365052631578948, + "grad_norm": 0.439453125, + "learning_rate": 8.400863387852778e-05, + "loss": 3.0231, + "step": 17492 + }, + { + "epoch": 0.7365473684210526, + "grad_norm": 0.4140625, + "learning_rate": 8.398340265707866e-05, + "loss": 2.3049, + "step": 17493 + }, + { + "epoch": 0.7365894736842106, + "grad_norm": 0.431640625, + "learning_rate": 8.395817446025698e-05, + "loss": 2.8502, + "step": 17494 + }, + { + "epoch": 0.7366315789473684, + "grad_norm": 0.435546875, + "learning_rate": 8.393294928852224e-05, + "loss": 3.0826, + "step": 17495 + }, + { + "epoch": 0.7366736842105264, + "grad_norm": 0.462890625, + "learning_rate": 8.390772714233405e-05, + "loss": 2.8338, + "step": 17496 + }, + { + "epoch": 0.7367157894736842, + "grad_norm": 0.4453125, + "learning_rate": 8.388250802215191e-05, + "loss": 2.6943, + "step": 17497 + }, + { + "epoch": 0.7367578947368421, + "grad_norm": 0.3984375, + "learning_rate": 8.38572919284353e-05, + "loss": 2.7742, + "step": 17498 + }, + { + "epoch": 0.7368, + "grad_norm": 0.455078125, + "learning_rate": 8.383207886164365e-05, + "loss": 3.2979, + "step": 17499 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.439453125, + "learning_rate": 8.380686882223618e-05, + "loss": 3.0861, + "step": 17500 + }, + { + "epoch": 0.7368842105263158, + "grad_norm": 0.4375, + "learning_rate": 8.378166181067238e-05, + "loss": 2.9101, + "step": 17501 + }, + { + "epoch": 0.7369263157894737, + "grad_norm": 0.44140625, + "learning_rate": 8.375645782741137e-05, + "loss": 3.4054, + "step": 17502 + }, + { + "epoch": 0.7369684210526316, + "grad_norm": 0.419921875, + "learning_rate": 8.373125687291241e-05, + "loss": 3.0726, + "step": 17503 + }, + { + "epoch": 0.7370105263157894, + "grad_norm": 0.412109375, + "learning_rate": 8.370605894763459e-05, + "loss": 2.8369, + "step": 17504 + }, + { + "epoch": 0.7370526315789474, + "grad_norm": 0.4296875, + "learning_rate": 8.368086405203695e-05, + "loss": 3.2243, + "step": 17505 + }, + { + "epoch": 0.7370947368421052, + "grad_norm": 0.41015625, + "learning_rate": 8.36556721865786e-05, + "loss": 2.6013, + "step": 17506 + }, + { + "epoch": 0.7371368421052632, + "grad_norm": 0.423828125, + "learning_rate": 8.363048335171833e-05, + "loss": 3.0765, + "step": 17507 + }, + { + "epoch": 0.737178947368421, + "grad_norm": 0.4296875, + "learning_rate": 8.360529754791538e-05, + "loss": 3.3717, + "step": 17508 + }, + { + "epoch": 0.737221052631579, + "grad_norm": 0.40625, + "learning_rate": 8.35801147756282e-05, + "loss": 2.8376, + "step": 17509 + }, + { + "epoch": 0.7372631578947368, + "grad_norm": 0.451171875, + "learning_rate": 8.355493503531592e-05, + "loss": 2.9544, + "step": 17510 + }, + { + "epoch": 0.7373052631578947, + "grad_norm": 0.416015625, + "learning_rate": 8.352975832743712e-05, + "loss": 3.192, + "step": 17511 + }, + { + "epoch": 0.7373473684210526, + "grad_norm": 0.451171875, + "learning_rate": 8.350458465245056e-05, + "loss": 3.2392, + "step": 17512 + }, + { + "epoch": 0.7373894736842105, + "grad_norm": 0.43359375, + "learning_rate": 8.347941401081483e-05, + "loss": 3.1794, + "step": 17513 + }, + { + "epoch": 0.7374315789473684, + "grad_norm": 0.43359375, + "learning_rate": 8.345424640298843e-05, + "loss": 2.9829, + "step": 17514 + }, + { + "epoch": 0.7374736842105263, + "grad_norm": 0.423828125, + "learning_rate": 8.342908182943016e-05, + "loss": 2.9809, + "step": 17515 + }, + { + "epoch": 0.7375157894736842, + "grad_norm": 0.435546875, + "learning_rate": 8.340392029059812e-05, + "loss": 2.8339, + "step": 17516 + }, + { + "epoch": 0.7375578947368421, + "grad_norm": 0.482421875, + "learning_rate": 8.3378761786951e-05, + "loss": 3.1362, + "step": 17517 + }, + { + "epoch": 0.7376, + "grad_norm": 0.416015625, + "learning_rate": 8.335360631894709e-05, + "loss": 2.7523, + "step": 17518 + }, + { + "epoch": 0.7376421052631579, + "grad_norm": 0.421875, + "learning_rate": 8.332845388704466e-05, + "loss": 3.086, + "step": 17519 + }, + { + "epoch": 0.7376842105263158, + "grad_norm": 0.431640625, + "learning_rate": 8.330330449170195e-05, + "loss": 3.1042, + "step": 17520 + }, + { + "epoch": 0.7377263157894737, + "grad_norm": 0.4140625, + "learning_rate": 8.32781581333771e-05, + "loss": 2.8653, + "step": 17521 + }, + { + "epoch": 0.7377684210526316, + "grad_norm": 0.416015625, + "learning_rate": 8.325301481252848e-05, + "loss": 2.6848, + "step": 17522 + }, + { + "epoch": 0.7378105263157895, + "grad_norm": 0.4453125, + "learning_rate": 8.322787452961387e-05, + "loss": 3.2506, + "step": 17523 + }, + { + "epoch": 0.7378526315789473, + "grad_norm": 0.42578125, + "learning_rate": 8.320273728509159e-05, + "loss": 2.9591, + "step": 17524 + }, + { + "epoch": 0.7378947368421053, + "grad_norm": 0.69921875, + "learning_rate": 8.317760307941927e-05, + "loss": 2.7788, + "step": 17525 + }, + { + "epoch": 0.7379368421052631, + "grad_norm": 0.44140625, + "learning_rate": 8.315247191305511e-05, + "loss": 3.026, + "step": 17526 + }, + { + "epoch": 0.7379789473684211, + "grad_norm": 0.412109375, + "learning_rate": 8.31273437864569e-05, + "loss": 2.8661, + "step": 17527 + }, + { + "epoch": 0.7380210526315789, + "grad_norm": 0.451171875, + "learning_rate": 8.31022187000823e-05, + "loss": 3.0473, + "step": 17528 + }, + { + "epoch": 0.7380631578947369, + "grad_norm": 0.427734375, + "learning_rate": 8.307709665438937e-05, + "loss": 3.0359, + "step": 17529 + }, + { + "epoch": 0.7381052631578947, + "grad_norm": 0.44140625, + "learning_rate": 8.305197764983541e-05, + "loss": 2.9506, + "step": 17530 + }, + { + "epoch": 0.7381473684210527, + "grad_norm": 0.451171875, + "learning_rate": 8.302686168687845e-05, + "loss": 3.1856, + "step": 17531 + }, + { + "epoch": 0.7381894736842105, + "grad_norm": 0.400390625, + "learning_rate": 8.300174876597575e-05, + "loss": 3.0519, + "step": 17532 + }, + { + "epoch": 0.7382315789473685, + "grad_norm": 0.443359375, + "learning_rate": 8.2976638887585e-05, + "loss": 3.1356, + "step": 17533 + }, + { + "epoch": 0.7382736842105263, + "grad_norm": 0.43359375, + "learning_rate": 8.295153205216369e-05, + "loss": 2.8951, + "step": 17534 + }, + { + "epoch": 0.7383157894736843, + "grad_norm": 0.4375, + "learning_rate": 8.292642826016916e-05, + "loss": 2.9037, + "step": 17535 + }, + { + "epoch": 0.7383578947368421, + "grad_norm": 0.439453125, + "learning_rate": 8.290132751205881e-05, + "loss": 3.191, + "step": 17536 + }, + { + "epoch": 0.7384, + "grad_norm": 0.431640625, + "learning_rate": 8.287622980828988e-05, + "loss": 3.085, + "step": 17537 + }, + { + "epoch": 0.7384421052631579, + "grad_norm": 0.412109375, + "learning_rate": 8.285113514931983e-05, + "loss": 3.3099, + "step": 17538 + }, + { + "epoch": 0.7384842105263157, + "grad_norm": 0.412109375, + "learning_rate": 8.282604353560552e-05, + "loss": 3.357, + "step": 17539 + }, + { + "epoch": 0.7385263157894737, + "grad_norm": 0.4296875, + "learning_rate": 8.280095496760439e-05, + "loss": 3.0421, + "step": 17540 + }, + { + "epoch": 0.7385684210526315, + "grad_norm": 0.42578125, + "learning_rate": 8.27758694457734e-05, + "loss": 3.1583, + "step": 17541 + }, + { + "epoch": 0.7386105263157895, + "grad_norm": 0.416015625, + "learning_rate": 8.27507869705696e-05, + "loss": 2.7443, + "step": 17542 + }, + { + "epoch": 0.7386526315789473, + "grad_norm": 0.46484375, + "learning_rate": 8.272570754244993e-05, + "loss": 3.1097, + "step": 17543 + }, + { + "epoch": 0.7386947368421053, + "grad_norm": 0.439453125, + "learning_rate": 8.270063116187127e-05, + "loss": 3.0966, + "step": 17544 + }, + { + "epoch": 0.7387368421052631, + "grad_norm": 0.416015625, + "learning_rate": 8.267555782929067e-05, + "loss": 3.2179, + "step": 17545 + }, + { + "epoch": 0.7387789473684211, + "grad_norm": 0.453125, + "learning_rate": 8.265048754516466e-05, + "loss": 2.6872, + "step": 17546 + }, + { + "epoch": 0.7388210526315789, + "grad_norm": 0.41796875, + "learning_rate": 8.262542030995024e-05, + "loss": 3.0463, + "step": 17547 + }, + { + "epoch": 0.7388631578947369, + "grad_norm": 0.40625, + "learning_rate": 8.260035612410399e-05, + "loss": 3.2567, + "step": 17548 + }, + { + "epoch": 0.7389052631578947, + "grad_norm": 0.431640625, + "learning_rate": 8.257529498808256e-05, + "loss": 2.7769, + "step": 17549 + }, + { + "epoch": 0.7389473684210527, + "grad_norm": 0.4296875, + "learning_rate": 8.255023690234257e-05, + "loss": 3.2597, + "step": 17550 + }, + { + "epoch": 0.7389894736842105, + "grad_norm": 0.42578125, + "learning_rate": 8.252518186734048e-05, + "loss": 3.1473, + "step": 17551 + }, + { + "epoch": 0.7390315789473684, + "grad_norm": 0.455078125, + "learning_rate": 8.250012988353282e-05, + "loss": 2.8691, + "step": 17552 + }, + { + "epoch": 0.7390736842105263, + "grad_norm": 0.427734375, + "learning_rate": 8.24750809513759e-05, + "loss": 3.0088, + "step": 17553 + }, + { + "epoch": 0.7391157894736842, + "grad_norm": 0.41796875, + "learning_rate": 8.245003507132625e-05, + "loss": 3.2294, + "step": 17554 + }, + { + "epoch": 0.7391578947368421, + "grad_norm": 0.4375, + "learning_rate": 8.24249922438401e-05, + "loss": 2.9939, + "step": 17555 + }, + { + "epoch": 0.7392, + "grad_norm": 0.447265625, + "learning_rate": 8.239995246937371e-05, + "loss": 2.546, + "step": 17556 + }, + { + "epoch": 0.7392421052631579, + "grad_norm": 0.453125, + "learning_rate": 8.237491574838327e-05, + "loss": 3.105, + "step": 17557 + }, + { + "epoch": 0.7392842105263158, + "grad_norm": 0.431640625, + "learning_rate": 8.234988208132487e-05, + "loss": 3.1455, + "step": 17558 + }, + { + "epoch": 0.7393263157894737, + "grad_norm": 0.427734375, + "learning_rate": 8.232485146865468e-05, + "loss": 2.7145, + "step": 17559 + }, + { + "epoch": 0.7393684210526316, + "grad_norm": 0.41015625, + "learning_rate": 8.22998239108286e-05, + "loss": 2.9565, + "step": 17560 + }, + { + "epoch": 0.7394105263157895, + "grad_norm": 0.41796875, + "learning_rate": 8.227479940830277e-05, + "loss": 2.9454, + "step": 17561 + }, + { + "epoch": 0.7394526315789474, + "grad_norm": 0.419921875, + "learning_rate": 8.224977796153302e-05, + "loss": 2.2829, + "step": 17562 + }, + { + "epoch": 0.7394947368421053, + "grad_norm": 0.43359375, + "learning_rate": 8.222475957097523e-05, + "loss": 3.337, + "step": 17563 + }, + { + "epoch": 0.7395368421052632, + "grad_norm": 0.427734375, + "learning_rate": 8.219974423708514e-05, + "loss": 3.1087, + "step": 17564 + }, + { + "epoch": 0.739578947368421, + "grad_norm": 0.423828125, + "learning_rate": 8.21747319603186e-05, + "loss": 2.9359, + "step": 17565 + }, + { + "epoch": 0.739621052631579, + "grad_norm": 0.40625, + "learning_rate": 8.214972274113125e-05, + "loss": 2.619, + "step": 17566 + }, + { + "epoch": 0.7396631578947368, + "grad_norm": 0.388671875, + "learning_rate": 8.212471657997875e-05, + "loss": 2.6439, + "step": 17567 + }, + { + "epoch": 0.7397052631578948, + "grad_norm": 0.421875, + "learning_rate": 8.209971347731656e-05, + "loss": 2.7097, + "step": 17568 + }, + { + "epoch": 0.7397473684210526, + "grad_norm": 0.45703125, + "learning_rate": 8.20747134336004e-05, + "loss": 3.1776, + "step": 17569 + }, + { + "epoch": 0.7397894736842106, + "grad_norm": 0.453125, + "learning_rate": 8.20497164492857e-05, + "loss": 3.1628, + "step": 17570 + }, + { + "epoch": 0.7398315789473684, + "grad_norm": 0.4453125, + "learning_rate": 8.20247225248278e-05, + "loss": 2.8349, + "step": 17571 + }, + { + "epoch": 0.7398736842105264, + "grad_norm": 0.40625, + "learning_rate": 8.199973166068206e-05, + "loss": 2.7887, + "step": 17572 + }, + { + "epoch": 0.7399157894736842, + "grad_norm": 0.416015625, + "learning_rate": 8.197474385730377e-05, + "loss": 2.8789, + "step": 17573 + }, + { + "epoch": 0.7399578947368421, + "grad_norm": 0.40625, + "learning_rate": 8.19497591151484e-05, + "loss": 2.6945, + "step": 17574 + }, + { + "epoch": 0.74, + "grad_norm": 0.4140625, + "learning_rate": 8.192477743467078e-05, + "loss": 2.8433, + "step": 17575 + }, + { + "epoch": 0.740042105263158, + "grad_norm": 0.4765625, + "learning_rate": 8.189979881632634e-05, + "loss": 2.8476, + "step": 17576 + }, + { + "epoch": 0.7400842105263158, + "grad_norm": 0.431640625, + "learning_rate": 8.187482326057002e-05, + "loss": 2.9385, + "step": 17577 + }, + { + "epoch": 0.7401263157894736, + "grad_norm": 0.41796875, + "learning_rate": 8.184985076785692e-05, + "loss": 3.172, + "step": 17578 + }, + { + "epoch": 0.7401684210526316, + "grad_norm": 0.4140625, + "learning_rate": 8.182488133864197e-05, + "loss": 2.9247, + "step": 17579 + }, + { + "epoch": 0.7402105263157894, + "grad_norm": 0.4609375, + "learning_rate": 8.179991497337996e-05, + "loss": 3.6467, + "step": 17580 + }, + { + "epoch": 0.7402526315789474, + "grad_norm": 0.4140625, + "learning_rate": 8.177495167252608e-05, + "loss": 2.848, + "step": 17581 + }, + { + "epoch": 0.7402947368421052, + "grad_norm": 0.43359375, + "learning_rate": 8.174999143653475e-05, + "loss": 3.0746, + "step": 17582 + }, + { + "epoch": 0.7403368421052632, + "grad_norm": 0.42578125, + "learning_rate": 8.172503426586103e-05, + "loss": 3.3865, + "step": 17583 + }, + { + "epoch": 0.740378947368421, + "grad_norm": 0.451171875, + "learning_rate": 8.170008016095934e-05, + "loss": 2.9487, + "step": 17584 + }, + { + "epoch": 0.740421052631579, + "grad_norm": 0.41015625, + "learning_rate": 8.16751291222845e-05, + "loss": 3.0877, + "step": 17585 + }, + { + "epoch": 0.7404631578947368, + "grad_norm": 0.439453125, + "learning_rate": 8.165018115029105e-05, + "loss": 3.1764, + "step": 17586 + }, + { + "epoch": 0.7405052631578948, + "grad_norm": 0.44140625, + "learning_rate": 8.162523624543342e-05, + "loss": 2.9305, + "step": 17587 + }, + { + "epoch": 0.7405473684210526, + "grad_norm": 0.455078125, + "learning_rate": 8.16002944081663e-05, + "loss": 2.6253, + "step": 17588 + }, + { + "epoch": 0.7405894736842106, + "grad_norm": 0.43359375, + "learning_rate": 8.157535563894381e-05, + "loss": 3.064, + "step": 17589 + }, + { + "epoch": 0.7406315789473684, + "grad_norm": 0.44921875, + "learning_rate": 8.15504199382206e-05, + "loss": 2.7958, + "step": 17590 + }, + { + "epoch": 0.7406736842105263, + "grad_norm": 0.3984375, + "learning_rate": 8.152548730645065e-05, + "loss": 2.5647, + "step": 17591 + }, + { + "epoch": 0.7407157894736842, + "grad_norm": 0.45703125, + "learning_rate": 8.150055774408847e-05, + "loss": 3.0793, + "step": 17592 + }, + { + "epoch": 0.7407578947368421, + "grad_norm": 0.46875, + "learning_rate": 8.147563125158814e-05, + "loss": 3.1509, + "step": 17593 + }, + { + "epoch": 0.7408, + "grad_norm": 0.439453125, + "learning_rate": 8.145070782940377e-05, + "loss": 2.9246, + "step": 17594 + }, + { + "epoch": 0.7408421052631579, + "grad_norm": 0.4375, + "learning_rate": 8.142578747798951e-05, + "loss": 3.0869, + "step": 17595 + }, + { + "epoch": 0.7408842105263158, + "grad_norm": 0.423828125, + "learning_rate": 8.140087019779924e-05, + "loss": 3.3136, + "step": 17596 + }, + { + "epoch": 0.7409263157894737, + "grad_norm": 0.43359375, + "learning_rate": 8.137595598928718e-05, + "loss": 3.2361, + "step": 17597 + }, + { + "epoch": 0.7409684210526316, + "grad_norm": 0.4296875, + "learning_rate": 8.135104485290693e-05, + "loss": 3.0041, + "step": 17598 + }, + { + "epoch": 0.7410105263157895, + "grad_norm": 0.439453125, + "learning_rate": 8.132613678911263e-05, + "loss": 2.9811, + "step": 17599 + }, + { + "epoch": 0.7410526315789474, + "grad_norm": 0.4140625, + "learning_rate": 8.130123179835777e-05, + "loss": 3.1249, + "step": 17600 + }, + { + "epoch": 0.7410947368421053, + "grad_norm": 0.427734375, + "learning_rate": 8.127632988109634e-05, + "loss": 3.2064, + "step": 17601 + }, + { + "epoch": 0.7411368421052632, + "grad_norm": 0.4296875, + "learning_rate": 8.125143103778196e-05, + "loss": 2.7307, + "step": 17602 + }, + { + "epoch": 0.741178947368421, + "grad_norm": 0.408203125, + "learning_rate": 8.122653526886814e-05, + "loss": 3.3496, + "step": 17603 + }, + { + "epoch": 0.741221052631579, + "grad_norm": 0.412109375, + "learning_rate": 8.12016425748087e-05, + "loss": 2.8468, + "step": 17604 + }, + { + "epoch": 0.7412631578947368, + "grad_norm": 0.427734375, + "learning_rate": 8.117675295605684e-05, + "loss": 2.8568, + "step": 17605 + }, + { + "epoch": 0.7413052631578947, + "grad_norm": 0.439453125, + "learning_rate": 8.115186641306636e-05, + "loss": 3.268, + "step": 17606 + }, + { + "epoch": 0.7413473684210526, + "grad_norm": 0.404296875, + "learning_rate": 8.112698294629032e-05, + "loss": 3.4852, + "step": 17607 + }, + { + "epoch": 0.7413894736842105, + "grad_norm": 0.4375, + "learning_rate": 8.110210255618231e-05, + "loss": 3.6075, + "step": 17608 + }, + { + "epoch": 0.7414315789473684, + "grad_norm": 0.423828125, + "learning_rate": 8.107722524319552e-05, + "loss": 3.1941, + "step": 17609 + }, + { + "epoch": 0.7414736842105263, + "grad_norm": 0.43359375, + "learning_rate": 8.105235100778325e-05, + "loss": 3.1158, + "step": 17610 + }, + { + "epoch": 0.7415157894736842, + "grad_norm": 0.431640625, + "learning_rate": 8.10274798503986e-05, + "loss": 3.2473, + "step": 17611 + }, + { + "epoch": 0.7415578947368421, + "grad_norm": 0.458984375, + "learning_rate": 8.100261177149468e-05, + "loss": 3.1386, + "step": 17612 + }, + { + "epoch": 0.7416, + "grad_norm": 0.435546875, + "learning_rate": 8.097774677152475e-05, + "loss": 3.4269, + "step": 17613 + }, + { + "epoch": 0.7416421052631579, + "grad_norm": 0.419921875, + "learning_rate": 8.095288485094155e-05, + "loss": 3.2482, + "step": 17614 + }, + { + "epoch": 0.7416842105263158, + "grad_norm": 0.42578125, + "learning_rate": 8.092802601019822e-05, + "loss": 2.6936, + "step": 17615 + }, + { + "epoch": 0.7417263157894737, + "grad_norm": 0.431640625, + "learning_rate": 8.09031702497476e-05, + "loss": 3.1977, + "step": 17616 + }, + { + "epoch": 0.7417684210526316, + "grad_norm": 0.4296875, + "learning_rate": 8.087831757004258e-05, + "loss": 3.1649, + "step": 17617 + }, + { + "epoch": 0.7418105263157895, + "grad_norm": 0.412109375, + "learning_rate": 8.085346797153586e-05, + "loss": 3.0625, + "step": 17618 + }, + { + "epoch": 0.7418526315789473, + "grad_norm": 0.40625, + "learning_rate": 8.082862145468015e-05, + "loss": 3.4446, + "step": 17619 + }, + { + "epoch": 0.7418947368421053, + "grad_norm": 0.427734375, + "learning_rate": 8.080377801992835e-05, + "loss": 3.2164, + "step": 17620 + }, + { + "epoch": 0.7419368421052631, + "grad_norm": 0.451171875, + "learning_rate": 8.077893766773276e-05, + "loss": 2.9525, + "step": 17621 + }, + { + "epoch": 0.7419789473684211, + "grad_norm": 0.482421875, + "learning_rate": 8.075410039854616e-05, + "loss": 3.4812, + "step": 17622 + }, + { + "epoch": 0.7420210526315789, + "grad_norm": 0.4296875, + "learning_rate": 8.072926621282104e-05, + "loss": 3.3057, + "step": 17623 + }, + { + "epoch": 0.7420631578947369, + "grad_norm": 0.423828125, + "learning_rate": 8.070443511100975e-05, + "loss": 3.3029, + "step": 17624 + }, + { + "epoch": 0.7421052631578947, + "grad_norm": 0.44140625, + "learning_rate": 8.067960709356478e-05, + "loss": 3.3961, + "step": 17625 + }, + { + "epoch": 0.7421473684210527, + "grad_norm": 4.75, + "learning_rate": 8.065478216093841e-05, + "loss": 3.2198, + "step": 17626 + }, + { + "epoch": 0.7421894736842105, + "grad_norm": 0.4375, + "learning_rate": 8.062996031358286e-05, + "loss": 3.3833, + "step": 17627 + }, + { + "epoch": 0.7422315789473685, + "grad_norm": 0.4453125, + "learning_rate": 8.060514155195048e-05, + "loss": 2.692, + "step": 17628 + }, + { + "epoch": 0.7422736842105263, + "grad_norm": 0.43359375, + "learning_rate": 8.058032587649341e-05, + "loss": 3.2869, + "step": 17629 + }, + { + "epoch": 0.7423157894736843, + "grad_norm": 0.43359375, + "learning_rate": 8.055551328766377e-05, + "loss": 3.0522, + "step": 17630 + }, + { + "epoch": 0.7423578947368421, + "grad_norm": 0.44921875, + "learning_rate": 8.053070378591357e-05, + "loss": 3.1317, + "step": 17631 + }, + { + "epoch": 0.7424, + "grad_norm": 0.421875, + "learning_rate": 8.050589737169483e-05, + "loss": 2.8788, + "step": 17632 + }, + { + "epoch": 0.7424421052631579, + "grad_norm": 0.4453125, + "learning_rate": 8.048109404545948e-05, + "loss": 3.075, + "step": 17633 + }, + { + "epoch": 0.7424842105263157, + "grad_norm": 0.427734375, + "learning_rate": 8.045629380765934e-05, + "loss": 3.1869, + "step": 17634 + }, + { + "epoch": 0.7425263157894737, + "grad_norm": 0.423828125, + "learning_rate": 8.043149665874641e-05, + "loss": 2.9315, + "step": 17635 + }, + { + "epoch": 0.7425684210526315, + "grad_norm": 0.435546875, + "learning_rate": 8.040670259917238e-05, + "loss": 3.0078, + "step": 17636 + }, + { + "epoch": 0.7426105263157895, + "grad_norm": 0.431640625, + "learning_rate": 8.038191162938896e-05, + "loss": 3.2411, + "step": 17637 + }, + { + "epoch": 0.7426526315789473, + "grad_norm": 0.443359375, + "learning_rate": 8.035712374984783e-05, + "loss": 3.0164, + "step": 17638 + }, + { + "epoch": 0.7426947368421053, + "grad_norm": 0.4140625, + "learning_rate": 8.033233896100056e-05, + "loss": 3.2539, + "step": 17639 + }, + { + "epoch": 0.7427368421052631, + "grad_norm": 0.4375, + "learning_rate": 8.030755726329874e-05, + "loss": 2.764, + "step": 17640 + }, + { + "epoch": 0.7427789473684211, + "grad_norm": 0.427734375, + "learning_rate": 8.028277865719378e-05, + "loss": 2.9396, + "step": 17641 + }, + { + "epoch": 0.7428210526315789, + "grad_norm": 0.419921875, + "learning_rate": 8.02580031431373e-05, + "loss": 2.8634, + "step": 17642 + }, + { + "epoch": 0.7428631578947369, + "grad_norm": 0.42578125, + "learning_rate": 8.023323072158042e-05, + "loss": 2.7683, + "step": 17643 + }, + { + "epoch": 0.7429052631578947, + "grad_norm": 0.43359375, + "learning_rate": 8.020846139297472e-05, + "loss": 3.4987, + "step": 17644 + }, + { + "epoch": 0.7429473684210526, + "grad_norm": 0.41796875, + "learning_rate": 8.018369515777133e-05, + "loss": 2.975, + "step": 17645 + }, + { + "epoch": 0.7429894736842105, + "grad_norm": 0.4140625, + "learning_rate": 8.015893201642149e-05, + "loss": 2.9077, + "step": 17646 + }, + { + "epoch": 0.7430315789473684, + "grad_norm": 0.46484375, + "learning_rate": 8.013417196937636e-05, + "loss": 3.2693, + "step": 17647 + }, + { + "epoch": 0.7430736842105263, + "grad_norm": 0.43359375, + "learning_rate": 8.010941501708693e-05, + "loss": 3.349, + "step": 17648 + }, + { + "epoch": 0.7431157894736842, + "grad_norm": 0.419921875, + "learning_rate": 8.008466116000454e-05, + "loss": 3.3587, + "step": 17649 + }, + { + "epoch": 0.7431578947368421, + "grad_norm": 0.419921875, + "learning_rate": 8.005991039857979e-05, + "loss": 3.1469, + "step": 17650 + }, + { + "epoch": 0.7432, + "grad_norm": 0.43359375, + "learning_rate": 8.003516273326392e-05, + "loss": 3.2724, + "step": 17651 + }, + { + "epoch": 0.7432421052631579, + "grad_norm": 0.4140625, + "learning_rate": 8.001041816450763e-05, + "loss": 2.8888, + "step": 17652 + }, + { + "epoch": 0.7432842105263158, + "grad_norm": 0.4296875, + "learning_rate": 7.998567669276183e-05, + "loss": 2.745, + "step": 17653 + }, + { + "epoch": 0.7433263157894737, + "grad_norm": 0.45703125, + "learning_rate": 7.996093831847723e-05, + "loss": 2.9117, + "step": 17654 + }, + { + "epoch": 0.7433684210526316, + "grad_norm": 0.421875, + "learning_rate": 7.993620304210447e-05, + "loss": 3.3102, + "step": 17655 + }, + { + "epoch": 0.7434105263157895, + "grad_norm": 0.451171875, + "learning_rate": 7.991147086409445e-05, + "loss": 3.4329, + "step": 17656 + }, + { + "epoch": 0.7434526315789474, + "grad_norm": 0.439453125, + "learning_rate": 7.988674178489741e-05, + "loss": 3.1856, + "step": 17657 + }, + { + "epoch": 0.7434947368421052, + "grad_norm": 0.458984375, + "learning_rate": 7.986201580496422e-05, + "loss": 3.6201, + "step": 17658 + }, + { + "epoch": 0.7435368421052632, + "grad_norm": 0.3984375, + "learning_rate": 7.983729292474504e-05, + "loss": 2.9935, + "step": 17659 + }, + { + "epoch": 0.743578947368421, + "grad_norm": 0.44921875, + "learning_rate": 7.981257314469051e-05, + "loss": 3.1357, + "step": 17660 + }, + { + "epoch": 0.743621052631579, + "grad_norm": 0.4375, + "learning_rate": 7.978785646525097e-05, + "loss": 3.3248, + "step": 17661 + }, + { + "epoch": 0.7436631578947368, + "grad_norm": 0.43359375, + "learning_rate": 7.976314288687658e-05, + "loss": 2.3929, + "step": 17662 + }, + { + "epoch": 0.7437052631578948, + "grad_norm": 0.4375, + "learning_rate": 7.973843241001788e-05, + "loss": 3.0709, + "step": 17663 + }, + { + "epoch": 0.7437473684210526, + "grad_norm": 0.408203125, + "learning_rate": 7.971372503512475e-05, + "loss": 2.5831, + "step": 17664 + }, + { + "epoch": 0.7437894736842106, + "grad_norm": 0.458984375, + "learning_rate": 7.968902076264761e-05, + "loss": 2.8337, + "step": 17665 + }, + { + "epoch": 0.7438315789473684, + "grad_norm": 0.431640625, + "learning_rate": 7.966431959303624e-05, + "loss": 3.4095, + "step": 17666 + }, + { + "epoch": 0.7438736842105264, + "grad_norm": 0.423828125, + "learning_rate": 7.9639621526741e-05, + "loss": 2.9965, + "step": 17667 + }, + { + "epoch": 0.7439157894736842, + "grad_norm": 0.4140625, + "learning_rate": 7.961492656421155e-05, + "loss": 3.3854, + "step": 17668 + }, + { + "epoch": 0.7439578947368422, + "grad_norm": 0.43359375, + "learning_rate": 7.959023470589799e-05, + "loss": 2.9396, + "step": 17669 + }, + { + "epoch": 0.744, + "grad_norm": 0.419921875, + "learning_rate": 7.956554595225015e-05, + "loss": 3.1431, + "step": 17670 + }, + { + "epoch": 0.744042105263158, + "grad_norm": 0.44140625, + "learning_rate": 7.954086030371774e-05, + "loss": 3.0147, + "step": 17671 + }, + { + "epoch": 0.7440842105263158, + "grad_norm": 0.427734375, + "learning_rate": 7.95161777607507e-05, + "loss": 2.9955, + "step": 17672 + }, + { + "epoch": 0.7441263157894736, + "grad_norm": 0.4296875, + "learning_rate": 7.949149832379846e-05, + "loss": 2.6265, + "step": 17673 + }, + { + "epoch": 0.7441684210526316, + "grad_norm": 0.431640625, + "learning_rate": 7.946682199331088e-05, + "loss": 3.0518, + "step": 17674 + }, + { + "epoch": 0.7442105263157894, + "grad_norm": 0.451171875, + "learning_rate": 7.944214876973738e-05, + "loss": 3.3039, + "step": 17675 + }, + { + "epoch": 0.7442526315789474, + "grad_norm": 0.4453125, + "learning_rate": 7.941747865352758e-05, + "loss": 2.5217, + "step": 17676 + }, + { + "epoch": 0.7442947368421052, + "grad_norm": 0.41015625, + "learning_rate": 7.939281164513088e-05, + "loss": 3.2892, + "step": 17677 + }, + { + "epoch": 0.7443368421052632, + "grad_norm": 0.416015625, + "learning_rate": 7.936814774499659e-05, + "loss": 2.758, + "step": 17678 + }, + { + "epoch": 0.744378947368421, + "grad_norm": 0.4296875, + "learning_rate": 7.934348695357435e-05, + "loss": 3.3586, + "step": 17679 + }, + { + "epoch": 0.744421052631579, + "grad_norm": 0.447265625, + "learning_rate": 7.931882927131309e-05, + "loss": 2.8103, + "step": 17680 + }, + { + "epoch": 0.7444631578947368, + "grad_norm": 0.455078125, + "learning_rate": 7.929417469866232e-05, + "loss": 3.2136, + "step": 17681 + }, + { + "epoch": 0.7445052631578948, + "grad_norm": 0.4375, + "learning_rate": 7.926952323607109e-05, + "loss": 2.8392, + "step": 17682 + }, + { + "epoch": 0.7445473684210526, + "grad_norm": 0.439453125, + "learning_rate": 7.924487488398857e-05, + "loss": 3.2488, + "step": 17683 + }, + { + "epoch": 0.7445894736842106, + "grad_norm": 0.412109375, + "learning_rate": 7.92202296428638e-05, + "loss": 2.9607, + "step": 17684 + }, + { + "epoch": 0.7446315789473684, + "grad_norm": 0.4296875, + "learning_rate": 7.919558751314576e-05, + "loss": 3.4319, + "step": 17685 + }, + { + "epoch": 0.7446736842105263, + "grad_norm": 0.41015625, + "learning_rate": 7.917094849528344e-05, + "loss": 3.2068, + "step": 17686 + }, + { + "epoch": 0.7447157894736842, + "grad_norm": 0.42578125, + "learning_rate": 7.914631258972566e-05, + "loss": 3.3719, + "step": 17687 + }, + { + "epoch": 0.7447578947368421, + "grad_norm": 0.443359375, + "learning_rate": 7.91216797969214e-05, + "loss": 3.1213, + "step": 17688 + }, + { + "epoch": 0.7448, + "grad_norm": 0.45703125, + "learning_rate": 7.909705011731936e-05, + "loss": 2.9213, + "step": 17689 + }, + { + "epoch": 0.7448421052631579, + "grad_norm": 0.416015625, + "learning_rate": 7.907242355136823e-05, + "loss": 2.9728, + "step": 17690 + }, + { + "epoch": 0.7448842105263158, + "grad_norm": 0.458984375, + "learning_rate": 7.904780009951673e-05, + "loss": 2.916, + "step": 17691 + }, + { + "epoch": 0.7449263157894737, + "grad_norm": 0.41015625, + "learning_rate": 7.902317976221346e-05, + "loss": 3.3855, + "step": 17692 + }, + { + "epoch": 0.7449684210526316, + "grad_norm": 0.4453125, + "learning_rate": 7.899856253990698e-05, + "loss": 3.0976, + "step": 17693 + }, + { + "epoch": 0.7450105263157895, + "grad_norm": 0.431640625, + "learning_rate": 7.897394843304567e-05, + "loss": 3.2046, + "step": 17694 + }, + { + "epoch": 0.7450526315789474, + "grad_norm": 0.42578125, + "learning_rate": 7.894933744207816e-05, + "loss": 3.3728, + "step": 17695 + }, + { + "epoch": 0.7450947368421053, + "grad_norm": 0.427734375, + "learning_rate": 7.892472956745272e-05, + "loss": 3.4017, + "step": 17696 + }, + { + "epoch": 0.7451368421052632, + "grad_norm": 0.421875, + "learning_rate": 7.890012480961773e-05, + "loss": 3.0002, + "step": 17697 + }, + { + "epoch": 0.745178947368421, + "grad_norm": 0.453125, + "learning_rate": 7.887552316902144e-05, + "loss": 3.3021, + "step": 17698 + }, + { + "epoch": 0.7452210526315789, + "grad_norm": 0.416015625, + "learning_rate": 7.885092464611204e-05, + "loss": 2.9502, + "step": 17699 + }, + { + "epoch": 0.7452631578947368, + "grad_norm": 0.431640625, + "learning_rate": 7.88263292413377e-05, + "loss": 3.3809, + "step": 17700 + }, + { + "epoch": 0.7453052631578947, + "grad_norm": 0.443359375, + "learning_rate": 7.880173695514653e-05, + "loss": 3.1285, + "step": 17701 + }, + { + "epoch": 0.7453473684210526, + "grad_norm": 0.443359375, + "learning_rate": 7.877714778798645e-05, + "loss": 2.9476, + "step": 17702 + }, + { + "epoch": 0.7453894736842105, + "grad_norm": 0.412109375, + "learning_rate": 7.875256174030567e-05, + "loss": 3.11, + "step": 17703 + }, + { + "epoch": 0.7454315789473684, + "grad_norm": 0.41796875, + "learning_rate": 7.8727978812552e-05, + "loss": 3.4689, + "step": 17704 + }, + { + "epoch": 0.7454736842105263, + "grad_norm": 0.43359375, + "learning_rate": 7.870339900517331e-05, + "loss": 3.4221, + "step": 17705 + }, + { + "epoch": 0.7455157894736842, + "grad_norm": 0.404296875, + "learning_rate": 7.867882231861745e-05, + "loss": 3.1046, + "step": 17706 + }, + { + "epoch": 0.7455578947368421, + "grad_norm": 0.44140625, + "learning_rate": 7.865424875333216e-05, + "loss": 3.3729, + "step": 17707 + }, + { + "epoch": 0.7456, + "grad_norm": 0.419921875, + "learning_rate": 7.862967830976511e-05, + "loss": 2.9193, + "step": 17708 + }, + { + "epoch": 0.7456421052631579, + "grad_norm": 0.44921875, + "learning_rate": 7.860511098836387e-05, + "loss": 3.326, + "step": 17709 + }, + { + "epoch": 0.7456842105263158, + "grad_norm": 0.443359375, + "learning_rate": 7.858054678957633e-05, + "loss": 3.0627, + "step": 17710 + }, + { + "epoch": 0.7457263157894737, + "grad_norm": 0.4140625, + "learning_rate": 7.855598571384964e-05, + "loss": 2.9963, + "step": 17711 + }, + { + "epoch": 0.7457684210526315, + "grad_norm": 0.44140625, + "learning_rate": 7.853142776163152e-05, + "loss": 3.0309, + "step": 17712 + }, + { + "epoch": 0.7458105263157895, + "grad_norm": 0.427734375, + "learning_rate": 7.850687293336933e-05, + "loss": 3.2342, + "step": 17713 + }, + { + "epoch": 0.7458526315789473, + "grad_norm": 0.4296875, + "learning_rate": 7.848232122951041e-05, + "loss": 3.0369, + "step": 17714 + }, + { + "epoch": 0.7458947368421053, + "grad_norm": 0.419921875, + "learning_rate": 7.845777265050208e-05, + "loss": 3.0712, + "step": 17715 + }, + { + "epoch": 0.7459368421052631, + "grad_norm": 0.42578125, + "learning_rate": 7.843322719679149e-05, + "loss": 3.3188, + "step": 17716 + }, + { + "epoch": 0.7459789473684211, + "grad_norm": 0.421875, + "learning_rate": 7.84086848688261e-05, + "loss": 2.8848, + "step": 17717 + }, + { + "epoch": 0.7460210526315789, + "grad_norm": 0.412109375, + "learning_rate": 7.838414566705268e-05, + "loss": 2.8543, + "step": 17718 + }, + { + "epoch": 0.7460631578947369, + "grad_norm": 0.625, + "learning_rate": 7.83596095919186e-05, + "loss": 3.1319, + "step": 17719 + }, + { + "epoch": 0.7461052631578947, + "grad_norm": 0.423828125, + "learning_rate": 7.833507664387071e-05, + "loss": 3.0337, + "step": 17720 + }, + { + "epoch": 0.7461473684210527, + "grad_norm": 0.41796875, + "learning_rate": 7.831054682335598e-05, + "loss": 2.845, + "step": 17721 + }, + { + "epoch": 0.7461894736842105, + "grad_norm": 0.396484375, + "learning_rate": 7.82860201308215e-05, + "loss": 2.5879, + "step": 17722 + }, + { + "epoch": 0.7462315789473685, + "grad_norm": 0.431640625, + "learning_rate": 7.826149656671386e-05, + "loss": 2.9478, + "step": 17723 + }, + { + "epoch": 0.7462736842105263, + "grad_norm": 0.431640625, + "learning_rate": 7.823697613148009e-05, + "loss": 3.3218, + "step": 17724 + }, + { + "epoch": 0.7463157894736843, + "grad_norm": 0.42578125, + "learning_rate": 7.821245882556665e-05, + "loss": 3.0221, + "step": 17725 + }, + { + "epoch": 0.7463578947368421, + "grad_norm": 0.412109375, + "learning_rate": 7.818794464942052e-05, + "loss": 3.1068, + "step": 17726 + }, + { + "epoch": 0.7464, + "grad_norm": 0.43359375, + "learning_rate": 7.816343360348801e-05, + "loss": 2.767, + "step": 17727 + }, + { + "epoch": 0.7464421052631579, + "grad_norm": 0.451171875, + "learning_rate": 7.813892568821593e-05, + "loss": 3.6161, + "step": 17728 + }, + { + "epoch": 0.7464842105263158, + "grad_norm": 0.427734375, + "learning_rate": 7.811442090405069e-05, + "loss": 3.2526, + "step": 17729 + }, + { + "epoch": 0.7465263157894737, + "grad_norm": 0.4296875, + "learning_rate": 7.808991925143869e-05, + "loss": 3.4836, + "step": 17730 + }, + { + "epoch": 0.7465684210526315, + "grad_norm": 0.419921875, + "learning_rate": 7.806542073082649e-05, + "loss": 3.4092, + "step": 17731 + }, + { + "epoch": 0.7466105263157895, + "grad_norm": 0.421875, + "learning_rate": 7.804092534266017e-05, + "loss": 2.8611, + "step": 17732 + }, + { + "epoch": 0.7466526315789473, + "grad_norm": 0.43359375, + "learning_rate": 7.801643308738631e-05, + "loss": 3.3969, + "step": 17733 + }, + { + "epoch": 0.7466947368421053, + "grad_norm": 0.400390625, + "learning_rate": 7.799194396545082e-05, + "loss": 2.8292, + "step": 17734 + }, + { + "epoch": 0.7467368421052631, + "grad_norm": 0.455078125, + "learning_rate": 7.796745797730007e-05, + "loss": 3.3956, + "step": 17735 + }, + { + "epoch": 0.7467789473684211, + "grad_norm": 0.416015625, + "learning_rate": 7.794297512338008e-05, + "loss": 3.2819, + "step": 17736 + }, + { + "epoch": 0.7468210526315789, + "grad_norm": 0.44140625, + "learning_rate": 7.791849540413698e-05, + "loss": 3.1499, + "step": 17737 + }, + { + "epoch": 0.7468631578947369, + "grad_norm": 0.4375, + "learning_rate": 7.789401882001668e-05, + "loss": 3.3349, + "step": 17738 + }, + { + "epoch": 0.7469052631578947, + "grad_norm": 0.41796875, + "learning_rate": 7.786954537146504e-05, + "loss": 3.4044, + "step": 17739 + }, + { + "epoch": 0.7469473684210526, + "grad_norm": 0.4296875, + "learning_rate": 7.784507505892824e-05, + "loss": 3.0916, + "step": 17740 + }, + { + "epoch": 0.7469894736842105, + "grad_norm": 0.451171875, + "learning_rate": 7.782060788285169e-05, + "loss": 2.8171, + "step": 17741 + }, + { + "epoch": 0.7470315789473684, + "grad_norm": 0.421875, + "learning_rate": 7.779614384368147e-05, + "loss": 2.8135, + "step": 17742 + }, + { + "epoch": 0.7470736842105263, + "grad_norm": 0.43359375, + "learning_rate": 7.777168294186317e-05, + "loss": 3.1717, + "step": 17743 + }, + { + "epoch": 0.7471157894736842, + "grad_norm": 0.431640625, + "learning_rate": 7.774722517784246e-05, + "loss": 3.1439, + "step": 17744 + }, + { + "epoch": 0.7471578947368421, + "grad_norm": 0.42578125, + "learning_rate": 7.772277055206492e-05, + "loss": 3.2358, + "step": 17745 + }, + { + "epoch": 0.7472, + "grad_norm": 0.431640625, + "learning_rate": 7.769831906497602e-05, + "loss": 3.0483, + "step": 17746 + }, + { + "epoch": 0.7472421052631579, + "grad_norm": 0.41796875, + "learning_rate": 7.767387071702142e-05, + "loss": 2.9274, + "step": 17747 + }, + { + "epoch": 0.7472842105263158, + "grad_norm": 0.439453125, + "learning_rate": 7.764942550864629e-05, + "loss": 3.3444, + "step": 17748 + }, + { + "epoch": 0.7473263157894737, + "grad_norm": 0.466796875, + "learning_rate": 7.762498344029617e-05, + "loss": 2.9817, + "step": 17749 + }, + { + "epoch": 0.7473684210526316, + "grad_norm": 0.546875, + "learning_rate": 7.760054451241636e-05, + "loss": 2.7918, + "step": 17750 + }, + { + "epoch": 0.7474105263157895, + "grad_norm": 0.419921875, + "learning_rate": 7.757610872545204e-05, + "loss": 3.4325, + "step": 17751 + }, + { + "epoch": 0.7474526315789474, + "grad_norm": 0.42578125, + "learning_rate": 7.755167607984845e-05, + "loss": 3.4448, + "step": 17752 + }, + { + "epoch": 0.7474947368421052, + "grad_norm": 0.408203125, + "learning_rate": 7.752724657605068e-05, + "loss": 3.2694, + "step": 17753 + }, + { + "epoch": 0.7475368421052632, + "grad_norm": 0.419921875, + "learning_rate": 7.750282021450384e-05, + "loss": 3.0342, + "step": 17754 + }, + { + "epoch": 0.747578947368421, + "grad_norm": 0.44140625, + "learning_rate": 7.747839699565287e-05, + "loss": 3.2658, + "step": 17755 + }, + { + "epoch": 0.747621052631579, + "grad_norm": 0.470703125, + "learning_rate": 7.745397691994285e-05, + "loss": 3.0521, + "step": 17756 + }, + { + "epoch": 0.7476631578947368, + "grad_norm": 0.451171875, + "learning_rate": 7.742955998781867e-05, + "loss": 3.279, + "step": 17757 + }, + { + "epoch": 0.7477052631578948, + "grad_norm": 0.421875, + "learning_rate": 7.740514619972513e-05, + "loss": 3.0341, + "step": 17758 + }, + { + "epoch": 0.7477473684210526, + "grad_norm": 0.427734375, + "learning_rate": 7.738073555610705e-05, + "loss": 3.1511, + "step": 17759 + }, + { + "epoch": 0.7477894736842106, + "grad_norm": 0.4296875, + "learning_rate": 7.735632805740914e-05, + "loss": 3.4554, + "step": 17760 + }, + { + "epoch": 0.7478315789473684, + "grad_norm": 0.412109375, + "learning_rate": 7.733192370407607e-05, + "loss": 3.0547, + "step": 17761 + }, + { + "epoch": 0.7478736842105264, + "grad_norm": 0.462890625, + "learning_rate": 7.730752249655242e-05, + "loss": 3.2991, + "step": 17762 + }, + { + "epoch": 0.7479157894736842, + "grad_norm": 0.408203125, + "learning_rate": 7.728312443528288e-05, + "loss": 3.0757, + "step": 17763 + }, + { + "epoch": 0.7479578947368422, + "grad_norm": 0.42578125, + "learning_rate": 7.725872952071186e-05, + "loss": 2.4171, + "step": 17764 + }, + { + "epoch": 0.748, + "grad_norm": 0.42578125, + "learning_rate": 7.723433775328384e-05, + "loss": 3.2037, + "step": 17765 + }, + { + "epoch": 0.7480421052631578, + "grad_norm": 0.44140625, + "learning_rate": 7.72099491334432e-05, + "loss": 3.2609, + "step": 17766 + }, + { + "epoch": 0.7480842105263158, + "grad_norm": 0.44921875, + "learning_rate": 7.718556366163427e-05, + "loss": 3.1533, + "step": 17767 + }, + { + "epoch": 0.7481263157894736, + "grad_norm": 0.44921875, + "learning_rate": 7.71611813383013e-05, + "loss": 3.0509, + "step": 17768 + }, + { + "epoch": 0.7481684210526316, + "grad_norm": 0.435546875, + "learning_rate": 7.713680216388855e-05, + "loss": 3.2013, + "step": 17769 + }, + { + "epoch": 0.7482105263157894, + "grad_norm": 0.4296875, + "learning_rate": 7.711242613884007e-05, + "loss": 3.1478, + "step": 17770 + }, + { + "epoch": 0.7482526315789474, + "grad_norm": 0.427734375, + "learning_rate": 7.708805326360013e-05, + "loss": 3.1851, + "step": 17771 + }, + { + "epoch": 0.7482947368421052, + "grad_norm": 0.412109375, + "learning_rate": 7.706368353861268e-05, + "loss": 2.7668, + "step": 17772 + }, + { + "epoch": 0.7483368421052632, + "grad_norm": 0.490234375, + "learning_rate": 7.703931696432174e-05, + "loss": 3.357, + "step": 17773 + }, + { + "epoch": 0.748378947368421, + "grad_norm": 0.42578125, + "learning_rate": 7.701495354117121e-05, + "loss": 3.493, + "step": 17774 + }, + { + "epoch": 0.748421052631579, + "grad_norm": 0.41015625, + "learning_rate": 7.699059326960492e-05, + "loss": 2.7973, + "step": 17775 + }, + { + "epoch": 0.7484631578947368, + "grad_norm": 0.435546875, + "learning_rate": 7.69662361500669e-05, + "loss": 2.5367, + "step": 17776 + }, + { + "epoch": 0.7485052631578948, + "grad_norm": 0.431640625, + "learning_rate": 7.694188218300061e-05, + "loss": 3.1258, + "step": 17777 + }, + { + "epoch": 0.7485473684210526, + "grad_norm": 0.439453125, + "learning_rate": 7.691753136884993e-05, + "loss": 3.0488, + "step": 17778 + }, + { + "epoch": 0.7485894736842105, + "grad_norm": 0.453125, + "learning_rate": 7.689318370805847e-05, + "loss": 2.9457, + "step": 17779 + }, + { + "epoch": 0.7486315789473684, + "grad_norm": 0.435546875, + "learning_rate": 7.686883920106982e-05, + "loss": 2.7233, + "step": 17780 + }, + { + "epoch": 0.7486736842105263, + "grad_norm": 0.431640625, + "learning_rate": 7.68444978483275e-05, + "loss": 3.1506, + "step": 17781 + }, + { + "epoch": 0.7487157894736842, + "grad_norm": 0.40625, + "learning_rate": 7.682015965027492e-05, + "loss": 3.1776, + "step": 17782 + }, + { + "epoch": 0.7487578947368421, + "grad_norm": 0.408203125, + "learning_rate": 7.67958246073557e-05, + "loss": 3.2016, + "step": 17783 + }, + { + "epoch": 0.7488, + "grad_norm": 0.455078125, + "learning_rate": 7.677149272001288e-05, + "loss": 3.0499, + "step": 17784 + }, + { + "epoch": 0.7488421052631579, + "grad_norm": 0.400390625, + "learning_rate": 7.674716398869009e-05, + "loss": 3.0218, + "step": 17785 + }, + { + "epoch": 0.7488842105263158, + "grad_norm": 0.53125, + "learning_rate": 7.672283841383027e-05, + "loss": 3.1776, + "step": 17786 + }, + { + "epoch": 0.7489263157894737, + "grad_norm": 0.3984375, + "learning_rate": 7.66985159958768e-05, + "loss": 2.9598, + "step": 17787 + }, + { + "epoch": 0.7489684210526316, + "grad_norm": 0.51171875, + "learning_rate": 7.667419673527273e-05, + "loss": 3.0317, + "step": 17788 + }, + { + "epoch": 0.7490105263157895, + "grad_norm": 0.42578125, + "learning_rate": 7.66498806324611e-05, + "loss": 2.7939, + "step": 17789 + }, + { + "epoch": 0.7490526315789474, + "grad_norm": 0.431640625, + "learning_rate": 7.662556768788512e-05, + "loss": 3.1224, + "step": 17790 + }, + { + "epoch": 0.7490947368421053, + "grad_norm": 0.4375, + "learning_rate": 7.660125790198741e-05, + "loss": 3.3367, + "step": 17791 + }, + { + "epoch": 0.7491368421052632, + "grad_norm": 0.439453125, + "learning_rate": 7.657695127521122e-05, + "loss": 2.8424, + "step": 17792 + }, + { + "epoch": 0.749178947368421, + "grad_norm": 0.423828125, + "learning_rate": 7.655264780799907e-05, + "loss": 3.4951, + "step": 17793 + }, + { + "epoch": 0.7492210526315789, + "grad_norm": 0.462890625, + "learning_rate": 7.652834750079394e-05, + "loss": 3.3853, + "step": 17794 + }, + { + "epoch": 0.7492631578947369, + "grad_norm": 0.419921875, + "learning_rate": 7.650405035403848e-05, + "loss": 3.1308, + "step": 17795 + }, + { + "epoch": 0.7493052631578947, + "grad_norm": 0.41796875, + "learning_rate": 7.647975636817542e-05, + "loss": 3.2264, + "step": 17796 + }, + { + "epoch": 0.7493473684210527, + "grad_norm": 0.44921875, + "learning_rate": 7.645546554364729e-05, + "loss": 3.1742, + "step": 17797 + }, + { + "epoch": 0.7493894736842105, + "grad_norm": 0.423828125, + "learning_rate": 7.643117788089659e-05, + "loss": 3.0633, + "step": 17798 + }, + { + "epoch": 0.7494315789473684, + "grad_norm": 0.439453125, + "learning_rate": 7.640689338036608e-05, + "loss": 3.2137, + "step": 17799 + }, + { + "epoch": 0.7494736842105263, + "grad_norm": 0.447265625, + "learning_rate": 7.638261204249783e-05, + "loss": 2.8879, + "step": 17800 + }, + { + "epoch": 0.7495157894736842, + "grad_norm": 0.412109375, + "learning_rate": 7.635833386773456e-05, + "loss": 3.1504, + "step": 17801 + }, + { + "epoch": 0.7495578947368421, + "grad_norm": 0.44921875, + "learning_rate": 7.633405885651826e-05, + "loss": 2.754, + "step": 17802 + }, + { + "epoch": 0.7496, + "grad_norm": 0.46875, + "learning_rate": 7.630978700929145e-05, + "loss": 3.0729, + "step": 17803 + }, + { + "epoch": 0.7496421052631579, + "grad_norm": 0.4296875, + "learning_rate": 7.628551832649625e-05, + "loss": 3.2097, + "step": 17804 + }, + { + "epoch": 0.7496842105263158, + "grad_norm": 0.451171875, + "learning_rate": 7.626125280857471e-05, + "loss": 3.1063, + "step": 17805 + }, + { + "epoch": 0.7497263157894737, + "grad_norm": 0.419921875, + "learning_rate": 7.623699045596916e-05, + "loss": 3.2728, + "step": 17806 + }, + { + "epoch": 0.7497684210526315, + "grad_norm": 0.443359375, + "learning_rate": 7.621273126912134e-05, + "loss": 2.7823, + "step": 17807 + }, + { + "epoch": 0.7498105263157895, + "grad_norm": 0.451171875, + "learning_rate": 7.618847524847349e-05, + "loss": 3.0244, + "step": 17808 + }, + { + "epoch": 0.7498526315789473, + "grad_norm": 0.466796875, + "learning_rate": 7.616422239446727e-05, + "loss": 2.8898, + "step": 17809 + }, + { + "epoch": 0.7498947368421053, + "grad_norm": 0.451171875, + "learning_rate": 7.613997270754474e-05, + "loss": 2.9765, + "step": 17810 + }, + { + "epoch": 0.7499368421052631, + "grad_norm": 0.44140625, + "learning_rate": 7.611572618814763e-05, + "loss": 3.5159, + "step": 17811 + }, + { + "epoch": 0.7499789473684211, + "grad_norm": 0.41796875, + "learning_rate": 7.609148283671771e-05, + "loss": 2.993, + "step": 17812 + }, + { + "epoch": 0.7500210526315789, + "grad_norm": 0.421875, + "learning_rate": 7.60672426536966e-05, + "loss": 3.0904, + "step": 17813 + }, + { + "epoch": 0.7500631578947369, + "grad_norm": 0.490234375, + "learning_rate": 7.60430056395259e-05, + "loss": 3.0397, + "step": 17814 + }, + { + "epoch": 0.7501052631578947, + "grad_norm": 0.412109375, + "learning_rate": 7.601877179464741e-05, + "loss": 3.0567, + "step": 17815 + }, + { + "epoch": 0.7501473684210527, + "grad_norm": 0.412109375, + "learning_rate": 7.599454111950232e-05, + "loss": 3.0329, + "step": 17816 + }, + { + "epoch": 0.7501894736842105, + "grad_norm": 0.42578125, + "learning_rate": 7.597031361453233e-05, + "loss": 3.4886, + "step": 17817 + }, + { + "epoch": 0.7502315789473685, + "grad_norm": 0.4296875, + "learning_rate": 7.594608928017874e-05, + "loss": 3.253, + "step": 17818 + }, + { + "epoch": 0.7502736842105263, + "grad_norm": 0.4296875, + "learning_rate": 7.59218681168829e-05, + "loss": 3.0556, + "step": 17819 + }, + { + "epoch": 0.7503157894736842, + "grad_norm": 0.44140625, + "learning_rate": 7.589765012508607e-05, + "loss": 3.2216, + "step": 17820 + }, + { + "epoch": 0.7503578947368421, + "grad_norm": 0.42578125, + "learning_rate": 7.587343530522944e-05, + "loss": 3.2909, + "step": 17821 + }, + { + "epoch": 0.7504, + "grad_norm": 0.51953125, + "learning_rate": 7.584922365775429e-05, + "loss": 3.2148, + "step": 17822 + }, + { + "epoch": 0.7504421052631579, + "grad_norm": 0.4453125, + "learning_rate": 7.582501518310167e-05, + "loss": 2.8767, + "step": 17823 + }, + { + "epoch": 0.7504842105263158, + "grad_norm": 0.45703125, + "learning_rate": 7.580080988171265e-05, + "loss": 3.209, + "step": 17824 + }, + { + "epoch": 0.7505263157894737, + "grad_norm": 0.431640625, + "learning_rate": 7.577660775402818e-05, + "loss": 3.1463, + "step": 17825 + }, + { + "epoch": 0.7505684210526316, + "grad_norm": 0.416015625, + "learning_rate": 7.57524088004892e-05, + "loss": 2.9476, + "step": 17826 + }, + { + "epoch": 0.7506105263157895, + "grad_norm": 0.458984375, + "learning_rate": 7.572821302153663e-05, + "loss": 3.1166, + "step": 17827 + }, + { + "epoch": 0.7506526315789473, + "grad_norm": 0.431640625, + "learning_rate": 7.570402041761121e-05, + "loss": 3.5198, + "step": 17828 + }, + { + "epoch": 0.7506947368421053, + "grad_norm": 0.447265625, + "learning_rate": 7.567983098915371e-05, + "loss": 3.1533, + "step": 17829 + }, + { + "epoch": 0.7507368421052631, + "grad_norm": 0.42578125, + "learning_rate": 7.565564473660494e-05, + "loss": 3.3258, + "step": 17830 + }, + { + "epoch": 0.7507789473684211, + "grad_norm": 0.42578125, + "learning_rate": 7.563146166040547e-05, + "loss": 3.378, + "step": 17831 + }, + { + "epoch": 0.750821052631579, + "grad_norm": 0.44921875, + "learning_rate": 7.56072817609959e-05, + "loss": 2.9641, + "step": 17832 + }, + { + "epoch": 0.7508631578947368, + "grad_norm": 0.41015625, + "learning_rate": 7.558310503881674e-05, + "loss": 2.9044, + "step": 17833 + }, + { + "epoch": 0.7509052631578947, + "grad_norm": 0.4140625, + "learning_rate": 7.555893149430851e-05, + "loss": 2.7828, + "step": 17834 + }, + { + "epoch": 0.7509473684210526, + "grad_norm": 0.41796875, + "learning_rate": 7.553476112791155e-05, + "loss": 2.9097, + "step": 17835 + }, + { + "epoch": 0.7509894736842105, + "grad_norm": 0.453125, + "learning_rate": 7.551059394006621e-05, + "loss": 2.7156, + "step": 17836 + }, + { + "epoch": 0.7510315789473684, + "grad_norm": 0.490234375, + "learning_rate": 7.548642993121288e-05, + "loss": 3.0724, + "step": 17837 + }, + { + "epoch": 0.7510736842105263, + "grad_norm": 0.453125, + "learning_rate": 7.546226910179177e-05, + "loss": 2.9513, + "step": 17838 + }, + { + "epoch": 0.7511157894736842, + "grad_norm": 0.43359375, + "learning_rate": 7.543811145224303e-05, + "loss": 3.4351, + "step": 17839 + }, + { + "epoch": 0.7511578947368421, + "grad_norm": 0.439453125, + "learning_rate": 7.541395698300679e-05, + "loss": 2.6673, + "step": 17840 + }, + { + "epoch": 0.7512, + "grad_norm": 0.400390625, + "learning_rate": 7.538980569452314e-05, + "loss": 3.2137, + "step": 17841 + }, + { + "epoch": 0.7512421052631579, + "grad_norm": 0.396484375, + "learning_rate": 7.536565758723207e-05, + "loss": 2.8238, + "step": 17842 + }, + { + "epoch": 0.7512842105263158, + "grad_norm": 0.419921875, + "learning_rate": 7.534151266157347e-05, + "loss": 3.0127, + "step": 17843 + }, + { + "epoch": 0.7513263157894737, + "grad_norm": 0.421875, + "learning_rate": 7.531737091798743e-05, + "loss": 3.286, + "step": 17844 + }, + { + "epoch": 0.7513684210526316, + "grad_norm": 0.40234375, + "learning_rate": 7.52932323569135e-05, + "loss": 2.7039, + "step": 17845 + }, + { + "epoch": 0.7514105263157895, + "grad_norm": 0.451171875, + "learning_rate": 7.526909697879167e-05, + "loss": 3.3882, + "step": 17846 + }, + { + "epoch": 0.7514526315789474, + "grad_norm": 0.462890625, + "learning_rate": 7.524496478406159e-05, + "loss": 2.9403, + "step": 17847 + }, + { + "epoch": 0.7514947368421052, + "grad_norm": 0.421875, + "learning_rate": 7.522083577316293e-05, + "loss": 3.196, + "step": 17848 + }, + { + "epoch": 0.7515368421052632, + "grad_norm": 0.4375, + "learning_rate": 7.519670994653529e-05, + "loss": 2.6317, + "step": 17849 + }, + { + "epoch": 0.751578947368421, + "grad_norm": 0.419921875, + "learning_rate": 7.517258730461815e-05, + "loss": 2.562, + "step": 17850 + }, + { + "epoch": 0.751621052631579, + "grad_norm": 0.423828125, + "learning_rate": 7.514846784785118e-05, + "loss": 3.1515, + "step": 17851 + }, + { + "epoch": 0.7516631578947368, + "grad_norm": 0.404296875, + "learning_rate": 7.512435157667354e-05, + "loss": 3.0127, + "step": 17852 + }, + { + "epoch": 0.7517052631578948, + "grad_norm": 0.42578125, + "learning_rate": 7.510023849152484e-05, + "loss": 3.2391, + "step": 17853 + }, + { + "epoch": 0.7517473684210526, + "grad_norm": 0.44140625, + "learning_rate": 7.507612859284427e-05, + "loss": 3.0148, + "step": 17854 + }, + { + "epoch": 0.7517894736842106, + "grad_norm": 0.3984375, + "learning_rate": 7.50520218810711e-05, + "loss": 3.3489, + "step": 17855 + }, + { + "epoch": 0.7518315789473684, + "grad_norm": 0.4296875, + "learning_rate": 7.502791835664458e-05, + "loss": 3.0919, + "step": 17856 + }, + { + "epoch": 0.7518736842105264, + "grad_norm": 0.447265625, + "learning_rate": 7.500381802000369e-05, + "loss": 3.1156, + "step": 17857 + }, + { + "epoch": 0.7519157894736842, + "grad_norm": 0.419921875, + "learning_rate": 7.497972087158778e-05, + "loss": 3.1994, + "step": 17858 + }, + { + "epoch": 0.7519578947368422, + "grad_norm": 0.40234375, + "learning_rate": 7.495562691183558e-05, + "loss": 2.8095, + "step": 17859 + }, + { + "epoch": 0.752, + "grad_norm": 0.431640625, + "learning_rate": 7.493153614118633e-05, + "loss": 3.2132, + "step": 17860 + }, + { + "epoch": 0.7520421052631578, + "grad_norm": 0.462890625, + "learning_rate": 7.490744856007864e-05, + "loss": 2.8838, + "step": 17861 + }, + { + "epoch": 0.7520842105263158, + "grad_norm": 0.4296875, + "learning_rate": 7.488336416895158e-05, + "loss": 2.9682, + "step": 17862 + }, + { + "epoch": 0.7521263157894736, + "grad_norm": 0.443359375, + "learning_rate": 7.48592829682439e-05, + "loss": 3.5944, + "step": 17863 + }, + { + "epoch": 0.7521684210526316, + "grad_norm": 0.39453125, + "learning_rate": 7.48352049583942e-05, + "loss": 2.9618, + "step": 17864 + }, + { + "epoch": 0.7522105263157894, + "grad_norm": 0.423828125, + "learning_rate": 7.481113013984142e-05, + "loss": 3.3649, + "step": 17865 + }, + { + "epoch": 0.7522526315789474, + "grad_norm": 0.490234375, + "learning_rate": 7.478705851302384e-05, + "loss": 2.653, + "step": 17866 + }, + { + "epoch": 0.7522947368421052, + "grad_norm": 0.431640625, + "learning_rate": 7.476299007838036e-05, + "loss": 3.1838, + "step": 17867 + }, + { + "epoch": 0.7523368421052632, + "grad_norm": 0.41015625, + "learning_rate": 7.473892483634915e-05, + "loss": 2.7595, + "step": 17868 + }, + { + "epoch": 0.752378947368421, + "grad_norm": 0.419921875, + "learning_rate": 7.471486278736888e-05, + "loss": 2.4145, + "step": 17869 + }, + { + "epoch": 0.752421052631579, + "grad_norm": 0.443359375, + "learning_rate": 7.469080393187786e-05, + "loss": 3.1352, + "step": 17870 + }, + { + "epoch": 0.7524631578947368, + "grad_norm": 0.46484375, + "learning_rate": 7.466674827031441e-05, + "loss": 2.647, + "step": 17871 + }, + { + "epoch": 0.7525052631578948, + "grad_norm": 0.42578125, + "learning_rate": 7.46426958031168e-05, + "loss": 3.1138, + "step": 17872 + }, + { + "epoch": 0.7525473684210526, + "grad_norm": 0.40234375, + "learning_rate": 7.461864653072317e-05, + "loss": 2.7776, + "step": 17873 + }, + { + "epoch": 0.7525894736842105, + "grad_norm": 0.431640625, + "learning_rate": 7.459460045357186e-05, + "loss": 3.0042, + "step": 17874 + }, + { + "epoch": 0.7526315789473684, + "grad_norm": 0.44140625, + "learning_rate": 7.457055757210071e-05, + "loss": 2.9169, + "step": 17875 + }, + { + "epoch": 0.7526736842105263, + "grad_norm": 0.408203125, + "learning_rate": 7.454651788674796e-05, + "loss": 3.1255, + "step": 17876 + }, + { + "epoch": 0.7527157894736842, + "grad_norm": 0.4140625, + "learning_rate": 7.452248139795148e-05, + "loss": 3.1371, + "step": 17877 + }, + { + "epoch": 0.7527578947368421, + "grad_norm": 0.44921875, + "learning_rate": 7.449844810614922e-05, + "loss": 3.0477, + "step": 17878 + }, + { + "epoch": 0.7528, + "grad_norm": 0.45703125, + "learning_rate": 7.447441801177903e-05, + "loss": 2.9108, + "step": 17879 + }, + { + "epoch": 0.7528421052631579, + "grad_norm": 0.439453125, + "learning_rate": 7.445039111527865e-05, + "loss": 3.1906, + "step": 17880 + }, + { + "epoch": 0.7528842105263158, + "grad_norm": 0.4296875, + "learning_rate": 7.442636741708602e-05, + "loss": 2.8447, + "step": 17881 + }, + { + "epoch": 0.7529263157894737, + "grad_norm": 0.50390625, + "learning_rate": 7.440234691763853e-05, + "loss": 3.03, + "step": 17882 + }, + { + "epoch": 0.7529684210526316, + "grad_norm": 0.439453125, + "learning_rate": 7.437832961737403e-05, + "loss": 2.5647, + "step": 17883 + }, + { + "epoch": 0.7530105263157895, + "grad_norm": 0.443359375, + "learning_rate": 7.435431551673005e-05, + "loss": 3.2236, + "step": 17884 + }, + { + "epoch": 0.7530526315789474, + "grad_norm": 0.423828125, + "learning_rate": 7.433030461614404e-05, + "loss": 3.224, + "step": 17885 + }, + { + "epoch": 0.7530947368421053, + "grad_norm": 0.41796875, + "learning_rate": 7.430629691605347e-05, + "loss": 3.074, + "step": 17886 + }, + { + "epoch": 0.7531368421052631, + "grad_norm": 0.44921875, + "learning_rate": 7.428229241689577e-05, + "loss": 3.2722, + "step": 17887 + }, + { + "epoch": 0.7531789473684211, + "grad_norm": 0.45703125, + "learning_rate": 7.42582911191082e-05, + "loss": 2.7538, + "step": 17888 + }, + { + "epoch": 0.7532210526315789, + "grad_norm": 0.421875, + "learning_rate": 7.423429302312804e-05, + "loss": 3.3023, + "step": 17889 + }, + { + "epoch": 0.7532631578947369, + "grad_norm": 0.447265625, + "learning_rate": 7.421029812939259e-05, + "loss": 3.398, + "step": 17890 + }, + { + "epoch": 0.7533052631578947, + "grad_norm": 0.419921875, + "learning_rate": 7.418630643833899e-05, + "loss": 3.2064, + "step": 17891 + }, + { + "epoch": 0.7533473684210527, + "grad_norm": 0.41796875, + "learning_rate": 7.416231795040429e-05, + "loss": 2.9016, + "step": 17892 + }, + { + "epoch": 0.7533894736842105, + "grad_norm": 0.42578125, + "learning_rate": 7.413833266602557e-05, + "loss": 3.1228, + "step": 17893 + }, + { + "epoch": 0.7534315789473685, + "grad_norm": 0.427734375, + "learning_rate": 7.411435058563981e-05, + "loss": 2.9197, + "step": 17894 + }, + { + "epoch": 0.7534736842105263, + "grad_norm": 0.427734375, + "learning_rate": 7.409037170968391e-05, + "loss": 3.3076, + "step": 17895 + }, + { + "epoch": 0.7535157894736842, + "grad_norm": 0.478515625, + "learning_rate": 7.406639603859467e-05, + "loss": 2.915, + "step": 17896 + }, + { + "epoch": 0.7535578947368421, + "grad_norm": 0.47265625, + "learning_rate": 7.404242357280908e-05, + "loss": 3.2305, + "step": 17897 + }, + { + "epoch": 0.7536, + "grad_norm": 0.447265625, + "learning_rate": 7.401845431276377e-05, + "loss": 3.1816, + "step": 17898 + }, + { + "epoch": 0.7536421052631579, + "grad_norm": 0.40625, + "learning_rate": 7.399448825889546e-05, + "loss": 2.893, + "step": 17899 + }, + { + "epoch": 0.7536842105263157, + "grad_norm": 0.41015625, + "learning_rate": 7.397052541164079e-05, + "loss": 3.0306, + "step": 17900 + }, + { + "epoch": 0.7537263157894737, + "grad_norm": 0.419921875, + "learning_rate": 7.394656577143629e-05, + "loss": 3.0031, + "step": 17901 + }, + { + "epoch": 0.7537684210526315, + "grad_norm": 0.4453125, + "learning_rate": 7.392260933871852e-05, + "loss": 3.2557, + "step": 17902 + }, + { + "epoch": 0.7538105263157895, + "grad_norm": 0.431640625, + "learning_rate": 7.389865611392394e-05, + "loss": 3.0523, + "step": 17903 + }, + { + "epoch": 0.7538526315789473, + "grad_norm": 0.421875, + "learning_rate": 7.387470609748883e-05, + "loss": 2.7884, + "step": 17904 + }, + { + "epoch": 0.7538947368421053, + "grad_norm": 0.419921875, + "learning_rate": 7.385075928984975e-05, + "loss": 3.1049, + "step": 17905 + }, + { + "epoch": 0.7539368421052631, + "grad_norm": 0.4140625, + "learning_rate": 7.382681569144284e-05, + "loss": 2.9251, + "step": 17906 + }, + { + "epoch": 0.7539789473684211, + "grad_norm": 0.4609375, + "learning_rate": 7.380287530270435e-05, + "loss": 3.2891, + "step": 17907 + }, + { + "epoch": 0.7540210526315789, + "grad_norm": 0.4609375, + "learning_rate": 7.377893812407046e-05, + "loss": 3.272, + "step": 17908 + }, + { + "epoch": 0.7540631578947369, + "grad_norm": 0.4453125, + "learning_rate": 7.375500415597727e-05, + "loss": 2.7189, + "step": 17909 + }, + { + "epoch": 0.7541052631578947, + "grad_norm": 0.458984375, + "learning_rate": 7.373107339886084e-05, + "loss": 2.7377, + "step": 17910 + }, + { + "epoch": 0.7541473684210527, + "grad_norm": 0.421875, + "learning_rate": 7.370714585315704e-05, + "loss": 3.4069, + "step": 17911 + }, + { + "epoch": 0.7541894736842105, + "grad_norm": 0.416015625, + "learning_rate": 7.368322151930201e-05, + "loss": 2.7286, + "step": 17912 + }, + { + "epoch": 0.7542315789473685, + "grad_norm": 0.423828125, + "learning_rate": 7.36593003977315e-05, + "loss": 2.8868, + "step": 17913 + }, + { + "epoch": 0.7542736842105263, + "grad_norm": 0.4296875, + "learning_rate": 7.363538248888136e-05, + "loss": 2.8195, + "step": 17914 + }, + { + "epoch": 0.7543157894736842, + "grad_norm": 0.416015625, + "learning_rate": 7.361146779318731e-05, + "loss": 2.8003, + "step": 17915 + }, + { + "epoch": 0.7543578947368421, + "grad_norm": 0.427734375, + "learning_rate": 7.358755631108507e-05, + "loss": 2.9281, + "step": 17916 + }, + { + "epoch": 0.7544, + "grad_norm": 0.458984375, + "learning_rate": 7.35636480430103e-05, + "loss": 2.9806, + "step": 17917 + }, + { + "epoch": 0.7544421052631579, + "grad_norm": 0.4140625, + "learning_rate": 7.353974298939847e-05, + "loss": 2.6939, + "step": 17918 + }, + { + "epoch": 0.7544842105263158, + "grad_norm": 0.44921875, + "learning_rate": 7.351584115068535e-05, + "loss": 2.8462, + "step": 17919 + }, + { + "epoch": 0.7545263157894737, + "grad_norm": 0.416015625, + "learning_rate": 7.349194252730607e-05, + "loss": 3.0892, + "step": 17920 + }, + { + "epoch": 0.7545684210526316, + "grad_norm": 0.427734375, + "learning_rate": 7.346804711969632e-05, + "loss": 3.2774, + "step": 17921 + }, + { + "epoch": 0.7546105263157895, + "grad_norm": 0.4375, + "learning_rate": 7.344415492829129e-05, + "loss": 3.2071, + "step": 17922 + }, + { + "epoch": 0.7546526315789474, + "grad_norm": 0.427734375, + "learning_rate": 7.342026595352628e-05, + "loss": 3.2088, + "step": 17923 + }, + { + "epoch": 0.7546947368421053, + "grad_norm": 0.416015625, + "learning_rate": 7.339638019583666e-05, + "loss": 3.3859, + "step": 17924 + }, + { + "epoch": 0.7547368421052632, + "grad_norm": 0.423828125, + "learning_rate": 7.337249765565737e-05, + "loss": 2.9565, + "step": 17925 + }, + { + "epoch": 0.7547789473684211, + "grad_norm": 0.43359375, + "learning_rate": 7.33486183334238e-05, + "loss": 3.3, + "step": 17926 + }, + { + "epoch": 0.754821052631579, + "grad_norm": 0.4609375, + "learning_rate": 7.33247422295707e-05, + "loss": 3.1472, + "step": 17927 + }, + { + "epoch": 0.7548631578947368, + "grad_norm": 0.423828125, + "learning_rate": 7.330086934453326e-05, + "loss": 3.2403, + "step": 17928 + }, + { + "epoch": 0.7549052631578947, + "grad_norm": 0.404296875, + "learning_rate": 7.327699967874638e-05, + "loss": 3.327, + "step": 17929 + }, + { + "epoch": 0.7549473684210526, + "grad_norm": 0.435546875, + "learning_rate": 7.325313323264493e-05, + "loss": 2.9435, + "step": 17930 + }, + { + "epoch": 0.7549894736842105, + "grad_norm": 0.431640625, + "learning_rate": 7.322927000666371e-05, + "loss": 3.4752, + "step": 17931 + }, + { + "epoch": 0.7550315789473684, + "grad_norm": 0.43359375, + "learning_rate": 7.320541000123743e-05, + "loss": 3.0975, + "step": 17932 + }, + { + "epoch": 0.7550736842105263, + "grad_norm": 0.4296875, + "learning_rate": 7.3181553216801e-05, + "loss": 3.533, + "step": 17933 + }, + { + "epoch": 0.7551157894736842, + "grad_norm": 0.443359375, + "learning_rate": 7.315769965378877e-05, + "loss": 3.1239, + "step": 17934 + }, + { + "epoch": 0.7551578947368421, + "grad_norm": 0.451171875, + "learning_rate": 7.313384931263565e-05, + "loss": 3.1477, + "step": 17935 + }, + { + "epoch": 0.7552, + "grad_norm": 0.427734375, + "learning_rate": 7.311000219377578e-05, + "loss": 3.0842, + "step": 17936 + }, + { + "epoch": 0.7552421052631579, + "grad_norm": 0.427734375, + "learning_rate": 7.308615829764395e-05, + "loss": 2.9766, + "step": 17937 + }, + { + "epoch": 0.7552842105263158, + "grad_norm": 0.408203125, + "learning_rate": 7.306231762467445e-05, + "loss": 3.3122, + "step": 17938 + }, + { + "epoch": 0.7553263157894737, + "grad_norm": 0.4375, + "learning_rate": 7.303848017530151e-05, + "loss": 3.195, + "step": 17939 + }, + { + "epoch": 0.7553684210526316, + "grad_norm": 0.435546875, + "learning_rate": 7.301464594995974e-05, + "loss": 2.8319, + "step": 17940 + }, + { + "epoch": 0.7554105263157894, + "grad_norm": 0.4296875, + "learning_rate": 7.299081494908299e-05, + "loss": 2.7748, + "step": 17941 + }, + { + "epoch": 0.7554526315789474, + "grad_norm": 0.40234375, + "learning_rate": 7.29669871731058e-05, + "loss": 2.7599, + "step": 17942 + }, + { + "epoch": 0.7554947368421052, + "grad_norm": 0.416015625, + "learning_rate": 7.294316262246189e-05, + "loss": 3.0617, + "step": 17943 + }, + { + "epoch": 0.7555368421052632, + "grad_norm": 0.412109375, + "learning_rate": 7.291934129758565e-05, + "loss": 3.6084, + "step": 17944 + }, + { + "epoch": 0.755578947368421, + "grad_norm": 0.427734375, + "learning_rate": 7.289552319891094e-05, + "loss": 3.1954, + "step": 17945 + }, + { + "epoch": 0.755621052631579, + "grad_norm": 0.416015625, + "learning_rate": 7.28717083268717e-05, + "loss": 2.8634, + "step": 17946 + }, + { + "epoch": 0.7556631578947368, + "grad_norm": 0.40625, + "learning_rate": 7.284789668190181e-05, + "loss": 2.6953, + "step": 17947 + }, + { + "epoch": 0.7557052631578948, + "grad_norm": 0.416015625, + "learning_rate": 7.282408826443505e-05, + "loss": 3.3015, + "step": 17948 + }, + { + "epoch": 0.7557473684210526, + "grad_norm": 0.4140625, + "learning_rate": 7.280028307490535e-05, + "loss": 3.1275, + "step": 17949 + }, + { + "epoch": 0.7557894736842106, + "grad_norm": 0.45703125, + "learning_rate": 7.277648111374616e-05, + "loss": 3.4385, + "step": 17950 + }, + { + "epoch": 0.7558315789473684, + "grad_norm": 0.400390625, + "learning_rate": 7.275268238139133e-05, + "loss": 2.8682, + "step": 17951 + }, + { + "epoch": 0.7558736842105264, + "grad_norm": 0.4140625, + "learning_rate": 7.272888687827434e-05, + "loss": 3.1425, + "step": 17952 + }, + { + "epoch": 0.7559157894736842, + "grad_norm": 0.443359375, + "learning_rate": 7.270509460482878e-05, + "loss": 2.4862, + "step": 17953 + }, + { + "epoch": 0.755957894736842, + "grad_norm": 0.435546875, + "learning_rate": 7.268130556148805e-05, + "loss": 3.1172, + "step": 17954 + }, + { + "epoch": 0.756, + "grad_norm": 0.4375, + "learning_rate": 7.265751974868553e-05, + "loss": 3.3407, + "step": 17955 + }, + { + "epoch": 0.7560421052631578, + "grad_norm": 0.4140625, + "learning_rate": 7.263373716685481e-05, + "loss": 2.8891, + "step": 17956 + }, + { + "epoch": 0.7560842105263158, + "grad_norm": 0.4375, + "learning_rate": 7.260995781642881e-05, + "loss": 3.3673, + "step": 17957 + }, + { + "epoch": 0.7561263157894736, + "grad_norm": 0.42578125, + "learning_rate": 7.258618169784106e-05, + "loss": 3.0176, + "step": 17958 + }, + { + "epoch": 0.7561684210526316, + "grad_norm": 0.439453125, + "learning_rate": 7.256240881152462e-05, + "loss": 3.1554, + "step": 17959 + }, + { + "epoch": 0.7562105263157894, + "grad_norm": 0.515625, + "learning_rate": 7.253863915791262e-05, + "loss": 3.3399, + "step": 17960 + }, + { + "epoch": 0.7562526315789474, + "grad_norm": 0.400390625, + "learning_rate": 7.25148727374381e-05, + "loss": 2.9801, + "step": 17961 + }, + { + "epoch": 0.7562947368421052, + "grad_norm": 0.412109375, + "learning_rate": 7.249110955053406e-05, + "loss": 3.0449, + "step": 17962 + }, + { + "epoch": 0.7563368421052632, + "grad_norm": 0.42578125, + "learning_rate": 7.246734959763343e-05, + "loss": 3.2464, + "step": 17963 + }, + { + "epoch": 0.756378947368421, + "grad_norm": 0.416015625, + "learning_rate": 7.244359287916901e-05, + "loss": 3.0581, + "step": 17964 + }, + { + "epoch": 0.756421052631579, + "grad_norm": 0.5390625, + "learning_rate": 7.241983939557381e-05, + "loss": 3.0583, + "step": 17965 + }, + { + "epoch": 0.7564631578947368, + "grad_norm": 0.412109375, + "learning_rate": 7.239608914728049e-05, + "loss": 3.0835, + "step": 17966 + }, + { + "epoch": 0.7565052631578948, + "grad_norm": 0.4296875, + "learning_rate": 7.237234213472174e-05, + "loss": 2.927, + "step": 17967 + }, + { + "epoch": 0.7565473684210526, + "grad_norm": 0.423828125, + "learning_rate": 7.234859835833022e-05, + "loss": 2.768, + "step": 17968 + }, + { + "epoch": 0.7565894736842105, + "grad_norm": 0.455078125, + "learning_rate": 7.232485781853849e-05, + "loss": 3.6237, + "step": 17969 + }, + { + "epoch": 0.7566315789473684, + "grad_norm": 0.431640625, + "learning_rate": 7.2301120515779e-05, + "loss": 3.1102, + "step": 17970 + }, + { + "epoch": 0.7566736842105263, + "grad_norm": 0.423828125, + "learning_rate": 7.227738645048443e-05, + "loss": 3.2453, + "step": 17971 + }, + { + "epoch": 0.7567157894736842, + "grad_norm": 0.404296875, + "learning_rate": 7.225365562308703e-05, + "loss": 3.1052, + "step": 17972 + }, + { + "epoch": 0.7567578947368421, + "grad_norm": 0.43359375, + "learning_rate": 7.222992803401918e-05, + "loss": 3.3128, + "step": 17973 + }, + { + "epoch": 0.7568, + "grad_norm": 0.4140625, + "learning_rate": 7.22062036837132e-05, + "loss": 3.2558, + "step": 17974 + }, + { + "epoch": 0.7568421052631579, + "grad_norm": 0.447265625, + "learning_rate": 7.218248257260127e-05, + "loss": 2.6788, + "step": 17975 + }, + { + "epoch": 0.7568842105263158, + "grad_norm": 0.431640625, + "learning_rate": 7.215876470111558e-05, + "loss": 3.2525, + "step": 17976 + }, + { + "epoch": 0.7569263157894737, + "grad_norm": 0.427734375, + "learning_rate": 7.213505006968815e-05, + "loss": 3.3588, + "step": 17977 + }, + { + "epoch": 0.7569684210526316, + "grad_norm": 0.419921875, + "learning_rate": 7.21113386787513e-05, + "loss": 3.3014, + "step": 17978 + }, + { + "epoch": 0.7570105263157895, + "grad_norm": 0.427734375, + "learning_rate": 7.208763052873667e-05, + "loss": 2.8016, + "step": 17979 + }, + { + "epoch": 0.7570526315789474, + "grad_norm": 0.431640625, + "learning_rate": 7.20639256200765e-05, + "loss": 3.109, + "step": 17980 + }, + { + "epoch": 0.7570947368421053, + "grad_norm": 0.451171875, + "learning_rate": 7.204022395320251e-05, + "loss": 3.0281, + "step": 17981 + }, + { + "epoch": 0.7571368421052631, + "grad_norm": 0.40625, + "learning_rate": 7.201652552854654e-05, + "loss": 2.9049, + "step": 17982 + }, + { + "epoch": 0.7571789473684211, + "grad_norm": 0.435546875, + "learning_rate": 7.199283034654036e-05, + "loss": 3.2437, + "step": 17983 + }, + { + "epoch": 0.7572210526315789, + "grad_norm": 0.404296875, + "learning_rate": 7.196913840761557e-05, + "loss": 3.3204, + "step": 17984 + }, + { + "epoch": 0.7572631578947369, + "grad_norm": 0.421875, + "learning_rate": 7.194544971220407e-05, + "loss": 2.6717, + "step": 17985 + }, + { + "epoch": 0.7573052631578947, + "grad_norm": 0.470703125, + "learning_rate": 7.19217642607371e-05, + "loss": 2.8856, + "step": 17986 + }, + { + "epoch": 0.7573473684210527, + "grad_norm": 0.439453125, + "learning_rate": 7.189808205364645e-05, + "loss": 3.191, + "step": 17987 + }, + { + "epoch": 0.7573894736842105, + "grad_norm": 0.455078125, + "learning_rate": 7.187440309136348e-05, + "loss": 3.4122, + "step": 17988 + }, + { + "epoch": 0.7574315789473685, + "grad_norm": 0.4765625, + "learning_rate": 7.185072737431957e-05, + "loss": 3.0196, + "step": 17989 + }, + { + "epoch": 0.7574736842105263, + "grad_norm": 0.421875, + "learning_rate": 7.182705490294614e-05, + "loss": 3.4915, + "step": 17990 + }, + { + "epoch": 0.7575157894736843, + "grad_norm": 0.396484375, + "learning_rate": 7.18033856776743e-05, + "loss": 2.9076, + "step": 17991 + }, + { + "epoch": 0.7575578947368421, + "grad_norm": 0.41796875, + "learning_rate": 7.17797196989356e-05, + "loss": 3.1533, + "step": 17992 + }, + { + "epoch": 0.7576, + "grad_norm": 0.431640625, + "learning_rate": 7.175605696716082e-05, + "loss": 2.7835, + "step": 17993 + }, + { + "epoch": 0.7576421052631579, + "grad_norm": 0.408203125, + "learning_rate": 7.17323974827814e-05, + "loss": 3.0588, + "step": 17994 + }, + { + "epoch": 0.7576842105263157, + "grad_norm": 0.421875, + "learning_rate": 7.170874124622812e-05, + "loss": 3.0316, + "step": 17995 + }, + { + "epoch": 0.7577263157894737, + "grad_norm": 0.42578125, + "learning_rate": 7.168508825793216e-05, + "loss": 3.2328, + "step": 17996 + }, + { + "epoch": 0.7577684210526315, + "grad_norm": 0.435546875, + "learning_rate": 7.166143851832434e-05, + "loss": 3.0991, + "step": 17997 + }, + { + "epoch": 0.7578105263157895, + "grad_norm": 0.427734375, + "learning_rate": 7.163779202783555e-05, + "loss": 3.1678, + "step": 17998 + }, + { + "epoch": 0.7578526315789473, + "grad_norm": 0.423828125, + "learning_rate": 7.161414878689677e-05, + "loss": 3.1859, + "step": 17999 + }, + { + "epoch": 0.7578947368421053, + "grad_norm": 0.41015625, + "learning_rate": 7.159050879593843e-05, + "loss": 3.1153, + "step": 18000 + }, + { + "epoch": 0.7578947368421053, + "eval_loss": 3.08333420753479, + "eval_runtime": 335.6342, + "eval_samples_per_second": 44.692, + "eval_steps_per_second": 5.586, + "step": 18000 + } + ], + "logging_steps": 1, + "max_steps": 23750, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 2, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.28297586098176e+17, + "train_batch_size": 12, + "trial_name": null, + "trial_params": null +}